このブログを検索

9.21.2010

test

package SGM.main;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import SGM.Data.Data;

/*
 Training Set (9,603 docs): LEWISSPLIT="TRAIN";  TOPICS="YES"
 Test Set (3,299 docs): LEWISSPLIT="TEST"; TOPICS="YES"
 Unused (8,676 docs):   LEWISSPLIT="NOT-USED"; TOPICS="YES"
 or TOPICS="NO"
 */
public class Main {
 private static String FileAd = "D:/TESTDATA/Reuter/file";
 private static LinkedList<String> fileList = new LinkedList<String>();

 public static void main(String[] args) {

  setFileList(FileAd);

  SGMReader sgm = new SGMReader(fileList);

  String line = "<TITLE>BAHIA COCOA REVIEW</TITLE>";
  String key = "TITLE";

  System.out.println(getData(line, key));

 }

 private static void setFileList(String address) {
  File file = new File(address);
  String[] list = file.list();

  for (int i = 0; i < list.length; i++) {
   String _ad = new String(address).concat("/" + list[i]);
   if (isFile(_ad)) {
    fileList.add(_ad);
    System.out.println(_ad);
   } else {
    setFileList(_ad);
   }
  }
 }

 private static Boolean isFile(String address) {
  File file = new File(address);
  if (file.isFile()) {
   return true;
  }
  return false;
 }

 private static String getData(String line, String key) {
  int beginIndex = line.indexOf(key) + key.length() + 1;
  int endIndex = line.lastIndexOf(key) - 2;
  line = line.substring(beginIndex, endIndex);
  return line;
 }
}

class SGMReader {
 private String fileAd = "./reut2-001.sgm";
 private String[] element = { "REUTERS", "DATE", "TOPICS", "PLACES", "D", "PEOPLE", "ORGS",
   "EXCHANGES", "COMPANIES", "UNKNOWN", "TEXT", "TITLE", "DATELINE", "BODY", };

 private int testNum = 0;
 private int trainNum = 0;
 private int sum = 0;

 public SGMReader(LinkedList<String> fileList) {
  for (String ad : fileList) {
   this.Reader(ad);
  }

  System.out.println("test:\t" + testNum + "\ntrainNum:\t" + trainNum + "\nsum:" + sum);
 }

 public SGMReader() {
  this.Reader();
 }

 private void Reader(String ad) {
  ArrayList<String> SGM = new ArrayList<String>();
  String title = new String();
  Data data;
  try {
   File fHandle = new File(ad);
   Scanner fScan = new Scanner(fHandle);
   String Mode = "Unused";
   Boolean ReadMode = false;
   String ID = new String();
   Boolean SaveMode = false;

   while (fScan.hasNextLine()) {
    String str = fScan.nextLine();
    // System.out.println(str);

    if (this.haveKey("LEWISSPLIT", str)) {
     SGM.clear();
     ID = new String();
     title = new String();
     ReadMode = false;
     Mode = "Unused";
     SaveMode = false;
     if (this.isTrainingSet(str).equals("TRAIN")) {
      Mode = "TRAIN";
      title = "TRAIN";
      ID = this.getNewID(str);
      trainNum++;
     } else if (this.isTrainingSet(str).equals("TEST")) {
      Mode = "TEST";
      title = "TEST";
      ID = this.getNewID(str);
      testNum++;
     }
    }

    if (Mode != "Unused") {
     if (this.haveKey("TOPICS>", str)) {
      str = str.replace("<D>", "");
      str = str.replace("</D>", "_");
      String topics = this.getData(str, "TOPICS");
      // if (topics.length() != 0) {
      title = title.concat("." + topics + "" + ID + ".txt");
      sum++;
      SaveMode = true;
      // }
     }

     if (this.haveKey("<BODY", str)) {
      str = str.substring(str.indexOf("BODY") + new String("BODY").length() + 1);
      ReadMode = true;
     }
     if (this.haveKey("</BODY", str)) {
      if (SaveMode == true) {
       data = new Data(title, SGM);
      }
      SGM.clear();
      ID = new String();
      title = new String();
      ReadMode = false;
      Mode = "Unused";
      SaveMode = false;
     }

     if (ReadMode == true) {
      SGM.add(str);
      // System.out.println(str);
     }
    }
   }

   System.out.println(SGM.size());
  } catch (Exception e) {
   System.out.println(e.getMessage());
  }
 }

 private String getData(String line, String key) {
  int beginIndex = line.indexOf(key) + key.length() + 1;
  int endIndex = line.lastIndexOf(key) - 2;
  line = line.substring(beginIndex, endIndex);
  return line;
 }

 private String getNewID(String line) {
  int beginIndex = line.indexOf("NEWID") + new String("NEWID").length() + 2;
  int endIndex = line.length() - 2;
  return line.substring(beginIndex, endIndex);
 }

 private String isTrainingSet(String line) {
  if (this.haveKey("YES", line)) {
   if (this.haveKey("TEST", line)) {
    if (!this.haveKey("NOT-USED", line)) {
     return "TEST";
    } else {
     return "Unused";
    }
   } else if (this.haveKey("TRAIN", line)) {
    return "TRAIN";
   } else {
    return "Unused";
   }
  } else {
   return "Unused";
  }
 }

 private Boolean haveKey(String key, String line) {
  if (line.indexOf(key) > 0) {
   return true;
  } else {
   return false;
  }
 }

}

0 件のコメント:

コメントを投稿