package SGM.main; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedList; import java.util.Scanner; import java.util.regex.Matcher; import java.util.regex.Pattern; import SGM.Data.Data; /* Training Set (9,603 docs): LEWISSPLIT="TRAIN"; TOPICS="YES" Test Set (3,299 docs): LEWISSPLIT="TEST"; TOPICS="YES" Unused (8,676 docs): LEWISSPLIT="NOT-USED"; TOPICS="YES" or TOPICS="NO" */ public class Main { private static String FileAd = "D:/TESTDATA/Reuter/file"; private static LinkedList<String> fileList = new LinkedList<String>(); public static void main(String[] args) { setFileList(FileAd); SGMReader sgm = new SGMReader(fileList); String line = "<TITLE>BAHIA COCOA REVIEW</TITLE>"; String key = "TITLE"; System.out.println(getData(line, key)); } private static void setFileList(String address) { File file = new File(address); String[] list = file.list(); for (int i = 0; i < list.length; i++) { String _ad = new String(address).concat("/" + list[i]); if (isFile(_ad)) { fileList.add(_ad); System.out.println(_ad); } else { setFileList(_ad); } } } private static Boolean isFile(String address) { File file = new File(address); if (file.isFile()) { return true; } return false; } private static String getData(String line, String key) { int beginIndex = line.indexOf(key) + key.length() + 1; int endIndex = line.lastIndexOf(key) - 2; line = line.substring(beginIndex, endIndex); return line; } } class SGMReader { private String fileAd = "./reut2-001.sgm"; private String[] element = { "REUTERS", "DATE", "TOPICS", "PLACES", "D", "PEOPLE", "ORGS", "EXCHANGES", "COMPANIES", "UNKNOWN", "TEXT", "TITLE", "DATELINE", "BODY", }; private int testNum = 0; private int trainNum = 0; private int sum = 0; public SGMReader(LinkedList<String> fileList) { for (String ad : fileList) { this.Reader(ad); } System.out.println("test:\t" + testNum + "\ntrainNum:\t" + trainNum + "\nsum:" + sum); } public SGMReader() { this.Reader(); } private void Reader(String ad) { ArrayList<String> SGM = new ArrayList<String>(); String title = new String(); Data data; try { File fHandle = new File(ad); Scanner fScan = new Scanner(fHandle); String Mode = "Unused"; Boolean ReadMode = false; String ID = new String(); Boolean SaveMode = false; while (fScan.hasNextLine()) { String str = fScan.nextLine(); // System.out.println(str); if (this.haveKey("LEWISSPLIT", str)) { SGM.clear(); ID = new String(); title = new String(); ReadMode = false; Mode = "Unused"; SaveMode = false; if (this.isTrainingSet(str).equals("TRAIN")) { Mode = "TRAIN"; title = "TRAIN"; ID = this.getNewID(str); trainNum++; } else if (this.isTrainingSet(str).equals("TEST")) { Mode = "TEST"; title = "TEST"; ID = this.getNewID(str); testNum++; } } if (Mode != "Unused") { if (this.haveKey("TOPICS>", str)) { str = str.replace("<D>", ""); str = str.replace("</D>", "_"); String topics = this.getData(str, "TOPICS"); // if (topics.length() != 0) { title = title.concat("." + topics + "" + ID + ".txt"); sum++; SaveMode = true; // } } if (this.haveKey("<BODY", str)) { str = str.substring(str.indexOf("BODY") + new String("BODY").length() + 1); ReadMode = true; } if (this.haveKey("</BODY", str)) { if (SaveMode == true) { data = new Data(title, SGM); } SGM.clear(); ID = new String(); title = new String(); ReadMode = false; Mode = "Unused"; SaveMode = false; } if (ReadMode == true) { SGM.add(str); // System.out.println(str); } } } System.out.println(SGM.size()); } catch (Exception e) { System.out.println(e.getMessage()); } } private String getData(String line, String key) { int beginIndex = line.indexOf(key) + key.length() + 1; int endIndex = line.lastIndexOf(key) - 2; line = line.substring(beginIndex, endIndex); return line; } private String getNewID(String line) { int beginIndex = line.indexOf("NEWID") + new String("NEWID").length() + 2; int endIndex = line.length() - 2; return line.substring(beginIndex, endIndex); } private String isTrainingSet(String line) { if (this.haveKey("YES", line)) { if (this.haveKey("TEST", line)) { if (!this.haveKey("NOT-USED", line)) { return "TEST"; } else { return "Unused"; } } else if (this.haveKey("TRAIN", line)) { return "TRAIN"; } else { return "Unused"; } } else { return "Unused"; } } private Boolean haveKey(String key, String line) { if (line.indexOf(key) > 0) { return true; } else { return false; } } }
このブログを検索
9.21.2010
test
ラベル:
test
登録:
コメントの投稿 (Atom)
0 件のコメント:
コメントを投稿