package SGM.main;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import SGM.Data.Data;
/*
Training Set (9,603 docs): LEWISSPLIT="TRAIN"; TOPICS="YES"
Test Set (3,299 docs): LEWISSPLIT="TEST"; TOPICS="YES"
Unused (8,676 docs): LEWISSPLIT="NOT-USED"; TOPICS="YES"
or TOPICS="NO"
*/
public class Main {
private static String FileAd = "D:/TESTDATA/Reuter/file";
private static LinkedList<String> fileList = new LinkedList<String>();
public static void main(String[] args) {
setFileList(FileAd);
SGMReader sgm = new SGMReader(fileList);
String line = "<TITLE>BAHIA COCOA REVIEW</TITLE>";
String key = "TITLE";
System.out.println(getData(line, key));
}
private static void setFileList(String address) {
File file = new File(address);
String[] list = file.list();
for (int i = 0; i < list.length; i++) {
String _ad = new String(address).concat("/" + list[i]);
if (isFile(_ad)) {
fileList.add(_ad);
System.out.println(_ad);
} else {
setFileList(_ad);
}
}
}
private static Boolean isFile(String address) {
File file = new File(address);
if (file.isFile()) {
return true;
}
return false;
}
private static String getData(String line, String key) {
int beginIndex = line.indexOf(key) + key.length() + 1;
int endIndex = line.lastIndexOf(key) - 2;
line = line.substring(beginIndex, endIndex);
return line;
}
}
class SGMReader {
private String fileAd = "./reut2-001.sgm";
private String[] element = { "REUTERS", "DATE", "TOPICS", "PLACES", "D", "PEOPLE", "ORGS",
"EXCHANGES", "COMPANIES", "UNKNOWN", "TEXT", "TITLE", "DATELINE", "BODY", };
private int testNum = 0;
private int trainNum = 0;
private int sum = 0;
public SGMReader(LinkedList<String> fileList) {
for (String ad : fileList) {
this.Reader(ad);
}
System.out.println("test:\t" + testNum + "\ntrainNum:\t" + trainNum + "\nsum:" + sum);
}
public SGMReader() {
this.Reader();
}
private void Reader(String ad) {
ArrayList<String> SGM = new ArrayList<String>();
String title = new String();
Data data;
try {
File fHandle = new File(ad);
Scanner fScan = new Scanner(fHandle);
String Mode = "Unused";
Boolean ReadMode = false;
String ID = new String();
Boolean SaveMode = false;
while (fScan.hasNextLine()) {
String str = fScan.nextLine();
// System.out.println(str);
if (this.haveKey("LEWISSPLIT", str)) {
SGM.clear();
ID = new String();
title = new String();
ReadMode = false;
Mode = "Unused";
SaveMode = false;
if (this.isTrainingSet(str).equals("TRAIN")) {
Mode = "TRAIN";
title = "TRAIN";
ID = this.getNewID(str);
trainNum++;
} else if (this.isTrainingSet(str).equals("TEST")) {
Mode = "TEST";
title = "TEST";
ID = this.getNewID(str);
testNum++;
}
}
if (Mode != "Unused") {
if (this.haveKey("TOPICS>", str)) {
str = str.replace("<D>", "");
str = str.replace("</D>", "_");
String topics = this.getData(str, "TOPICS");
// if (topics.length() != 0) {
title = title.concat("." + topics + "" + ID + ".txt");
sum++;
SaveMode = true;
// }
}
if (this.haveKey("<BODY", str)) {
str = str.substring(str.indexOf("BODY") + new String("BODY").length() + 1);
ReadMode = true;
}
if (this.haveKey("</BODY", str)) {
if (SaveMode == true) {
data = new Data(title, SGM);
}
SGM.clear();
ID = new String();
title = new String();
ReadMode = false;
Mode = "Unused";
SaveMode = false;
}
if (ReadMode == true) {
SGM.add(str);
// System.out.println(str);
}
}
}
System.out.println(SGM.size());
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
private String getData(String line, String key) {
int beginIndex = line.indexOf(key) + key.length() + 1;
int endIndex = line.lastIndexOf(key) - 2;
line = line.substring(beginIndex, endIndex);
return line;
}
private String getNewID(String line) {
int beginIndex = line.indexOf("NEWID") + new String("NEWID").length() + 2;
int endIndex = line.length() - 2;
return line.substring(beginIndex, endIndex);
}
private String isTrainingSet(String line) {
if (this.haveKey("YES", line)) {
if (this.haveKey("TEST", line)) {
if (!this.haveKey("NOT-USED", line)) {
return "TEST";
} else {
return "Unused";
}
} else if (this.haveKey("TRAIN", line)) {
return "TRAIN";
} else {
return "Unused";
}
} else {
return "Unused";
}
}
private Boolean haveKey(String key, String line) {
if (line.indexOf(key) > 0) {
return true;
} else {
return false;
}
}
}