001 | package SGM.main; |
002 |
003 | import java.io.File; |
004 | import java.io.IOException; |
005 | import java.util.ArrayList; |
006 | import java.util.LinkedList; |
007 | import java.util.Scanner; |
008 | import java.util.regex.Matcher; |
009 | import java.util.regex.Pattern; |
010 |
011 | import SGM.Data.Data; |
012 |
013 | /* |
014 | Training Set (9,603 docs): LEWISSPLIT="TRAIN"; TOPICS="YES" |
015 | Test Set (3,299 docs): LEWISSPLIT="TEST"; TOPICS="YES" |
016 | Unused (8,676 docs): LEWISSPLIT="NOT-USED"; TOPICS="YES" |
017 | or TOPICS="NO" |
018 | */ |
019 | public class Main { |
020 | private static String FileAd = "D:/TESTDATA/Reuter/file" ; |
021 | private static LinkedList<String> fileList = new LinkedList<String>(); |
022 |
023 | public static void main(String[] args) { |
024 |
025 | setFileList(FileAd); |
026 |
027 | SGMReader sgm = new SGMReader(fileList); |
028 |
029 | String line = "<TITLE>BAHIA COCOA REVIEW</TITLE>" ; |
030 | String key = "TITLE" ; |
031 |
032 | System.out.println(getData(line, key)); |
033 |
034 | } |
035 |
036 | private static void setFileList(String address) { |
037 | File file = new File(address); |
038 | String[] list = file.list(); |
039 |
040 | for ( int i = 0 ; i < list.length; i++) { |
041 | String _ad = new String(address).concat( "/" + list[i]); |
042 | if (isFile(_ad)) { |
043 | fileList.add(_ad); |
044 | System.out.println(_ad); |
045 | } else { |
046 | setFileList(_ad); |
047 | } |
048 | } |
049 | } |
050 |
051 | private static Boolean isFile(String address) { |
052 | File file = new File(address); |
053 | if (file.isFile()) { |
054 | return true ; |
055 | } |
056 | return false ; |
057 | } |
058 |
059 | private static String getData(String line, String key) { |
060 | int beginIndex = line.indexOf(key) + key.length() + 1 ; |
061 | int endIndex = line.lastIndexOf(key) - 2 ; |
062 | line = line.substring(beginIndex, endIndex); |
063 | return line; |
064 | } |
065 | } |
066 |
067 | class SGMReader { |
068 | private String fileAd = "./reut2-001.sgm" ; |
069 | private String[] element = { "REUTERS" , "DATE" , "TOPICS" , "PLACES" , "D" , "PEOPLE" , "ORGS" , |
070 | "EXCHANGES" , "COMPANIES" , "UNKNOWN" , "TEXT" , "TITLE" , "DATELINE" , "BODY" , }; |
071 |
072 | private int testNum = 0 ; |
073 | private int trainNum = 0 ; |
074 | private int sum = 0 ; |
075 |
076 | public SGMReader(LinkedList<String> fileList) { |
077 | for (String ad : fileList) { |
078 | this .Reader(ad); |
079 | } |
080 |
081 | System.out.println( "test:\t" + testNum + "\ntrainNum:\t" + trainNum + "\nsum:" + sum); |
082 | } |
083 |
084 | public SGMReader() { |
085 | this .Reader(); |
086 | } |
087 |
088 | private void Reader(String ad) { |
089 | ArrayList<String> SGM = new ArrayList<String>(); |
090 | String title = new String(); |
091 | Data data; |
092 | try { |
093 | File fHandle = new File(ad); |
094 | Scanner fScan = new Scanner(fHandle); |
095 | String Mode = "Unused" ; |
096 | Boolean ReadMode = false ; |
097 | String ID = new String(); |
098 | Boolean SaveMode = false ; |
099 |
100 | while (fScan.hasNextLine()) { |
101 | String str = fScan.nextLine(); |
102 | // System.out.println(str); |
103 |
104 | if ( this .haveKey( "LEWISSPLIT" , str)) { |
105 | SGM.clear(); |
106 | ID = new String(); |
107 | title = new String(); |
108 | ReadMode = false ; |
109 | Mode = "Unused" ; |
110 | SaveMode = false ; |
111 | if ( this .isTrainingSet(str).equals( "TRAIN" )) { |
112 | Mode = "TRAIN" ; |
113 | title = "TRAIN" ; |
114 | ID = this .getNewID(str); |
115 | trainNum++; |
116 | } else if ( this .isTrainingSet(str).equals( "TEST" )) { |
117 | Mode = "TEST" ; |
118 | title = "TEST" ; |
119 | ID = this .getNewID(str); |
120 | testNum++; |
121 | } |
122 | } |
123 |
124 | if (Mode != "Unused" ) { |
125 | if ( this .haveKey( "TOPICS>" , str)) { |
126 | str = str.replace( "<D>" , "" ); |
127 | str = str.replace( "</D>" , "_" ); |
128 | String topics = this .getData(str, "TOPICS" ); |
129 | // if (topics.length() != 0) { |
130 | title = title.concat( "." + topics + "" + ID + ".txt" ); |
131 | sum++; |
132 | SaveMode = true ; |
133 | // } |
134 | } |
135 |
136 | if ( this .haveKey( "<BODY" , str)) { |
137 | str = str.substring(str.indexOf( "BODY" ) + new String( "BODY" ).length() + 1 ); |
138 | ReadMode = true ; |
139 | } |
140 | if ( this .haveKey( "</BODY" , str)) { |
141 | if (SaveMode == true ) { |
142 | data = new Data(title, SGM); |
143 | } |
144 | SGM.clear(); |
145 | ID = new String(); |
146 | title = new String(); |
147 | ReadMode = false ; |
148 | Mode = "Unused" ; |
149 | SaveMode = false ; |
150 | } |
151 |
152 | if (ReadMode == true ) { |
153 | SGM.add(str); |
154 | // System.out.println(str); |
155 | } |
156 | } |
157 | } |
158 |
159 | System.out.println(SGM.size()); |
160 | } catch (Exception e) { |
161 | System.out.println(e.getMessage()); |
162 | } |
163 | } |
164 |
165 | private String getData(String line, String key) { |
166 | int beginIndex = line.indexOf(key) + key.length() + 1 ; |
167 | int endIndex = line.lastIndexOf(key) - 2 ; |
168 | line = line.substring(beginIndex, endIndex); |
169 | return line; |
170 | } |
171 |
172 | private String getNewID(String line) { |
173 | int beginIndex = line.indexOf( "NEWID" ) + new String( "NEWID" ).length() + 2 ; |
174 | int endIndex = line.length() - 2 ; |
175 | return line.substring(beginIndex, endIndex); |
176 | } |
177 |
178 | private String isTrainingSet(String line) { |
179 | if ( this .haveKey( "YES" , line)) { |
180 | if ( this .haveKey( "TEST" , line)) { |
181 | if (! this .haveKey( "NOT-USED" , line)) { |
182 | return "TEST" ; |
183 | } else { |
184 | return "Unused" ; |
185 | } |
186 | } else if ( this .haveKey( "TRAIN" , line)) { |
187 | return "TRAIN" ; |
188 | } else { |
189 | return "Unused" ; |
190 | } |
191 | } else { |
192 | return "Unused" ; |
193 | } |
194 | } |
195 |
196 | private Boolean haveKey(String key, String line) { |
197 | if (line.indexOf(key) > 0 ) { |
198 | return true ; |
199 | } else { |
200 | return false ; |
201 | } |
202 | } |
203 |
204 | } |
0 件のコメント:
コメントを投稿