View difference between Paste ID: taXyE03L and
SHOW:
|
|
- or go back to the newest paste.
1 | - | |
1 | + | package util; |
2 | ||
3 | import java.util.AbstractMap; | |
4 | import java.util.ArrayList; | |
5 | import java.util.List; | |
6 | import java.util.Map; | |
7 | import java.util.TreeMap; | |
8 | ||
9 | import java.io.BufferedReader; | |
10 | import java.io.BufferedWriter; | |
11 | import java.io.File; | |
12 | import java.io.FileInputStream; | |
13 | import java.io.InputStreamReader; | |
14 | import java.io.IOException; | |
15 | ||
16 | public class TextSegmenter { | |
17 | /** Lexical and concatenated entries must be at least 2 characters. */ | |
18 | private static final int MIN_LEX_LENGTH = 2; | |
19 | ||
20 | /** Words and frequencies. */ | |
21 | private TreeMap<String, Double> dictionary = | |
22 | new TreeMap<String, Double>(); | |
23 | ||
24 | /** List of concatenated words to split. */ | |
25 | private List<String> concat = new ArrayList<String>(); | |
26 | ||
27 | public TextSegmenter() { | |
28 | } | |
29 | ||
30 | public void split( File lexicon, File concat ) | |
31 | throws IOException { | |
32 | BufferedReader lex = new BufferedReader( | |
33 | new InputStreamReader( new FileInputStream( lexicon ) ) ); | |
34 | BufferedReader col = new BufferedReader( | |
35 | new InputStreamReader( new FileInputStream( concat ) ) ); | |
36 | ||
37 | split( lex, col ); | |
38 | ||
39 | lex.close(); | |
40 | col.close(); | |
41 | } | |
42 | ||
43 | /** | |
44 | * Splits the text. Callers must close the streams. | |
45 | */ | |
46 | public void split( BufferedReader lexicon, BufferedReader concat ) | |
47 | throws IOException { | |
48 | loadLexicon( lexicon ); | |
49 | loadConcat( concat ); | |
50 | split(); | |
51 | } | |
52 | ||
53 | /** | |
54 | * Iterates over all of the contatenated text, splitting each concatenated | |
55 | * String into English words. | |
56 | */ | |
57 | private void split() { | |
58 | for( String concat : getConcat() ) { | |
59 | System.out.printf( "%s::%s\n", concat, segments( concat ) ); | |
60 | } | |
61 | } | |
62 | ||
63 | /** | |
64 | * Returns a number between 0 and 1 that represents how often the word is | |
65 | * used relative to all the other words in the lexicon. | |
66 | */ | |
67 | private double getProbability( String s ) { | |
68 | try { | |
69 | return getDictionary().get( s ); | |
70 | } | |
71 | catch( Exception e ) { | |
72 | return 0.0; | |
73 | } | |
74 | } | |
75 | ||
76 | /** | |
77 | * Splits a concatenated phrase into its constituent words. | |
78 | */ | |
79 | private String segments( String concat ) { | |
80 | int length = concat.length(); | |
81 | List<Map.Entry<String, Double>> words = | |
82 | new ArrayList<Map.Entry<String, Double>>(); | |
83 | ||
84 | // Put all the words that exist in the string into a map. | |
85 | // | |
86 | for( int i = 0; i < length; i++ ) { | |
87 | for( int j = 0; j < length - i; j++ ) { | |
88 | // Word and probability. | |
89 | // | |
90 | String w = concat.substring( j, length - i ); | |
91 | double p = getProbability( w ); | |
92 | ||
93 | // Retain words that comprise the concatenated string in order. | |
94 | // | |
95 | if( p > 0 ) { | |
96 | words.add( 0, new AbstractMap.SimpleEntry<String, Double>( w, p ) ); | |
97 | } | |
98 | } | |
99 | } | |
100 | ||
101 | StringBuilder result = new StringBuilder( length * 2 ); | |
102 | ||
103 | // Find the possibilities that account for the most words. | |
104 | // | |
105 | for( Map.Entry<String, Double> word : words ) { | |
106 | result.append( word.getKey() ).append( ' ' ); | |
107 | ||
108 | ||
109 | System.out.printf( "%s=%f\n", word.getKey(), word.getValue() ); | |
110 | } | |
111 | ||
112 | return result.toString(); | |
113 | } | |
114 | ||
115 | /** | |
116 | * Loads all the words and word probability from the dictionary. Words | |
117 | * are separated from the probability by a comma. | |
118 | */ | |
119 | private void loadLexicon( BufferedReader lexiconData ) | |
120 | throws IOException { | |
121 | String line = null; | |
122 | TreeMap<String, Double> dictionary = getDictionary(); | |
123 | ||
124 | dictionary.clear(); | |
125 | ||
126 | while( (line = lexiconData.readLine()) != null ) { | |
127 | String[] lex = line.toLowerCase().split( "," ); | |
128 | ||
129 | if( lex[0].length() >= MIN_LEX_LENGTH ) { | |
130 | dictionary.put( lex[0], Double.parseDouble( lex[1] ) ); | |
131 | } | |
132 | } | |
133 | } | |
134 | ||
135 | /** | |
136 | * Inserts the lines of concatenated text into the internal list. | |
137 | */ | |
138 | private void loadConcat( BufferedReader concatData ) | |
139 | throws IOException { | |
140 | String line = null; | |
141 | List<String> concat = getConcat(); | |
142 | ||
143 | concat.clear(); | |
144 | ||
145 | while( (line = concatData.readLine()) != null ) { | |
146 | if( line.length() >= MIN_LEX_LENGTH ) { | |
147 | concat.add( line.toLowerCase() ); | |
148 | } | |
149 | } | |
150 | } | |
151 | ||
152 | private List<String> getConcat() { | |
153 | return this.concat; | |
154 | } | |
155 | ||
156 | private TreeMap<String, Double> getDictionary() { | |
157 | return this.dictionary; | |
158 | } | |
159 | ||
160 | public static void main( String args[] ) | |
161 | throws IOException { | |
162 | TextSegmenter ts = new TextSegmenter(); | |
163 | ||
164 | if( args.length == 2 ) { | |
165 | try { | |
166 | ts.split( new File( args[0] ), new File( args[1] ) ); | |
167 | } | |
168 | catch( Exception e ) { | |
169 | System.err.println( "Error: " + e.getMessage() ); | |
170 | e.printStackTrace(); | |
171 | } | |
172 | } | |
173 | else { | |
174 | System.out.println( "TextSegmenter <lexicon> <concat>" ); | |
175 | System.out.println( "<lexicon> - <word, relative probability>" ); | |
176 | System.out.println( "<concat> - <words>" ); | |
177 | } | |
178 | } | |
179 | } |