View difference between Paste ID: taXyE03L and
SHOW: | | - or go back to the newest paste.
1-
1+
package util;
2
3
import java.util.AbstractMap;
4
import java.util.ArrayList;
5
import java.util.List;
6
import java.util.Map;
7
import java.util.TreeMap;
8
9
import java.io.BufferedReader;
10
import java.io.BufferedWriter;
11
import java.io.File;
12
import java.io.FileInputStream;
13
import java.io.InputStreamReader;
14
import java.io.IOException;
15
16
public class TextSegmenter {
17
  /** Lexical and concatenated entries must be at least 2 characters. */
18
  private static final int MIN_LEX_LENGTH = 2;
19
20
  /** Words and frequencies. */
21
  private TreeMap<String, Double> dictionary =
22
    new TreeMap<String, Double>();
23
24
  /** List of concatenated words to split. */
25
  private List<String> concat = new ArrayList<String>();
26
27
  public TextSegmenter() {
28
  }
29
30
  public void split( File lexicon, File concat )
31
    throws IOException {
32
    BufferedReader lex = new BufferedReader(
33
      new InputStreamReader( new FileInputStream( lexicon ) ) );
34
    BufferedReader col = new BufferedReader(
35
      new InputStreamReader( new FileInputStream( concat ) ) );
36
37
    split( lex, col );
38
39
    lex.close();
40
    col.close();
41
  }
42
43
  /**
44
   * Splits the text. Callers must close the streams.
45
   */
46
  public void split( BufferedReader lexicon, BufferedReader concat )
47
    throws IOException {
48
    loadLexicon( lexicon );
49
    loadConcat( concat );
50
    split();
51
  }
52
53
  /**
54
   * Iterates over all of the contatenated text, splitting each concatenated
55
   * String into English words.
56
   */
57
  private void split() {
58
    for( String concat : getConcat() ) {
59
      System.out.printf( "%s::%s\n", concat, segments( concat ) );
60
    }
61
  }
62
63
  /**
64
   * Returns a number between 0 and 1 that represents how often the word is
65
   * used relative to all the other words in the lexicon.
66
   */
67
  private double getProbability( String s ) {
68
    try {
69
      return getDictionary().get( s );
70
    }
71
    catch( Exception e ) {
72
      return 0.0;
73
    }
74
  }
75
76
  /**
77
   * Splits a concatenated phrase into its constituent words.
78
   */
79
  private String segments( String concat ) {
80
    int length = concat.length();
81
    List<Map.Entry<String, Double>> words =
82
      new ArrayList<Map.Entry<String, Double>>();
83
84
    // Put all the words that exist in the string into a map.
85
    //
86
    for( int i = 0; i < length; i++ ) {
87
      for( int j = 0; j < length - i; j++ ) {
88
        // Word and probability.
89
        //
90
        String w = concat.substring( j, length - i );
91
        double p = getProbability( w );
92
93
        // Retain words that comprise the concatenated string in order.
94
        //
95
        if( p > 0 ) {
96
          words.add( 0, new AbstractMap.SimpleEntry<String, Double>( w, p ) );
97
        }
98
      }
99
    }
100
101
    StringBuilder result = new StringBuilder( length * 2 );
102
103
    // Find the possibilities that account for the most words.
104
    //
105
    for( Map.Entry<String, Double> word : words ) {
106
      result.append( word.getKey() ).append( ' ' );
107
108
109
      System.out.printf( "%s=%f\n", word.getKey(), word.getValue() );
110
    }
111
112
    return result.toString();
113
  }
114
115
  /**
116
   * Loads all the words and word probability from the dictionary. Words
117
   * are separated from the probability by a comma.
118
   */
119
  private void loadLexicon( BufferedReader lexiconData )
120
    throws IOException {
121
    String line = null;
122
    TreeMap<String, Double> dictionary = getDictionary();
123
124
    dictionary.clear();
125
126
    while( (line = lexiconData.readLine()) != null ) {
127
      String[] lex = line.toLowerCase().split( "," );
128
129
      if( lex[0].length() >= MIN_LEX_LENGTH ) {
130
        dictionary.put( lex[0], Double.parseDouble( lex[1] ) );
131
      }
132
    }
133
  }
134
135
  /**
136
   * Inserts the lines of concatenated text into the internal list.
137
   */
138
  private void loadConcat( BufferedReader concatData )
139
    throws IOException {
140
    String line = null;
141
    List<String> concat = getConcat();
142
143
    concat.clear();
144
145
    while( (line = concatData.readLine()) != null ) {
146
      if( line.length() >= MIN_LEX_LENGTH ) {
147
        concat.add( line.toLowerCase() );
148
      }
149
    }
150
  }
151
152
  private List<String> getConcat() {
153
    return this.concat;
154
  }
155
156
  private TreeMap<String, Double> getDictionary() {
157
    return this.dictionary;
158
  }
159
160
  public static void main( String args[] )
161
    throws IOException {
162
    TextSegmenter ts = new TextSegmenter();
163
164
    if( args.length == 2 ) {
165
      try {
166
        ts.split( new File( args[0] ), new File( args[1] ) );
167
      }
168
      catch( Exception e ) {
169
        System.err.println( "Error: " + e.getMessage() );
170
        e.printStackTrace();
171
      }
172
    }
173
    else {
174
      System.out.println( "TextSegmenter <lexicon> <concat>" );
175
      System.out.println( "<lexicon> - <word, relative probability>" );
176
      System.out.println( "<concat>  - <words>" );
177
    }
178
  }
179
}