package util;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.IOException;
public class TextSegmenter {
/** Lexical and concatenated entries must be at least 2 characters. */
private static final int MIN_LEX_LENGTH = 2;
/** Words and frequencies. */
private TreeMap<String, Double> dictionary =
new TreeMap<String, Double>();
/** List of concatenated words to split. */
private List<String> concat = new ArrayList<String>();
public TextSegmenter() {
}
public void split( File lexicon, File concat )
throws IOException {
BufferedReader lex = new BufferedReader(
new InputStreamReader( new FileInputStream( lexicon ) ) );
BufferedReader col = new BufferedReader(
new InputStreamReader( new FileInputStream( concat ) ) );
split( lex, col );
lex.close();
col.close();
}
/**
* Splits the text. Callers must close the streams.
*/
public void split( BufferedReader lexicon, BufferedReader concat )
throws IOException {
loadLexicon( lexicon );
loadConcat( concat );
split();
}
/**
* Iterates over all of the contatenated text, splitting each concatenated
* String into English words.
*/
private void split() {
for( String concat : getConcat() ) {
System.out.printf( "%s::%s\n", concat, segments( concat ) );
}
}
/**
* Returns a number between 0 and 1 that represents how often the word is
* used relative to all the other words in the lexicon.
*/
private double getProbability( String s ) {
try {
return getDictionary().get( s );
}
catch( Exception e ) {
return 0.0;
}
}
/**
* Splits a concatenated phrase into its constituent words.
*/
private String segments( String concat ) {
int length = concat.length();
List<Map.Entry<String, Double>> words =
new ArrayList<Map.Entry<String, Double>>();
// Put all the words that exist in the string into a map.
//
for( int i = 0; i < length; i++ ) {
for( int j = 0; j < length - i; j++ ) {
// Word and probability.
//
String w = concat.substring( j, length - i );
double p = getProbability( w );
// Retain words that comprise the concatenated string in order.
//
if( p > 0 ) {
words.add( 0, new AbstractMap.SimpleEntry<String, Double>( w, p ) );
}
}
}
StringBuilder result = new StringBuilder( length * 2 );
// Find the possibilities that account for the most words.
//
for( Map.Entry<String, Double> word : words ) {
result.append( word.getKey() ).append( ' ' );
System.out.printf( "%s=%f\n", word.getKey(), word.getValue() );
}
return result.toString();
}
/**
* Loads all the words and word probability from the dictionary. Words
* are separated from the probability by a comma.
*/
private void loadLexicon( BufferedReader lexiconData )
throws IOException {
String line = null;
TreeMap<String, Double> dictionary = getDictionary();
dictionary.clear();
while( (line = lexiconData.readLine()) != null ) {
String[] lex = line.toLowerCase().split( "," );
if( lex[0].length() >= MIN_LEX_LENGTH ) {
dictionary.put( lex[0], Double.parseDouble( lex[1] ) );
}
}
}
/**
* Inserts the lines of concatenated text into the internal list.
*/
private void loadConcat( BufferedReader concatData )
throws IOException {
String line = null;
List<String> concat = getConcat();
concat.clear();
while( (line = concatData.readLine()) != null ) {
if( line.length() >= MIN_LEX_LENGTH ) {
concat.add( line.toLowerCase() );
}
}
}
private List<String> getConcat() {
return this.concat;
}
private TreeMap<String, Double> getDictionary() {
return this.dictionary;
}
public static void main( String args[] )
throws IOException {
TextSegmenter ts = new TextSegmenter();
if( args.length == 2 ) {
try {
ts.split( new File( args[0] ), new File( args[1] ) );
}
catch( Exception e ) {
System.err.println( "Error: " + e.getMessage() );
e.printStackTrace();
}
}
else {
System.out.println( "TextSegmenter <lexicon> <concat>" );
System.out.println( "<lexicon> - <word, relative probability>" );
System.out.println( "<concat> - <words>" );
}
}
}