package util; import java.util.AbstractMap; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.TreeMap; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import java.io.IOException; public class TextSegmenter { /** Lexical and concatenated entries must be at least 2 characters. */ private static final int MIN_LEX_LENGTH = 2; /** Words and frequencies. */ private TreeMap dictionary = new TreeMap(); /** List of concatenated words to split. */ private List concat = new ArrayList(); public TextSegmenter() { } public void split( File lexicon, File concat ) throws IOException { BufferedReader lex = new BufferedReader( new InputStreamReader( new FileInputStream( lexicon ) ) ); BufferedReader col = new BufferedReader( new InputStreamReader( new FileInputStream( concat ) ) ); split( lex, col ); lex.close(); col.close(); } /** * Splits the text. Callers must close the streams. */ public void split( BufferedReader lexicon, BufferedReader concat ) throws IOException { loadLexicon( lexicon ); loadConcat( concat ); split(); } /** * Iterates over all of the contatenated text, splitting each concatenated * String into English words. */ private void split() { for( String concat : getConcat() ) { System.out.printf( "%s::%s\n", concat, segments( concat ) ); } } /** * Returns a number between 0 and 1 that represents how often the word is * used relative to all the other words in the lexicon. */ private double getProbability( String s ) { try { return getDictionary().get( s ); } catch( Exception e ) { return 0.0; } } /** * Splits a concatenated phrase into its constituent words. */ private String segments( String concat ) { int length = concat.length(); List> words = new ArrayList>(); // Put all the words that exist in the string into a map. // for( int i = 0; i < length; i++ ) { for( int j = 0; j < length - i; j++ ) { // Word and probability. // String w = concat.substring( j, length - i ); double p = getProbability( w ); // Retain words that comprise the concatenated string in order. // if( p > 0 ) { words.add( 0, new AbstractMap.SimpleEntry( w, p ) ); } } } StringBuilder result = new StringBuilder( length * 2 ); // Find the possibilities that account for the most words. // for( Map.Entry word : words ) { result.append( word.getKey() ).append( ' ' ); System.out.printf( "%s=%f\n", word.getKey(), word.getValue() ); } return result.toString(); } /** * Loads all the words and word probability from the dictionary. Words * are separated from the probability by a comma. */ private void loadLexicon( BufferedReader lexiconData ) throws IOException { String line = null; TreeMap dictionary = getDictionary(); dictionary.clear(); while( (line = lexiconData.readLine()) != null ) { String[] lex = line.toLowerCase().split( "," ); if( lex[0].length() >= MIN_LEX_LENGTH ) { dictionary.put( lex[0], Double.parseDouble( lex[1] ) ); } } } /** * Inserts the lines of concatenated text into the internal list. */ private void loadConcat( BufferedReader concatData ) throws IOException { String line = null; List concat = getConcat(); concat.clear(); while( (line = concatData.readLine()) != null ) { if( line.length() >= MIN_LEX_LENGTH ) { concat.add( line.toLowerCase() ); } } } private List getConcat() { return this.concat; } private TreeMap getDictionary() { return this.dictionary; } public static void main( String args[] ) throws IOException { TextSegmenter ts = new TextSegmenter(); if( args.length == 2 ) { try { ts.split( new File( args[0] ), new File( args[1] ) ); } catch( Exception e ) { System.err.println( "Error: " + e.getMessage() ); e.printStackTrace(); } } else { System.out.println( "TextSegmenter " ); System.out.println( " - " ); System.out.println( " - " ); } } }