Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package util;
- import java.util.AbstractMap;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.Map;
- import java.util.TreeMap;
- import java.io.BufferedReader;
- import java.io.BufferedWriter;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.InputStreamReader;
- import java.io.IOException;
- public class TextSegmenter {
- /** Lexical and concatenated entries must be at least 2 characters. */
- private static final int MIN_LEX_LENGTH = 2;
- /** Words and frequencies. */
- private TreeMap<String, Double> dictionary =
- new TreeMap<String, Double>();
- /** List of concatenated words to split. */
- private List<String> concat = new ArrayList<String>();
- public TextSegmenter() {
- }
- public void split( File lexicon, File concat )
- throws IOException {
- BufferedReader lex = new BufferedReader(
- new InputStreamReader( new FileInputStream( lexicon ) ) );
- BufferedReader col = new BufferedReader(
- new InputStreamReader( new FileInputStream( concat ) ) );
- split( lex, col );
- lex.close();
- col.close();
- }
- /**
- * Splits the text. Callers must close the streams.
- */
- public void split( BufferedReader lexicon, BufferedReader concat )
- throws IOException {
- loadLexicon( lexicon );
- loadConcat( concat );
- split();
- }
- /**
- * Iterates over all of the contatenated text, splitting each concatenated
- * String into English words.
- */
- private void split() {
- for( String concat : getConcat() ) {
- System.out.printf( "%s::%s\n", concat, segments( concat ) );
- }
- }
- /**
- * Returns a number between 0 and 1 that represents how often the word is
- * used relative to all the other words in the lexicon.
- */
- private double getProbability( String s ) {
- try {
- return getDictionary().get( s );
- }
- catch( Exception e ) {
- return 0.0;
- }
- }
- /**
- * Splits a concatenated phrase into its constituent words.
- */
- private String segments( String concat ) {
- int length = concat.length();
- List<Map.Entry<String, Double>> words =
- new ArrayList<Map.Entry<String, Double>>();
- // Put all the words that exist in the string into a map.
- //
- for( int i = 0; i < length; i++ ) {
- for( int j = 0; j < length - i; j++ ) {
- // Word and probability.
- //
- String w = concat.substring( j, length - i );
- double p = getProbability( w );
- // Retain words that comprise the concatenated string in order.
- //
- if( p > 0 ) {
- words.add( 0, new AbstractMap.SimpleEntry<String, Double>( w, p ) );
- }
- }
- }
- StringBuilder result = new StringBuilder( length * 2 );
- // Find the possibilities that account for the most words.
- //
- for( Map.Entry<String, Double> word : words ) {
- result.append( word.getKey() ).append( ' ' );
- System.out.printf( "%s=%f\n", word.getKey(), word.getValue() );
- }
- return result.toString();
- }
- /**
- * Loads all the words and word probability from the dictionary. Words
- * are separated from the probability by a comma.
- */
- private void loadLexicon( BufferedReader lexiconData )
- throws IOException {
- String line = null;
- TreeMap<String, Double> dictionary = getDictionary();
- dictionary.clear();
- while( (line = lexiconData.readLine()) != null ) {
- String[] lex = line.toLowerCase().split( "," );
- if( lex[0].length() >= MIN_LEX_LENGTH ) {
- dictionary.put( lex[0], Double.parseDouble( lex[1] ) );
- }
- }
- }
- /**
- * Inserts the lines of concatenated text into the internal list.
- */
- private void loadConcat( BufferedReader concatData )
- throws IOException {
- String line = null;
- List<String> concat = getConcat();
- concat.clear();
- while( (line = concatData.readLine()) != null ) {
- if( line.length() >= MIN_LEX_LENGTH ) {
- concat.add( line.toLowerCase() );
- }
- }
- }
- private List<String> getConcat() {
- return this.concat;
- }
- private TreeMap<String, Double> getDictionary() {
- return this.dictionary;
- }
- public static void main( String args[] )
- throws IOException {
- TextSegmenter ts = new TextSegmenter();
- if( args.length == 2 ) {
- try {
- ts.split( new File( args[0] ), new File( args[1] ) );
- }
- catch( Exception e ) {
- System.err.println( "Error: " + e.getMessage() );
- e.printStackTrace();
- }
- }
- else {
- System.out.println( "TextSegmenter <lexicon> <concat>" );
- System.out.println( "<lexicon> - <word, relative probability>" );
- System.out.println( "<concat> - <words>" );
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement