View difference between Paste ID: <a href="/taXyE03L">taXyE03L</a> and <a href="/post/view"></a>

View difference between Paste ID: taXyE03L and

SHOW: | | - or go back to the newest paste.


package util;

import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.IOException;

public class TextSegmenter {
  /** Lexical and concatenated entries must be at least 2 characters. */
  private static final int MIN_LEX_LENGTH = 2;

  /** Words and frequencies. */
  private TreeMap<String, Double> dictionary =
    new TreeMap<String, Double>();

  /** List of concatenated words to split. */
  private List<String> concat = new ArrayList<String>();

  public TextSegmenter() {
  }

  public void split( File lexicon, File concat )
    throws IOException {
    BufferedReader lex = new BufferedReader(
      new InputStreamReader( new FileInputStream( lexicon ) ) );
    BufferedReader col = new BufferedReader(
      new InputStreamReader( new FileInputStream( concat ) ) );

    split( lex, col );

    lex.close();
    col.close();
  }

  /**
   * Splits the text. Callers must close the streams.
   */
  public void split( BufferedReader lexicon, BufferedReader concat )
    throws IOException {
    loadLexicon( lexicon );
    loadConcat( concat );
    split();
  }

  /**
   * Iterates over all of the contatenated text, splitting each concatenated
   * String into English words.
   */
  private void split() {
    for( String concat : getConcat() ) {
      System.out.printf( "%s::%s\n", concat, segments( concat ) );
    }
  }

  /**
   * Returns a number between 0 and 1 that represents how often the word is
   * used relative to all the other words in the lexicon.
   */
  private double getProbability( String s ) {
    try {
      return getDictionary().get( s );
    }
    catch( Exception e ) {
      return 0.0;
    }
  }

  /**
   * Splits a concatenated phrase into its constituent words.
   */
  private String segments( String concat ) {
    int length = concat.length();
    List<Map.Entry<String, Double>> words =
      new ArrayList<Map.Entry<String, Double>>();

    // Put all the words that exist in the string into a map.
    //
    for( int i = 0; i < length; i++ ) {
      for( int j = 0; j < length - i; j++ ) {
        // Word and probability.
        //
        String w = concat.substring( j, length - i );
        double p = getProbability( w );

        // Retain words that comprise the concatenated string in order.
        //
        if( p > 0 ) {
          words.add( 0, new AbstractMap.SimpleEntry<String, Double>( w, p ) );
        }
      }
    }

    StringBuilder result = new StringBuilder( length * 2 );

    // Find the possibilities that account for the most words.
    //
    for( Map.Entry<String, Double> word : words ) {
      result.append( word.getKey() ).append( ' ' );


      System.out.printf( "%s=%f\n", word.getKey(), word.getValue() );
    }

    return result.toString();
  }

  /**
   * Loads all the words and word probability from the dictionary. Words
   * are separated from the probability by a comma.
   */
  private void loadLexicon( BufferedReader lexiconData )
    throws IOException {
    String line = null;
    TreeMap<String, Double> dictionary = getDictionary();

    dictionary.clear();

    while( (line = lexiconData.readLine()) != null ) {
      String[] lex = line.toLowerCase().split( "," );

      if( lex[0].length() >= MIN_LEX_LENGTH ) {
        dictionary.put( lex[0], Double.parseDouble( lex[1] ) );
      }
    }
  }

  /**
   * Inserts the lines of concatenated text into the internal list.
   */
  private void loadConcat( BufferedReader concatData )
    throws IOException {
    String line = null;
    List<String> concat = getConcat();

    concat.clear();

    while( (line = concatData.readLine()) != null ) {
      if( line.length() >= MIN_LEX_LENGTH ) {
        concat.add( line.toLowerCase() );
      }
    }
  }

  private List<String> getConcat() {
    return this.concat;
  }

  private TreeMap<String, Double> getDictionary() {
    return this.dictionary;
  }

  public static void main( String args[] )
    throws IOException {
    TextSegmenter ts = new TextSegmenter();

    if( args.length == 2 ) {
      try {
        ts.split( new File( args[0] ), new File( args[1] ) );
      }
      catch( Exception e ) {
        System.err.println( "Error: " + e.getMessage() );
        e.printStackTrace();
      }
    }
    else {
      System.out.println( "TextSegmenter <lexicon> <concat>" );
      System.out.println( "<lexicon> - <word, relative probability>" );
      System.out.println( "<concat>  - <words>" );
    }
  }
}

1	-
1	+	package util;
2
3		import java.util.AbstractMap;
4		import java.util.ArrayList;
5		import java.util.List;
6		import java.util.Map;
7		import java.util.TreeMap;
8
9		import java.io.BufferedReader;
10		import java.io.BufferedWriter;
11		import java.io.File;
12		import java.io.FileInputStream;
13		import java.io.InputStreamReader;
14		import java.io.IOException;
15
16		public class TextSegmenter {
17		/** Lexical and concatenated entries must be at least 2 characters. */
18		private static final int MIN_LEX_LENGTH = 2;
19
20		/** Words and frequencies. */
21		private TreeMap<String, Double> dictionary =
22		new TreeMap<String, Double>();
23
24		/** List of concatenated words to split. */
25		private List<String> concat = new ArrayList<String>();
26
27		public TextSegmenter() {
28		}
29
30		public void split( File lexicon, File concat )
31		throws IOException {
32		BufferedReader lex = new BufferedReader(
33		new InputStreamReader( new FileInputStream( lexicon ) ) );
34		BufferedReader col = new BufferedReader(
35		new InputStreamReader( new FileInputStream( concat ) ) );
36
37		split( lex, col );
38
39		lex.close();
40		col.close();
41		}
42
43		/**
44		* Splits the text. Callers must close the streams.
45		*/
46		public void split( BufferedReader lexicon, BufferedReader concat )
47		throws IOException {
48		loadLexicon( lexicon );
49		loadConcat( concat );
50		split();
51		}
52
53		/**
54		* Iterates over all of the contatenated text, splitting each concatenated
55		* String into English words.
56		*/
57		private void split() {
58		for( String concat : getConcat() ) {
59		System.out.printf( "%s::%s\n", concat, segments( concat ) );
60		}
61		}
62
63		/**
64		* Returns a number between 0 and 1 that represents how often the word is
65		* used relative to all the other words in the lexicon.
66		*/
67		private double getProbability( String s ) {
68		try {
69		return getDictionary().get( s );
70		}
71		catch( Exception e ) {
72		return 0.0;
73		}
74		}
75
76		/**
77		* Splits a concatenated phrase into its constituent words.
78		*/
79		private String segments( String concat ) {
80		int length = concat.length();
81		List<Map.Entry<String, Double>> words =
82		new ArrayList<Map.Entry<String, Double>>();
83
84		// Put all the words that exist in the string into a map.
85		//
86		for( int i = 0; i < length; i++ ) {
87		for( int j = 0; j < length - i; j++ ) {
88		// Word and probability.
89		//
90		String w = concat.substring( j, length - i );
91		double p = getProbability( w );
92
93		// Retain words that comprise the concatenated string in order.
94		//
95		if( p > 0 ) {
96		words.add( 0, new AbstractMap.SimpleEntry<String, Double>( w, p ) );
97		}
98		}
99		}
100
101		StringBuilder result = new StringBuilder( length * 2 );
102
103		// Find the possibilities that account for the most words.
104		//
105		for( Map.Entry<String, Double> word : words ) {
106		result.append( word.getKey() ).append( ' ' );
107
108
109		System.out.printf( "%s=%f\n", word.getKey(), word.getValue() );
110		}
111
112		return result.toString();
113		}
114
115		/**
116		* Loads all the words and word probability from the dictionary. Words
117		* are separated from the probability by a comma.
118		*/
119		private void loadLexicon( BufferedReader lexiconData )
120		throws IOException {
121		String line = null;
122		TreeMap<String, Double> dictionary = getDictionary();
123
124		dictionary.clear();
125
126		while( (line = lexiconData.readLine()) != null ) {
127		String[] lex = line.toLowerCase().split( "," );
128
129		if( lex[0].length() >= MIN_LEX_LENGTH ) {
130		dictionary.put( lex[0], Double.parseDouble( lex[1] ) );
131		}
132		}
133		}
134
135		/**
136		* Inserts the lines of concatenated text into the internal list.
137		*/
138		private void loadConcat( BufferedReader concatData )
139		throws IOException {
140		String line = null;
141		List<String> concat = getConcat();
142
143		concat.clear();
144
145		while( (line = concatData.readLine()) != null ) {
146		if( line.length() >= MIN_LEX_LENGTH ) {
147		concat.add( line.toLowerCase() );
148		}
149		}
150		}
151
152		private List<String> getConcat() {
153		return this.concat;
154		}
155
156		private TreeMap<String, Double> getDictionary() {
157		return this.dictionary;
158		}
159
160		public static void main( String args[] )
161		throws IOException {
162		TextSegmenter ts = new TextSegmenter();
163
164		if( args.length == 2 ) {
165		try {
166		ts.split( new File( args[0] ), new File( args[1] ) );
167		}
168		catch( Exception e ) {
169		System.err.println( "Error: " + e.getMessage() );
170		e.printStackTrace();
171		}
172		}
173		else {
174		System.out.println( "TextSegmenter <lexicon> <concat>" );
175		System.out.println( "<lexicon> - <word, relative probability>" );
176		System.out.println( "<concat> - <words>" );
177		}
178		}
179		}