Want more features on Pastebin? Sign Up, it's FREE!
Guest

Untitled

By: a guest on Jan 2nd, 2011  |  syntax: Java  |  size: 4.73 KB  |  views: 836  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
This paste has a previous version, view the difference. Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. package util;
  2.  
  3. import java.util.AbstractMap;
  4. import java.util.ArrayList;
  5. import java.util.List;
  6. import java.util.Map;
  7. import java.util.TreeMap;
  8.  
  9. import java.io.BufferedReader;
  10. import java.io.BufferedWriter;
  11. import java.io.File;
  12. import java.io.FileInputStream;
  13. import java.io.InputStreamReader;
  14. import java.io.IOException;
  15.  
  16. public class TextSegmenter {
  17.   /** Lexical and concatenated entries must be at least 2 characters. */
  18.   private static final int MIN_LEX_LENGTH = 2;
  19.  
  20.   /** Words and frequencies. */
  21.   private TreeMap<String, Double> dictionary =
  22.     new TreeMap<String, Double>();
  23.  
  24.   /** List of concatenated words to split. */
  25.   private List<String> concat = new ArrayList<String>();
  26.  
  27.   public TextSegmenter() {
  28.   }
  29.  
  30.   public void split( File lexicon, File concat )
  31.     throws IOException {
  32.     BufferedReader lex = new BufferedReader(
  33.       new InputStreamReader( new FileInputStream( lexicon ) ) );
  34.     BufferedReader col = new BufferedReader(
  35.       new InputStreamReader( new FileInputStream( concat ) ) );
  36.  
  37.     split( lex, col );
  38.  
  39.     lex.close();
  40.     col.close();
  41.   }
  42.  
  43.   /**
  44.    * Splits the text. Callers must close the streams.
  45.    */
  46.   public void split( BufferedReader lexicon, BufferedReader concat )
  47.     throws IOException {
  48.     loadLexicon( lexicon );
  49.     loadConcat( concat );
  50.     split();
  51.   }
  52.  
  53.   /**
  54.    * Iterates over all of the contatenated text, splitting each concatenated
  55.    * String into English words.
  56.    */
  57.   private void split() {
  58.     for( String concat : getConcat() ) {
  59.       System.out.printf( "%s::%s\n", concat, segments( concat ) );
  60.     }
  61.   }
  62.  
  63.   /**
  64.    * Returns a number between 0 and 1 that represents how often the word is
  65.    * used relative to all the other words in the lexicon.
  66.    */
  67.   private double getProbability( String s ) {
  68.     try {
  69.       return getDictionary().get( s );
  70.     }
  71.     catch( Exception e ) {
  72.       return 0.0;
  73.     }
  74.   }
  75.  
  76.   /**
  77.    * Splits a concatenated phrase into its constituent words.
  78.    */
  79.   private String segments( String concat ) {
  80.     int length = concat.length();
  81.     List<Map.Entry<String, Double>> words =
  82.       new ArrayList<Map.Entry<String, Double>>();
  83.  
  84.     // Put all the words that exist in the string into a map.
  85.     //
  86.     for( int i = 0; i < length; i++ ) {
  87.       for( int j = 0; j < length - i; j++ ) {
  88.         // Word and probability.
  89.         //
  90.         String w = concat.substring( j, length - i );
  91.         double p = getProbability( w );
  92.  
  93.         // Retain words that comprise the concatenated string in order.
  94.         //
  95.         if( p > 0 ) {
  96.           words.add( 0, new AbstractMap.SimpleEntry<String, Double>( w, p ) );
  97.         }
  98.       }
  99.     }
  100.  
  101.     StringBuilder result = new StringBuilder( length * 2 );
  102.  
  103.     // Find the possibilities that account for the most words.
  104.     //
  105.     for( Map.Entry<String, Double> word : words ) {
  106.       result.append( word.getKey() ).append( ' ' );
  107.  
  108.  
  109.       System.out.printf( "%s=%f\n", word.getKey(), word.getValue() );
  110.     }
  111.  
  112.     return result.toString();
  113.   }
  114.  
  115.   /**
  116.    * Loads all the words and word probability from the dictionary. Words
  117.    * are separated from the probability by a comma.
  118.    */
  119.   private void loadLexicon( BufferedReader lexiconData )
  120.     throws IOException {
  121.     String line = null;
  122.     TreeMap<String, Double> dictionary = getDictionary();
  123.  
  124.     dictionary.clear();
  125.  
  126.     while( (line = lexiconData.readLine()) != null ) {
  127.       String[] lex = line.toLowerCase().split( "," );
  128.  
  129.       if( lex[0].length() >= MIN_LEX_LENGTH ) {
  130.         dictionary.put( lex[0], Double.parseDouble( lex[1] ) );
  131.       }
  132.     }
  133.   }
  134.  
  135.   /**
  136.    * Inserts the lines of concatenated text into the internal list.
  137.    */
  138.   private void loadConcat( BufferedReader concatData )
  139.     throws IOException {
  140.     String line = null;
  141.     List<String> concat = getConcat();
  142.  
  143.     concat.clear();
  144.  
  145.     while( (line = concatData.readLine()) != null ) {
  146.       if( line.length() >= MIN_LEX_LENGTH ) {
  147.         concat.add( line.toLowerCase() );
  148.       }
  149.     }
  150.   }
  151.  
  152.   private List<String> getConcat() {
  153.     return this.concat;
  154.   }
  155.  
  156.   private TreeMap<String, Double> getDictionary() {
  157.     return this.dictionary;
  158.   }
  159.  
  160.   public static void main( String args[] )
  161.     throws IOException {
  162.     TextSegmenter ts = new TextSegmenter();
  163.  
  164.     if( args.length == 2 ) {
  165.       try {
  166.         ts.split( new File( args[0] ), new File( args[1] ) );
  167.       }
  168.       catch( Exception e ) {
  169.         System.err.println( "Error: " + e.getMessage() );
  170.         e.printStackTrace();
  171.       }
  172.     }
  173.     else {
  174.       System.out.println( "TextSegmenter <lexicon> <concat>" );
  175.       System.out.println( "<lexicon> - <word, relative probability>" );
  176.       System.out.println( "<concat>  - <words>" );
  177.     }
  178.   }
  179. }
clone this paste RAW Paste Data