Guest User

[Java] KeywordsGuesser

a guest
Jul 5th, 2013
1,132
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. /**
  2.  * Guesses keywords from an input string, based on the frequency of the words.
  3.  *
  4.  * @see <a href="http://lucene.apache.org/">http://lucene.apache.org/</a>
  5.  */
  6. public class KeywordsGuesser {
  7.  
  8.     /** Lucene version. */
  9.     private static Version LUCENE_VERSION = Version.LUCENE_36;
  10.  
  11.     /**
  12.      * Keyword holder, composed by a unique stem, its frequency, and a set of found corresponding
  13.      * terms for this stem.
  14.      */
  15.     public static class Keyword implements Comparable<Keyword> {
  16.  
  17.         /** The unique stem. */
  18.         private String stem;
  19.  
  20.         /** The frequency of the stem. */
  21.         private Integer frequency;
  22.  
  23.         /** The found corresponding terms for this stem. */
  24.         private Set<String> terms;
  25.  
  26.         /**
  27.          * Unique constructor.
  28.          *
  29.          * @param stem The unique stem this instance must hold.
  30.          */
  31.         public Keyword(String stem) {
  32.             this.stem = stem;
  33.             terms = new HashSet<String>();
  34.             frequency = 0;
  35.         }
  36.  
  37.         /**
  38.          * Add a found corresponding term for this stem. If this term has been already found, it
  39.          * won't be duplicated but the stem frequency will still be incremented.
  40.          *
  41.          * @param term The term to add.
  42.          */
  43.         private void add(String term) {
  44.             terms.add(term);
  45.             frequency++;
  46.         }
  47.  
  48.         /**
  49.          * Gets the unique stem of this instance.
  50.          *
  51.          * @return The unique stem.
  52.          */
  53.         public String getStem() {
  54.             return stem;
  55.         }
  56.  
  57.         /**
  58.          * Gets the frequency of this stem.
  59.          *
  60.          * @return The frequency.
  61.          */
  62.         public Integer getFrequency() {
  63.             return frequency;
  64.         }
  65.  
  66.         /**
  67.          * Gets the list of found corresponding terms for this stem.
  68.          *
  69.          * @return The list of found corresponding terms.
  70.          */
  71.         public Set<String> getTerms() {
  72.             return terms;
  73.         }
  74.  
  75.         /**
  76.          * Used to reverse sort a list of keywords based on their frequency (from the most frequent
  77.          * keyword to the least frequent one).
  78.          */
  79.         @Override
  80.         public int compareTo(Keyword o) {
  81.             return o.frequency.compareTo(frequency);
  82.         }
  83.  
  84.         /**
  85.          * Used to keep unicity between two keywords: only their respective stems are taken into
  86.          * account.
  87.          */
  88.         @Override
  89.         public boolean equals(Object obj) {
  90.             return obj instanceof Keyword && obj.hashCode() == hashCode();
  91.         }
  92.  
  93.         /**
  94.          * Used to keep unicity between two keywords: only their respective stems are taken into
  95.          * account.
  96.          */
  97.         @Override
  98.         public int hashCode() {
  99.             return Arrays.hashCode(new Object[] { stem });
  100.         }
  101.  
  102.         /**
  103.          * User-readable representation of a keyword: "[stem] x[frequency]".
  104.          */
  105.         @Override
  106.         public String toString() {
  107.             return stem + " x" + frequency;
  108.         }
  109.  
  110.     }
  111.  
  112.     /**
  113.      * Stemmize the given term.
  114.      *
  115.      * @param term The term to stem.
  116.      * @return The stem of the given term.
  117.      * @throws IOException If an I/O error occured.
  118.      */
  119.     private static String stemmize(String term) throws IOException {
  120.  
  121.         // tokenize term
  122.         TokenStream tokenStream = new ClassicTokenizer(LUCENE_VERSION, new StringReader(term));
  123.         // stemmize
  124.         tokenStream = new PorterStemFilter(tokenStream);
  125.  
  126.         Set<String> stems = new HashSet<String>();
  127.         CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
  128.         // for each token
  129.         while (tokenStream.incrementToken()) {
  130.             // add it in the dedicated set (to keep unicity)
  131.             stems.add(token.toString());
  132.         }
  133.  
  134.         // if no stem or 2+ stems have been found, return null
  135.         if (stems.size() != 1) {
  136.             return null;
  137.         }
  138.  
  139.         String stem = stems.iterator().next();
  140.  
  141.         // if the stem has non-alphanumerical chars, return null
  142.         if (!stem.matches("[\\w-]+")) {
  143.             return null;
  144.         }
  145.  
  146.         return stem;
  147.     }
  148.  
  149.     /**
  150.      * Tries to find the given example within the given collection. If it hasn't been found, the
  151.      * example is automatically added in the collection and is then returned.
  152.      *
  153.      * @param collection The collection to search into.
  154.      * @param example The example to search.
  155.      * @return The existing element if it has been found, the given example otherwise.
  156.      */
  157.     private static <T> T find(Collection<T> collection, T example) {
  158.         for (T element : collection) {
  159.             if (element.equals(example)) {
  160.                 return element;
  161.             }
  162.         }
  163.         collection.add(example);
  164.         return example;
  165.     }
  166.  
  167.     /**
  168.      * Extracts text content from the given URL and guesses keywords within it (needs jsoup parser).
  169.      *
  170.      * @param The URL to read.
  171.      * @return A set of potential keywords. The first keyword is the most frequent one, the last the
  172.      *         least frequent.
  173.      * @throws IOException If an I/O error occured.
  174.      * @see <a href="http://jsoup.org/">http://jsoup.org/</a>
  175.      */
  176.     public static List<Keyword> guessFromUrl(String url) throws IOException {
  177.         // get textual content from url
  178.         Document doc = Jsoup.connect(url).get();
  179.         String content = doc.body().text();
  180.         // guess keywords from this content
  181.         return guessFromString(content);
  182.     }
  183.  
  184.     /**
  185.      * Guesses keywords from given input string.
  186.      *
  187.      * @param input The input string.
  188.      * @return A set of potential keywords. The first keyword is the most frequent one, the last the
  189.      *         least frequent.
  190.      * @throws IOException If an I/O error occured.
  191.      */
  192.     public static List<Keyword> guessFromString(String input) throws IOException {
  193.  
  194.         // hack to keep dashed words (e.g. "non-specific" rather than "non" and "specific")
  195.         input = input.replaceAll("-+", "-0");
  196.         // replace any punctuation char but dashes and apostrophes and by a space
  197.         input = input.replaceAll("[\\p{Punct}&&[^'-]]+", " ");
  198.         // replace most common english contractions
  199.         input = input.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", "");
  200.  
  201.         // tokenize input
  202.         TokenStream tokenStream = new ClassicTokenizer(LUCENE_VERSION, new StringReader(input));
  203.         // to lower case
  204.         tokenStream = new LowerCaseFilter(LUCENE_VERSION, tokenStream);
  205.         // remove dots from acronyms (and "'s" but already done manually above)
  206.         tokenStream = new ClassicFilter(tokenStream);
  207.         // convert any char to ASCII
  208.         tokenStream = new ASCIIFoldingFilter(tokenStream);
  209.         // remove english stop words
  210.         tokenStream = new StopFilter(LUCENE_VERSION, tokenStream, EnglishAnalyzer.getDefaultStopSet());
  211.  
  212.         List<Keyword> keywords = new LinkedList<Keyword>();
  213.         CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
  214.  
  215.         // for each token
  216.         while (tokenStream.incrementToken()) {
  217.             String term = token.toString();
  218.             // stemmize
  219.             String stem = stemmize(term);
  220.             if (stem != null) {
  221.                 // create the keyword or get the existing one if any
  222.                 Keyword keyword = find(keywords, new Keyword(stem.replaceAll("-0", "-")));
  223.                 // add its corresponding initial token
  224.                 keyword.add(term.replaceAll("-0", "-"));
  225.             }
  226.         }
  227.  
  228.         // reverse sort by frequency
  229.         Collections.sort(keywords);
  230.  
  231.         return keywords;
  232.     }
  233.  
  234. }
Advertisement
Add Comment
Please, Sign In to add comment