Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /**
- * Guesses keywords from an input string, based on the frequency of the words.
- *
- * @see <a href="http://lucene.apache.org/">http://lucene.apache.org/</a>
- */
- public class KeywordsGuesser {
- /** Lucene version. */
- private static Version LUCENE_VERSION = Version.LUCENE_36;
- /**
- * Keyword holder, composed by a unique stem, its frequency, and a set of found corresponding
- * terms for this stem.
- */
- public static class Keyword implements Comparable<Keyword> {
- /** The unique stem. */
- private String stem;
- /** The frequency of the stem. */
- private Integer frequency;
- /** The found corresponding terms for this stem. */
- private Set<String> terms;
- /**
- * Unique constructor.
- *
- * @param stem The unique stem this instance must hold.
- */
- public Keyword(String stem) {
- this.stem = stem;
- terms = new HashSet<String>();
- frequency = 0;
- }
- /**
- * Add a found corresponding term for this stem. If this term has been already found, it
- * won't be duplicated but the stem frequency will still be incremented.
- *
- * @param term The term to add.
- */
- private void add(String term) {
- terms.add(term);
- frequency++;
- }
- /**
- * Gets the unique stem of this instance.
- *
- * @return The unique stem.
- */
- public String getStem() {
- return stem;
- }
- /**
- * Gets the frequency of this stem.
- *
- * @return The frequency.
- */
- public Integer getFrequency() {
- return frequency;
- }
- /**
- * Gets the list of found corresponding terms for this stem.
- *
- * @return The list of found corresponding terms.
- */
- public Set<String> getTerms() {
- return terms;
- }
- /**
- * Used to reverse sort a list of keywords based on their frequency (from the most frequent
- * keyword to the least frequent one).
- */
- @Override
- public int compareTo(Keyword o) {
- return o.frequency.compareTo(frequency);
- }
- /**
- * Used to keep unicity between two keywords: only their respective stems are taken into
- * account.
- */
- @Override
- public boolean equals(Object obj) {
- return obj instanceof Keyword && obj.hashCode() == hashCode();
- }
- /**
- * Used to keep unicity between two keywords: only their respective stems are taken into
- * account.
- */
- @Override
- public int hashCode() {
- return Arrays.hashCode(new Object[] { stem });
- }
- /**
- * User-readable representation of a keyword: "[stem] x[frequency]".
- */
- @Override
- public String toString() {
- return stem + " x" + frequency;
- }
- }
- /**
- * Stemmize the given term.
- *
- * @param term The term to stem.
- * @return The stem of the given term.
- * @throws IOException If an I/O error occured.
- */
- private static String stemmize(String term) throws IOException {
- // tokenize term
- TokenStream tokenStream = new ClassicTokenizer(LUCENE_VERSION, new StringReader(term));
- // stemmize
- tokenStream = new PorterStemFilter(tokenStream);
- Set<String> stems = new HashSet<String>();
- CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
- // for each token
- while (tokenStream.incrementToken()) {
- // add it in the dedicated set (to keep unicity)
- stems.add(token.toString());
- }
- // if no stem or 2+ stems have been found, return null
- if (stems.size() != 1) {
- return null;
- }
- String stem = stems.iterator().next();
- // if the stem has non-alphanumerical chars, return null
- if (!stem.matches("[\\w-]+")) {
- return null;
- }
- return stem;
- }
- /**
- * Tries to find the given example within the given collection. If it hasn't been found, the
- * example is automatically added in the collection and is then returned.
- *
- * @param collection The collection to search into.
- * @param example The example to search.
- * @return The existing element if it has been found, the given example otherwise.
- */
- private static <T> T find(Collection<T> collection, T example) {
- for (T element : collection) {
- if (element.equals(example)) {
- return element;
- }
- }
- collection.add(example);
- return example;
- }
- /**
- * Extracts text content from the given URL and guesses keywords within it (needs jsoup parser).
- *
- * @param The URL to read.
- * @return A set of potential keywords. The first keyword is the most frequent one, the last the
- * least frequent.
- * @throws IOException If an I/O error occured.
- * @see <a href="http://jsoup.org/">http://jsoup.org/</a>
- */
- public static List<Keyword> guessFromUrl(String url) throws IOException {
- // get textual content from url
- Document doc = Jsoup.connect(url).get();
- String content = doc.body().text();
- // guess keywords from this content
- return guessFromString(content);
- }
- /**
- * Guesses keywords from given input string.
- *
- * @param input The input string.
- * @return A set of potential keywords. The first keyword is the most frequent one, the last the
- * least frequent.
- * @throws IOException If an I/O error occured.
- */
- public static List<Keyword> guessFromString(String input) throws IOException {
- // hack to keep dashed words (e.g. "non-specific" rather than "non" and "specific")
- input = input.replaceAll("-+", "-0");
- // replace any punctuation char but dashes and apostrophes and by a space
- input = input.replaceAll("[\\p{Punct}&&[^'-]]+", " ");
- // replace most common english contractions
- input = input.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", "");
- // tokenize input
- TokenStream tokenStream = new ClassicTokenizer(LUCENE_VERSION, new StringReader(input));
- // to lower case
- tokenStream = new LowerCaseFilter(LUCENE_VERSION, tokenStream);
- // remove dots from acronyms (and "'s" but already done manually above)
- tokenStream = new ClassicFilter(tokenStream);
- // convert any char to ASCII
- tokenStream = new ASCIIFoldingFilter(tokenStream);
- // remove english stop words
- tokenStream = new StopFilter(LUCENE_VERSION, tokenStream, EnglishAnalyzer.getDefaultStopSet());
- List<Keyword> keywords = new LinkedList<Keyword>();
- CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
- // for each token
- while (tokenStream.incrementToken()) {
- String term = token.toString();
- // stemmize
- String stem = stemmize(term);
- if (stem != null) {
- // create the keyword or get the existing one if any
- Keyword keyword = find(keywords, new Keyword(stem.replaceAll("-0", "-")));
- // add its corresponding initial token
- keyword.add(term.replaceAll("-0", "-"));
- }
- }
- // reverse sort by frequency
- Collections.sort(keywords);
- return keywords;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment