Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- * To change this license header, choose License Headers in Project Properties.
- * To change this template file, choose Tools | Templates
- * and open the template in the editor.
- */
- package org.common.libraries.etc;
- /**
- * Its methods compare two strings using a, pseudo-fuzzy, case-insensitive,
- * matching algorithm.
- *
- * @author Manuel Iglesias Alonso [email protected]
- */
- public class FuzzyMatch {
- /**
- * Rating when the two strings to be compared fully match.
- */
- public static final int MAX_RATING = 1000;
- private static final String VOWELS = "aeiou"; // Used in rateWords method.
- private static final int VOWEL_VALUE = 750 * MAX_RATING / 1000; // Used in rateWords method.
- private static final int NON_VOWEL_VALUE = 1000 * MAX_RATING / 1000; // Used in rateWords method.
- private static final int MATCH_TO_KEY_MAX_CORRECTION = 1200 * MAX_RATING / 1000; // Used in rateWords & rateSentences methods.
- private static final int MATCH_TO_ITEM_MAX_CORRECTION = 1000 * MAX_RATING / 1000;// Used in rateWords & rateSentences methods.
- private static final int GOOD_MATCH_RATING_LIMIT = 690 * MAX_RATING / 1000; // Used in rateSentences method.
- /**
- *
- * @param word1 Word to be (fuzzy, case insensitive) compared.
- * @param word2 Word to be (fuzzy, case insensitive) compared.
- * @return 0 .. MAX_RATING.
- */
- public static int rateWords(String word1, String word2) {
- final String key;// 'key': Reference word: Shorter.
- String string; // 'string': Word: Longer.
- boolean isVowel;
- int index, matchToKeyCorrection, matchToStringCorrection,
- matchLength = -1, value = 0, valueIfAllKeyCharactersFoundInString = 0,
- keyLength = word1.length(), stringLength = word2.length();
- if (stringLength == 0 && keyLength == 0) {
- return MAX_RATING;// Two empty strings are identical.
- }
- if (stringLength == 0 || keyLength == 0) {
- return 0;// Different if only one word is empty.
- }
- if (stringLength > keyLength) {
- key = word1.toLowerCase();
- string = word2.toLowerCase();
- } else {
- key = word2.toLowerCase();
- string = word1.toLowerCase();
- index = stringLength; // Swap values.
- stringLength = keyLength;// Swap values.
- keyLength = index; // Swap values.
- }
- if (string.equals(key)) {
- return MAX_RATING;
- }
- for (char c : key.toCharArray()) {
- if (VOWELS.indexOf(c) > -1) {
- isVowel = true;
- valueIfAllKeyCharactersFoundInString += VOWEL_VALUE;
- } else {
- isVowel = false;
- valueIfAllKeyCharactersFoundInString += NON_VOWEL_VALUE;
- }
- index = string.indexOf(c);
- if (index > -1) {
- string = string.substring(index + 1);
- if (matchLength == -1) {// First 'key's character that is found in 'string'.
- matchLength = stringLength - string.length() - 1;// Number of not matched leading characters.
- } // If string="0123456789" & key="35", leading_not_matched: 3 ("012").
- if (isVowel) {
- value += VOWEL_VALUE;
- } else {
- value += NON_VOWEL_VALUE;
- }
- }
- }
- if (matchLength > -1) {
- matchLength = stringLength - matchLength - string.length();// For string="0123456789" & key="25", match block is "2345" ('matchLength' is 4 - of 10).
- matchToKeyCorrection = MATCH_TO_KEY_MAX_CORRECTION * Math.abs(matchLength - keyLength) / (matchLength + keyLength);// '0 .. MAX_CORRECTION'. Best match 0 if 'matchLength == keyLength'.
- matchToStringCorrection = MATCH_TO_ITEM_MAX_CORRECTION - (matchLength * MATCH_TO_ITEM_MAX_CORRECTION / stringLength);// '0 .. MAX_CORRECTION'. Best match 0 if 'matchLength == stringLength'.
- valueIfAllKeyCharactersFoundInString += (matchToKeyCorrection + matchToStringCorrection);// valueIfAllKeyCharactersFoundInString increases if corrections > 0. Best match if corrections == 0.
- }
- return MAX_RATING * value / valueIfAllKeyCharactersFoundInString;
- }
- /**
- *
- * @param sentence1 Sentence to be (fuzzy, case insensitive) compared.
- * @param sentence2 Sentence to be (fuzzy, case insensitive) compared.
- * @return 0 .. MAX_RATING.
- */
- public static int rateSentences(String sentence1, String sentence2) {
- int i, rating, maxRating, matchToKeyCorrection, matchToWordsCorrection, value,
- valueIfAllKeyWordsFoundInWords, wordsLength, keyLength, lastMatchIndex, matchLength;
- String[] temp;
- String[] keyWords = sentence1.replaceAll("_+|\\P{IsWord}+", " ").strip().split("\\s+");// Replace '_' & non_words by ' ' then split on whitespace. Works with accented chars.
- String[] words = sentence2.replaceAll("_+|\\P{IsWord}+", " ").strip().split("\\s+");// '\\p{IsWord}': words; '\\P{IsWord}': not_words.
- if (keyWords.length == 0 && words.length == 0) {
- return MAX_RATING;// Two empty strings are identical.
- }
- if (keyWords.length == 0 || words.length == 0) {
- return 0;// Different if only one sentence is empty.
- }
- if (keyWords.length > words.length) {
- temp = words;
- words = keyWords;// Swap values. 'words': Sentence: Gas more words.
- keyWords = temp; // Swap values. 'keyWords': Reference sentence: Has less words.
- }
- keyLength = keyWords.length;
- wordsLength = words.length;
- matchLength = -1;
- value = 0;
- lastMatchIndex = 0;
- for (String word : keyWords) {
- maxRating = 0;
- for (i = lastMatchIndex; i < wordsLength; i++) {// Loop starts at index of ('words's) word that follows a word that made a new maxRating.
- rating = rateWords(word, words[i]);
- if (rating > GOOD_MATCH_RATING_LIMIT && rating > maxRating) {
- maxRating = rating;
- lastMatchIndex = i + 1;// Index of next 'words's word.
- }
- }
- if (maxRating > 0) {// If a good match has been found for ('words's) 'word'.
- value += maxRating;
- if (matchLength == -1) {// First good match found for a 'key's word in 'words'.
- matchLength = lastMatchIndex - 1;// Number of not matched leading words.
- }// If words={"0","1","2","3","4","5","6","7","8","9"} & key={"3","5"}, leading_not_matched: 3 ({"0","1","2"}).
- }
- }
- valueIfAllKeyWordsFoundInWords = keyLength * MAX_RATING;
- if (matchLength > -1) {
- matchLength = lastMatchIndex - matchLength;// For words={"0","1","2","3","4","5","6","7","8","9"} & key={"2","5"}, match block is {"2","3","4","5"} ('matchLength' is 4 - of 10).
- matchToKeyCorrection = MATCH_TO_KEY_MAX_CORRECTION * Math.abs(matchLength - keyLength) / (matchLength + keyLength);// '0 .. MAX_CORRECTION'. Best match 0 if 'matchLength == keyLength'.
- matchToWordsCorrection = MATCH_TO_ITEM_MAX_CORRECTION - (matchLength * MATCH_TO_ITEM_MAX_CORRECTION / wordsLength);// '0 .. MAX_CORRECTION'. Best match 0 if 'matchLength == wordsLength'.
- valueIfAllKeyWordsFoundInWords += (matchToKeyCorrection + matchToWordsCorrection);// valueIfAllKeyWordsFoundInWords increases if corrections > 0. Best match if corrections == 0.
- }
- return MAX_RATING * value / valueIfAllKeyWordsFoundInWords;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement