Advertisement
Guest User

FuzzyMatch

a guest
Feb 1st, 2020
320
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 7.83 KB | None | 0 0
  1. /*
  2.  * To change this license header, choose License Headers in Project Properties.
  3.  * To change this template file, choose Tools | Templates
  4.  * and open the template in the editor.
  5.  */
  6. package org.common.libraries.etc;
  7.  
  8. /**
  9.  * Its methods compare two strings using a, pseudo-fuzzy, case-insensitive,
  10.  * matching algorithm.
  11.  *
  12.  * @author Manuel Iglesias Alonso [email protected]
  13.  */
  14. public class FuzzyMatch {
  15.  
  16.     /**
  17.      * Rating when the two strings to be compared fully match.
  18.      */
  19.     public static final int MAX_RATING = 1000;
  20.     private static final String VOWELS = "aeiou";                                    // Used in rateWords method.
  21.     private static final int VOWEL_VALUE = 750 * MAX_RATING / 1000;                  // Used in rateWords method.
  22.     private static final int NON_VOWEL_VALUE = 1000 * MAX_RATING / 1000;             // Used in rateWords method.
  23.     private static final int MATCH_TO_KEY_MAX_CORRECTION = 1200 * MAX_RATING / 1000; // Used in rateWords & rateSentences methods.
  24.     private static final int MATCH_TO_ITEM_MAX_CORRECTION = 1000 * MAX_RATING / 1000;// Used in rateWords & rateSentences methods.
  25.     private static final int GOOD_MATCH_RATING_LIMIT = 690 * MAX_RATING / 1000;      // Used in rateSentences method.
  26.  
  27.     /**
  28.      *
  29.      * @param word1 Word to be (fuzzy, case insensitive) compared.
  30.      * @param word2 Word to be (fuzzy, case insensitive) compared.
  31.      * @return 0 .. MAX_RATING.
  32.      */
  33.     public static int rateWords(String word1, String word2) {
  34.         final String key;// 'key':    Reference word: Shorter.
  35.         String string;   // 'string': Word: Longer.
  36.         boolean isVowel;
  37.         int index, matchToKeyCorrection, matchToStringCorrection,
  38.                 matchLength = -1, value = 0, valueIfAllKeyCharactersFoundInString = 0,
  39.                 keyLength = word1.length(), stringLength = word2.length();
  40.         if (stringLength == 0 && keyLength == 0) {
  41.             return MAX_RATING;// Two empty strings are identical.
  42.         }
  43.         if (stringLength == 0 || keyLength == 0) {
  44.             return 0;// Different if only one word is empty.
  45.         }
  46.         if (stringLength > keyLength) {
  47.             key = word1.toLowerCase();
  48.             string = word2.toLowerCase();
  49.         } else {
  50.             key = word2.toLowerCase();
  51.             string = word1.toLowerCase();
  52.             index = stringLength;    // Swap values.
  53.             stringLength = keyLength;// Swap values.
  54.             keyLength = index;       // Swap values.
  55.         }
  56.         if (string.equals(key)) {
  57.             return MAX_RATING;
  58.         }
  59.         for (char c : key.toCharArray()) {
  60.             if (VOWELS.indexOf(c) > -1) {
  61.                 isVowel = true;
  62.                 valueIfAllKeyCharactersFoundInString += VOWEL_VALUE;
  63.             } else {
  64.                 isVowel = false;
  65.                 valueIfAllKeyCharactersFoundInString += NON_VOWEL_VALUE;
  66.             }
  67.             index = string.indexOf(c);
  68.             if (index > -1) {
  69.                 string = string.substring(index + 1);
  70.                 if (matchLength == -1) {// First 'key's character that is found in 'string'.
  71.                     matchLength = stringLength - string.length() - 1;// Number of not matched leading characters.
  72.                 }                                                    // If string="0123456789" & key="35", leading_not_matched: 3 ("012").
  73.                 if (isVowel) {
  74.                     value += VOWEL_VALUE;
  75.                 } else {
  76.                     value += NON_VOWEL_VALUE;
  77.                 }
  78.             }
  79.         }
  80.         if (matchLength > -1) {
  81.             matchLength = stringLength - matchLength - string.length();// For string="0123456789" & key="25", match block is "2345" ('matchLength' is 4 - of 10).
  82.             matchToKeyCorrection = MATCH_TO_KEY_MAX_CORRECTION * Math.abs(matchLength - keyLength) / (matchLength + keyLength);// '0 .. MAX_CORRECTION'. Best match 0 if 'matchLength == keyLength'.
  83.             matchToStringCorrection = MATCH_TO_ITEM_MAX_CORRECTION - (matchLength * MATCH_TO_ITEM_MAX_CORRECTION / stringLength);// '0 .. MAX_CORRECTION'. Best match 0 if 'matchLength == stringLength'.
  84.             valueIfAllKeyCharactersFoundInString += (matchToKeyCorrection + matchToStringCorrection);// valueIfAllKeyCharactersFoundInString increases if corrections > 0. Best match if corrections == 0.
  85.         }
  86.         return MAX_RATING * value / valueIfAllKeyCharactersFoundInString;
  87.     }
  88.  
  89.     /**
  90.      *
  91.      * @param sentence1 Sentence to be (fuzzy, case insensitive) compared.
  92.      * @param sentence2 Sentence to be (fuzzy, case insensitive) compared.
  93.      * @return 0 .. MAX_RATING.
  94.      */
  95.     public static int rateSentences(String sentence1, String sentence2) {
  96.         int i, rating, maxRating, matchToKeyCorrection, matchToWordsCorrection, value,
  97.                 valueIfAllKeyWordsFoundInWords, wordsLength, keyLength, lastMatchIndex, matchLength;
  98.         String[] temp;
  99.         String[] keyWords = sentence1.replaceAll("_+|\\P{IsWord}+", " ").strip().split("\\s+");// Replace '_' & non_words by ' ' then split on whitespace. Works with accented chars.
  100.         String[] words = sentence2.replaceAll("_+|\\P{IsWord}+", " ").strip().split("\\s+");// '\\p{IsWord}': words; '\\P{IsWord}': not_words.
  101.         if (keyWords.length == 0 && words.length == 0) {
  102.             return MAX_RATING;// Two empty strings are identical.
  103.         }
  104.         if (keyWords.length == 0 || words.length == 0) {
  105.             return 0;// Different if only one sentence is empty.
  106.         }
  107.         if (keyWords.length > words.length) {
  108.             temp = words;
  109.             words = keyWords;// Swap values. 'words':   Sentence: Gas more words.
  110.             keyWords = temp; // Swap values. 'keyWords': Reference sentence: Has less words.
  111.         }
  112.         keyLength = keyWords.length;
  113.         wordsLength = words.length;
  114.         matchLength = -1;
  115.         value = 0;
  116.         lastMatchIndex = 0;
  117.         for (String word : keyWords) {
  118.             maxRating = 0;
  119.             for (i = lastMatchIndex; i < wordsLength; i++) {// Loop starts at index of ('words's) word that follows a word that made a new maxRating.
  120.                 rating = rateWords(word, words[i]);
  121.                 if (rating > GOOD_MATCH_RATING_LIMIT && rating > maxRating) {
  122.                     maxRating = rating;
  123.                     lastMatchIndex = i + 1;// Index of next 'words's word.
  124.                 }
  125.             }
  126.             if (maxRating > 0) {// If a good match has been found for ('words's) 'word'.
  127.                 value += maxRating;
  128.                 if (matchLength == -1) {// First good match found for a 'key's word in 'words'.
  129.                     matchLength = lastMatchIndex - 1;// Number of not matched leading words.
  130.                 }// If words={"0","1","2","3","4","5","6","7","8","9"} & key={"3","5"}, leading_not_matched: 3 ({"0","1","2"}).                
  131.             }
  132.         }
  133.         valueIfAllKeyWordsFoundInWords = keyLength * MAX_RATING;
  134.         if (matchLength > -1) {
  135.             matchLength = lastMatchIndex - matchLength;// For words={"0","1","2","3","4","5","6","7","8","9"} & key={"2","5"}, match block is {"2","3","4","5"} ('matchLength' is 4 - of 10).
  136.             matchToKeyCorrection = MATCH_TO_KEY_MAX_CORRECTION * Math.abs(matchLength - keyLength) / (matchLength + keyLength);// '0 .. MAX_CORRECTION'. Best match 0 if 'matchLength == keyLength'.
  137.             matchToWordsCorrection = MATCH_TO_ITEM_MAX_CORRECTION - (matchLength * MATCH_TO_ITEM_MAX_CORRECTION / wordsLength);// '0 .. MAX_CORRECTION'. Best match 0 if 'matchLength == wordsLength'.
  138.             valueIfAllKeyWordsFoundInWords += (matchToKeyCorrection + matchToWordsCorrection);// valueIfAllKeyWordsFoundInWords increases if corrections > 0. Best match if corrections == 0.
  139.         }
  140.         return MAX_RATING * value / valueIfAllKeyWordsFoundInWords;
  141.     }
  142. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement