mlavoie

String Similarity Ranking

May 7th, 2013
1,638
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. /// <summary>
  2. /// This class implements string comparison algorithm
  3. /// based on character pair similarity
  4. /// Source: http://www.catalysoft.com/articles/StrikeAMatch.html
  5. /// </summary>
  6. public class SimilarityTool
  7. {
  8.     /// <summary>
  9.     /// Compares the two strings based on letter pair matches
  10.     /// </summary>
  11.     /// <param name="str1"></param>
  12.     /// <param name="str2"></param>
  13.     /// <returns>The percentage match from 0.0 to 1.0 where 1.0 is 100%</returns>
  14.     public double CompareStrings(string str1, string str2)
  15.     {
  16.         List<string> pairs1 = WordLetterPairs(str1.ToUpper());
  17.         List<string> pairs2 = WordLetterPairs(str2.ToUpper());
  18.  
  19.         int intersection = 0;
  20.         int union = pairs1.Count + pairs2.Count;
  21.  
  22.         for (int i = 0; i < pairs1.Count; i++)
  23.         {
  24.             for (int j = 0; j < pairs2.Count; j++)
  25.             {
  26.                 if (pairs1[i] == pairs2[j])
  27.                 {
  28.                     intersection++;
  29.                     pairs2.RemoveAt(j);//Must remove the match to prevent "GGGG" from appearing to match "GG" with 100% success
  30.  
  31.                     break;
  32.                 }
  33.             }
  34.         }
  35.  
  36.         return (2.0 * intersection) / union;
  37.     }
  38.  
  39.     /// <summary>
  40.     /// Gets all letter pairs for each
  41.     /// individual word in the string
  42.     /// </summary>
  43.     /// <param name="str"></param>
  44.     /// <returns></returns>
  45.     private List<string> WordLetterPairs(string str)
  46.     {
  47.         List<string> AllPairs = new List<string>();
  48.  
  49.         // Tokenize the string and put the tokens/words into an array
  50.         string[] Words = Regex.Split(str, @"\s");
  51.  
  52.         // For each word
  53.         for (int w = 0; w < Words.Length; w++)
  54.         {
  55.             if (!string.IsNullOrEmpty(Words[w]))
  56.             {
  57.                 // Find the pairs of characters
  58.                 String[] PairsInWord = LetterPairs(Words[w]);
  59.  
  60.                 for (int p = 0; p < PairsInWord.Length; p++)
  61.                 {
  62.                     AllPairs.Add(PairsInWord[p]);
  63.                 }
  64.             }
  65.         }
  66.  
  67.         return AllPairs;
  68.     }
  69.  
  70.     /// <summary>
  71.     /// Generates an array containing every
  72.     /// two consecutive letters in the input string
  73.     /// </summary>
  74.     /// <param name="str"></param>
  75.     /// <returns></returns>
  76.     private string[] LetterPairs(string str)
  77.     {
  78.         int numPairs = str.Length - 1;
  79.  
  80.         string[] pairs = new string[numPairs];
  81.  
  82.         for (int i = 0; i < numPairs; i++)
  83.         {
  84.             pairs[i] = str.Substring(i, 2);
  85.         }
  86.  
  87.         return pairs;
  88.     }
  89. }
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×