SHARE
TWEET

String Similarity Ranking

mlavoie May 7th, 2013 1,055 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. /// <summary>
  2. /// This class implements string comparison algorithm
  3. /// based on character pair similarity
  4. /// Source: http://www.catalysoft.com/articles/StrikeAMatch.html
  5. /// </summary>
  6. public class SimilarityTool
  7. {
  8.     /// <summary>
  9.     /// Compares the two strings based on letter pair matches
  10.     /// </summary>
  11.     /// <param name="str1"></param>
  12.     /// <param name="str2"></param>
  13.     /// <returns>The percentage match from 0.0 to 1.0 where 1.0 is 100%</returns>
  14.     public double CompareStrings(string str1, string str2)
  15.     {
  16.         List<string> pairs1 = WordLetterPairs(str1.ToUpper());
  17.         List<string> pairs2 = WordLetterPairs(str2.ToUpper());
  18.  
  19.         int intersection = 0;
  20.         int union = pairs1.Count + pairs2.Count;
  21.  
  22.         for (int i = 0; i < pairs1.Count; i++)
  23.         {
  24.                 for (int j = 0; j < pairs2.Count; j++)
  25.                 {
  26.                         if (pairs1[i] == pairs2[j])
  27.                         {
  28.                                 intersection++;
  29.                                 pairs2.RemoveAt(j);//Must remove the match to prevent "GGGG" from appearing to match "GG" with 100% success
  30.  
  31.                                 break;
  32.                         }
  33.                 }
  34.         }
  35.  
  36.         return (2.0 * intersection) / union;
  37.     }
  38.  
  39.     /// <summary>
  40.     /// Gets all letter pairs for each
  41.     /// individual word in the string
  42.     /// </summary>
  43.     /// <param name="str"></param>
  44.     /// <returns></returns>
  45.     private List<string> WordLetterPairs(string str)
  46.     {
  47.         List<string> AllPairs = new List<string>();
  48.  
  49.         // Tokenize the string and put the tokens/words into an array
  50.         string[] Words = Regex.Split(str, @"\s");
  51.  
  52.         // For each word
  53.         for (int w = 0; w < Words.Length; w++)
  54.         {
  55.                 if (!string.IsNullOrEmpty(Words[w]))
  56.                 {
  57.                         // Find the pairs of characters
  58.                         String[] PairsInWord = LetterPairs(Words[w]);
  59.  
  60.                         for (int p = 0; p < PairsInWord.Length; p++)
  61.                         {
  62.                                 AllPairs.Add(PairsInWord[p]);
  63.                         }
  64.                 }
  65.         }
  66.  
  67.         return AllPairs;
  68.     }
  69.  
  70.     /// <summary>
  71.     /// Generates an array containing every
  72.     /// two consecutive letters in the input string
  73.     /// </summary>
  74.     /// <param name="str"></param>
  75.     /// <returns></returns>
  76.     private string[] LetterPairs(string str)
  77.     {
  78.         int numPairs = str.Length - 1;
  79.  
  80.         string[] pairs = new string[numPairs];
  81.  
  82.         for (int i = 0; i < numPairs; i++)
  83.         {
  84.                 pairs[i] = str.Substring(i, 2);
  85.         }
  86.  
  87.         return pairs;
  88.     }
  89. }
RAW Paste Data
Top