Advertisement
Yunga

SimilarityTool.cs

Jan 22nd, 2015
257
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C# 1.94 KB | None | 0 0
  1. // This class implements string comparison algorithm based on character pair similarity
  2. // Source: http://www.catalysoft.com/articles/StrikeAMatch.html
  3. // From: http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
  4.  
  5. public class SimilarityTool {
  6.     // Compares the two strings based on letter pair matches
  7.     // Returns the percentage match from 0.0 to 1.0 where 1.0 is 100%
  8.     public double CompareStrings( string str1, string str2 ) {
  9.         List<string> pairs1 = WordLetterPairs( str1.ToUpper() );
  10.         List<string> pairs2 = WordLetterPairs( str2.ToUpper() );
  11.         int intersection = 0;
  12.         int union = pairs1.Count + pairs2.Count;
  13.  
  14.         for ( int i = 0 ; i < pairs1.Count ; i++ ) {
  15.             for ( int j = 0 ; j < pairs2.Count ; j++ ) {
  16.                 if ( pairs1[i] == pairs2[j] ) {
  17.                     intersection++;
  18.                     pairs2.RemoveAt( j ); //Must remove the match to prevent "GGGG" from appearing to match "GG" with 100% success
  19.                     break;
  20.                 }
  21.             }
  22.         }
  23.  
  24.         return ( 2.0 * intersection ) / union;
  25.     }
  26.  
  27.     // Gets all letter pairs for each individual word in the string
  28.     private List<string> WordLetterPairs( string str ) {
  29.         List<string> AllPairs = new List<string>();
  30.         string[] Words = Regex.Split( str, @"\s" ); // Tokenize the string and put the tokens/words into an array
  31.  
  32.         // For each word
  33.         for ( int w = 0 ; w < Words.Length ; w++ ) {
  34.             if ( !string.IsNullOrEmpty( Words[w] ) ) {
  35.                 // Find the pairs of characters
  36.                 String[] PairsInWord = LetterPairs( Words[w] );
  37.  
  38.                 for ( int p = 0 ; p < PairsInWord.Length ; p++ ) {
  39.                     AllPairs.Add( PairsInWord[p] );
  40.                 }
  41.             }
  42.         }
  43.  
  44.         return AllPairs;
  45.     }
  46.  
  47.     // Generates an array containing every two consecutive letters in the input string
  48.     private string[] LetterPairs( string str ) {
  49.         int numPairs = str.Length - 1;
  50.         string[] pairs = new string[numPairs];
  51.  
  52.         for ( int i = 0 ; i < numPairs ; i++ ) {
  53.             pairs[i] = str.Substring( i, 2 );
  54.         }
  55.  
  56.         return pairs;
  57.     }
  58. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement