Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Web;
- namespace tp
- {
- public class FindSimilaryLookingWords
- {
- public FindSimilaryLookingWords(string word, List<string> words,
- out List<string> wordsSortedBySimilarity, out List<double> similarities)
- {
- wordsSortedBySimilarity = new List<string>();
- similarities = new List<double>();
- foreach (string comparedWord in words)
- {
- double similarity;
- var compute2 = new Compute2WordsSimilarity(word, comparedWord, out similarity);
- int index = similarities.Count;
- for (int i = 0; i < similarities.Count; i ++)
- {
- if (similarity > similarities [i])
- {
- index = i;
- break;
- }
- }
- wordsSortedBySimilarity.Insert(index, comparedWord);
- similarities.Insert(index, similarity);
- }
- }
- }
- public class Compute2WordsSimilarity
- {
- public Compute2WordsSimilarity(string word1, string word2,
- // 0 = no similarity, 100 = equal
- out double similarity
- )
- {
- double maxSimilarity = 0;
- double importance;
- double removeFromMaxSimilarity = 0;
- if (word1 == null || word2 == null)
- {
- similarity = 0;
- return;
- }
- string word1Lc = word1.ToLower();
- string word2Lc = word2.ToLower();
- // if both words don't start with same letters remove 50 points
- importance = 30;
- maxSimilarity += importance;
- if ((getFirstChar(word1) != getFirstChar(word2)) && (getLastChar(word1) != getLastChar(word2)))
- {
- removeFromMaxSimilarity -= importance;
- }
- // if they don't start with same letter and end with same remove 30
- importance = 15;
- maxSimilarity += importance;
- if ((getFirstChar(word1) != getFirstChar(word2)) && (getLastChar(word1) == getLastChar(word2)))
- {
- removeFromMaxSimilarity -= importance;
- }
- // if they start with same letter and don't end with same remove 20
- importance = 10;
- maxSimilarity += importance;
- if ((getFirstChar(word1) == getFirstChar(word2)) && (getLastChar(word1) != getLastChar(word2)))
- {
- removeFromMaxSimilarity -= importance;
- }
- // words length difference: remove max 5 points
- importance = 5;
- maxSimilarity += importance;
- removeFromMaxSimilarity -= importance * (1 - Math.Min(word1.Length, word2.Length) / (double)Math.Max(word1.Length, word2.Length));
- // number of letters or sets shuffled: remove max 10 points
- List<int> setsMatchedIndexesWord1; // indexes in word1
- List<int> setsMatchedIndexesWord2; // indexes in word2
- List<int> setsLengths; // lengths are equal in both words, hence the match
- List<int> setsMissingIndexes; // indexes in word2
- List<int> setsMissingLengths;
- List<int> setsAddedIndexes; // indexes in word1
- List<int> setsAddedLengths;
- pairShuffledSets(word1Lc, word2Lc, out setsMatchedIndexesWord1, out setsMatchedIndexesWord2,
- out setsLengths, out setsMissingIndexes,
- out setsMissingLengths, out setsAddedIndexes, out setsAddedLengths);
- double countShuffled = 0;
- double sumShuffled = 0;
- double sumDistanceShuffle = 0;
- for (int i = 0; i < setsMatchedIndexesWord1.Count; i++)
- {
- int setIndexWord1 = setsMatchedIndexesWord1[i];
- int setIndexWord2 = setsMatchedIndexesWord2[i];
- //double movePercentage = (setIndexWord1 - setIndexWord2) /
- if (setIndexWord1 != setIndexWord2)
- {
- sumDistanceShuffle += Math.Abs(setIndexWord1 - setIndexWord2);
- sumShuffled += setsLengths [i];
- countShuffled++;
- }
- }
- importance = 10;
- maxSimilarity += importance;
- removeFromMaxSimilarity -= importance * sumShuffled / (double)word1.Length;
- // summed distance of movement when shuffling: remove max 5 points
- importance = 10;
- maxSimilarity += importance;
- removeFromMaxSimilarity -= importance * sumDistanceShuffle / (double)word1.Length;
- // number of letters or sets removed: remove max 20 points
- importance = 100;
- maxSimilarity += importance;
- int sumRemoved = setsMissingLengths.Sum();
- removeFromMaxSimilarity -= importance * sumRemoved / (double)word1.Length;
- // number of letters or sets added: remove max 10 points
- importance = 10;
- maxSimilarity += importance;
- int sumAdded = setsAddedLengths.Sum();
- removeFromMaxSimilarity -= importance * sumAdded / (double)word1.Length;
- // number of breaks when shuffling (count of shuffled): remove max 5 points
- importance = 5;
- maxSimilarity += importance;
- removeFromMaxSimilarity -= importance * countShuffled / (double)word1.Length;
- // number of breaks when removing: remove max 5 points
- importance = 5;
- maxSimilarity += importance;
- removeFromMaxSimilarity -= importance * setsMissingIndexes.Count / (double)word1.Length;
- // number of breaks when adding: remove max 5 points
- importance = 5;
- maxSimilarity += importance;
- removeFromMaxSimilarity -= importance * setsAddedIndexes.Count / (double)word1.Length;
- // computer similarity
- similarity = 100 * (maxSimilarity + removeFromMaxSimilarity) / maxSimilarity;
- }
- private void pairShuffledSets(string word1, string word2,
- out List<int> setsMatchedIndexesWord1, // indexes in word1
- out List<int> setsMatchedIndexesWord2, // indexes in word2
- out List<int> setsLengths,
- out List<int> setsMissingIndexes, // indexes in word2
- out List<int> setsMissingLengths,
- out List<int> setsAddedIndexes, // indexes in word1
- out List<int> setsAddedLengths
- )
- {
- // vars
- setsMatchedIndexesWord1 = new List<int>();
- setsMatchedIndexesWord2 = new List<int>();
- setsLengths = new List<int>();
- setsMissingIndexes = new List<int>();
- setsMissingLengths = new List<int>();
- setsAddedIndexes = new List<int>();
- setsAddedLengths = new List<int>();
- var takenBits1 = new List<bool>(word1.Length);
- for (int i = 0; i < word1.Length; i++)
- takenBits1.Add(false);
- var takenBits2 = new List<bool>(word2.Length);
- for (int i = 0; i < word2.Length; i++)
- takenBits2.Add(false);
- bool areAnyBitsFree;
- areAnyBitsFree = true;
- // get setsMatchedIndexesWord1, setsMatchedIndexesWord2, setsLengths
- for (int blockSize = word1.Length; blockSize >= 1 && areAnyBitsFree; blockSize--)// for each possible block length
- {
- for (int x1 = 0; x1 <= word1.Length - blockSize && areAnyBitsFree; x1++) // take a block from every possible position in word1
- {
- string block = word1.Substring(x1, blockSize);
- int startIndex = 0;
- bool isBlockTaken = true;
- while (isBlockTaken)
- {
- int x2 = word2.IndexOf(block, startIndex, StringComparison.InvariantCultureIgnoreCase);
- if (x2 != -1)
- {
- // get isBlockTaken. check that this block is not already taken by some other block. if it is try to find next block in word2.
- isBlockTaken = false;
- for (int i = x2; i < x2 + block.Length; i++)
- {
- if (takenBits2[i])
- {
- isBlockTaken = true;
- startIndex = x2 + 1;
- break;
- }
- }
- // update takenBits1
- if (!isBlockTaken)
- {
- for (int i = x1; i < x1 + block.Length; i++)
- takenBits1[i] = true;
- }
- // update takenBits2
- if (!isBlockTaken)
- {
- for (int i = x2; i < x2 + block.Length; i++)
- takenBits2[i] = true;
- }
- // update areAnyBitsFree
- areAnyBitsFree = takenBits1.IndexOf(false) != -1 && takenBits2.IndexOf(false) != -1;
- // we found a match of two parts. update out variables
- if (!isBlockTaken)
- {
- setsMatchedIndexesWord1.Add(x1);
- setsMatchedIndexesWord2.Add(x2);
- setsLengths.Add(blockSize);
- }
- }
- else
- break;
- }
- }
- }
- // get setsMissingIndexes and setsMissingLengths from takenBits1
- int wx1 = 0;
- for (; wx1 < takenBits1.Count; wx1++)
- {
- if (!takenBits1[wx1])
- {
- setsMissingIndexes.Add(wx1);
- var missingSetLen = 0;
- for (; wx1 < takenBits1.Count && !takenBits1[wx1]; wx1++)
- missingSetLen++;
- setsMissingLengths.Add(missingSetLen);
- }
- }
- // get setsAddedIndexes and setsAddedLengths from takenBits2
- int wx2 = 0;
- for (; wx2 < takenBits2.Count; wx2++)
- {
- if (!takenBits2[wx2])
- {
- setsAddedIndexes.Add(wx2);
- var addedSetLen = 0;
- for (; wx2 < takenBits1.Count && !takenBits1[wx2]; wx2++)
- addedSetLen++;
- setsAddedLengths.Add(addedSetLen);
- }
- }
- }
- private char getFirstChar(string s)
- {
- if (s == null || s.Length == 0)
- return '\0';
- return s[0];
- }
- private char getLastChar(string s)
- {
- if (s == null || s.Length == 0)
- return '\0';
- return s[s.Length - 1];
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement