Guest User

Untitled

a guest
May 26th, 2018
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.24 KB | None | 0 0
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Linq;
  4. using System.Text;
  5. using SearchEngineWF.Data_Access_Layer;
  6. using System.Text.RegularExpressions;
  7. using System.Globalization;
  8. namespace SearchEngineWF.Data_Access_Layer
  9. {
  10. class Parser
  11. {
  12. public static List<string> m_lstmonths = new List<string>(DateTimeFormatInfo.InvariantInfo.MonthNames);
  13. /// <summary>
  14. /// This function takes a string and split it into an array of strings.
  15. /// Input: ref string[] terms - refference to an empty array which will contain all the terms.
  16. /// </summary>
  17. /// <param name="terms"></param>
  18. public static void Parse(ref string[] terms)
  19. {
  20. SplitDoc2Array(out terms);
  21. RemoveSignsFromTerms(ref terms);
  22. }
  23.  
  24. /// <summary>
  25. /// This function splits a string by a list of delimeters.
  26. /// First string in Archive.m_dicDocs is read and the result of the split action is saved in "termsArray".
  27. /// Input: An empty array which ,by the end of this function, will hold all the terms from the string.
  28. /// </summary>
  29. /// <param name="termsArray"></param>
  30. private static void SplitDoc2Array(out string[] termsArray)
  31. {
  32. int docNum = Archive.m_dicDocs.First().Key;
  33. string[] delimeters= {"\n","\r\n","-"," ","/","(",")","[","]","{","}","*","&","+","%","|","_","@","#","*","="};
  34. termsArray = Archive.m_dicDocs[docNum].Split(delimeters, StringSplitOptions.RemoveEmptyEntries);
  35.  
  36. Archive.m_dicDocs.Remove(docNum);
  37. }
  38. /// <summary>
  39. /// This function removes any character which isn't a-z, 0-9 , ".", "," or empty space. The result is saved on
  40. /// the input array - "terms".
  41. /// Input: A refference to an array of string terms.
  42. /// </summary>
  43. /// <param name="terms"></param>
  44. public static void RemoveSignsFromTerms(ref string[] terms)
  45. {
  46. List<string> templist = new List<string>();
  47. Regex r = new Regex("[^a-z0-9., ]", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant | RegexOptions.Compiled); //setting rules for regex.
  48. for (int k = 0; k < terms.Length; k++)
  49. {
  50. terms[k] = r.Replace(terms[k], string.Empty);
  51. if (terms[k] != "")
  52. {
  53. templist.Add(terms[k].TrimEnd('.',' ',',').TrimStart('.',' ',','));
  54. }
  55. }
  56. terms = null;
  57. terms = templist.ToArray();
  58. }
  59. /// <summary>
  60. /// This function takes a list of terms and recognize a pattern of a date. If a pattern of
  61. /// date is recognized - it will be saved as one term.
  62. /// Input: A refference to a list of string terms.
  63. /// </summary>
  64. /// <param name="lst_terms"></param>
  65. public static void ExtractDates(ref List<string> lst_terms)
  66. {
  67. int k;
  68. string date = "";
  69.  
  70. for (int i = 0; i < lst_terms.Count-1; i++)
  71. {
  72. if (m_lstmonths.Contains(lst_terms[i])) //check whether the terms is month.
  73. {
  74. date = lst_terms[i]; // save month
  75. if (int.TryParse(lst_terms[i + 1], out k) && k > 0 && k < 32)
  76. {
  77. date +=' ' + k.ToString(); // add day
  78. if (i + 2 < lst_terms.Count && int.TryParse(lst_terms[i + 2], out k) && k > 999 && k < 10000) //check that year-check is correct
  79. {
  80. date += ' ' + k.ToString(); // add year
  81. lst_terms.RemoveAt(i + 2);
  82. }
  83. lst_terms.RemoveAt(i + 1);
  84. lst_terms[i] = date;
  85.  
  86. }
  87. else if (int.TryParse(lst_terms[i + 1], out k) && k > 999 && k < 10000)
  88. {
  89. date = lst_terms[i] + ' ' + k.ToString(); // add year
  90. lst_terms.RemoveAt(i + 1);
  91. lst_terms[i] = date;
  92. }
  93. }
  94.  
  95. }
  96. }
  97. /// <summary>
  98. /// This function takes a list of terms removes all entries of terms which are defined as stop-words.
  99. /// Input: A refference to a list of string terms.
  100. /// </summary>
  101. /// <param name="templist"></param>
  102. public static void WipeStopWords(ref List<string> templist)
  103. {
  104. for (int i = 0; i < templist.Count; i++)
  105. {
  106. if (Archive.m_HshStopWords.Contains(templist[i]))
  107. {
  108. templist.RemoveAt(i);
  109. i--;
  110. }
  111. }
  112. }
  113. /// <summary>
  114. /// This function is designed to recognize a pattern of the type: "X,Y,X...", where X is a number
  115. /// and Y is a string (or vice versa) and decide if it is a number or not.
  116. /// Input: A refference to a list of string terms.
  117. /// </summary>
  118. /// <param name="s"></param>
  119. public static void CheckPsiks(ref List<string> s) //looking for this pattern : "123,hello,52"
  120. {
  121. List<string> fixedList = new List<string>();
  122. string[] delimieters = {","," "};
  123. for (int i = 0; i < s.Count - 1; i++)
  124. {
  125. if (s[i].Contains(','))
  126. {
  127. string temp = s[i].Replace(",",string.Empty);
  128. double dbl;
  129. if (!double.TryParse(temp, out dbl))
  130. {
  131. string[] tempstring = s[i].Split(delimieters, StringSplitOptions.RemoveEmptyEntries);
  132. for (int k = 0; k < tempstring.Length; k++)
  133. if (tempstring[k] != "")
  134. fixedList.Add(tempstring[k]);
  135. }
  136. else
  137. {
  138. if (s[i] != "")
  139. fixedList.Add(s[i]);
  140. }
  141. }
  142. else
  143. {
  144. if (s[i] != "")
  145. fixedList.Add(s[i]);
  146. }
  147. }
  148. s = fixedList;
  149. }
  150. /// <summary>
  151. /// This function takes a list of terms, counts and saves them into the database.
  152. /// The database will hold a counter to the number of appearances for every terms.
  153. /// Input: A refference to a list of string terms.
  154. /// </summary>
  155. /// <param name="s"></param>
  156. public static void addTerms2Dic(ref List<string> s)
  157. {
  158.  
  159. int i = Archive.m_dTerms.Count;
  160. string DocName = Archive.m_dicDocNames[i];
  161. Archive.m_TermsPointers[DocName] = new Dictionary<string, List<int>>();
  162. Dictionary<string, int> dicInternal = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
  163. Archive.m_dTerms[i] = dicInternal;
  164. for (int k = 0; k < s.Count; k++)
  165. {
  166. if (Archive.m_TermsPointers[DocName].ContainsKey(s[k]))
  167. {
  168. Archive.m_TermsPointers[DocName][s[k]].Add(k);
  169. }
  170. else
  171. {
  172. Archive.m_TermsPointers[DocName][s[k]] = new List<int>();
  173. Archive.m_TermsPointers[DocName][s[k]].Add(k);
  174. }
  175. if (Archive.m_dTerms[i].ContainsKey(s[k])) //updating the dictionary of the text which counts apearences of terms in text.
  176. {
  177. Archive.m_dTerms[i][s[k]]++;
  178.  
  179. }
  180. else
  181. {
  182. Archive.m_dTerms[i][s[k]] = 1;
  183. /* Archive.m_TermsPointers[Archive.m_dicDocNames[i]][s[k]] = new List<int>();
  184. Archive.m_TermsPointers[Archive.m_dicDocNames[i]][s[k]].Add(k); */
  185. }
  186. }
  187. //SortedDictionary<string, int> tmpQ7 = new SortedDictionary<string, int>(dicInternal); // for Q7
  188. }
  189. }
  190. }
Add Comment
Please, Sign In to add comment