Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Text;
- using SearchEngineWF.Data_Access_Layer;
- using System.Text.RegularExpressions;
- using System.Globalization;
- namespace SearchEngineWF.Data_Access_Layer
- {
- class Parser
- {
- public static List<string> m_lstmonths = new List<string>(DateTimeFormatInfo.InvariantInfo.MonthNames);
- /// <summary>
- /// This function takes a string and split it into an array of strings.
- /// Input: ref string[] terms - refference to an empty array which will contain all the terms.
- /// </summary>
- /// <param name="terms"></param>
- public static void Parse(ref string[] terms)
- {
- SplitDoc2Array(out terms);
- RemoveSignsFromTerms(ref terms);
- }
- /// <summary>
- /// This function splits a string by a list of delimeters.
- /// First string in Archive.m_dicDocs is read and the result of the split action is saved in "termsArray".
- /// Input: An empty array which ,by the end of this function, will hold all the terms from the string.
- /// </summary>
- /// <param name="termsArray"></param>
- private static void SplitDoc2Array(out string[] termsArray)
- {
- int docNum = Archive.m_dicDocs.First().Key;
- string[] delimeters= {"\n","\r\n","-"," ","/","(",")","[","]","{","}","*","&","+","%","|","_","@","#","*","="};
- termsArray = Archive.m_dicDocs[docNum].Split(delimeters, StringSplitOptions.RemoveEmptyEntries);
- Archive.m_dicDocs.Remove(docNum);
- }
- /// <summary>
- /// This function removes any character which isn't a-z, 0-9 , ".", "," or empty space. The result is saved on
- /// the input array - "terms".
- /// Input: A refference to an array of string terms.
- /// </summary>
- /// <param name="terms"></param>
- public static void RemoveSignsFromTerms(ref string[] terms)
- {
- List<string> templist = new List<string>();
- Regex r = new Regex("[^a-z0-9., ]", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant | RegexOptions.Compiled); //setting rules for regex.
- for (int k = 0; k < terms.Length; k++)
- {
- terms[k] = r.Replace(terms[k], string.Empty);
- if (terms[k] != "")
- {
- templist.Add(terms[k].TrimEnd('.',' ',',').TrimStart('.',' ',','));
- }
- }
- terms = null;
- terms = templist.ToArray();
- }
- /// <summary>
- /// This function takes a list of terms and recognize a pattern of a date. If a pattern of
- /// date is recognized - it will be saved as one term.
- /// Input: A refference to a list of string terms.
- /// </summary>
- /// <param name="lst_terms"></param>
- public static void ExtractDates(ref List<string> lst_terms)
- {
- int k;
- string date = "";
- for (int i = 0; i < lst_terms.Count-1; i++)
- {
- if (m_lstmonths.Contains(lst_terms[i])) //check whether the terms is month.
- {
- date = lst_terms[i]; // save month
- if (int.TryParse(lst_terms[i + 1], out k) && k > 0 && k < 32)
- {
- date +=' ' + k.ToString(); // add day
- if (i + 2 < lst_terms.Count && int.TryParse(lst_terms[i + 2], out k) && k > 999 && k < 10000) //check that year-check is correct
- {
- date += ' ' + k.ToString(); // add year
- lst_terms.RemoveAt(i + 2);
- }
- lst_terms.RemoveAt(i + 1);
- lst_terms[i] = date;
- }
- else if (int.TryParse(lst_terms[i + 1], out k) && k > 999 && k < 10000)
- {
- date = lst_terms[i] + ' ' + k.ToString(); // add year
- lst_terms.RemoveAt(i + 1);
- lst_terms[i] = date;
- }
- }
- }
- }
- /// <summary>
- /// This function takes a list of terms removes all entries of terms which are defined as stop-words.
- /// Input: A refference to a list of string terms.
- /// </summary>
- /// <param name="templist"></param>
- public static void WipeStopWords(ref List<string> templist)
- {
- for (int i = 0; i < templist.Count; i++)
- {
- if (Archive.m_HshStopWords.Contains(templist[i]))
- {
- templist.RemoveAt(i);
- i--;
- }
- }
- }
- /// <summary>
- /// This function is designed to recognize a pattern of the type: "X,Y,X...", where X is a number
- /// and Y is a string (or vice versa) and decide if it is a number or not.
- /// Input: A refference to a list of string terms.
- /// </summary>
- /// <param name="s"></param>
- public static void CheckPsiks(ref List<string> s) //looking for this pattern : "123,hello,52"
- {
- List<string> fixedList = new List<string>();
- string[] delimieters = {","," "};
- for (int i = 0; i < s.Count - 1; i++)
- {
- if (s[i].Contains(','))
- {
- string temp = s[i].Replace(",",string.Empty);
- double dbl;
- if (!double.TryParse(temp, out dbl))
- {
- string[] tempstring = s[i].Split(delimieters, StringSplitOptions.RemoveEmptyEntries);
- for (int k = 0; k < tempstring.Length; k++)
- if (tempstring[k] != "")
- fixedList.Add(tempstring[k]);
- }
- else
- {
- if (s[i] != "")
- fixedList.Add(s[i]);
- }
- }
- else
- {
- if (s[i] != "")
- fixedList.Add(s[i]);
- }
- }
- s = fixedList;
- }
- /// <summary>
- /// This function takes a list of terms, counts and saves them into the database.
- /// The database will hold a counter to the number of appearances for every terms.
- /// Input: A refference to a list of string terms.
- /// </summary>
- /// <param name="s"></param>
- public static void addTerms2Dic(ref List<string> s)
- {
- int i = Archive.m_dTerms.Count;
- string DocName = Archive.m_dicDocNames[i];
- Archive.m_TermsPointers[DocName] = new Dictionary<string, List<int>>();
- Dictionary<string, int> dicInternal = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
- Archive.m_dTerms[i] = dicInternal;
- for (int k = 0; k < s.Count; k++)
- {
- if (Archive.m_TermsPointers[DocName].ContainsKey(s[k]))
- {
- Archive.m_TermsPointers[DocName][s[k]].Add(k);
- }
- else
- {
- Archive.m_TermsPointers[DocName][s[k]] = new List<int>();
- Archive.m_TermsPointers[DocName][s[k]].Add(k);
- }
- if (Archive.m_dTerms[i].ContainsKey(s[k])) //updating the dictionary of the text which counts apearences of terms in text.
- {
- Archive.m_dTerms[i][s[k]]++;
- }
- else
- {
- Archive.m_dTerms[i][s[k]] = 1;
- /* Archive.m_TermsPointers[Archive.m_dicDocNames[i]][s[k]] = new List<int>();
- Archive.m_TermsPointers[Archive.m_dicDocNames[i]][s[k]].Add(k); */
- }
- }
- //SortedDictionary<string, int> tmpQ7 = new SortedDictionary<string, int>(dicInternal); // for Q7
- }
- }
- }
Add Comment
Please, Sign In to add comment