Advertisement
Guest User

Untitled

a guest
Feb 23rd, 2020
136
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.72 KB | None | 0 0
  1. #%% md
  2.  
  3. # 1. Text processing
  4.  
  5. We will create the pipline of text preprocessing
  6.  
  7. # 1. 1 Normalization
  8.  
  9. The first step is normalisation.
  10. It might include:
  11. * converting all letters to lower or upper case
  12. * converting numbers into words or removing numbers
  13. * removing punctuations, accent marks and other diacritics
  14. * removing white spaces
  15. * expanding abbreviations
  16.  
  17. In this exercise it would be ok to have a lowercase text without specific characters and digits and without unnecessery space symbols.
  18.  
  19. How neural networks could be implemented for text normalization?
  20.  
  21. #%%
  22.  
  23. import re
  24. # normalize text
  25. def normalize(text):
  26.     text = text.lower()
  27.     text = ' '.join(re.findall(r'[\w*]+', text))
  28.     return text
  29.      
  30.  
  31. #%%
  32.  
  33. text = """This sentence is going to be lemmatized. Borrowed from Latin per sē (“by itself”), from per (“by, through”) and sē (“itself, himself, herself, themselves”)"""
  34. text = normalize(text)
  35. print(text)
  36.  
  37. #%% md
  38.  
  39. # 1.2 Tokenize
  40. Use nltk tokenizer to tokenize the text
  41.  
  42. #%%
  43.  
  44. import nltk
  45. nltk.download('punkt')
  46. nltk.download('wordnet')
  47. nltk.download('stopwords')
  48.  
  49. #%%
  50.  
  51. # tokenize text using nltk lib
  52. from nltk.tokenize import word_tokenize
  53. def tokenize(text):
  54.     return word_tokenize(text)
  55.  
  56. #%%
  57.  
  58. tokens = tokenize(text)
  59. print(tokens)
  60.  
  61. #%% md
  62.  
  63. # 1.3 Lemmatization
  64. What is the difference between stemming and lemmatization?
  65.  
  66. [Optional reading](https://towardsdatascience.com/state-of-the-art-multilingual-lemmatization-f303e8ff1a8)
  67.  
  68.  
  69. #%%
  70.  
  71. from nltk.stem import WordNetLemmatizer
  72.  
  73. def lemmatization(tokens):
  74.     lemmatizer = WordNetLemmatizer()
  75.     return [lemmatizer.lemmatize(token) for token in tokens]
  76.  
  77. #%%
  78.  
  79. lemmed = lemmatization(tokens)
  80. print(lemmed)
  81.  
  82. #%% md
  83.  
  84. # 1.4 Stop words
  85. The next step is to remove stop words. Take the list of stop words from nltk.
  86.  
  87. #%%
  88.  
  89. from nltk.corpus import stopwords
  90.  
  91. def remove_stop_word(tokens):
  92.     return [word for word in tokens if word not in stopwords.words('english')]
  93.  
  94. #%%
  95.  
  96. clean = remove_stop_word(lemmed)
  97. print(clean)
  98.  
  99. #%% md
  100.  
  101. # 1.5 Pipeline
  102. Run a complete pipeline inone function.
  103.  
  104. #%%
  105.  
  106. def preprocess(text):
  107.     return remove_stop_word(lemmatization(tokenize(normalize(text))))
  108.  
  109.  
  110. #%%
  111.  
  112. clean = preprocess(text)
  113. print(clean)
  114.  
  115. #%% md
  116.  
  117. # SoundEx algo
  118.  
  119. #%%
  120.  
  121. def soundex(word):
  122.   first_letter = word[0]
  123.   word = word[1:]
  124.   for i in range(len(word)):
  125.     c = word[i]
  126.     if c in 'aeiouhwy':
  127.       word = word.replace(c, '0')
  128.     elif c in 'bfpv':
  129.       word = word.replace(c, '1')
  130.     elif c in 'cgjkqsxz':
  131.       word = word.replace(c, '2')
  132.     elif c in 'dt':
  133.       word = word.replace(c, '3')
  134.     elif c == 'l':
  135.       word = word.replace(c, '4')
  136.     elif c in 'mn':
  137.       word = word.replace(c, '5')
  138.     elif c == 'r':
  139.       word = word.replace(c, '6')
  140.   if len(word) > 0:
  141.     res = word[0]
  142.   else:
  143.     res = ''
  144.   for i in range(1, len(word)):
  145.     if word[i] != word[i - 1]:
  146.       res += word[i]
  147.   res = res.replace('0', '')
  148.   if len(res) < 3:
  149.     c = 3 - len(res)
  150.     res = '0' * c + res
  151.   return first_letter + res[:3]
  152.  
  153. #%% md
  154.  
  155. # Levenstein Distance
  156.  
  157. #%%
  158.  
  159. def levenstein_distance(a, b):
  160.   d = [[0 for j in range(len(b))] for i in range(len(a))]
  161.   for i in range(1, len(a)):
  162.     d[i][0] = i
  163.   for j in range(1, len(b)):
  164.     d[0][j] = j
  165.   for i in range(1, len(a)):
  166.     for j in range(1, len(b)):
  167.       if a[i] == b[j]:
  168.         x = 0
  169.       else:
  170.         x = 1
  171.       d[i][j] = min(d[i - 1][j - 1] + x, d[i - 1][j] + 1, d[i][j - 1] + 1)
  172.   return d[len(a) - 1][len(b) - 1]
  173.  
  174. #%% md
  175.  
  176. # 2. Collection
  177.  
  178. Download Reuters data from here:
  179. https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz
  180.  
  181. Read data description here:
  182. https://archive.ics.uci.edu/ml/datasets/reuters-21578+text+categorization+collection
  183.  
  184. The function should return a list of strings - raw texts. Remove html tags using bs4 package.
  185.  
  186. #%%
  187.  
  188. import requests
  189. import tarfile
  190. import os
  191. from bs4 import BeautifulSoup
  192. from urllib.parse import quote
  193.  
  194. def get_collection():
  195.     url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz'
  196.     resp = requests.get(url, timeout=5)
  197.     with open('data.tar.gz', "wb") as output:
  198.       output.write(resp.content)
  199.     if not os.path.exists('dataset'):
  200.       os.mkdir('dataset')
  201.       tar = tarfile.open('data.tar.gz', "r:gz")
  202.       tar.extractall('dataset')
  203.       tar.close()
  204.  
  205.     collection = []
  206.     i = 0
  207.     for filename in os.listdir('dataset'):
  208.       if '.sgm' in filename:
  209.         with open(os.path.join('dataset', filename), "rb") as input:
  210.           soup = BeautifulSoup(input.read())
  211.           texts = soup.find_all("text")
  212.           for news in texts:
  213.             collection.append((news.text, i))
  214.             i += 1
  215.     return collection
  216.  
  217. #%%
  218.  
  219. collection = get_collection()
  220. print(len(collection))
  221.  
  222. #%%
  223.  
  224. print(collection[0])
  225.  
  226. #%% md
  227.  
  228. # Prefix tree dictionary
  229.  
  230. #%%
  231.  
  232. def add(w, prefix_dict):
  233.   d = prefix_dict
  234.   for c in w:
  235.     if not c in d.keys():
  236.       d[c] = {}  
  237.     d = d[c]
  238.   d['!'] = 1
  239.  
  240.  
  241. def add_text(text):
  242.   global prefix_dict, r_prefix_dict
  243.   for w in text:
  244.     add(w, prefix_dict)
  245.     add(w[::-1], r_prefix_dict)
  246.  
  247.  
  248. def traverse(tree, word):
  249.   res = []
  250.   keys = tree.keys()
  251.   if '!' in keys:
  252.     res.append(word)
  253.   for c in keys:
  254.     if c == '!':
  255.       continue
  256.     res += traverse(tree[c], word + c)
  257.   return res
  258.  
  259.  
  260. def find_words(pref, tree):
  261.   d = tree
  262.   res = []
  263.   for c in pref:
  264.     if c in d.keys():
  265.       d = d[c]
  266.     else:
  267.       return res
  268.   if '!' in d.keys():
  269.     res.append(pref)
  270.   for c in d.keys():
  271.     if c == '!':
  272.       continue
  273.     res += traverse(d[c], pref + c)
  274.   return res
  275.  
  276. prefix_dict = {}
  277. r_prefix_dict = {}
  278.  
  279. #%% md
  280.  
  281. # 3. Inverted index
  282. You will work with the boolean search model. Construct a dictionary which maps words to the postings.  
  283.  
  284. #%%
  285.  
  286. def make_index(collection):
  287.     inverted_index = {}
  288.     for element in collection:
  289.       text = preprocess(element[0])
  290.       posting = element[1]
  291.       if posting % 1000 == 0:
  292.         print(f"Posting [{posting}] processed")
  293.       for word in text:
  294.         if not word in inverted_index.keys():
  295.           inverted_index[word] = set()
  296.         inverted_index[word].add(posting)
  297.     for key in inverted_index.keys():
  298.       inverted_index[key] = list(inverted_index[key])
  299.     return inverted_index
  300.  
  301. def make_soundex_index(collection):
  302.     inverted_index = {}
  303.     soundex_dict = {}
  304.     for element in collection:
  305.       text = remove_stop_word(tokenize(normalize(element[0])))
  306.       add_text(text)
  307.       posting = element[1]
  308.       if posting % 1000 == 0:
  309.         print(f"Posting [{posting}] processed")
  310.       for word in text:
  311.         soundexed = soundex(word)
  312.         if not soundexed in inverted_index.keys():
  313.           inverted_index[soundexed] = set()
  314.         if not soundexed in soundex_dict.keys():
  315.           soundex_dict[soundexed] = set()
  316.         inverted_index[soundexed].add(posting)
  317.         soundex_dict[soundexed].add(word)
  318.     for key in inverted_index.keys():
  319.       inverted_index[key] = list(inverted_index[key])
  320.       soundex_dict[key] = list(soundex_dict[key])
  321.     return inverted_index, soundex_dict
  322.  
  323. #%%
  324.  
  325. import json
  326. if os.path.exists("index.json"):
  327.     index = json.loads(open("index.json", "r").read())
  328. else:
  329.     index = make_index(collection)
  330.     json.dump(index, open("index.json","w"))
  331. if os.path.exists("soundex_index.json"):
  332.     soundex_index = json.loads(open("soundex_index.json", "r").read())
  333.     soundex_dict = json.loads(open("soundex_dict.json", "r").read())
  334. else:
  335.     soundex_index, soundex_dict = make_soundex_index(collection)
  336.     json.dump(soundex_index, open("soundex_index.json","w"))
  337.     json.dump(soundex_dict, open("soundex_dict.json","w"))
  338.  
  339. #%% md
  340.  
  341. # 4. Query processing
  342.  
  343. Using given search query, find all relevant documents. In binary model the relevant document is the one which contains all words from the query.
  344.  
  345. Return the list of relevant documents indexes.
  346.  
  347. #%%
  348.  
  349. from math import ceil
  350.  
  351.  
  352. def retrieve_documents(words):
  353.   """Retrieve all documents containing either of present words"""
  354.   words = lemmatization(words)
  355.   res = set()
  356.   for w in words:
  357.     res = res | set(index[w])
  358.   return res
  359.  
  360.  
  361. def find_alternatives(word):
  362.   """
  363.  Use soundex to find similar words,
  364.  retrieve 0.1 of found words sorted by Levenstein distance
  365.  """
  366.   s = soundex(word)
  367.   if s in soundex_dict.keys():
  368.     soundex_list = soundex_dict[s]
  369.   else:
  370.     soundex_list = []
  371.   distances = []
  372.   for w in soundex_list:
  373.     distances.append(levenstein_distance(word, w))
  374.   z = sorted(zip(distances, soundex_list))
  375.   distances, soundex_list = zip(*z)
  376.   if len(soundex_list) > 0:
  377.     n = ceil(len(soundex_list) * 0.1)
  378.     return soundex_list[:n]
  379.   else:
  380.     return []
  381.  
  382.  
  383. def reverse_words(words):
  384.   for i in range(len(words)):
  385.     words[i] = words[i][::-1]
  386.   return words
  387.  
  388.  
  389. def search(query):
  390.     query = preprocess(query)
  391.     relevant_documents = None
  392.     for i in range(len(query)):
  393.       if '*' in query[i]:
  394.         if query[i].count('*') > 1: # Search with several stars is not implemented
  395.           continue
  396.         x = query[i].find('*')
  397.         if not relevant_documents:
  398.           relevant_documents = retrieve_documents(
  399.               set(find_words(query[i][:x], prefix_dict)) & # Find and lemmatize words that match prefix before '*'
  400.               set(reverse_words(find_words(query[i][x + 1:][::-1], r_prefix_dict))) # Find and lemmatize words that match suffix after '*'
  401.           )
  402.         else:
  403.           relevant_documents = relevant_documents & \
  404.             retrieve_documents(
  405.                 set(find_words(query[i][:x], prefix_dict)) & # Find and lemmatize words that match prefix before '*'
  406.                 set(reverse_words(find_words(query[i][x + 1:][::-1], r_prefix_dict))) # Find and lemmatize words that match suffix after '*'
  407.             )
  408.       else:
  409.         if not relevant_documents:
  410.           if query[i] in index.keys():
  411.             relevant_documents = set(index[query[i]])
  412.           else:
  413.             relevant_documents = retrieve_documents(find_alternatives(query[i]))
  414.         else:
  415.           if query[i] in index.keys():
  416.             relevant_documents = relevant_documents & set(index[query[i]])
  417.           else:
  418.             relevant_documents = relevant_documents & \
  419.             retrieve_documents(find_alternatives(query[i]))
  420.     return list(relevant_documents)
  421.  
  422. #%%
  423.  
  424. query = 'moskow ex*ess'
  425. relevant = search(query)
  426. print(len(relevant))
  427. print(relevant)
  428. if len(relevant) > 0:
  429.   print(collection[relevant[0]])
  430. #%%
  431. from flask import Flask
  432.  
  433. app = Flask(__name__)
  434. from app import app
  435.  
  436. @app.route('/')
  437. @app.route('/index')
  438. def index():
  439.     return "Hello, World!"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement