Untitled

#%% md

# 1. Text processing

We will create the pipline of text preprocessing

# 1. 1 Normalization

The first step is normalisation.
It might include:
* converting all letters to lower or upper case
* converting numbers into words or removing numbers
* removing punctuations, accent marks and other diacritics
* removing white spaces
* expanding abbreviations

In this exercise it would be ok to have a lowercase text without specific characters and digits and without unnecessery space symbols.

How neural networks could be implemented for text normalization?

#%%

import re
# normalize text
def normalize(text):
    text = text.lower()
    text = ' '.join(re.findall(r'[\w*]+', text))
    return text


#%%

text = """This sentence is going to be lemmatized. Borrowed from Latin per sē (“by itself”), from per (“by, through”) and sē (“itself, himself, herself, themselves”)"""
text = normalize(text)
print(text)

#%% md

# 1.2 Tokenize
Use nltk tokenizer to tokenize the text

#%%

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

#%%

# tokenize text using nltk lib
from nltk.tokenize import word_tokenize
def tokenize(text):
    return word_tokenize(text)

#%%

tokens = tokenize(text)
print(tokens)

#%% md

# 1.3 Lemmatization
What is the difference between stemming and lemmatization?

[Optional reading](https://towardsdatascience.com/state-of-the-art-multilingual-lemmatization-f303e8ff1a8)


#%%

from nltk.stem import WordNetLemmatizer

def lemmatization(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

#%%

lemmed = lemmatization(tokens)
print(lemmed)

#%% md

# 1.4 Stop words
The next step is to remove stop words. Take the list of stop words from nltk.

#%%

from nltk.corpus import stopwords

def remove_stop_word(tokens):
    return [word for word in tokens if word not in stopwords.words('english')]

#%%

clean = remove_stop_word(lemmed)
print(clean)

#%% md

# 1.5 Pipeline
Run a complete pipeline inone function.

#%%

def preprocess(text):
    return remove_stop_word(lemmatization(tokenize(normalize(text))))


#%%

clean = preprocess(text)
print(clean)

#%% md

# SoundEx algo

#%%

def soundex(word):
  first_letter = word[0]
  word = word[1:]
  for i in range(len(word)):
    c = word[i]
    if c in 'aeiouhwy':
      word = word.replace(c, '0')
    elif c in 'bfpv':
      word = word.replace(c, '1')
    elif c in 'cgjkqsxz':
      word = word.replace(c, '2')
    elif c in 'dt':
      word = word.replace(c, '3')
    elif c == 'l':
      word = word.replace(c, '4')
    elif c in 'mn':
      word = word.replace(c, '5')
    elif c == 'r':
      word = word.replace(c, '6')
  if len(word) > 0:
    res = word[0]
  else:
    res = ''
  for i in range(1, len(word)):
    if word[i] != word[i - 1]:
      res += word[i]
  res = res.replace('0', '')
  if len(res) < 3:
    c = 3 - len(res)
    res = '0' * c + res
  return first_letter + res[:3]

#%% md

# Levenstein Distance

#%%

def levenstein_distance(a, b):
  d = [[0 for j in range(len(b))] for i in range(len(a))]
  for i in range(1, len(a)):
    d[i][0] = i
  for j in range(1, len(b)):
    d[0][j] = j
  for i in range(1, len(a)):
    for j in range(1, len(b)):
      if a[i] == b[j]:
        x = 0
      else:
        x = 1
      d[i][j] = min(d[i - 1][j - 1] + x, d[i - 1][j] + 1, d[i][j - 1] + 1)
  return d[len(a) - 1][len(b) - 1]

#%% md

# 2. Collection

Download Reuters data from here:
https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz

Read data description here:
https://archive.ics.uci.edu/ml/datasets/reuters-21578+text+categorization+collection

The function should return a list of strings - raw texts. Remove html tags using bs4 package.

#%%

import requests
import tarfile
import os
from bs4 import BeautifulSoup
from urllib.parse import quote

def get_collection():
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz'
    resp = requests.get(url, timeout=5)
    with open('data.tar.gz', "wb") as output:
      output.write(resp.content)
    if not os.path.exists('dataset'):
      os.mkdir('dataset')
      tar = tarfile.open('data.tar.gz', "r:gz")
      tar.extractall('dataset')
      tar.close()

    collection = []
    i = 0
    for filename in os.listdir('dataset'):
      if '.sgm' in filename:
        with open(os.path.join('dataset', filename), "rb") as input:
          soup = BeautifulSoup(input.read())
          texts = soup.find_all("text")
          for news in texts:
            collection.append((news.text, i))
            i += 1
    return collection

#%%

collection = get_collection()
print(len(collection))

#%%

print(collection[0])

#%% md

# Prefix tree dictionary

#%%

def add(w, prefix_dict):
  d = prefix_dict
  for c in w:
    if not c in d.keys():
      d[c] = {}
    d = d[c]
  d['!'] = 1


def add_text(text):
  global prefix_dict, r_prefix_dict
  for w in text:
    add(w, prefix_dict)
    add(w[::-1], r_prefix_dict)


def traverse(tree, word):
  res = []
  keys = tree.keys()
  if '!' in keys:
    res.append(word)
  for c in keys:
    if c == '!':
      continue
    res += traverse(tree[c], word + c)
  return res


def find_words(pref, tree):
  d = tree
  res = []
  for c in pref:
    if c in d.keys():
      d = d[c]
    else:
      return res
  if '!' in d.keys():
    res.append(pref)
  for c in d.keys():
    if c == '!':
      continue
    res += traverse(d[c], pref + c)
  return res

prefix_dict = {}
r_prefix_dict = {}

#%% md

# 3. Inverted index
You will work with the boolean search model. Construct a dictionary which maps words to the postings.

#%%

def make_index(collection):
    inverted_index = {}
    for element in collection:
      text = preprocess(element[0])
      posting = element[1]
      if posting % 1000 == 0:
        print(f"Posting [{posting}] processed")
      for word in text:
        if not word in inverted_index.keys():
          inverted_index[word] = set()
        inverted_index[word].add(posting)
    for key in inverted_index.keys():
      inverted_index[key] = list(inverted_index[key])
    return inverted_index

def make_soundex_index(collection):
    inverted_index = {}
    soundex_dict = {}
    for element in collection:
      text = remove_stop_word(tokenize(normalize(element[0])))
      add_text(text)
      posting = element[1]
      if posting % 1000 == 0:
        print(f"Posting [{posting}] processed")
      for word in text:
        soundexed = soundex(word)
        if not soundexed in inverted_index.keys():
          inverted_index[soundexed] = set()
        if not soundexed in soundex_dict.keys():
          soundex_dict[soundexed] = set()
        inverted_index[soundexed].add(posting)
        soundex_dict[soundexed].add(word)
    for key in inverted_index.keys():
      inverted_index[key] = list(inverted_index[key])
      soundex_dict[key] = list(soundex_dict[key])
    return inverted_index, soundex_dict

#%%

import json
if os.path.exists("index.json"):
    index = json.loads(open("index.json", "r").read())
else:
    index = make_index(collection)
    json.dump(index, open("index.json","w"))
if os.path.exists("soundex_index.json"):
    soundex_index = json.loads(open("soundex_index.json", "r").read())
    soundex_dict = json.loads(open("soundex_dict.json", "r").read())
else:
    soundex_index, soundex_dict = make_soundex_index(collection)
    json.dump(soundex_index, open("soundex_index.json","w"))
    json.dump(soundex_dict, open("soundex_dict.json","w"))

#%% md

# 4. Query processing

Using given search query, find all relevant documents. In binary model the relevant document is the one which contains all words from the query.

Return the list of relevant documents indexes.

#%%

from math import ceil


def retrieve_documents(words):
  """Retrieve all documents containing either of present words"""
  words = lemmatization(words)
  res = set()
  for w in words:
    res = res | set(index[w])
  return res


def find_alternatives(word):
  """
  Use soundex to find similar words,
  retrieve 0.1 of found words sorted by Levenstein distance
  """
  s = soundex(word)
  if s in soundex_dict.keys():
    soundex_list = soundex_dict[s]
  else:
    soundex_list = []
  distances = []
  for w in soundex_list:
    distances.append(levenstein_distance(word, w))
  z = sorted(zip(distances, soundex_list))
  distances, soundex_list = zip(*z)
  if len(soundex_list) > 0:
    n = ceil(len(soundex_list) * 0.1)
    return soundex_list[:n]
  else:
    return []


def reverse_words(words):
  for i in range(len(words)):
    words[i] = words[i][::-1]
  return words


def search(query):
    query = preprocess(query)
    relevant_documents = None
    for i in range(len(query)):
      if '*' in query[i]:
        if query[i].count('*') > 1: # Search with several stars is not implemented
          continue
        x = query[i].find('*')
        if not relevant_documents:
          relevant_documents = retrieve_documents(
              set(find_words(query[i][:x], prefix_dict)) & # Find and lemmatize words that match prefix before '*'
              set(reverse_words(find_words(query[i][x + 1:][::-1], r_prefix_dict))) # Find and lemmatize words that match suffix after '*'
          )
        else:
          relevant_documents = relevant_documents & \
            retrieve_documents(
                set(find_words(query[i][:x], prefix_dict)) & # Find and lemmatize words that match prefix before '*'
                set(reverse_words(find_words(query[i][x + 1:][::-1], r_prefix_dict))) # Find and lemmatize words that match suffix after '*'
            )
      else:
        if not relevant_documents:
          if query[i] in index.keys():
            relevant_documents = set(index[query[i]])
          else:
            relevant_documents = retrieve_documents(find_alternatives(query[i]))
        else:
          if query[i] in index.keys():
            relevant_documents = relevant_documents & set(index[query[i]])
          else:
            relevant_documents = relevant_documents & \
            retrieve_documents(find_alternatives(query[i]))
    return list(relevant_documents)

#%%

query = 'moskow ex*ess'
relevant = search(query)
print(len(relevant))
print(relevant)
if len(relevant) > 0:
  print(collection[relevant[0]])
#%%
from flask import Flask

app = Flask(__name__)
from app import app

@app.route('/')
@app.route('/index')
def index():
    return "Hello, World!"