Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #%% md
- # 1. Text processing
- We will create the pipline of text preprocessing
- # 1. 1 Normalization
- The first step is normalisation.
- It might include:
- * converting all letters to lower or upper case
- * converting numbers into words or removing numbers
- * removing punctuations, accent marks and other diacritics
- * removing white spaces
- * expanding abbreviations
- In this exercise it would be ok to have a lowercase text without specific characters and digits and without unnecessery space symbols.
- How neural networks could be implemented for text normalization?
- #%%
- import re
- # normalize text
- def normalize(text):
- text = text.lower()
- text = ' '.join(re.findall(r'[\w*]+', text))
- return text
- #%%
- text = """This sentence is going to be lemmatized. Borrowed from Latin per sē (“by itself”), from per (“by, through”) and sē (“itself, himself, herself, themselves”)"""
- text = normalize(text)
- print(text)
- #%% md
- # 1.2 Tokenize
- Use nltk tokenizer to tokenize the text
- #%%
- import nltk
- nltk.download('punkt')
- nltk.download('wordnet')
- nltk.download('stopwords')
- #%%
- # tokenize text using nltk lib
- from nltk.tokenize import word_tokenize
- def tokenize(text):
- return word_tokenize(text)
- #%%
- tokens = tokenize(text)
- print(tokens)
- #%% md
- # 1.3 Lemmatization
- What is the difference between stemming and lemmatization?
- [Optional reading](https://towardsdatascience.com/state-of-the-art-multilingual-lemmatization-f303e8ff1a8)
- #%%
- from nltk.stem import WordNetLemmatizer
- def lemmatization(tokens):
- lemmatizer = WordNetLemmatizer()
- return [lemmatizer.lemmatize(token) for token in tokens]
- #%%
- lemmed = lemmatization(tokens)
- print(lemmed)
- #%% md
- # 1.4 Stop words
- The next step is to remove stop words. Take the list of stop words from nltk.
- #%%
- from nltk.corpus import stopwords
- def remove_stop_word(tokens):
- return [word for word in tokens if word not in stopwords.words('english')]
- #%%
- clean = remove_stop_word(lemmed)
- print(clean)
- #%% md
- # 1.5 Pipeline
- Run a complete pipeline inone function.
- #%%
- def preprocess(text):
- return remove_stop_word(lemmatization(tokenize(normalize(text))))
- #%%
- clean = preprocess(text)
- print(clean)
- #%% md
- # SoundEx algo
- #%%
- def soundex(word):
- first_letter = word[0]
- word = word[1:]
- for i in range(len(word)):
- c = word[i]
- if c in 'aeiouhwy':
- word = word.replace(c, '0')
- elif c in 'bfpv':
- word = word.replace(c, '1')
- elif c in 'cgjkqsxz':
- word = word.replace(c, '2')
- elif c in 'dt':
- word = word.replace(c, '3')
- elif c == 'l':
- word = word.replace(c, '4')
- elif c in 'mn':
- word = word.replace(c, '5')
- elif c == 'r':
- word = word.replace(c, '6')
- if len(word) > 0:
- res = word[0]
- else:
- res = ''
- for i in range(1, len(word)):
- if word[i] != word[i - 1]:
- res += word[i]
- res = res.replace('0', '')
- if len(res) < 3:
- c = 3 - len(res)
- res = '0' * c + res
- return first_letter + res[:3]
- #%% md
- # Levenstein Distance
- #%%
- def levenstein_distance(a, b):
- d = [[0 for j in range(len(b))] for i in range(len(a))]
- for i in range(1, len(a)):
- d[i][0] = i
- for j in range(1, len(b)):
- d[0][j] = j
- for i in range(1, len(a)):
- for j in range(1, len(b)):
- if a[i] == b[j]:
- x = 0
- else:
- x = 1
- d[i][j] = min(d[i - 1][j - 1] + x, d[i - 1][j] + 1, d[i][j - 1] + 1)
- return d[len(a) - 1][len(b) - 1]
- #%% md
- # 2. Collection
- Download Reuters data from here:
- https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz
- Read data description here:
- https://archive.ics.uci.edu/ml/datasets/reuters-21578+text+categorization+collection
- The function should return a list of strings - raw texts. Remove html tags using bs4 package.
- #%%
- import requests
- import tarfile
- import os
- from bs4 import BeautifulSoup
- from urllib.parse import quote
- def get_collection():
- url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz'
- resp = requests.get(url, timeout=5)
- with open('data.tar.gz', "wb") as output:
- output.write(resp.content)
- if not os.path.exists('dataset'):
- os.mkdir('dataset')
- tar = tarfile.open('data.tar.gz', "r:gz")
- tar.extractall('dataset')
- tar.close()
- collection = []
- i = 0
- for filename in os.listdir('dataset'):
- if '.sgm' in filename:
- with open(os.path.join('dataset', filename), "rb") as input:
- soup = BeautifulSoup(input.read())
- texts = soup.find_all("text")
- for news in texts:
- collection.append((news.text, i))
- i += 1
- return collection
- #%%
- collection = get_collection()
- print(len(collection))
- #%%
- print(collection[0])
- #%% md
- # Prefix tree dictionary
- #%%
- def add(w, prefix_dict):
- d = prefix_dict
- for c in w:
- if not c in d.keys():
- d[c] = {}
- d = d[c]
- d['!'] = 1
- def add_text(text):
- global prefix_dict, r_prefix_dict
- for w in text:
- add(w, prefix_dict)
- add(w[::-1], r_prefix_dict)
- def traverse(tree, word):
- res = []
- keys = tree.keys()
- if '!' in keys:
- res.append(word)
- for c in keys:
- if c == '!':
- continue
- res += traverse(tree[c], word + c)
- return res
- def find_words(pref, tree):
- d = tree
- res = []
- for c in pref:
- if c in d.keys():
- d = d[c]
- else:
- return res
- if '!' in d.keys():
- res.append(pref)
- for c in d.keys():
- if c == '!':
- continue
- res += traverse(d[c], pref + c)
- return res
- prefix_dict = {}
- r_prefix_dict = {}
- #%% md
- # 3. Inverted index
- You will work with the boolean search model. Construct a dictionary which maps words to the postings.
- #%%
- def make_index(collection):
- inverted_index = {}
- for element in collection:
- text = preprocess(element[0])
- posting = element[1]
- if posting % 1000 == 0:
- print(f"Posting [{posting}] processed")
- for word in text:
- if not word in inverted_index.keys():
- inverted_index[word] = set()
- inverted_index[word].add(posting)
- for key in inverted_index.keys():
- inverted_index[key] = list(inverted_index[key])
- return inverted_index
- def make_soundex_index(collection):
- inverted_index = {}
- soundex_dict = {}
- for element in collection:
- text = remove_stop_word(tokenize(normalize(element[0])))
- add_text(text)
- posting = element[1]
- if posting % 1000 == 0:
- print(f"Posting [{posting}] processed")
- for word in text:
- soundexed = soundex(word)
- if not soundexed in inverted_index.keys():
- inverted_index[soundexed] = set()
- if not soundexed in soundex_dict.keys():
- soundex_dict[soundexed] = set()
- inverted_index[soundexed].add(posting)
- soundex_dict[soundexed].add(word)
- for key in inverted_index.keys():
- inverted_index[key] = list(inverted_index[key])
- soundex_dict[key] = list(soundex_dict[key])
- return inverted_index, soundex_dict
- #%%
- import json
- if os.path.exists("index.json"):
- index = json.loads(open("index.json", "r").read())
- else:
- index = make_index(collection)
- json.dump(index, open("index.json","w"))
- if os.path.exists("soundex_index.json"):
- soundex_index = json.loads(open("soundex_index.json", "r").read())
- soundex_dict = json.loads(open("soundex_dict.json", "r").read())
- else:
- soundex_index, soundex_dict = make_soundex_index(collection)
- json.dump(soundex_index, open("soundex_index.json","w"))
- json.dump(soundex_dict, open("soundex_dict.json","w"))
- #%% md
- # 4. Query processing
- Using given search query, find all relevant documents. In binary model the relevant document is the one which contains all words from the query.
- Return the list of relevant documents indexes.
- #%%
- from math import ceil
- def retrieve_documents(words):
- """Retrieve all documents containing either of present words"""
- words = lemmatization(words)
- res = set()
- for w in words:
- res = res | set(index[w])
- return res
- def find_alternatives(word):
- """
- Use soundex to find similar words,
- retrieve 0.1 of found words sorted by Levenstein distance
- """
- s = soundex(word)
- if s in soundex_dict.keys():
- soundex_list = soundex_dict[s]
- else:
- soundex_list = []
- distances = []
- for w in soundex_list:
- distances.append(levenstein_distance(word, w))
- z = sorted(zip(distances, soundex_list))
- distances, soundex_list = zip(*z)
- if len(soundex_list) > 0:
- n = ceil(len(soundex_list) * 0.1)
- return soundex_list[:n]
- else:
- return []
- def reverse_words(words):
- for i in range(len(words)):
- words[i] = words[i][::-1]
- return words
- def search(query):
- query = preprocess(query)
- relevant_documents = None
- for i in range(len(query)):
- if '*' in query[i]:
- if query[i].count('*') > 1: # Search with several stars is not implemented
- continue
- x = query[i].find('*')
- if not relevant_documents:
- relevant_documents = retrieve_documents(
- set(find_words(query[i][:x], prefix_dict)) & # Find and lemmatize words that match prefix before '*'
- set(reverse_words(find_words(query[i][x + 1:][::-1], r_prefix_dict))) # Find and lemmatize words that match suffix after '*'
- )
- else:
- relevant_documents = relevant_documents & \
- retrieve_documents(
- set(find_words(query[i][:x], prefix_dict)) & # Find and lemmatize words that match prefix before '*'
- set(reverse_words(find_words(query[i][x + 1:][::-1], r_prefix_dict))) # Find and lemmatize words that match suffix after '*'
- )
- else:
- if not relevant_documents:
- if query[i] in index.keys():
- relevant_documents = set(index[query[i]])
- else:
- relevant_documents = retrieve_documents(find_alternatives(query[i]))
- else:
- if query[i] in index.keys():
- relevant_documents = relevant_documents & set(index[query[i]])
- else:
- relevant_documents = relevant_documents & \
- retrieve_documents(find_alternatives(query[i]))
- return list(relevant_documents)
- #%%
- query = 'moskow ex*ess'
- relevant = search(query)
- print(len(relevant))
- print(relevant)
- if len(relevant) > 0:
- print(collection[relevant[0]])
- #%%
- from flask import Flask
- app = Flask(__name__)
- from app import app
- @app.route('/')
- @app.route('/index')
- def index():
- return "Hello, World!"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement