Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tensorflow as tf
- from nltk.tokenize import word_tokenize
- import csv
- import random
- import numpy as np
- from collections import Counter
- from nltk.stem import WordNetLemmatizer
- lemmatizer = WordNetLemmatizer()
- filenames = ["Andy-Weir-The-Martian.csv"]
- def deleteChars(string):
- string = string.replace("!", " ")
- string = string.replace("?", " ")
- string = string.replace(",", " ")
- string = string.replace(".", " ")
- string = string.replace(";", " ")
- string = string.replace("<br/>", " ")
- string = string.replace("</span>", " ")
- string = string.replace('<span class="a-size-base review-text">', " ")
- string = string.replace(":", " ")
- string = string.replace("(", " ")
- string = string.replace(")", " ")
- string = string.replace("[", " ")
- string = string.replace("]", " ")
- string = string.replace("{", " ")
- string = string.replace("}", " ")
- string = string.replace("~", " ")
- string = string.replace("/", " ")
- #string = string.replace("\", " ")
- string = string.replace("|", " ")
- string = string.lower()
- return string
- def shortenAmountWords(words_title, words_html):
- print("Number of words before: title ", len(words_title) , ", html ", len(words_html))
- w_counts = Counter(words_title)
- print(len(w_counts))
- final_title = []
- for w in w_counts:
- if 400 > w_counts[w] > 1:
- final_title.append(w)
- w_counts = Counter(words_html)
- print(len(w_counts))
- final_html = []
- for w in w_counts:
- if 5000 > w_counts[w] > 5:
- final_html.append(w)
- print("Number of words after reducing: title " + str(len(final_title)) + ", html " + str(len(final_html)))
- return final_title, final_html
- def createLexicons():
- print("Creating Lexicons..")
- words_html = set()
- words_title = set()
- for filename in filenames:
- with open(filename, 'r') as file:
- reader =csv.reader(file, delimiter = " ")
- for [rating, tailURL, title, html] in reader:
- title = title.decode("utf8")
- title = deleteChars(title)
- titles = word_tokenize(title)
- #titles = [lemmatizer.lemmatize(i) for i in title]
- words_title.update(titles)
- html = html.decode("utf8")
- html = deleteChars(html)
- htmls = word_tokenize(html)
- #htmls = [lemmatizer.lemmatize(i) for i in htmls]
- words_html.update(htmls)
- print("Finished File " + filename)
- words_title = [lemmatizer.lemmatize(i) for i in words_title]
- words_html = [lemmatizer.lemmatize(i) for i in words_html]
- print("Created Lexicons!")
- print("Lexicon title size: " + str(len(words_title)) + ", Lexicon html size: " + str(len(words_html)))
- return words_title, words_html
- #return shortenAmountWords(list(words_title), list(words_html))
- def readCSV(words_title, words_html):
- print("Reading Files..")
- words_title = list(words_title)
- words_html = list(words_html)
- for filename in filenames:
- with open(filename, 'r') as file:
- reader =csv.reader(file, delimiter = " ")
- #count = 0
- dict_lines = []
- for [rating, tailURL, title, html] in reader:
- row_title_words = set()
- title = title.decode("utf8")
- title = deleteChars(title)
- row_title_words.update(word_tokenize(title))
- row_title_words = [lemmatizer.lemmatize(i) for i in row_title_words]
- row_html_words = set()
- html = html.decode("utf8")
- html = deleteChars(html)
- row_html_words.update(word_tokenize(html))
- row_html_words = [lemmatizer.lemmatize(i) for i in row_html_words]
- row_title_numbers = []
- for word in row_title_words:
- if word in words_title:
- row_title_numbers.append(str(words_title.index(word)))
- #index_value = words_title.index(word)
- #row_title_numbers.append(index_value)
- row_html_numbers = []
- for word in row_html_words:
- if word in words_html:
- row_html_numbers.append(str(words_html.index(word)))
- #index_value = words_html.index(word)
- #row_html_numbers.append(str(index_value))
- #rating = str(rating)
- writer = csv.writer(open("realCSV.dat", 'w'))
- writer.writerow([str(rating), " ".join(row_title_numbers), " ".join(row_html_numbers)])
- #if count not in dict_line:
- #dict_line[count] = []
- #dict_lines.append([rating, row_html_numbers, row_html_numbers])
- #count += 1
- #print("Finished line with hits: title " + str(len(row_title_numbers)) + ", html " + str(len(row_html_numbers)))
- print("Finished reading file " + filename)
- print("Finished reading all files!")
- #print(len(words_title) + len(words_html))
- words_title, words_html = createLexicons()
- lines = readCSV(words_title, words_html)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement