Untitled

import tensorflow as tf
from nltk.tokenize import word_tokenize
import csv
import random
import numpy as np
from collections import Counter
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
filenames = ["Andy-Weir-The-Martian.csv"]


def deleteChars(string):
   string = string.replace("!", " ")
   string = string.replace("?", " ")
   string = string.replace(",", " ")
   string = string.replace(".", " ")
   string = string.replace(";", " ")
   string = string.replace("<br/>", " ")
   string = string.replace("</span>", " ")
   string = string.replace('<span class="a-size-base review-text">', " ")
   string = string.replace(":", " ")
   string = string.replace("(", " ")
   string = string.replace(")", " ")
   string = string.replace("[", " ")
   string = string.replace("]", " ")
   string = string.replace("{", " ")
   string = string.replace("}", " ")
   string = string.replace("~", " ")
   string = string.replace("/", " ")
   #string = string.replace("\", " ")
   string = string.replace("|", " ")
   string = string.lower()
   return string


def shortenAmountWords(words_title, words_html):
   print("Number of words before: title ", len(words_title) , ", html ", len(words_html))
   w_counts = Counter(words_title)
   print(len(w_counts))
   final_title = []
   for w in w_counts:
      if 400 > w_counts[w] > 1:
         final_title.append(w)

   w_counts = Counter(words_html)
   print(len(w_counts))
   final_html = []
   for w in w_counts:
      if 5000 > w_counts[w] > 5:
         final_html.append(w)

   print("Number of words after reducing: title " + str(len(final_title)) + ", html " + str(len(final_html)))
   return final_title, final_html


def createLexicons():
   print("Creating Lexicons..")
   words_html = set()
   words_title = set()
   for filename in filenames:
      with open(filename, 'r') as file:
         reader =csv.reader(file, delimiter = "	")
         for [rating, tailURL, title, html] in reader:
            title = title.decode("utf8")
            title = deleteChars(title)
            titles = word_tokenize(title)
            #titles = [lemmatizer.lemmatize(i) for i in title]
            words_title.update(titles)

            html = html.decode("utf8")
            html = deleteChars(html)
            htmls = word_tokenize(html)
            #htmls = [lemmatizer.lemmatize(i) for i in htmls]
            words_html.update(htmls)

         print("Finished File " + filename)

   words_title = [lemmatizer.lemmatize(i) for i in words_title]
   words_html = [lemmatizer.lemmatize(i) for i in words_html]

   print("Created Lexicons!")
   print("Lexicon title size: " + str(len(words_title)) + ", Lexicon html size: " + str(len(words_html)))
   return words_title, words_html
   #return shortenAmountWords(list(words_title), list(words_html))


def readCSV(words_title, words_html):
   print("Reading Files..")
   words_title = list(words_title)
   words_html = list(words_html)


   for filename in filenames:
      with open(filename, 'r') as file:
         reader =csv.reader(file, delimiter = "	")
         #count = 0
         dict_lines = []
         for [rating, tailURL, title, html] in reader:
            row_title_words = set()
            title = title.decode("utf8")
            title = deleteChars(title)
            row_title_words.update(word_tokenize(title))
            row_title_words = [lemmatizer.lemmatize(i) for i in row_title_words]

            row_html_words = set()
            html = html.decode("utf8")
            html = deleteChars(html)
            row_html_words.update(word_tokenize(html))
            row_html_words = [lemmatizer.lemmatize(i) for i in row_html_words]

            row_title_numbers = []
            for word in row_title_words:
               if word in words_title:
                   row_title_numbers.append(str(words_title.index(word)))
                   #index_value = words_title.index(word)
                   #row_title_numbers.append(index_value)

            row_html_numbers = []
            for word in row_html_words:
               if word in words_html:
                   row_html_numbers.append(str(words_html.index(word)))
                   #index_value = words_html.index(word)
                   #row_html_numbers.append(str(index_value))


            #rating = str(rating)
            writer = csv.writer(open("realCSV.dat", 'w'))
            writer.writerow([str(rating), " ".join(row_title_numbers), "    ".join(row_html_numbers)])

            #if count not in dict_line:
                #dict_line[count] = []
            #dict_lines.append([rating, row_html_numbers, row_html_numbers])
            #count += 1

            #print("Finished line with hits: title " + str(len(row_title_numbers)) + ", html " + str(len(row_html_numbers)))
      print("Finished reading file " + filename)
   print("Finished reading all files!")
   #print(len(words_title) + len(words_html))


words_title, words_html = createLexicons()
lines = readCSV(words_title, words_html)