Advertisement
Guest User

Untitled

a guest
Jan 21st, 2017
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.04 KB | None | 0 0
  1. import tensorflow as tf
  2. from nltk.tokenize import word_tokenize
  3. import csv
  4. import random
  5. import numpy as np
  6. from collections import Counter
  7. from nltk.stem import WordNetLemmatizer
  8.  
  9. lemmatizer = WordNetLemmatizer()
  10. filenames = ["Andy-Weir-The-Martian.csv"]
  11.  
  12.  
  13. def deleteChars(string):
  14. string = string.replace("!", " ")
  15. string = string.replace("?", " ")
  16. string = string.replace(",", " ")
  17. string = string.replace(".", " ")
  18. string = string.replace(";", " ")
  19. string = string.replace("<br/>", " ")
  20. string = string.replace("</span>", " ")
  21. string = string.replace('<span class="a-size-base review-text">', " ")
  22. string = string.replace(":", " ")
  23. string = string.replace("(", " ")
  24. string = string.replace(")", " ")
  25. string = string.replace("[", " ")
  26. string = string.replace("]", " ")
  27. string = string.replace("{", " ")
  28. string = string.replace("}", " ")
  29. string = string.replace("~", " ")
  30. string = string.replace("/", " ")
  31. #string = string.replace("\", " ")
  32. string = string.replace("|", " ")
  33. string = string.lower()
  34. return string
  35.  
  36.  
  37. def shortenAmountWords(words_title, words_html):
  38. print("Number of words before: title ", len(words_title) , ", html ", len(words_html))
  39. w_counts = Counter(words_title)
  40. print(len(w_counts))
  41. final_title = []
  42. for w in w_counts:
  43. if 400 > w_counts[w] > 1:
  44. final_title.append(w)
  45.  
  46. w_counts = Counter(words_html)
  47. print(len(w_counts))
  48. final_html = []
  49. for w in w_counts:
  50. if 5000 > w_counts[w] > 5:
  51. final_html.append(w)
  52.  
  53. print("Number of words after reducing: title " + str(len(final_title)) + ", html " + str(len(final_html)))
  54. return final_title, final_html
  55.  
  56.  
  57. def createLexicons():
  58. print("Creating Lexicons..")
  59. words_html = set()
  60. words_title = set()
  61. for filename in filenames:
  62. with open(filename, 'r') as file:
  63. reader =csv.reader(file, delimiter = " ")
  64. for [rating, tailURL, title, html] in reader:
  65. title = title.decode("utf8")
  66. title = deleteChars(title)
  67. titles = word_tokenize(title)
  68. #titles = [lemmatizer.lemmatize(i) for i in title]
  69. words_title.update(titles)
  70.  
  71. html = html.decode("utf8")
  72. html = deleteChars(html)
  73. htmls = word_tokenize(html)
  74. #htmls = [lemmatizer.lemmatize(i) for i in htmls]
  75. words_html.update(htmls)
  76.  
  77. print("Finished File " + filename)
  78.  
  79. words_title = [lemmatizer.lemmatize(i) for i in words_title]
  80. words_html = [lemmatizer.lemmatize(i) for i in words_html]
  81.  
  82. print("Created Lexicons!")
  83. print("Lexicon title size: " + str(len(words_title)) + ", Lexicon html size: " + str(len(words_html)))
  84. return words_title, words_html
  85. #return shortenAmountWords(list(words_title), list(words_html))
  86.  
  87.  
  88.  
  89. def readCSV(words_title, words_html):
  90. print("Reading Files..")
  91. words_title = list(words_title)
  92. words_html = list(words_html)
  93.  
  94.  
  95. for filename in filenames:
  96. with open(filename, 'r') as file:
  97. reader =csv.reader(file, delimiter = " ")
  98. #count = 0
  99. dict_lines = []
  100. for [rating, tailURL, title, html] in reader:
  101. row_title_words = set()
  102. title = title.decode("utf8")
  103. title = deleteChars(title)
  104. row_title_words.update(word_tokenize(title))
  105. row_title_words = [lemmatizer.lemmatize(i) for i in row_title_words]
  106.  
  107. row_html_words = set()
  108. html = html.decode("utf8")
  109. html = deleteChars(html)
  110. row_html_words.update(word_tokenize(html))
  111. row_html_words = [lemmatizer.lemmatize(i) for i in row_html_words]
  112.  
  113. row_title_numbers = []
  114. for word in row_title_words:
  115. if word in words_title:
  116. row_title_numbers.append(str(words_title.index(word)))
  117. #index_value = words_title.index(word)
  118. #row_title_numbers.append(index_value)
  119.  
  120. row_html_numbers = []
  121. for word in row_html_words:
  122. if word in words_html:
  123. row_html_numbers.append(str(words_html.index(word)))
  124. #index_value = words_html.index(word)
  125. #row_html_numbers.append(str(index_value))
  126.  
  127.  
  128. #rating = str(rating)
  129. writer = csv.writer(open("realCSV.dat", 'w'))
  130. writer.writerow([str(rating), " ".join(row_title_numbers), " ".join(row_html_numbers)])
  131.  
  132. #if count not in dict_line:
  133. #dict_line[count] = []
  134. #dict_lines.append([rating, row_html_numbers, row_html_numbers])
  135. #count += 1
  136.  
  137. #print("Finished line with hits: title " + str(len(row_title_numbers)) + ", html " + str(len(row_html_numbers)))
  138. print("Finished reading file " + filename)
  139. print("Finished reading all files!")
  140. #print(len(words_title) + len(words_html))
  141.  
  142.  
  143. words_title, words_html = createLexicons()
  144. lines = readCSV(words_title, words_html)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement