Guest User

Untitled

a guest
Oct 17th, 2018
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.07 KB | None | 0 0
  1. import csv
  2. import sys
  3. import re
  4. import os
  5. from collections import Counter
  6.  
  7. print("Building corpus...")
  8.  
  9. nicknames = []
  10.  
  11.  
  12. def process_text(line):
  13. line = line.replace("‘", "'").replace("’", "'")
  14. line = line.replace("“", '"').replace("”", '"')
  15. line = line.replace("\"\"", "\"").replace(", , ", "\n")
  16. line = re.sub(r"@.*#[0-9]{4}", "", line)
  17. return line
  18.  
  19.  
  20. with open("corpus.txt", "w") as logfile:
  21. for dirname, dirnames, filenames in os.walk('./corpus_data'):
  22. for filename in filenames:
  23. print("Processing {}...".format(os.path.join(dirname, filename)))
  24. with open(os.path.join(dirname, filename), "r") as csvfile:
  25. reader = csv.reader(csvfile, delimiter=";", quotechar="\"")
  26. for row in reader:
  27. if row[0] in nicknames or len(nicknames) == 0:
  28. logfile.write(process_text(row[2].strip() + "\n"))
  29.  
  30. MAX_WORD_COUNT = 10
  31.  
  32. word_freqs = [Counter() for i in range(MAX_WORD_COUNT)]
  33. all_phrases = Counter()
  34.  
  35.  
  36. def consecutive_words(string, n):
  37. words = re.sub('["“”\(\)\[\]]', '', string.rstrip('\n')).split(" ")
  38. for i in range(len(words) - n + 1):
  39. phrase = " ".join(words[i:i + n])
  40. yield phrase
  41.  
  42.  
  43. corpus = open("corpus.txt", "r")
  44.  
  45. for count, line in enumerate(corpus):
  46. for i in range(MAX_WORD_COUNT):
  47. for phrase in consecutive_words(line, i + 1):
  48. word_freqs[i].update([phrase])
  49. all_phrases.update([phrase])
  50. if count % 1000 == 0:
  51. sys.stdout.write("\rAnalyzed {} lines...".format(count))
  52.  
  53. print("all lines analyzed.")
  54.  
  55. for i in range(MAX_WORD_COUNT):
  56. print("\rWriting {}-word-phrases.txt...".format(i + 1))
  57. freqfile = open("{}-word-phrases.txt".format(i + 1), "w")
  58. entry_count = 0
  59. for phrase, count in word_freqs[i].most_common():
  60. if count >= 2:
  61. freqfile.write("\"{}\" - {}\n".format(phrase, count))
  62. entry_count += 1
  63. if entry_count % 1000 == 0:
  64. sys.stdout.write("\rWrote {} entries...".format(entry_count))
  65.  
  66. print("\rWriting all-phrases.txt...")
  67. freqfile = open("all-phrases.txt".format(i + 1), "w")
  68. for phrase, count in all_phrases.most_common():
  69. if count >= 2:
  70. freqfile.write("\"{}\" - {}\n".format(phrase, count))
  71.  
  72. print("All done! Enjoy your analysis.")
Add Comment
Please, Sign In to add comment