Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- import sys
- import re
- import os
- from collections import Counter
- print("Building corpus...")
- nicknames = []
- def process_text(line):
- line = line.replace("‘", "'").replace("’", "'")
- line = line.replace("“", '"').replace("”", '"')
- line = line.replace("\"\"", "\"").replace(", , ", "\n")
- line = re.sub(r"@.*#[0-9]{4}", "", line)
- return line
- with open("corpus.txt", "w") as logfile:
- for dirname, dirnames, filenames in os.walk('./corpus_data'):
- for filename in filenames:
- print("Processing {}...".format(os.path.join(dirname, filename)))
- with open(os.path.join(dirname, filename), "r") as csvfile:
- reader = csv.reader(csvfile, delimiter=";", quotechar="\"")
- for row in reader:
- if row[0] in nicknames or len(nicknames) == 0:
- logfile.write(process_text(row[2].strip() + "\n"))
- MAX_WORD_COUNT = 10
- word_freqs = [Counter() for i in range(MAX_WORD_COUNT)]
- all_phrases = Counter()
- def consecutive_words(string, n):
- words = re.sub('["“”\(\)\[\]]', '', string.rstrip('\n')).split(" ")
- for i in range(len(words) - n + 1):
- phrase = " ".join(words[i:i + n])
- yield phrase
- corpus = open("corpus.txt", "r")
- for count, line in enumerate(corpus):
- for i in range(MAX_WORD_COUNT):
- for phrase in consecutive_words(line, i + 1):
- word_freqs[i].update([phrase])
- all_phrases.update([phrase])
- if count % 1000 == 0:
- sys.stdout.write("\rAnalyzed {} lines...".format(count))
- print("all lines analyzed.")
- for i in range(MAX_WORD_COUNT):
- print("\rWriting {}-word-phrases.txt...".format(i + 1))
- freqfile = open("{}-word-phrases.txt".format(i + 1), "w")
- entry_count = 0
- for phrase, count in word_freqs[i].most_common():
- if count >= 2:
- freqfile.write("\"{}\" - {}\n".format(phrase, count))
- entry_count += 1
- if entry_count % 1000 == 0:
- sys.stdout.write("\rWrote {} entries...".format(entry_count))
- print("\rWriting all-phrases.txt...")
- freqfile = open("all-phrases.txt".format(i + 1), "w")
- for phrase, count in all_phrases.most_common():
- if count >= 2:
- freqfile.write("\"{}\" - {}\n".format(phrase, count))
- print("All done! Enjoy your analysis.")
Add Comment
Please, Sign In to add comment