Untitled

import csv
import sys
import re
import os
from collections import Counter

print("Building corpus...")

nicknames = []


def process_text(line):
	line = line.replace("‘", "'").replace("’", "'")
	line = line.replace("“", '"').replace("”", '"')
	line = line.replace("\"\"", "\"").replace(", , ", "\n")
	line = re.sub(r"@.*#[0-9]{4}", "", line)
	return line


with open("corpus.txt", "w") as logfile:
	for dirname, dirnames, filenames in os.walk('./corpus_data'):
		for filename in filenames:
			print("Processing {}...".format(os.path.join(dirname, filename)))
			with open(os.path.join(dirname, filename), "r") as csvfile:
				reader = csv.reader(csvfile, delimiter=";", quotechar="\"")
				for row in reader:
					if row[0] in nicknames or len(nicknames) == 0:
						logfile.write(process_text(row[2].strip() + "\n"))

MAX_WORD_COUNT = 10

word_freqs = [Counter() for i in range(MAX_WORD_COUNT)]
all_phrases = Counter()


def consecutive_words(string, n):
	words = re.sub('["“”\(\)\[\]]', '', string.rstrip('\n')).split(" ")
	for i in range(len(words) - n + 1):
		phrase = " ".join(words[i:i + n])
		yield phrase


corpus = open("corpus.txt", "r")

for count, line in enumerate(corpus):
	for i in range(MAX_WORD_COUNT):
		for phrase in consecutive_words(line, i + 1):
			word_freqs[i].update([phrase])
			all_phrases.update([phrase])
	if count % 1000 == 0:
		sys.stdout.write("\rAnalyzed {} lines...".format(count))

print("all lines analyzed.")

for i in range(MAX_WORD_COUNT):
	print("\rWriting {}-word-phrases.txt...".format(i + 1))
	freqfile = open("{}-word-phrases.txt".format(i + 1), "w")
	entry_count = 0
	for phrase, count in word_freqs[i].most_common():
		if count >= 2:
			freqfile.write("\"{}\" - {}\n".format(phrase, count))
			entry_count += 1
			if entry_count % 1000 == 0:
				sys.stdout.write("\rWrote {} entries...".format(entry_count))

print("\rWriting all-phrases.txt...")
freqfile = open("all-phrases.txt".format(i + 1), "w")
for phrase, count in all_phrases.most_common():
	if count >= 2:
		freqfile.write("\"{}\" - {}\n".format(phrase, count))

print("All done! Enjoy your analysis.")