import threading
import json
import fileinput
import os
import sys
import time
import re
from collections import defaultdict
class Indexer ():
dictionary = defaultdict(list)
categories = defaultdict(list)
users = defaultdict(list)
tweets = []
tweet_counts = 0
tweets_dir = '../data/tweets/';
categories_dir = '../data/categories/';
users_dir = '../data/users/';
dictionary_dir = '../data/dictionary/';
def run (self):
self.index()
def index(self):
for line in fileinput.input(['../data/raw_tweets.txt']):
if self.tweet_counts % 1000 == 0:
self.save_data(self.users, self.users_dir)
self.save_data(self.categories, self.categories_dir)
self.save_data(self.dictionary, self.dictionary_dir)
self.tweet_counts += 1
tweet_array = line.split(",")
#get rid of follower data
tweet_array.pop(-1)
tweet_array.pop(-1)
user_name = tweet_array.pop(-1)
display_user_name = user_name
user_name = user_name.lower()
#get rid of timestamp
tweet_array.pop(0)
#put the tweet back together, incase the actual message contained commas
tweet = ','.join(tweet_array)
tweet_id = self.tweet_counts
self.users[user_name].append(tweet_id)
clean_tweet = re.compile('[^a-zA-Z0-9@#]').sub(' ', tweet)
tweet = self.tokenize_line(tweet_id, clean_tweet)
tweet = (user_name + '%' + tweet).replace(user_name, '' + display_user_name + '');
self.tweets.append(tweet)
def tokenize_line(self, tweet_id, tweet):
tokens = tweet.split(" ")
for token in tokens:
if len(token) == 0:
continue
display_token = token
#index the token after putting it to lower case, this ensures, that case-conflicts don't emerge when fetching said token
token = token.lower()
file_name = self.get_file_name(token, 0, 1)
#replace the category with a link to the categories
if token[0] == "#":
token = token[1:len(token)-1]
tweet = tweet.replace(display_token, '' + display_token + '');
self.categories[token].append(tweet_id)
#make the username clickable
elif token[0] == "@":
token_display = token
token = token[1:len(token)-1]
tweet = tweet.replace(display_token, '' + display_token + '');
self.dictionary[token].append(tweet_id)
return tweet
def get_file_name(self, token, lower, upper):
result = ""
upper += 1
if len(token) >= (upper):
result = token[lower:upper]
elif len(token) >= (lower):
result = token[lower]
return result
#saves the hash content to the given file after
#loading any existing data
def save_data(self, hash, directory):
for token in hash:
if len(token) == 0:
continue
id_list = hash[token]
file_name = self.get_file_name(token, 0, 1)
file_path = directory + file_name + '.txt'
file = self.open_file(file_path)
existing_tweets = self.read_file(file)
if existing_tweets.get(token, False) == False:
existing_tweets[token] = []
existing_tweets[token].extend(id_list)
sorted(existing_tweets[token])
file.close()
#open file handler to write
file = open(file_path, 'w')
file.write(json.dumps(existing_tweets))
file.close()
hash = defaultdict(list)
#returns a file object for a file. if it doesn't exist, it makes it
def open_file(self, file):
if os.path.exists(file):
return open(file, 'r')
open(file, 'w+').close()
return open(file, 'r')
def read_file(self, file):
file_contents = file.read()
if len(file_contents) == 0:
return {}
else:
return json.loads(file_contents)
i = Indexer()
start = time.time()
i.index()
end = time.time()
elapsed= end - start
print (elapsed)