import threading import json import fileinput import os import sys import time import re from collections import defaultdict class Indexer (): dictionary = defaultdict(list) categories = defaultdict(list) users = defaultdict(list) tweets = [] tweet_counts = 0 tweets_dir = '../data/tweets/'; categories_dir = '../data/categories/'; users_dir = '../data/users/'; dictionary_dir = '../data/dictionary/'; def run (self): self.index() def index(self): for line in fileinput.input(['../data/raw_tweets.txt']): if self.tweet_counts % 1000 == 0: self.save_data(self.users, self.users_dir) self.save_data(self.categories, self.categories_dir) self.save_data(self.dictionary, self.dictionary_dir) self.tweet_counts += 1 tweet_array = line.split(",") #get rid of follower data tweet_array.pop(-1) tweet_array.pop(-1) user_name = tweet_array.pop(-1) display_user_name = user_name user_name = user_name.lower() #get rid of timestamp tweet_array.pop(0) #put the tweet back together, incase the actual message contained commas tweet = ','.join(tweet_array) tweet_id = self.tweet_counts self.users[user_name].append(tweet_id) clean_tweet = re.compile('[^a-zA-Z0-9@#]').sub(' ', tweet) tweet = self.tokenize_line(tweet_id, clean_tweet) tweet = (user_name + '%' + tweet).replace(user_name, '' + display_user_name + ''); self.tweets.append(tweet) def tokenize_line(self, tweet_id, tweet): tokens = tweet.split(" ") for token in tokens: if len(token) == 0: continue display_token = token #index the token after putting it to lower case, this ensures, that case-conflicts don't emerge when fetching said token token = token.lower() file_name = self.get_file_name(token, 0, 1) #replace the category with a link to the categories if token[0] == "#": token = token[1:len(token)-1] tweet = tweet.replace(display_token, '' + display_token + ''); self.categories[token].append(tweet_id) #make the username clickable elif token[0] == "@": token_display = token token = token[1:len(token)-1] tweet = tweet.replace(display_token, '' + display_token + ''); self.dictionary[token].append(tweet_id) return tweet def get_file_name(self, token, lower, upper): result = "" upper += 1 if len(token) >= (upper): result = token[lower:upper] elif len(token) >= (lower): result = token[lower] return result #saves the hash content to the given file after #loading any existing data def save_data(self, hash, directory): for token in hash: if len(token) == 0: continue id_list = hash[token] file_name = self.get_file_name(token, 0, 1) file_path = directory + file_name + '.txt' file = self.open_file(file_path) existing_tweets = self.read_file(file) if existing_tweets.get(token, False) == False: existing_tweets[token] = [] existing_tweets[token].extend(id_list) sorted(existing_tweets[token]) file.close() #open file handler to write file = open(file_path, 'w') file.write(json.dumps(existing_tweets)) file.close() hash = defaultdict(list) #returns a file object for a file. if it doesn't exist, it makes it def open_file(self, file): if os.path.exists(file): return open(file, 'r') open(file, 'w+').close() return open(file, 'r') def read_file(self, file): file_contents = file.read() if len(file_contents) == 0: return {} else: return json.loads(file_contents) i = Indexer() start = time.time() i.index() end = time.time() elapsed= end - start print (elapsed)