Untitled

import threading
import json
import fileinput
import os
import sys
import time
import re
from collections import defaultdict
class Indexer ():
    dictionary = defaultdict(list)
    categories = defaultdict(list)
    users = defaultdict(list)
    tweets = []
    tweet_counts = 0
    tweets_dir = '../data/tweets/';
    categories_dir = '../data/categories/';
    users_dir = '../data/users/';
    dictionary_dir = '../data/dictionary/';

    def run (self):
        self.index()
    def index(self):
        for line in fileinput.input(['../data/raw_tweets.txt']):
            if self.tweet_counts % 1000 == 0:
                self.save_data(self.users, self.users_dir)
                self.save_data(self.categories, self.categories_dir)
                self.save_data(self.dictionary, self.dictionary_dir)
            self.tweet_counts += 1
            tweet_array = line.split(",")

            #get rid of follower data
            tweet_array.pop(-1)
            tweet_array.pop(-1)
            user_name = tweet_array.pop(-1)
            display_user_name = user_name
            user_name = user_name.lower()

            #get rid of timestamp
            tweet_array.pop(0)

            #put the tweet back together, incase the actual message contained commas
            tweet = ','.join(tweet_array)

            tweet_id = self.tweet_counts
            self.users[user_name].append(tweet_id)
            clean_tweet = re.compile('[^a-zA-Z0-9@#]').sub(' ', tweet)
            tweet = self.tokenize_line(tweet_id, clean_tweet)
            tweet = (user_name + '%' + tweet).replace(user_name, '<a href = "/index.php?query=' + user_name + '&type=u">' + display_user_name + '</a>');
            self.tweets.append(tweet)

    def tokenize_line(self, tweet_id, tweet):
        tokens = tweet.split(" ")
        for token in tokens:
            if len(token) == 0:
                continue
            display_token = token
            #index the token after putting it to lower case, this ensures, that case-conflicts don't emerge when fetching said token
            token = token.lower()
            file_name = self.get_file_name(token, 0, 1)

            #replace the category with a link to the categories
            if token[0] == "#":
                token = token[1:len(token)-1]
                tweet = tweet.replace(display_token, '<a href = "/index.php?query=' + token + '&type=c">' + display_token + '</a>');
                self.categories[token].append(tweet_id)

            #make the username clickable
            elif token[0] == "@":
                token_display = token
                token = token[1:len(token)-1]
                tweet = tweet.replace(display_token, '<a href = "/index.php?query=' + token + '&type=u">' + display_token + '</a>');

            self.dictionary[token].append(tweet_id)
        return tweet

    def get_file_name(self, token, lower, upper):
        result = ""
        upper += 1
        if len(token) >= (upper):
            result = token[lower:upper]
        elif len(token) >= (lower):
            result = token[lower]
        return result

    #saves the hash content to the given file after
    #loading any existing data
    def save_data(self, hash, directory):
        for token in hash:
            if len(token) == 0:
                continue
            id_list = hash[token]
            file_name = self.get_file_name(token, 0, 1)
            file_path = directory + file_name + '.txt'
            file = self.open_file(file_path)
            existing_tweets = self.read_file(file)
            if existing_tweets.get(token, False) == False:
                existing_tweets[token] = []
            existing_tweets[token].extend(id_list)
            sorted(existing_tweets[token])
            file.close()

            #open file handler to write
            file = open(file_path, 'w')
            file.write(json.dumps(existing_tweets))
            file.close()
        hash = defaultdict(list)

    #returns a file object for a file. if it doesn't exist, it makes it
    def open_file(self, file):
        if os.path.exists(file):
            return open(file, 'r')
        open(file, 'w+').close()
        return open(file, 'r')

    def read_file(self, file):
        file_contents = file.read()
        if len(file_contents) == 0:
            return {}
        else:
            return json.loads(file_contents)


i = Indexer()
start = time.time()
i.index()
end = time.time()
elapsed= end - start
print (elapsed)