Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import threading
- import json
- import fileinput
- import os
- import sys
- import time
- import re
- from collections import defaultdict
- class Indexer ():
- dictionary = defaultdict(list)
- categories = defaultdict(list)
- users = defaultdict(list)
- tweets = []
- tweet_counts = 0
- tweets_dir = '../data/tweets/';
- categories_dir = '../data/categories/';
- users_dir = '../data/users/';
- dictionary_dir = '../data/dictionary/';
- def run (self):
- self.index()
- def index(self):
- for line in fileinput.input(['../data/raw_tweets.txt']):
- if self.tweet_counts % 1000 == 0:
- self.save_data(self.users, self.users_dir)
- self.save_data(self.categories, self.categories_dir)
- self.save_data(self.dictionary, self.dictionary_dir)
- self.tweet_counts += 1
- tweet_array = line.split(",")
- #get rid of follower data
- tweet_array.pop(-1)
- tweet_array.pop(-1)
- user_name = tweet_array.pop(-1)
- display_user_name = user_name
- user_name = user_name.lower()
- #get rid of timestamp
- tweet_array.pop(0)
- #put the tweet back together, incase the actual message contained commas
- tweet = ','.join(tweet_array)
- tweet_id = self.tweet_counts
- self.users[user_name].append(tweet_id)
- clean_tweet = re.compile('[^a-zA-Z0-9@#]').sub(' ', tweet)
- tweet = self.tokenize_line(tweet_id, clean_tweet)
- tweet = (user_name + '%' + tweet).replace(user_name, '<a href = "/index.php?query=' + user_name + '&type=u">' + display_user_name + '</a>');
- self.tweets.append(tweet)
- def tokenize_line(self, tweet_id, tweet):
- tokens = tweet.split(" ")
- for token in tokens:
- if len(token) == 0:
- continue
- display_token = token
- #index the token after putting it to lower case, this ensures, that case-conflicts don't emerge when fetching said token
- token = token.lower()
- file_name = self.get_file_name(token, 0, 1)
- #replace the category with a link to the categories
- if token[0] == "#":
- token = token[1:len(token)-1]
- tweet = tweet.replace(display_token, '<a href = "/index.php?query=' + token + '&type=c">' + display_token + '</a>');
- self.categories[token].append(tweet_id)
- #make the username clickable
- elif token[0] == "@":
- token_display = token
- token = token[1:len(token)-1]
- tweet = tweet.replace(display_token, '<a href = "/index.php?query=' + token + '&type=u">' + display_token + '</a>');
- self.dictionary[token].append(tweet_id)
- return tweet
- def get_file_name(self, token, lower, upper):
- result = ""
- upper += 1
- if len(token) >= (upper):
- result = token[lower:upper]
- elif len(token) >= (lower):
- result = token[lower]
- return result
- #saves the hash content to the given file after
- #loading any existing data
- def save_data(self, hash, directory):
- for token in hash:
- if len(token) == 0:
- continue
- id_list = hash[token]
- file_name = self.get_file_name(token, 0, 1)
- file_path = directory + file_name + '.txt'
- file = self.open_file(file_path)
- existing_tweets = self.read_file(file)
- if existing_tweets.get(token, False) == False:
- existing_tweets[token] = []
- existing_tweets[token].extend(id_list)
- sorted(existing_tweets[token])
- file.close()
- #open file handler to write
- file = open(file_path, 'w')
- file.write(json.dumps(existing_tweets))
- file.close()
- hash = defaultdict(list)
- #returns a file object for a file. if it doesn't exist, it makes it
- def open_file(self, file):
- if os.path.exists(file):
- return open(file, 'r')
- open(file, 'w+').close()
- return open(file, 'r')
- def read_file(self, file):
- file_contents = file.read()
- if len(file_contents) == 0:
- return {}
- else:
- return json.loads(file_contents)
- i = Indexer()
- start = time.time()
- i.index()
- end = time.time()
- elapsed= end - start
- print (elapsed)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement