View difference between Paste ID: Mnmfs7zN and
SHOW: | | - or go back to the newest paste.
1-
1+
import threading
2
import json
3
import fileinput
4
import os
5
import sys
6
import time
7
import re
8
from collections import defaultdict
9
class Indexer ():
10
	dictionary = defaultdict(list)
11
	categories = defaultdict(list)
12
	users = defaultdict(list)
13
	tweets = []
14
	tweet_counts = 0
15
	tweets_dir = '../data/tweets/';
16
	categories_dir = '../data/categories/';
17
	users_dir = '../data/users/';
18
	dictionary_dir = '../data/dictionary/';
19
20
	def run (self):
21
		self.index()
22
	def index(self):
23
		for line in fileinput.input(['../data/raw_tweets.txt']):
24
			if self.tweet_counts % 1000 == 0:
25
				self.save_data(self.users, self.users_dir)
26
				self.save_data(self.categories, self.categories_dir)
27
				self.save_data(self.dictionary, self.dictionary_dir)
28
			self.tweet_counts += 1
29
			tweet_array = line.split(",")
30
			
31
			#get rid of follower data
32
			tweet_array.pop(-1)
33
			tweet_array.pop(-1)
34
			user_name = tweet_array.pop(-1)
35
			display_user_name = user_name
36
			user_name = user_name.lower()
37
			
38
			#get rid of timestamp
39
			tweet_array.pop(0)
40
			
41
			#put the tweet back together, incase the actual message contained commas 
42
			tweet = ','.join(tweet_array)
43
			
44
			tweet_id = self.tweet_counts
45
			self.users[user_name].append(tweet_id)
46
			clean_tweet = re.compile('[^a-zA-Z0-9@#]').sub(' ', tweet)
47
			tweet = self.tokenize_line(tweet_id, clean_tweet)
48
			tweet = (user_name + '%' + tweet).replace(user_name, '<a href = "/index.php?query=' + user_name + '&type=u">' + display_user_name + '</a>');
49
			self.tweets.append(tweet)
50
		
51
	def tokenize_line(self, tweet_id, tweet):
52
		tokens = tweet.split(" ")
53
		for token in tokens:
54
			if len(token) == 0:
55
				continue
56
			display_token = token
57
			#index the token after putting it to lower case, this ensures, that case-conflicts don't emerge when fetching said token
58
			token = token.lower()
59
			file_name = self.get_file_name(token, 0, 1)
60
			
61
			#replace the category with a link to the categories
62
			if token[0] == "#":
63
				token = token[1:len(token)-1]
64
				tweet = tweet.replace(display_token, '<a href = "/index.php?query=' + token + '&type=c">' + display_token + '</a>');
65
				self.categories[token].append(tweet_id)	
66
				
67
			#make the username clickable 
68
			elif token[0] == "@":
69
				token_display = token
70
				token = token[1:len(token)-1]
71
				tweet = tweet.replace(display_token, '<a href = "/index.php?query=' + token + '&type=u">' + display_token + '</a>');
72
				
73
			self.dictionary[token].append(tweet_id)
74
		return tweet
75
	
76
	def get_file_name(self, token, lower, upper):
77
		result = ""
78
		upper += 1
79
		if len(token) >= (upper):
80
			result = token[lower:upper]
81
		elif len(token) >= (lower):
82
			result = token[lower]
83
		return result
84
	
85
	#saves the hash content to the given file after
86
	#loading any existing data 
87
	def save_data(self, hash, directory):
88
		for token in hash:
89
			if len(token) == 0:
90
				continue
91
			id_list = hash[token]
92
			file_name = self.get_file_name(token, 0, 1)
93
			file_path = directory + file_name + '.txt'
94
			file = self.open_file(file_path)
95
			existing_tweets = self.read_file(file)
96
			if existing_tweets.get(token, False) == False:
97
				existing_tweets[token] = []
98
			existing_tweets[token].extend(id_list)
99
			sorted(existing_tweets[token])
100
			file.close()
101
			
102
			#open file handler to write
103
			file = open(file_path, 'w')
104
			file.write(json.dumps(existing_tweets))
105
			file.close()
106
		hash = defaultdict(list)
107
	
108
	#returns a file object for a file. if it doesn't exist, it makes it
109
	def open_file(self, file):
110
		if os.path.exists(file):
111
			return open(file, 'r')
112
		open(file, 'w+').close()
113
		return open(file, 'r')
114
	
115
	def read_file(self, file):
116
		file_contents = file.read()
117
		if len(file_contents) == 0:
118
			return {}
119
		else:
120
			return json.loads(file_contents)
121
			
122
			
123
		
124
def main():
125
    i = Indexer()
126
    start = time.time()
127
    i.index()
128
    end = time.time()
129
    elapsed= end - start
130
    print (elapsed)		
131
132
if __name__ == "__main__":			
133
    main()