View difference between Paste ID: Mnmfs7zN and
SHOW:
|
|
- or go back to the newest paste.
1 | - | |
1 | + | import threading |
2 | import json | |
3 | import fileinput | |
4 | import os | |
5 | import sys | |
6 | import time | |
7 | import re | |
8 | from collections import defaultdict | |
9 | class Indexer (): | |
10 | dictionary = defaultdict(list) | |
11 | categories = defaultdict(list) | |
12 | users = defaultdict(list) | |
13 | tweets = [] | |
14 | tweet_counts = 0 | |
15 | tweets_dir = '../data/tweets/'; | |
16 | categories_dir = '../data/categories/'; | |
17 | users_dir = '../data/users/'; | |
18 | dictionary_dir = '../data/dictionary/'; | |
19 | ||
20 | def run (self): | |
21 | self.index() | |
22 | def index(self): | |
23 | for line in fileinput.input(['../data/raw_tweets.txt']): | |
24 | if self.tweet_counts % 1000 == 0: | |
25 | self.save_data(self.users, self.users_dir) | |
26 | self.save_data(self.categories, self.categories_dir) | |
27 | self.save_data(self.dictionary, self.dictionary_dir) | |
28 | self.tweet_counts += 1 | |
29 | tweet_array = line.split(",") | |
30 | ||
31 | #get rid of follower data | |
32 | tweet_array.pop(-1) | |
33 | tweet_array.pop(-1) | |
34 | user_name = tweet_array.pop(-1) | |
35 | display_user_name = user_name | |
36 | user_name = user_name.lower() | |
37 | ||
38 | #get rid of timestamp | |
39 | tweet_array.pop(0) | |
40 | ||
41 | #put the tweet back together, incase the actual message contained commas | |
42 | tweet = ','.join(tweet_array) | |
43 | ||
44 | tweet_id = self.tweet_counts | |
45 | self.users[user_name].append(tweet_id) | |
46 | clean_tweet = re.compile('[^a-zA-Z0-9@#]').sub(' ', tweet) | |
47 | tweet = self.tokenize_line(tweet_id, clean_tweet) | |
48 | tweet = (user_name + '%' + tweet).replace(user_name, '<a href = "/index.php?query=' + user_name + '&type=u">' + display_user_name + '</a>'); | |
49 | self.tweets.append(tweet) | |
50 | ||
51 | def tokenize_line(self, tweet_id, tweet): | |
52 | tokens = tweet.split(" ") | |
53 | for token in tokens: | |
54 | if len(token) == 0: | |
55 | continue | |
56 | display_token = token | |
57 | #index the token after putting it to lower case, this ensures, that case-conflicts don't emerge when fetching said token | |
58 | token = token.lower() | |
59 | file_name = self.get_file_name(token, 0, 1) | |
60 | ||
61 | #replace the category with a link to the categories | |
62 | if token[0] == "#": | |
63 | token = token[1:len(token)-1] | |
64 | tweet = tweet.replace(display_token, '<a href = "/index.php?query=' + token + '&type=c">' + display_token + '</a>'); | |
65 | self.categories[token].append(tweet_id) | |
66 | ||
67 | #make the username clickable | |
68 | elif token[0] == "@": | |
69 | token_display = token | |
70 | token = token[1:len(token)-1] | |
71 | tweet = tweet.replace(display_token, '<a href = "/index.php?query=' + token + '&type=u">' + display_token + '</a>'); | |
72 | ||
73 | self.dictionary[token].append(tweet_id) | |
74 | return tweet | |
75 | ||
76 | def get_file_name(self, token, lower, upper): | |
77 | result = "" | |
78 | upper += 1 | |
79 | if len(token) >= (upper): | |
80 | result = token[lower:upper] | |
81 | elif len(token) >= (lower): | |
82 | result = token[lower] | |
83 | return result | |
84 | ||
85 | #saves the hash content to the given file after | |
86 | #loading any existing data | |
87 | def save_data(self, hash, directory): | |
88 | for token in hash: | |
89 | if len(token) == 0: | |
90 | continue | |
91 | id_list = hash[token] | |
92 | file_name = self.get_file_name(token, 0, 1) | |
93 | file_path = directory + file_name + '.txt' | |
94 | file = self.open_file(file_path) | |
95 | existing_tweets = self.read_file(file) | |
96 | if existing_tweets.get(token, False) == False: | |
97 | existing_tweets[token] = [] | |
98 | existing_tweets[token].extend(id_list) | |
99 | sorted(existing_tweets[token]) | |
100 | file.close() | |
101 | ||
102 | #open file handler to write | |
103 | file = open(file_path, 'w') | |
104 | file.write(json.dumps(existing_tweets)) | |
105 | file.close() | |
106 | hash = defaultdict(list) | |
107 | ||
108 | #returns a file object for a file. if it doesn't exist, it makes it | |
109 | def open_file(self, file): | |
110 | if os.path.exists(file): | |
111 | return open(file, 'r') | |
112 | open(file, 'w+').close() | |
113 | return open(file, 'r') | |
114 | ||
115 | def read_file(self, file): | |
116 | file_contents = file.read() | |
117 | if len(file_contents) == 0: | |
118 | return {} | |
119 | else: | |
120 | return json.loads(file_contents) | |
121 | ||
122 | ||
123 | ||
124 | def main(): | |
125 | i = Indexer() | |
126 | start = time.time() | |
127 | i.index() | |
128 | end = time.time() | |
129 | elapsed= end - start | |
130 | print (elapsed) | |
131 | ||
132 | if __name__ == "__main__": | |
133 | main() |