Advertisement
Guest User

Untitled

a guest
May 3rd, 2010
298
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.84 KB | None | 0 0
  1. import threading
  2. import json
  3. import fileinput
  4. import os
  5. import sys
  6. import time
  7. import re
  8. from collections import defaultdict
  9. class Indexer ():
  10.     dictionary = defaultdict(list)
  11.     categories = defaultdict(list)
  12.     users = defaultdict(list)
  13.     tweets = []
  14.     tweet_counts = 0
  15.     tweets_dir = '../data/tweets/';
  16.     categories_dir = '../data/categories/';
  17.     users_dir = '../data/users/';
  18.     dictionary_dir = '../data/dictionary/';
  19.  
  20.     def run (self):
  21.         self.index()
  22.     def index(self):
  23.         for line in fileinput.input(['../data/raw_tweets.txt']):
  24.             if self.tweet_counts % 1000 == 0:
  25.                 self.save_data(self.users, self.users_dir)
  26.                 self.save_data(self.categories, self.categories_dir)
  27.                 self.save_data(self.dictionary, self.dictionary_dir)
  28.             self.tweet_counts += 1
  29.             tweet_array = line.split(",")
  30.            
  31.             #get rid of follower data
  32.             tweet_array.pop(-1)
  33.             tweet_array.pop(-1)
  34.             user_name = tweet_array.pop(-1)
  35.             display_user_name = user_name
  36.             user_name = user_name.lower()
  37.            
  38.             #get rid of timestamp
  39.             tweet_array.pop(0)
  40.            
  41.             #put the tweet back together, incase the actual message contained commas
  42.             tweet = ','.join(tweet_array)
  43.            
  44.             tweet_id = self.tweet_counts
  45.             self.users[user_name].append(tweet_id)
  46.             clean_tweet = re.compile('[^a-zA-Z0-9@#]').sub(' ', tweet)
  47.             tweet = self.tokenize_line(tweet_id, clean_tweet)
  48.             tweet = (user_name + '%' + tweet).replace(user_name, '<a href = "/index.php?query=' + user_name + '&type=u">' + display_user_name + '</a>');
  49.             self.tweets.append(tweet)
  50.        
  51.     def tokenize_line(self, tweet_id, tweet):
  52.         tokens = tweet.split(" ")
  53.         for token in tokens:
  54.             if len(token) == 0:
  55.                 continue
  56.             display_token = token
  57.             #index the token after putting it to lower case, this ensures, that case-conflicts don't emerge when fetching said token
  58.             token = token.lower()
  59.             file_name = self.get_file_name(token, 0, 1)
  60.            
  61.             #replace the category with a link to the categories
  62.             if token[0] == "#":
  63.                 token = token[1:len(token)-1]
  64.                 tweet = tweet.replace(display_token, '<a href = "/index.php?query=' + token + '&type=c">' + display_token + '</a>');
  65.                 self.categories[token].append(tweet_id)
  66.                
  67.             #make the username clickable
  68.             elif token[0] == "@":
  69.                 token_display = token
  70.                 token = token[1:len(token)-1]
  71.                 tweet = tweet.replace(display_token, '<a href = "/index.php?query=' + token + '&type=u">' + display_token + '</a>');
  72.                
  73.             self.dictionary[token].append(tweet_id)
  74.         return tweet
  75.    
  76.     def get_file_name(self, token, lower, upper):
  77.         result = ""
  78.         upper += 1
  79.         if len(token) >= (upper):
  80.             result = token[lower:upper]
  81.         elif len(token) >= (lower):
  82.             result = token[lower]
  83.         return result
  84.    
  85.     #saves the hash content to the given file after
  86.     #loading any existing data
  87.     def save_data(self, hash, directory):
  88.         for token in hash:
  89.             if len(token) == 0:
  90.                 continue
  91.             id_list = hash[token]
  92.             file_name = self.get_file_name(token, 0, 1)
  93.             file_path = directory + file_name + '.txt'
  94.             file = self.open_file(file_path)
  95.             existing_tweets = self.read_file(file)
  96.             if existing_tweets.get(token, False) == False:
  97.                 existing_tweets[token] = []
  98.             existing_tweets[token].extend(id_list)
  99.             sorted(existing_tweets[token])
  100.             file.close()
  101.            
  102.             #open file handler to write
  103.             file = open(file_path, 'w')
  104.             file.write(json.dumps(existing_tweets))
  105.             file.close()
  106.         hash = defaultdict(list)
  107.    
  108.     #returns a file object for a file. if it doesn't exist, it makes it
  109.     def open_file(self, file):
  110.         if os.path.exists(file):
  111.             return open(file, 'r')
  112.         open(file, 'w+').close()
  113.         return open(file, 'r')
  114.    
  115.     def read_file(self, file):
  116.         file_contents = file.read()
  117.         if len(file_contents) == 0:
  118.             return {}
  119.         else:
  120.             return json.loads(file_contents)
  121.            
  122.            
  123.        
  124.        
  125.            
  126. i = Indexer()
  127. start = time.time()
  128. i.index()
  129. end = time.time()
  130. elapsed= end - start
  131. print (elapsed)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement