Advertisement
Guest User

Untitled

a guest
Apr 10th, 2010
210
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 4.68 KB | None | 0 0
  1. <?php
  2. class Index
  3. {
  4.     private $tweets;
  5.     private $categories;
  6.     private $users;
  7.     private $inverted;
  8.     private $files_map;
  9.    
  10.     //file names holding the json-encoded data
  11.     public static $indexed_inverted = 'indexed_inverted';
  12.     public static $indexed_tweets = 'indexed_tweets';
  13.     public static $indexed_categories = 'indexed_categories';
  14.     public static $indexed_users = 'indexed_users';
  15.    
  16.     function Index()
  17.     {
  18.         $this->files_map = array(Index::$indexed_inverted   => &$this->inverted,
  19.                                  Index::$indexed_tweets     => &$this->tweets,  
  20.                                  Index::$indexed_categories => &$this->categories,
  21.                                  Index::$indexed_users      => &$this->users); 
  22.     }
  23.    
  24.     function assignIndices($indices_to_fetch)
  25.     {
  26.         //data validation
  27.         if(!is_array($indices_to_fetch))
  28.         {
  29.             throw new Excepton('$indices_to_fetch needs to be an array');
  30.         }
  31.        
  32.         foreach($indices_to_fetch as $file_name)
  33.         {
  34.             if(!array_key_exists($file_name, $this->files_map))
  35.             {
  36.                 throw new Exception('Key ' . $file_name . ' doesn\t exist in files_map');
  37.             }
  38.             $file_path = 'data/'.$file_name.'.txt';
  39.        
  40.             //pre existing data.. exists
  41.             if(file_exists($file_path))
  42.             {
  43.                 $file_contents = file_get_contents($file_path);
  44.                 $this->files_map[$file_name] = (array) json_decode($file_contents);
  45.             }
  46.         }
  47.     }
  48.    
  49.     function buildIndex($num)
  50.     {
  51.         $read_handle = fopen('data/raw_tweets.txt.'.$num.'', 'r');
  52.         $count = 0;
  53.         while(!feof($read_handle))
  54.         {
  55.             //divide the tweet into an array, an index will exist per n+1 commas
  56.             $current_tweet_array = explode(',', fgets($read_handle));
  57.            
  58.             //lets me know there's progress.. rather than watching an idle console screen
  59.             if($count % 500 == 0)
  60.                 echo "reached $count\n";
  61.            
  62.             //get rid of follower data
  63.             array_pop($current_tweet_array);
  64.             array_pop($current_tweet_array);
  65.            
  66.             $user_name = strtolower(array_pop($current_tweet_array));
  67.            
  68.             //get rid of timestamp
  69.             array_shift($current_tweet_array);
  70.            
  71.             //put the tweet back together, incase the actual message contained commas
  72.             $tweet = implode(",", $current_tweet_array);
  73.            
  74.             //undo the html special characters which appear quite often
  75.             $tweet = htmlspecialchars_decode($tweet);
  76.            
  77.             //the current tweet's id is the previous tweets position in the array
  78.             $tweet_id = count($this->tweets);
  79.            
  80.             //attribute the tweet to the author
  81.             if(!isset($this->users[$user_name]))
  82.             {
  83.                 $this->users[$user_name] = array();
  84.             }
  85.             $this->users[$user_name][] = $tweet_id;
  86.            
  87.             //index only the words from the tweet.
  88.             $clean_tweet =  preg_replace('/[^a-zA-Z0-9@&#_\s]/', ' ', $tweet);
  89.             $tweet = $this->tokenizeLine($clean_tweet, $tweet_id, $tweet);
  90.             $tweet = str_replace($user_name, '<a href = "/index.php?query='.$user_name.'&type=u">'. $user_name .'</a>', ($user_name . '%' . $tweet));
  91.             $this->tweets[] = $tweet;
  92.             $count++;
  93.         }
  94.         $this->writeToFile(array_keys($this->files_map));
  95.     }
  96.    
  97.     private function tokenizeLine($clean_tweet, $tweet_id, $tweet)
  98.     {
  99.         $token = strtok($clean_tweet, " ");
  100.         $words_in_place = array();
  101.         while($token !== false)
  102.         {
  103.             $words_in_place[$token] = 1;
  104.             //replace the category with a link to the categories
  105.             if($token[0] == '#')
  106.             {
  107.                 $token_display = $token;
  108.                 $token = substr($token, 1);
  109.                 $tweet = str_replace($token_display, '<a href = "/index.php?query='.$token.'&type=c">'. $token_display .'</a>', $tweet);
  110.                 if(!isset($this->categories[$token]))
  111.                     $this->categories[$token] = array();
  112.                 $this->categories[$token][] = $tweet_id;
  113.             }  
  114.            
  115.             //don't bother indexing usernames with the general terms, but make it clickable
  116.             if($token[0] == '@')
  117.             {  
  118.                 $token_display = $token;
  119.                 $token = substr($token, 1);
  120.                 $tweet = str_replace($token_display, '<a href = "/index.php?query='.$token.'&type=u">'. $token_display .'</a>', $tweet);
  121.                 $token = strtok(' ');
  122.                 continue;
  123.             }
  124.  
  125.             //index the token after putting it to lower case, this ensures, that case-conflicts don't emerge when fetching said token
  126.             $token = strtolower($token);
  127.             if(!isset($this->inverted[$token]))
  128.             {
  129.                 $this->inverted[$token] = array();
  130.             }
  131.            
  132.             if(!isset($words_in_place[$token]))
  133.             {
  134.                 $this->inverted[$token][] = $tweet_id;
  135.             }
  136.             $token = strtok(' ');
  137.         }
  138.         return $tweet;
  139.     }
  140.    
  141.     private function writeToFile($file_name_array)
  142.     {
  143.         if(is_array($file_name_array))
  144.         {
  145.             foreach($file_name_array as $file_name)
  146.             {
  147.                 //write our indexed data to file in json format
  148.                 $file_path = 'data/'.$file_name.'.txt';
  149.                 unlink($file_path);
  150.                 $write_handle = fopen($file_path, 'w');
  151.                 fwrite($write_handle, json_encode($this->files_map[$file_name]));
  152.                 fclose($write_handle);
  153.             }
  154.         }
  155.     }
  156. }
  157. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement