Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- class Index
- {
- private $tweets;
- private $categories;
- private $users;
- private $inverted;
- private $files_map;
- //file names holding the json-encoded data
- public static $indexed_inverted = 'indexed_inverted';
- public static $indexed_tweets = 'indexed_tweets';
- public static $indexed_categories = 'indexed_categories';
- public static $indexed_users = 'indexed_users';
- function Index()
- {
- $this->files_map = array(Index::$indexed_inverted => &$this->inverted,
- Index::$indexed_tweets => &$this->tweets,
- Index::$indexed_categories => &$this->categories,
- Index::$indexed_users => &$this->users);
- }
- function assignIndices($indices_to_fetch)
- {
- //data validation
- if(!is_array($indices_to_fetch))
- {
- throw new Excepton('$indices_to_fetch needs to be an array');
- }
- foreach($indices_to_fetch as $file_name)
- {
- if(!array_key_exists($file_name, $this->files_map))
- {
- throw new Exception('Key ' . $file_name . ' doesn\t exist in files_map');
- }
- $file_path = 'data/'.$file_name.'.txt';
- //pre existing data.. exists
- if(file_exists($file_path))
- {
- $file_contents = file_get_contents($file_path);
- $this->files_map[$file_name] = (array) json_decode($file_contents);
- }
- }
- }
- function buildIndex($num)
- {
- $read_handle = fopen('data/raw_tweets.txt.'.$num.'', 'r');
- $count = 0;
- while(!feof($read_handle))
- {
- //divide the tweet into an array, an index will exist per n+1 commas
- $current_tweet_array = explode(',', fgets($read_handle));
- //lets me know there's progress.. rather than watching an idle console screen
- if($count % 500 == 0)
- echo "reached $count\n";
- //get rid of follower data
- array_pop($current_tweet_array);
- array_pop($current_tweet_array);
- $user_name = strtolower(array_pop($current_tweet_array));
- //get rid of timestamp
- array_shift($current_tweet_array);
- //put the tweet back together, incase the actual message contained commas
- $tweet = implode(",", $current_tweet_array);
- //undo the html special characters which appear quite often
- $tweet = htmlspecialchars_decode($tweet);
- //the current tweet's id is the previous tweets position in the array
- $tweet_id = count($this->tweets);
- //attribute the tweet to the author
- if(!isset($this->users[$user_name]))
- {
- $this->users[$user_name] = array();
- }
- $this->users[$user_name][] = $tweet_id;
- //index only the words from the tweet.
- $clean_tweet = preg_replace('/[^a-zA-Z0-9@&#_\s]/', ' ', $tweet);
- $tweet = $this->tokenizeLine($clean_tweet, $tweet_id, $tweet);
- $tweet = str_replace($user_name, '<a href = "/index.php?query='.$user_name.'&type=u">'. $user_name .'</a>', ($user_name . '%' . $tweet));
- $this->tweets[] = $tweet;
- $count++;
- }
- $this->writeToFile(array_keys($this->files_map));
- }
- private function tokenizeLine($clean_tweet, $tweet_id, $tweet)
- {
- $token = strtok($clean_tweet, " ");
- $words_in_place = array();
- while($token !== false)
- {
- $words_in_place[$token] = 1;
- //replace the category with a link to the categories
- if($token[0] == '#')
- {
- $token_display = $token;
- $token = substr($token, 1);
- $tweet = str_replace($token_display, '<a href = "/index.php?query='.$token.'&type=c">'. $token_display .'</a>', $tweet);
- if(!isset($this->categories[$token]))
- $this->categories[$token] = array();
- $this->categories[$token][] = $tweet_id;
- }
- //don't bother indexing usernames with the general terms, but make it clickable
- if($token[0] == '@')
- {
- $token_display = $token;
- $token = substr($token, 1);
- $tweet = str_replace($token_display, '<a href = "/index.php?query='.$token.'&type=u">'. $token_display .'</a>', $tweet);
- $token = strtok(' ');
- continue;
- }
- //index the token after putting it to lower case, this ensures, that case-conflicts don't emerge when fetching said token
- $token = strtolower($token);
- if(!isset($this->inverted[$token]))
- {
- $this->inverted[$token] = array();
- }
- if(!isset($words_in_place[$token]))
- {
- $this->inverted[$token][] = $tweet_id;
- }
- $token = strtok(' ');
- }
- return $tweet;
- }
- private function writeToFile($file_name_array)
- {
- if(is_array($file_name_array))
- {
- foreach($file_name_array as $file_name)
- {
- //write our indexed data to file in json format
- $file_path = 'data/'.$file_name.'.txt';
- unlink($file_path);
- $write_handle = fopen($file_path, 'w');
- fwrite($write_handle, json_encode($this->files_map[$file_name]));
- fclose($write_handle);
- }
- }
- }
- }
- ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement