Advertisement
HighInBC

Pastebin scraper. Follows API rules, needs whitelisted IP

Jun 14th, 2016
747
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 2.03 KB | None | 0 0
  1. #!/usr/bin/perl
  2. #
  3. #  This is a simple perl program to scrape pastebin.com
  4. #
  5. #  It will save a copy of every new paste made in one file and will
  6. #  save another file containing the meta data for the paste.
  7. #
  8. #  Offered with no warranties, use at own risk. It is your
  9. #  responsibility to make sure you are not disruptive to the API.
  10. #
  11. #  The variables $scrape_rate and $fetch_rate can be changed to
  12. #  adjust the speed. They are currently set to recommended rates.
  13. #  The variable $data_dir can be changed to where you want the data
  14. #  stored, this folder must exist.
  15. #
  16. #  This program requires that you whitelist your IP with the API.
  17. #
  18. #  Released to the public domain by Ryan Bushby.
  19. #
  20. use strict;
  21. use utf8;
  22. use JSON qw( from_json to_json );
  23. use LWP::Simple;
  24. no warnings 'utf8';
  25.  
  26. my $data_dir = 'pastes';
  27.  
  28. my $scrape_rate = 60;
  29. my $fetch_rate = 1;
  30.  
  31. while( 1 ) {
  32.   my $scrape_time = time() + $scrape_rate;
  33.   my $j_scrape = get('http://pastebin.com/api_scraping.php?limit=100');  
  34.   my $data = from_json( $j_scrape );
  35.  
  36.   my $overlap;
  37.   my $added;
  38.   foreach my $record ( reverse @{$data} ) {
  39.     my $key = $record->{key};
  40.     my $filename_content = $data_dir.'/content_'.$record->{date}."_$key";
  41.     my $filename_meta    = "$data_dir/meta_$key";
  42.     unless( -f( $filename_meta ) ) {
  43.       print "Saving $key".($record->{'title'} ? " - ".$record->{title} : '')."\n";
  44.       $record->{filename_content} = $filename_content;
  45.       $record->{filename_meta} = $filename_meta;
  46.       getstore( $record->{scrape_url}, $filename_content );
  47.       open( META, ">$filename_meta" );
  48.       print META ( to_json( $record, { pretty => 1 } ) );
  49.       $added++;
  50.       sleep $fetch_rate;
  51.     } else {
  52. #      print "Skipping already cached key: $key\n";
  53.       $overlap++;
  54.     }
  55.   }
  56.  
  57.   print( "There were $added records added and $overlap records of overlap\n" );
  58.  
  59.   my $time_left = ($scrape_time - time());
  60.   print ( "$time_left seconds to wait\n" ) if ( $time_left > 0 );
  61.   do {
  62.     sleep ( 1 );
  63.   } until ( time() >= $scrape_time );
  64. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement