Advertisement
ZaynerTech

Scraper Aaron Swartz

Jan 17th, 2013
264
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 3.87 KB | None | 0 0
  1.     #!/usr/bin/perl
  2.     # Scraper for Science Magazine
  3.     # Needs curl and grep and mkdir
  4.     # Just use it on Linux
  5.     # Downloads paper and all available supplementary data
  6.     # including movies and tables
  7.     # Also downloads Table of Contents for each issue
  8.     # Should store all the data properly so one can browse from
  9.     # each TOC
  10.      
  11.     #issue
  12.     $x = 6116;
  13.      
  14.     #volume
  15.     $y = 339;
  16.      
  17.     for(;$x >= 1; $x--)
  18.     {
  19.      
  20.       #let us not DDoS them
  21.       sleep(5);
  22.       $url = "http://www.sciencemag.org/content/$y/$x.toc";
  23.      
  24.       $name = "$x\.toc";
  25.       `curl -s --retry 3 -A GoogleBot $url -o $name`;
  26.       # Last issue in volume goto next volume
  27.       # probably could be done more elegantly
  28.       if(`grep "Content not found" $name`)
  29.       {
  30.         $y--;
  31.         $url = "http://www.sciencemag.org/content/$y/$x.toc";
  32.         $name = "$x\.$y";
  33.         `curl -s --retry 3 -A GoogleBot $url -o $name`;
  34.       }
  35.      
  36.      
  37.       @supp = `grep -B 1 "Supporting Online Material" $name`;
  38.       @supp1 = `grep suppl $name`;
  39.       push(@supp, @supp1);
  40.       @stuff = `grep "Full Text (PDF)" $name`;
  41.       `mkdir $y`;
  42.       `mkdir $y/$x`;  
  43.      
  44.     #time to parse webpages to find the papers and files  
  45.     foreach $mag (@stuff)
  46.     {
  47.      @file = split(/\"/,$mag);
  48.      if(grep(/last/,$mag)) { $bob = "$file[3]"; }
  49.      else{
  50.      #print $bob = "$file[1]";
  51.     }
  52.      $name = "http://www.sciencemag.org$bob";
  53.      @final = split(/\//,$bob);
  54.        
  55.      `curl -s -A GoogleBot $name -o $y/$x/$final[4]`;
  56.     }
  57.      
  58.     #Find supplemental materials
  59.     foreach $sup (@supp)
  60.     {
  61.      @file = split(/\"/,$sup);
  62.      if(grep(/last/,$sup)) { $bob = "$file[3]"; }
  63.      else{ $bob = "$file[1]"; }
  64.      @dirs = split(/\//, $bob);
  65.      `mkdir ./$dirs[1]/$dirs[2]`;
  66.      `mkdir ./$dirs[1]/$dirs[2]/$dirs[3]`;
  67.      `mkdir ./$dirs[1]/$dirs[2]/$dirs[3]/$dirs[4]`;
  68.      `mkdir ./$dirs[1]/$dirs[2]/$dirs[3]/$dirs[4]/$dirs[5]`;
  69.      $name = "http://www.sciencemag.org$bob";
  70.      `curl -s -A GoogleBot $name -o .$bob`;
  71.      @morestuff = `grep "Download Supplement" .$bob`;
  72.      @movies = `grep "Movie S" .$bob`;
  73.      @tables = `grep "Table S" .$bob`;
  74.      
  75.      foreach $more (@morestuff)
  76.      {
  77.         @file = split(/\"/,$more);
  78.          $bob = "$file[3]";
  79.         if(grep(/science/, $bob)) {@final = split(/\//,$bob);$finalname = "$final[7]"; }
  80.         else {  @final = split(/\./,$bob); $finalname = "Supp\.$final[2]\.pdf"; }
  81.         $name = "http://www.sciencemag.org$bob";
  82.         `curl -s -A GoogleBot $name -o $y/$x/$finalname`;
  83.      }
  84.      
  85.      
  86.      foreach $mov (@movies)
  87.      {
  88.         @file = split(/\"/,$mov);
  89.          $bob = "$file[3]";
  90.         @final = split(/\//,$bob);$finalname = "$final[7]";
  91.         `mkdir ./$final[1]/$final[2]`;
  92.         `mkdir ./$final[1]/$final[2]/$final[3]`;
  93.         `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]`;
  94.         `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]`;
  95.         `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]`;
  96.        
  97.         $name = "http://www.sciencemag.org$bob";
  98.         `curl -s -A GoogleBot $name -o ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]/$finalname`;
  99.      }
  100.     foreach $tab (@tables)
  101.      {
  102.         @file = split(/\"/,$tab);
  103.         $bob = "$file[3]";
  104.         @final = split(/\//,$bob);$finalname = "$final[7]";
  105.         `mkdir ./$final[1]/$final[2]`;
  106.         `mkdir ./$final[1]/$final[2]/$final[3]`;
  107.         `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]`;
  108.         `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]`;
  109.         `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]`;
  110.      
  111.         $name = "http://www.sciencemag.org$bob";
  112.         `curl -s -A GoogleBot $name -o ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]/$finalname`;
  113.      }
  114.      
  115.     }
  116.     }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement