Advertisement
Guest User

In Memory of Aaron Swartz

a guest
Jan 16th, 2013
336
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.42 KB | None | 0 0
  1. #!/usr/bin/perl
  2. # Scraper for Science Magazine
  3. # Needs curl and grep and mkdir
  4. # Just use it on Linux
  5. # Downloads paper and all available supplementary data
  6. # including movies and tables
  7. # Also downloads Table of Contents for each issue
  8. # Should store all the data properly so one can browse from
  9. # each TOC
  10.  
  11. #issue
  12. $x = 6116;
  13.  
  14. #volume
  15. $y = 339;
  16.  
  17. for(;$x >= 1; $x--)
  18. {
  19.  
  20. #let us not DDoS them
  21. sleep(5);
  22. $url = "http://www.sciencemag.org/content/$y/$x.toc";
  23.  
  24. $name = "$x\.toc";
  25. `curl -s --retry 3 -A GoogleBot $url -o $name`;
  26. # Last issue in volume goto next volume
  27. # probably could be done more elegantly
  28. if(`grep "Content not found" $name`)
  29. {
  30. $y--;
  31. $url = "http://www.sciencemag.org/content/$y/$x.toc";
  32. $name = "$x\.$y";
  33. `curl -s --retry 3 -A GoogleBot $url -o $name`;
  34. }
  35.  
  36.  
  37. @supp = `grep -B 1 "Supporting Online Material" $name`;
  38. @supp1 = `grep suppl $name`;
  39. push(@supp, @supp1);
  40. @stuff = `grep "Full Text (PDF)" $name`;
  41. `mkdir $y`;
  42. `mkdir $y/$x`;
  43.  
  44. #time to parse webpages to find the papers and files
  45. foreach $mag (@stuff)
  46. {
  47. @file = split(/\"/,$mag);
  48. if(grep(/last/,$mag)) { $bob = "$file[3]"; }
  49. else{
  50. #print $bob = "$file[1]";
  51. }
  52. $name = "http://www.sciencemag.org$bob";
  53. @final = split(/\//,$bob);
  54.  
  55. `curl -s -A GoogleBot $name -o $y/$x/$final[4]`;
  56. }
  57.  
  58. #Find supplemental materials
  59. foreach $sup (@supp)
  60. {
  61. @file = split(/\"/,$sup);
  62. if(grep(/last/,$sup)) { $bob = "$file[3]"; }
  63. else{ $bob = "$file[1]"; }
  64. @dirs = split(/\//, $bob);
  65. `mkdir ./$dirs[1]/$dirs[2]`;
  66. `mkdir ./$dirs[1]/$dirs[2]/$dirs[3]`;
  67. `mkdir ./$dirs[1]/$dirs[2]/$dirs[3]/$dirs[4]`;
  68. `mkdir ./$dirs[1]/$dirs[2]/$dirs[3]/$dirs[4]/$dirs[5]`;
  69. $name = "http://www.sciencemag.org$bob";
  70. `curl -s -A GoogleBot $name -o .$bob`;
  71. @morestuff = `grep "Download Supplement" .$bob`;
  72. @movies = `grep "Movie S" .$bob`;
  73. @tables = `grep "Table S" .$bob`;
  74.  
  75. foreach $more (@morestuff)
  76. {
  77. @file = split(/\"/,$more);
  78. $bob = "$file[3]";
  79. if(grep(/science/, $bob)) {@final = split(/\//,$bob);$finalname = "$final[7]"; }
  80. else { @final = split(/\./,$bob); $finalname = "Supp\.$final[2]\.pdf"; }
  81. $name = "http://www.sciencemag.org$bob";
  82. `curl -s -A GoogleBot $name -o $y/$x/$finalname`;
  83. }
  84.  
  85.  
  86. foreach $mov (@movies)
  87. {
  88. @file = split(/\"/,$mov);
  89. $bob = "$file[3]";
  90. @final = split(/\//,$bob);$finalname = "$final[7]";
  91. `mkdir ./$final[1]/$final[2]`;
  92. `mkdir ./$final[1]/$final[2]/$final[3]`;
  93. `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]`;
  94. `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]`;
  95. `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]`;
  96.  
  97. $name = "http://www.sciencemag.org$bob";
  98. `curl -s -A GoogleBot $name -o ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]/$finalname`;
  99. }
  100. foreach $tab (@tables)
  101. {
  102. @file = split(/\"/,$tab);
  103. $bob = "$file[3]";
  104. @final = split(/\//,$bob);$finalname = "$final[7]";
  105. `mkdir ./$final[1]/$final[2]`;
  106. `mkdir ./$final[1]/$final[2]/$final[3]`;
  107. `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]`;
  108. `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]`;
  109. `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]`;
  110.  
  111. $name = "http://www.sciencemag.org$bob";
  112. `curl -s -A GoogleBot $name -o ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]/$finalname`;
  113. }
  114.  
  115. }
  116. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement