Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/perl
- # Scraper for Science Magazine
- # Needs curl and grep and mkdir
- # Just use it on Linux
- # Downloads paper and all available supplementary data
- # including movies and tables
- # Also downloads Table of Contents for each issue
- # Should store all the data properly so one can browse from
- # each TOC
- #issue
- $x = 6116;
- #volume
- $y = 339;
- for(;$x >= 1; $x--)
- {
- #let us not DDoS them
- sleep(5);
- $url = "http://www.sciencemag.org/content/$y/$x.toc";
- $name = "$x\.toc";
- `curl -s --retry 3 -A GoogleBot $url -o $name`;
- # Last issue in volume goto next volume
- # probably could be done more elegantly
- if(`grep "Content not found" $name`)
- {
- $y--;
- $url = "http://www.sciencemag.org/content/$y/$x.toc";
- $name = "$x\.$y";
- `curl -s --retry 3 -A GoogleBot $url -o $name`;
- }
- @supp = `grep -B 1 "Supporting Online Material" $name`;
- @supp1 = `grep suppl $name`;
- push(@supp, @supp1);
- @stuff = `grep "Full Text (PDF)" $name`;
- `mkdir $y`;
- `mkdir $y/$x`;
- #time to parse webpages to find the papers and files
- foreach $mag (@stuff)
- {
- @file = split(/\"/,$mag);
- if(grep(/last/,$mag)) { $bob = "$file[3]"; }
- else{
- #print $bob = "$file[1]";
- }
- $name = "http://www.sciencemag.org$bob";
- @final = split(/\//,$bob);
- `curl -s -A GoogleBot $name -o $y/$x/$final[4]`;
- }
- #Find supplemental materials
- foreach $sup (@supp)
- {
- @file = split(/\"/,$sup);
- if(grep(/last/,$sup)) { $bob = "$file[3]"; }
- else{ $bob = "$file[1]"; }
- @dirs = split(/\//, $bob);
- `mkdir ./$dirs[1]/$dirs[2]`;
- `mkdir ./$dirs[1]/$dirs[2]/$dirs[3]`;
- `mkdir ./$dirs[1]/$dirs[2]/$dirs[3]/$dirs[4]`;
- `mkdir ./$dirs[1]/$dirs[2]/$dirs[3]/$dirs[4]/$dirs[5]`;
- $name = "http://www.sciencemag.org$bob";
- `curl -s -A GoogleBot $name -o .$bob`;
- @morestuff = `grep "Download Supplement" .$bob`;
- @movies = `grep "Movie S" .$bob`;
- @tables = `grep "Table S" .$bob`;
- foreach $more (@morestuff)
- {
- @file = split(/\"/,$more);
- $bob = "$file[3]";
- if(grep(/science/, $bob)) {@final = split(/\//,$bob);$finalname = "$final[7]"; }
- else { @final = split(/\./,$bob); $finalname = "Supp\.$final[2]\.pdf"; }
- $name = "http://www.sciencemag.org$bob";
- `curl -s -A GoogleBot $name -o $y/$x/$finalname`;
- }
- foreach $mov (@movies)
- {
- @file = split(/\"/,$mov);
- $bob = "$file[3]";
- @final = split(/\//,$bob);$finalname = "$final[7]";
- `mkdir ./$final[1]/$final[2]`;
- `mkdir ./$final[1]/$final[2]/$final[3]`;
- `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]`;
- `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]`;
- `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]`;
- $name = "http://www.sciencemag.org$bob";
- `curl -s -A GoogleBot $name -o ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]/$finalname`;
- }
- foreach $tab (@tables)
- {
- @file = split(/\"/,$tab);
- $bob = "$file[3]";
- @final = split(/\//,$bob);$finalname = "$final[7]";
- `mkdir ./$final[1]/$final[2]`;
- `mkdir ./$final[1]/$final[2]/$final[3]`;
- `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]`;
- `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]`;
- `mkdir ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]`;
- $name = "http://www.sciencemag.org$bob";
- `curl -s -A GoogleBot $name -o ./$final[1]/$final[2]/$final[3]/$final[4]/$final[5]/$final[6]/$finalname`;
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement