Advertisement
moytrage

deviantart_downloader.pl

Feb 28th, 2012
928
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 5.85 KB | None | 0 0
  1. # A Downloader for DeviantArt.com galleries. Per author.
  2. # Either use: deviantart_downloader.pl authorname1 authorname2 authorname3 ...
  3. # Or put a list of authors into @authors variable down below like in example.
  4. # In case of problems see program's output (including download log in author's subdir).
  5. # You may send letters to user moytra at Google's gmail.com
  6.  
  7. our @authors = @ARGV ? @ARGV : qw(anry artgerm genzoman artgutierrez kerembeyit jenovah-art sandara yukiusagi1983 pierrerodriguez longai);
  8.  
  9. use strict;
  10. use warnings;
  11.  
  12. our $check_existing_image_integrity = 0; # If all existing images are rechecked for integrity
  13. our $check_downloaded_image_integrity = 0; # If to check integrity right after download
  14. our $auto_remove_corrupted = 1; # If true will automatically remove corrupted files and try redownload
  15. our $force_html_redownload = 0; # If to redownload html pages referring images every time
  16. our $download_images = 1; # If 0 images are not downloaded. Only pages...
  17. our $ignore_download_errors = 1; # If 1 if there's no file on server (happens), still proceed, don't die...
  18.  
  19. our $offset_limit = 9960; # To prevent infinitly downloading HTML pages in case of parsing errors, limit offset (max num of retrieved pages).
  20.  
  21. our $down_log = "down.log"; # Download log from wget.
  22. our $null_file = ":NUL"; # Put /dev/null on Unix.
  23.  
  24. sub mdir($) {
  25.     my $dir = shift;
  26.     mkdir "$dir" or die "Can't create dir [$dir]." unless -d "$dir";
  27. }
  28.  
  29. sub down($$$) {
  30.     my ($url, $file, $log) = @_;
  31.     print "[Download]";
  32.  
  33.     return (system ("wget -a $log \"$url\" -O $file") == 0);
  34. }
  35.  
  36. sub check_integrity($) {
  37.     my $fname = shift;
  38.     print "[IntegrityCheck]";
  39.     return system("identify -verbose -quiet -regard-warnings \"$fname\" 2>> $null_file 1>> $null_file") == 0;
  40. }
  41.  
  42. sub unique_merge_into($@) {
  43.     my ($fname, @links) = @_;
  44.  
  45.     my %all_links;
  46.  
  47.     if (-e "$fname") {
  48.         open my $flinks, "<$fname" or die "Can't read open [$fname].";
  49.         while (<$flinks>) {
  50.             next if /^\s*$/;
  51.             chomp;
  52.             $all_links{$_} = 1;
  53.         }
  54.         close $flinks;
  55.     }
  56.  
  57.     # Merge existing links with new ones. For history, if some images were removed.
  58.     $all_links{$_} = 1 for @links;
  59.  
  60.     {
  61.         open my $flinks, ">$fname" or die "Can't write open [$fname].";
  62.         print $flinks "$_\n" for sort keys %all_links;
  63.         close $flinks;
  64.     }
  65. }
  66.  
  67. print '*' x 40, "Start!", '*' x 40, "\n";
  68.  
  69. for my $author (@authors) {
  70.     print "-" x 40, "[$author]", "-" x 40, "\n";
  71.  
  72.     my $offset = 0;
  73.  
  74.     mdir("$author");
  75.     mdir("$author/pages");
  76.     mdir("$author/super");
  77.     mdir("$author/full");
  78.  
  79.     my $fetch_next_page = 1;
  80.     my @links;
  81.     my %all_found_images;
  82.  
  83.     PAGE_LOOP:
  84.     while ($fetch_next_page) {
  85.         # Each page equals 24 images
  86.  
  87.         die "Offset limit reached... Looks like infinite loop, something wrong with parsing HTML pages." if $offset >= $offset_limit;
  88.  
  89.         $fetch_next_page = 0;
  90.         my $url = "http://$author.deviantart.com/gallery/?offset=$offset";
  91.         my $fname = "$author/pages/offset_".sprintf("%04d", $offset).".html";
  92.  
  93.         print "[$fname]\n";
  94.  
  95.         my $html_existed_before = -e $fname;
  96.  
  97.         if (not -e $fname or $force_html_redownload) {
  98.             down($url, $fname, "$author/$down_log") or die "Can't download [$url] to [$fname].";
  99.         }
  100.        
  101.         open my $file, "<$fname" or die "Can't open for reading [$fname].";
  102.         my $text = do { local $/; <$file> };
  103.         close $file;
  104.  
  105.         unless ($text =~ /^\s*(<!DOCTYPE html>|<html>)/ and $text =~ /<\/html>\s*$/) {
  106.             if ($html_existed_before and $auto_remove_corrupted) {
  107.                 print "[!Corrupted!][Delete][Restart]\n";
  108.                 unlink($fname) or die "Can't delete [$fname].";
  109.                 redo PAGE_LOOP;
  110.             } else {
  111.                 die "Corrupted HTML page [$fname], please resolve manually.";
  112.             }
  113.         }
  114.  
  115.         print "[Parse]\n\n";
  116.  
  117.         $text =~ /id="gruze-main"/ or die "Can't find magic marker of main view in HTML. Maybe website has changed encoding...";
  118.         $text = substr($text, $-[0]);
  119.  
  120.         for my $full_link ($text =~ /<a class="thumb"[^>]*?>.*?<\/a>/g) {
  121.             $full_link =~ s/[\r\n]/ /g;
  122.             $full_link =~ /^(<a.*?>)/ or die "Can't extract first <a> tag from [$full_link].";
  123.             my $link = $1;
  124.             my %attrs = ($link =~ /(\w+)="(.*?)"/g);
  125.  
  126.             unless ($attrs{super_img}) {
  127.                 warn "Empty super_img in link [$link]. Usually always present at DeviantArt. Skipping...";
  128.                 next;
  129.             }
  130.  
  131.             my ($quality, $url) = exists $attrs{super_fullimg} ? ("f", $attrs{super_fullimg}) : ("s", $attrs{super_img});
  132.  
  133.             $attrs{href} =~ /^http:\/\/$author\.deviantart\.com\/art\/(.+)$/i or die "Can't parse [$attrs{href}].";
  134.             $1 =~ /^(.+?)-(\d+)$/ or die;
  135.             my $id_name = sprintf("${author}_%010d_${quality}_$1", $2);
  136.  
  137.  
  138.             # This is a guarantee of no inf loop. If no new images are found from the author, finish...
  139.             next if exists $all_found_images{$id_name};
  140.  
  141.             $all_found_images{$id_name} = $id_name;
  142.  
  143.             $url =~ /\.([a-z]+)$/ or die "Can't get file extension from url [$url].";
  144.             my $ftype = $1;
  145.             my $fname = "$author/".($quality eq "s" ? "super" : ($quality eq "f" ? "full" : die))."/$id_name.$ftype";
  146.  
  147.             print "\t[$fname]\n\t";
  148.  
  149.             if (-e $fname and $check_existing_image_integrity and not check_integrity($fname)) {
  150.                 print "[!Corrupted!]";
  151.                 if ($auto_remove_corrupted) {
  152.                     print "[Delete]";
  153.                     unlink($fname);
  154.                 } else {
  155.                     die "Auto-delete not allowed... Dying...";
  156.                 }
  157.             }
  158.  
  159.             if (not -e $fname and $download_images) {
  160.                 unless (down($url, $fname, "$author/$down_log")) {
  161.                     if ($ignore_download_errors) {
  162.                         print "[DownloadFailure][Skip]\n";
  163.                         next;
  164.                     } else {
  165.                         die "Can't download [$url].";
  166.                     }
  167.                 }
  168.  
  169.                 check_integrity($fname) or die "Corrupted image [$fname] after download! Please resolve..." if $check_downloaded_image_integrity;
  170.             }
  171.  
  172.             print "\n\n";
  173.             $fetch_next_page = 1;
  174.             push @links, $full_link;
  175.         }
  176.  
  177.         $offset += 24;
  178.     }
  179.  
  180.     unique_merge_into("$author/links.html", @links);
  181.     unique_merge_into("all_links.html", @links);
  182. }
  183.  
  184. print '*' x 40, "All Finished! :)", '*' x 40, "\n";
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement