Advertisement
Darayavaush

Multibooru Downloader

Jun 22nd, 2013
1,544
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Perl 15.16 KB | None | 0 0
  1. #!/usr/bin/perl
  2.  
  3. # Made by Anonymous; modified by Dariush to work with Danbooru 2
  4. # v.2.1.0 - added blacklist, fixed downloading of tags with over 200 images, generally cleaned up;
  5. # v.2.2.0 - added Gelbooru support, made everything extensible for support for more sites later;
  6. # v.2.3.0 - added Pixiv support, added blacklist addition from the command line, revamped the code, added automatic subdirectory creation, remade the way Sigint works, unified tag and pool downloads (note that pools aren't explicitly supported right now, but you can still grab them by searching for "pool:XXXX" as tag);
  7. # v.2.4.0 - changed argument handling so that quotes are no longer required, added folder and file naming schemes; blacklist now supports multi-tag combinations;
  8. # v.2.4.1 - added Pixiv tag downloads. Unfortunately, if they contain Japanese characters, they have to be entered in the parameter section of the script itself, since commandline doesn't pass Unicode to Perl properly;
  9. # v.2.4.2 - added DeviantArt support. No other changes;
  10. # You may contact me via PM on Danbooru or at archsinus@gmail.com
  11.  
  12. # Parameters that you (yes, YOU) can modify.
  13.  
  14. my @blacklist = ("amputee","scat","comic monochrome","doll_joints","puru-see","game_cg","yaoi"); #input tags as strings separated by commas
  15. my $tag_override = ""; #intended to be used only when trying to pass Unicode as input (for example, when using Pixiv tags that contain non-latin symbols (aka all of them)); I failed to get Unicode to read from ARGV properly. :(
  16.  
  17. # Below this line begins the script.
  18.  
  19. use strict;
  20. use warnings;
  21. use WWW::Mechanize;
  22. use HTTP::Response;
  23. use threads;
  24. use threads::shared;
  25. use File::Basename;
  26. use Digest::SHA1 qw(sha1_hex);
  27. use URI::Escape;
  28. use Data::Dumper;
  29. use Digest::MD5;
  30.    
  31. my $stop = 0;
  32. $SIG{'INT'} = 'SIGINT_handler';
  33. $| = 1; #flush stdout immediately
  34. my $user;
  35. my $pass;
  36. my $directory :shared;
  37. $directory = 'images';
  38. my $tags;
  39. my $site = 'dant';
  40. my $limit = 180; #Danbooru hardcaps requests at 200 images, so don't set this above 200. I want some overhead, so I set it to a bit lower value
  41. my $threads = 8;
  42. my $subdir = "<orig>";#id booru_name
  43. my $name = "<orig>";#hash title
  44.     #open(my $debug, '>','debug.txt');
  45.     my $exit = 0; #0 is full work, 1 is exit before download
  46. my $mech = WWW::Mechanize->new();
  47. $mech -> cookie_jar(HTTP::Cookies->new());
  48.  
  49. if (grep { /-help$|^help$|-h/i } @ARGV )
  50. {
  51.     show_help();
  52.     exit;
  53. }
  54.  
  55. #data input
  56. my $args = join(' ',@ARGV);
  57. my @strs = split(/(-\S)\b/,$args);
  58. shift @strs;
  59. $/ = ' ';
  60. foreach(@strs)  {   s/^\s+//; chomp;    }
  61. my %input = @strs;
  62.  
  63. $user = $input{"-u"};
  64. $pass = $input{"-p"};
  65. $tags = $input{"-t"};
  66.                                             if ($tag_override ne '')
  67.     {$tags = $tag_override;
  68.     print "WARNING: tag override is in effect.\n";}
  69.                                             if (exists $input{"-b"})
  70.     {push @blacklist, split(' ',$input{"-b"});
  71.     s/%/ /g foreach (@blacklist);}
  72. $directory = $input{"-d"}                   if (exists $input{"-d"});
  73. $exit = $input{"-e"}                        if (exists $input{"-e"});
  74. $site = $input{"-s"}                        if (exists $input{"-s"});
  75. $subdir = $input{"-r"}                      if (exists $input{"-r"});
  76. $name = $input{"-n"}                        if (exists $input{"-n"});
  77. $limit = $input{"-l"}                       if (exists $input{"-l"});
  78. $threads = $input{"-x"}                     if (exists $input{"-x"});
  79.  
  80. #data handling
  81. $directory =~ s/\/|\\$//;
  82. my %url_base = (
  83.         dant => "http://danbooru.donmai.us/post/index.xml",
  84.         gel  => "http://gelbooru.com/index.php?page=dapi&s=post&q=index",
  85.         pixi => "http://www.pixiv.net/member_illust.php",
  86.         pixt => "http://www.pixiv.net/search.php",
  87.         danp => "http://danbooru.donmai.us/pool/show.xml",
  88.         dea  => "deviantart.com/gallery/",
  89.         );
  90.  
  91. my @auth = ('pix');
  92.  
  93. print "Downloading '$tags' to $directory from $site.\n\n";
  94.  
  95. if ($tags eq '' or !exists $url_base{$site} or (($user eq '' or $pass eq '') and grep {$site =~ $_} @auth))
  96. {
  97.     show_help();
  98.     exit;
  99. }
  100. my $url = $url_base{$site};
  101. $url = authorize($url);
  102.  
  103. my @files :shared;
  104.  
  105. die "Non-unique subdirectory name" if (
  106.     (($site =~ 'dan' or $site eq 'gel')
  107.         and $subdir !~ /(<orig>)/
  108.         and $subdir !~ /(<booru_name>)/)
  109.     or  ($site eq 'pixi'
  110.         and $subdir !~ /(<orig>)/
  111.         and ($subdir !~ /(<booru_name>)/ or $subdir !~ /(<booru_fallback=[^>]+>)/)
  112.         and $subdir !~ /(<id>)/)
  113.     or  ($site eq 'pixt'
  114.         and $subdir !~ /(<orig>)/
  115.         and $subdir !~ /(<id>)/)
  116.     or  ($site eq 'dea'
  117.         and $subdir !~ /(<orig>)/
  118.         and $subdir !~ /(<id>)/)
  119.         and ($subdir !~ /(<booru_name>)/ or $subdir !~ /(<booru_fallback=[^>]+>)/)
  120.     );
  121.      
  122. for(my $page = 1; ; $page++)
  123. {
  124.     exit if ($stop);   
  125.        
  126.     fetch_page($page);
  127.     last if (handle_page($mech->content));
  128. }
  129.  
  130.     #print Dumper(@files);
  131.     exit if ($exit == 1);
  132. #yay, we have an array of links to files to be downloaded!
  133.  
  134. if (!-d $directory)
  135. {   mkdir $directory;}
  136. chdir $directory;
  137. $subdir = proper($subdir);
  138. if (!-d $subdir)
  139. {   mkdir $subdir;}
  140. die "Failed to chdir into subdirectory. Please try some other naming scheme." if !chdir $subdir;
  141.  
  142. my @thr;
  143. my $file;
  144. if ($#files+1 < $threads) { $threads = $#files+1; };
  145.  
  146. print "\nDownloading ".($#files+1)." files in $threads threads.\n";
  147.  
  148. for (1..$threads)
  149. {
  150.     if ($file = shift @files)
  151.     {
  152.         $thr[$_] = threads->create(\&save_file, $file);
  153.     }
  154. }
  155.  
  156. while (sleep 1)
  157. {
  158.     for (1..$threads)
  159.     {
  160.         if ($thr[$_]->is_joinable)
  161.         {
  162.             $thr[$_]->join;
  163.             if ($file = shift @files and !$stop)
  164.             {
  165.                 $thr[$_] = threads->create(\&save_file, $file);
  166.             }
  167.         }
  168.     }
  169.     last if (($#files == -1 or $stop) and threads->list == 0);
  170. }
  171.  
  172. sub handle_page
  173. #return value of 0 means that there are more pages to be fetched (non-empty page from a multi-page site); return of 1 means that this is the last page (the only page from a single-page site or an empty page from a multi-page one)
  174. {
  175.     my $content = shift;
  176.     if ($site =~ 'dan' or $site eq 'gel')
  177.     {  
  178.         $subdir =~ s/<orig>/$tags/g;
  179.         $subdir =~ s/<booru_name>/$tags/g;
  180.         return 1 if ($content !~ /<post (.+)\/>/);
  181.         while ($content =~ /<post (.+)\/>/g)
  182.         {
  183.             my $hash = hashXML($1);
  184.             foreach (@blacklist)
  185.             {
  186.                 my @sep_black = split(' ',$_); #separate components of multi-tag blacklisted combinations
  187.                 my $black_counter = 0;
  188.                 foreach (@sep_black)
  189.                 {
  190.                     if ($hash->{tags} =~ /$_/)
  191.                     {
  192.                         $black_counter += 1;
  193.                     }
  194.                 }
  195.                 if ($black_counter >= 0+@sep_black) #only blacklist the whole post if it matches every tag in the space-separated combination
  196.                 {
  197.                     $hash->{blacklisted} = 1;
  198.                     last;
  199.                 }
  200.             }
  201.             push @files, ($site =~ 'dan' ? "http://danbooru.donmai.us".$hash->{file_url}:$hash->{file_url}) unless $hash->{blacklisted};
  202.         }
  203.     }
  204.     if ($site =~ 'pix')
  205.     {  
  206.         my @links = grep {$_ =~ /illust_id=(\d+)/} map {$_->url} ($mech->links);
  207.         return 1 if (!@links);
  208.         foreach (@links)
  209.         {
  210.             /illust_id=(\d+)/;
  211.             $mech -> get("http://spapi.pixiv.net/iphone/illust.php?illust_id=$1");
  212.             my $content = $mech->content;
  213.             $content =~ s/"//g;
  214.             my @fields = split /,/, $content;
  215.             my $url;
  216.             my $manga_pages = '';
  217.             if ($fields[1] != 0 and 0+@fields == 31) #API is working correctly, we can do this the fast way
  218.             {
  219.                 $url = $fields[9];
  220.                 $manga_pages = $fields[19];                
  221.                 $url =~ s/(mobile\/)|(_480mw)|(jpg.*$)//g; #we chop the extension because it might be different from the actual one
  222.                 $url .= $fields[2];
  223.                     $subdir =~ s/<orig>/$fields[24]/g;
  224.             }
  225.             else
  226.             { #API is fucked up :(
  227.                 $mech->get("http://pixiv.net/".$_);
  228.                 $url = $mech->find_image(url_regex => qr/\d+_m.\S+/)->url;
  229.                 $url =~ s/_m//;
  230.                 if ($mech->content =~ /<li>Manga (\d+)P<\/li>/) #manga
  231.                 {
  232.                     $manga_pages = $1; 
  233.                 }
  234.                     if ($subdir =~ /<orig>/)
  235.                     {
  236.                         my $name = $mech->find_link(url_regex => qr/\S+stacc\/([^?\/]+)$/)->url;
  237.                         $name =~ s/^\S+\///;
  238.                         $subdir =~ s/<orig>/$name/g;
  239.                     }
  240.             }
  241.             #ID and Danbooru name lookup are independent of API, so they are done outside of API-specific blocks
  242.                     $subdir =~ s/<id>/$tags/g;         
  243.                 if ($subdir =~ /<booru_name>/)
  244.                 {
  245.                     $mech->get("http://danbooru.donmai.us/artists.xml?name=http://www.pixiv.net/member.php?id=$tags");
  246.                     if ($mech->content =~ /<name>(\S+)<\/name>/)
  247.                     {
  248.                         my $temp = $1;
  249.                         $subdir =~ s/<booru_name>/$temp/g;
  250.                         $subdir =~ s/<booru_fallback=[^>]+>//g;
  251.                     } else {
  252.                         $subdir =~ s/<booru_name>//g;
  253.                         $subdir =~ s/<booru_fallback=([^>]+)>/$1/g;
  254.                     }  
  255.                 }
  256.             if ($manga_pages ne '')
  257.             {
  258.                 $url =~ /(\S+)(\.\w+)$/;
  259.                 for (my $i = 0; $i < $manga_pages; $i++)
  260.                 {
  261.                     push @files, $1."_p$i".$2;
  262.                 }
  263.             }
  264.             else
  265.             {
  266.                 push @files, $url;
  267.             }
  268.         }
  269.     }
  270.     if ($site eq 'dea')
  271.     {  
  272.         $subdir =~ s/<orig>/$tags/g;   
  273.         if ($subdir =~ /<booru_name>/)
  274.         {
  275.             $mech->get("http://danbooru.donmai.us/artists.xml?name=http://$tags.deviantart.com/");
  276.             if ($mech->content =~ /<name>(\S+)<\/name>/)
  277.             {
  278.                 my $temp = $1;
  279.                 $subdir =~ s/<booru_name>/$temp/g;
  280.                 $subdir =~ s/<booru_fallback=[^>]+>//g;
  281.             } else {
  282.                 $subdir =~ s/<booru_name>//g;
  283.                 $subdir =~ s/<booru_fallback=([^>]+)>/$1/g;
  284.             }  
  285.         }
  286.         my @links = grep {$_ =~ /\/art\/(\S)+#comments/} map {$_->url} ($mech->links);
  287.         return 1 if (!@links);
  288.         foreach (@links)
  289.         {
  290.             s/#comments//;
  291.             $mech -> get("http://backend.deviantart.com/oembed?url=$_");
  292.             my $hash = hashJSON($mech->content);
  293.             push @files, $hash->{url};
  294.         }
  295.     }
  296.     print "Unused argument $1 supplied in subdirectory naming scheme.\n" while $subdir =~ /(<[^>]+>)/g;
  297.     $subdir =~ s/<[^>]+>//g;
  298.     return 0;
  299. }
  300.  
  301. sub authorize
  302. {
  303.     my $lurl = shift;
  304.     if ($site =~ 'dan')
  305.     {
  306.         $lurl.="?login=".uri_escape($user)."&password_hash=".sha1_hex("choujin-steiner--$pass--");
  307.         #no failure detection because apparently Danbooru doesn't actually care whether the username/password combination is correct
  308.     }
  309.     if ($site =~ 'pix')
  310.     {
  311.         $mech -> get('http://www.pixiv.net/login.php');
  312.         $mech -> submit_form(
  313.                     with_fields => {
  314.                         pixiv_id    => $user,
  315.                         pass        => $pass,
  316.                         skip        => 1,
  317.                     },
  318.                 );         
  319.         die("Authorization failed.\n") if ($mech->content =~ /loggedIn = false/);
  320.     }
  321.     print "Authorization successful.\n";
  322.    
  323.     return $lurl;
  324. }
  325.  
  326. sub hashJSON
  327. {
  328.     my $string = shift @_; 
  329.     my $hash;
  330.     while ($string =~ /"([^"]+)":"*([^,"]*)"*,/g) #JSON #{"approver_id":13793,"created_
  331.     {
  332.         $hash -> {$1} = $2;
  333.     }
  334.     return $hash;
  335. }
  336.  
  337. sub hashXML
  338. {
  339.     my $string = shift @_; 
  340.     my $hash;
  341.     while ($string =~ /(\S+)="([^"]*)"/g)
  342.     {
  343.         $hash -> {$1} = $2;
  344.     }
  345.     return $hash;
  346. }
  347.  
  348. sub fetch_page
  349. {
  350.     my $page = shift;
  351.     my %local = (
  352.         dant => "<url>&tags=$tags&page=$page&limit=$limit",
  353.         gel  => "<url>&tags=$tags&pid=".($page-1)."&limit=$limit",
  354.         pixi => "<url>?id=$tags&p=$page",
  355.         pixt => "<url>?s_mode=s_tag_full&word=$tags&p=$page",
  356.         danp => "<url>?id=$tags&page=$page&limit=$limit",
  357.         dea  => "http://$tags.<url>?offset=".($page-1)*24,
  358.         );
  359.     my $lurl = $local{$site};
  360.     $lurl =~ s/<url>/$url/;
  361.     print "Getting [$lurl] (page $page)... ";
  362.     my $response = $mech->get($lurl);  
  363.     if ($response->is_success)
  364.     {
  365.         print "OK.\n";
  366.         return $response->content;
  367.     }
  368.     else
  369.     {
  370.         print 'Error: ' . $response->code . ' ' . $response->message . "\n";
  371.         return undef;
  372.     }
  373. }
  374.  
  375. sub save_file
  376. {
  377.     my $file_url = shift;
  378.     my $local_name = $name;
  379.     $file_url =~ /(([^\/\\]+)\.([^\/\\]+))$/;
  380.     my $filename = $1;
  381.     my $file_id = $2;
  382.     my $ext = $3;  
  383.     $local_name =~ s/<orig>/$file_id/g;
  384.     my $temp_name = $local_name;
  385.     $temp_name =~ s/(<[^>]+>)//g; #this includes hashes and other options that can only be added after downloading
  386.     #we do everything that's possible to do with the file without downloading before this line
  387.     print ''.($#files+1)." files left, saving $filename...\n";
  388.     if (-e "$temp_name.$ext") #duplicate detection only works if chosen file naming scheme doesn't use hashes
  389.         { print "File already existed, skipping...\n" ;
  390.         threads->exit; }
  391.     else
  392.         {$mech->get($file_url, ':content_file' => $filename); }
  393.     if ($local_name =~ /<hash>/)
  394.     {  
  395.         open (my $fh, '<', $filename) or die "Can't open $filename: $!";
  396.         binmode ($fh);
  397.         my $hash = Digest::MD5->new->addfile($fh)->hexdigest;
  398.         $local_name =~ s/<hash>/$hash/g;
  399.     }
  400.     print "Unused argument $1 supplied in file naming scheme.\n" while $local_name =~ /(<[^>]+>)/g;
  401.     $local_name =~ s/(<[^>]+>)//g;
  402.     if ($name eq "<orig>")
  403.     {
  404.         print "Saved $filename succesfully.\n";
  405.     } else {
  406.         rename ($filename, $local_name.'.'.$ext); #this also serves as a backup duplicate detection scheme - pictures may get downloaded a second time, but they quietly overwrite the old version so the only symptom is a redundant download
  407.         print "Saved $filename (as $local_name.$ext) succesfully.\n";
  408.     }
  409.     threads->exit;
  410. }
  411.  
  412. sub show_help
  413. {
  414.         print "Multibooru download script.
  415. Usage: ".basename($0)." -u <username> -p <pass> -t <input> <other options>}
  416. Options:
  417.         -t          any input you want to throw at the script - artist ID for Pixiv, tags for 'boorus;
  418.         -d          directory to save images to (a unique subdirectory based on input will be automatically created) (default `$directory');
  419.         -s          site to download from (default Danbooru), syntax:
  420.             'dant' for Danbooru;
  421.             'gel' for Gelbooru;
  422.             'pixi' for Pixiv download by ID;
  423.             'pixt' for Pixiv download by tag; (if using this option, use ".'$tag_override'." option in the parameter section to ensure correct Unicode handling)
  424.         -b          blacklisted tags that work like simple tags prefixed with '-', but don't take up the two-tag limit and are processed client-side and not server-side like normal exclusions. Thus, this option is best used when you want to exclude a small percentage of posts and not for something like 'long_hair -b touhou'; if you want to exclude a combination of tags, all of which must be present for image not to be downloaded, use % as separator, as in 'comic%monochrome'; only applies to 'boorus;
  425.         -r          subdirectory naming scheme: takes a string that consists of any of the following arguments (angular brackets must be included):
  426.             <orig>                  'booru tag for 'boorus, artist name for Pixiv and DA;
  427.             <id>                    numeric ID, only works on Pixiv;
  428.             <booru_name>            'booru tag, works everywhere;
  429.             <booru_fallback=X>      if artist isn't in the database, X will be substituted (X may contain other bracketed values, so <booru_fallback=<orig> (Pixiv <id>)> is a valid string). Otherwise, this block is ignored);
  430.                 default is <orig>;
  431.         -n          file naming scheme: takes a string that consists of any of the following arguments (angular brackets must be included):
  432.             <orig>                  unchanged file name - hash for 'boorus, image id for Pixiv, title with artist suffix for DA;
  433.             <hash>                  MD5 hash - produces collisions when the same picture has been downloaded from another site (this is a good thing);
  434.             <title>                 work title, supposed to work on Pixiv and DA, but doesn't;
  435.                 default is <orig>;
  436.         ===         Things you probably want to mess with end here          ===
  437.         -l          files per page (defaut $limit); only applies to 'boorus;
  438.         -x          number of threads (default $threads);
  439.         -e          debug code (0 is normal run, 1 exits after completing the link array, but before downloading anything or creating any directories, more options will probably be added later).
  440.         ";
  441. }
  442.  
  443. sub proper
  444. {
  445.     $_[0] =~ s/_|:/ /g;
  446.     return join('',map{ucfirst("$_")} split(/\b/,$_[0]));
  447. }
  448.  
  449. sub SIGINT_handler
  450. {
  451.         #@files = ();
  452.         print "Interrupted by SIGINT, stopping...\n";
  453.         $stop = 1;
  454. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement