Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # A Downloader for DeviantArt.com galleries. Per author.
- # Either use: deviantart_downloader.pl authorname1 authorname2 authorname3 ...
- # Or put a list of authors into @authors variable down below like in example.
- # In case of problems see program's output (including download log in author's subdir).
- # You may send letters to user moytra at Google's gmail.com
- our @authors = @ARGV ? @ARGV : qw(anry artgerm genzoman artgutierrez kerembeyit jenovah-art sandara yukiusagi1983 pierrerodriguez longai);
- use strict;
- use warnings;
- our $check_existing_image_integrity = 0; # If all existing images are rechecked for integrity
- our $check_downloaded_image_integrity = 0; # If to check integrity right after download
- our $auto_remove_corrupted = 1; # If true will automatically remove corrupted files and try redownload
- our $force_html_redownload = 0; # If to redownload html pages referring images every time
- our $download_images = 1; # If 0 images are not downloaded. Only pages...
- our $ignore_download_errors = 1; # If 1 if there's no file on server (happens), still proceed, don't die...
- our $offset_limit = 9960; # To prevent infinitly downloading HTML pages in case of parsing errors, limit offset (max num of retrieved pages).
- our $down_log = "down.log"; # Download log from wget.
- our $null_file = ":NUL"; # Put /dev/null on Unix.
- sub mdir($) {
- my $dir = shift;
- mkdir "$dir" or die "Can't create dir [$dir]." unless -d "$dir";
- }
- sub down($$$) {
- my ($url, $file, $log) = @_;
- print "[Download]";
- return (system ("wget -a $log \"$url\" -O $file") == 0);
- }
- sub check_integrity($) {
- my $fname = shift;
- print "[IntegrityCheck]";
- return system("identify -verbose -quiet -regard-warnings \"$fname\" 2>> $null_file 1>> $null_file") == 0;
- }
- sub unique_merge_into($@) {
- my ($fname, @links) = @_;
- my %all_links;
- if (-e "$fname") {
- open my $flinks, "<$fname" or die "Can't read open [$fname].";
- while (<$flinks>) {
- next if /^\s*$/;
- chomp;
- $all_links{$_} = 1;
- }
- close $flinks;
- }
- # Merge existing links with new ones. For history, if some images were removed.
- $all_links{$_} = 1 for @links;
- {
- open my $flinks, ">$fname" or die "Can't write open [$fname].";
- print $flinks "$_\n" for sort keys %all_links;
- close $flinks;
- }
- }
- print '*' x 40, "Start!", '*' x 40, "\n";
- for my $author (@authors) {
- print "-" x 40, "[$author]", "-" x 40, "\n";
- my $offset = 0;
- mdir("$author");
- mdir("$author/pages");
- mdir("$author/super");
- mdir("$author/full");
- my $fetch_next_page = 1;
- my @links;
- my %all_found_images;
- PAGE_LOOP:
- while ($fetch_next_page) {
- # Each page equals 24 images
- die "Offset limit reached... Looks like infinite loop, something wrong with parsing HTML pages." if $offset >= $offset_limit;
- $fetch_next_page = 0;
- my $url = "http://$author.deviantart.com/gallery/?offset=$offset";
- my $fname = "$author/pages/offset_".sprintf("%04d", $offset).".html";
- print "[$fname]\n";
- my $html_existed_before = -e $fname;
- if (not -e $fname or $force_html_redownload) {
- down($url, $fname, "$author/$down_log") or die "Can't download [$url] to [$fname].";
- }
- open my $file, "<$fname" or die "Can't open for reading [$fname].";
- my $text = do { local $/; <$file> };
- close $file;
- unless ($text =~ /^\s*(<!DOCTYPE html>|<html>)/ and $text =~ /<\/html>\s*$/) {
- if ($html_existed_before and $auto_remove_corrupted) {
- print "[!Corrupted!][Delete][Restart]\n";
- unlink($fname) or die "Can't delete [$fname].";
- redo PAGE_LOOP;
- } else {
- die "Corrupted HTML page [$fname], please resolve manually.";
- }
- }
- print "[Parse]\n\n";
- $text =~ /id="gruze-main"/ or die "Can't find magic marker of main view in HTML. Maybe website has changed encoding...";
- $text = substr($text, $-[0]);
- for my $full_link ($text =~ /<a class="thumb"[^>]*?>.*?<\/a>/g) {
- $full_link =~ s/[\r\n]/ /g;
- $full_link =~ /^(<a.*?>)/ or die "Can't extract first <a> tag from [$full_link].";
- my $link = $1;
- my %attrs = ($link =~ /(\w+)="(.*?)"/g);
- unless ($attrs{super_img}) {
- warn "Empty super_img in link [$link]. Usually always present at DeviantArt. Skipping...";
- next;
- }
- my ($quality, $url) = exists $attrs{super_fullimg} ? ("f", $attrs{super_fullimg}) : ("s", $attrs{super_img});
- $attrs{href} =~ /^http:\/\/$author\.deviantart\.com\/art\/(.+)$/i or die "Can't parse [$attrs{href}].";
- $1 =~ /^(.+?)-(\d+)$/ or die;
- my $id_name = sprintf("${author}_%010d_${quality}_$1", $2);
- # This is a guarantee of no inf loop. If no new images are found from the author, finish...
- next if exists $all_found_images{$id_name};
- $all_found_images{$id_name} = $id_name;
- $url =~ /\.([a-z]+)$/ or die "Can't get file extension from url [$url].";
- my $ftype = $1;
- my $fname = "$author/".($quality eq "s" ? "super" : ($quality eq "f" ? "full" : die))."/$id_name.$ftype";
- print "\t[$fname]\n\t";
- if (-e $fname and $check_existing_image_integrity and not check_integrity($fname)) {
- print "[!Corrupted!]";
- if ($auto_remove_corrupted) {
- print "[Delete]";
- unlink($fname);
- } else {
- die "Auto-delete not allowed... Dying...";
- }
- }
- if (not -e $fname and $download_images) {
- unless (down($url, $fname, "$author/$down_log")) {
- if ($ignore_download_errors) {
- print "[DownloadFailure][Skip]\n";
- next;
- } else {
- die "Can't download [$url].";
- }
- }
- check_integrity($fname) or die "Corrupted image [$fname] after download! Please resolve..." if $check_downloaded_image_integrity;
- }
- print "\n\n";
- $fetch_next_page = 1;
- push @links, $full_link;
- }
- $offset += 24;
- }
- unique_merge_into("$author/links.html", @links);
- unique_merge_into("all_links.html", @links);
- }
- print '*' x 40, "All Finished! :)", '*' x 40, "\n";
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement