deviantart_downloader.pl

# A Downloader for DeviantArt.com galleries. Per author.
# Either use: deviantart_downloader.pl authorname1 authorname2 authorname3 ...
# Or put a list of authors into @authors variable down below like in example.
# In case of problems see program's output (including download log in author's subdir).
# You may send letters to user moytra at Google's gmail.com

our @authors = @ARGV ? @ARGV : qw(anry artgerm genzoman artgutierrez kerembeyit jenovah-art sandara yukiusagi1983 pierrerodriguez longai);

use strict;
use warnings;

our $check_existing_image_integrity = 0; # If all existing images are rechecked for integrity
our $check_downloaded_image_integrity = 0; # If to check integrity right after download
our $auto_remove_corrupted = 1; # If true will automatically remove corrupted files and try redownload
our $force_html_redownload = 0; # If to redownload html pages referring images every time
our $download_images = 1; # If 0 images are not downloaded. Only pages...
our $ignore_download_errors = 1; # If 1 if there's no file on server (happens), still proceed, don't die...

our $offset_limit = 9960; # To prevent infinitly downloading HTML pages in case of parsing errors, limit offset (max num of retrieved pages).

our $down_log = "down.log"; # Download log from wget.
our $null_file = ":NUL"; # Put /dev/null on Unix.

sub mdir($) {
    my $dir = shift;
    mkdir "$dir" or die "Can't create dir [$dir]." unless -d "$dir";
}

sub down($$$) {
    my ($url, $file, $log) = @_;
    print "[Download]";

    return (system ("wget -a $log \"$url\" -O $file") == 0);
}

sub check_integrity($) {
    my $fname = shift;
    print "[IntegrityCheck]";
    return system("identify -verbose -quiet -regard-warnings \"$fname\" 2>> $null_file 1>> $null_file") == 0;
}

sub unique_merge_into($@) {
    my ($fname, @links) = @_;

    my %all_links;

    if (-e "$fname") {
        open my $flinks, "<$fname" or die "Can't read open [$fname].";
        while (<$flinks>) {
            next if /^\s*$/;
            chomp;
            $all_links{$_} = 1;
        }
        close $flinks;
    }

    # Merge existing links with new ones. For history, if some images were removed.
    $all_links{$_} = 1 for @links;

    {
        open my $flinks, ">$fname" or die "Can't write open [$fname].";
        print $flinks "$_\n" for sort keys %all_links;
        close $flinks;
    }
}

print '*' x 40, "Start!", '*' x 40, "\n";

for my $author (@authors) {
    print "-" x 40, "[$author]", "-" x 40, "\n";

    my $offset = 0;

    mdir("$author");
    mdir("$author/pages");
    mdir("$author/super");
    mdir("$author/full");

    my $fetch_next_page = 1;
    my @links;
    my %all_found_images;

    PAGE_LOOP:
    while ($fetch_next_page) {
        # Each page equals 24 images

        die "Offset limit reached... Looks like infinite loop, something wrong with parsing HTML pages." if $offset >= $offset_limit;

        $fetch_next_page = 0;
        my $url = "http://$author.deviantart.com/gallery/?offset=$offset";
        my $fname = "$author/pages/offset_".sprintf("%04d", $offset).".html";

        print "[$fname]\n";

        my $html_existed_before = -e $fname;

        if (not -e $fname or $force_html_redownload) {
            down($url, $fname, "$author/$down_log") or die "Can't download [$url] to [$fname].";
        }

        open my $file, "<$fname" or die "Can't open for reading [$fname].";
        my $text = do { local $/; <$file> };
        close $file;

        unless ($text =~ /^\s*(<!DOCTYPE html>|<html>)/ and $text =~ /<\/html>\s*$/) {
            if ($html_existed_before and $auto_remove_corrupted) {
                print "[!Corrupted!][Delete][Restart]\n";
                unlink($fname) or die "Can't delete [$fname].";
                redo PAGE_LOOP;
            } else {
                die "Corrupted HTML page [$fname], please resolve manually.";
            }
        }

        print "[Parse]\n\n";

        $text =~ /id="gruze-main"/ or die "Can't find magic marker of main view in HTML. Maybe website has changed encoding...";
        $text = substr($text, $-[0]);

        for my $full_link ($text =~ /<a class="thumb"[^>]*?>.*?<\/a>/g) {
            $full_link =~ s/[\r\n]/ /g;
            $full_link =~ /^(<a.*?>)/ or die "Can't extract first <a> tag from [$full_link].";
            my $link = $1;
            my %attrs = ($link =~ /(\w+)="(.*?)"/g);

            unless ($attrs{super_img}) {
                warn "Empty super_img in link [$link]. Usually always present at DeviantArt. Skipping...";
                next;
            }

            my ($quality, $url) = exists $attrs{super_fullimg} ? ("f", $attrs{super_fullimg}) : ("s", $attrs{super_img});

            $attrs{href} =~ /^http:\/\/$author\.deviantart\.com\/art\/(.+)$/i or die "Can't parse [$attrs{href}].";
            $1 =~ /^(.+?)-(\d+)$/ or die;
            my $id_name = sprintf("${author}_%010d_${quality}_$1", $2);


            # This is a guarantee of no inf loop. If no new images are found from the author, finish...
            next if exists $all_found_images{$id_name};

            $all_found_images{$id_name} = $id_name;

            $url =~ /\.([a-z]+)$/ or die "Can't get file extension from url [$url].";
            my $ftype = $1;
            my $fname = "$author/".($quality eq "s" ? "super" : ($quality eq "f" ? "full" : die))."/$id_name.$ftype";

            print "\t[$fname]\n\t";

            if (-e $fname and $check_existing_image_integrity and not check_integrity($fname)) {
                print "[!Corrupted!]";
                if ($auto_remove_corrupted) {
                    print "[Delete]";
                    unlink($fname);
                } else {
                    die "Auto-delete not allowed... Dying...";
                }
            }

            if (not -e $fname and $download_images) {
                unless (down($url, $fname, "$author/$down_log")) {
                    if ($ignore_download_errors) {
                        print "[DownloadFailure][Skip]\n";
                        next;
                    } else {
                        die "Can't download [$url].";
                    }
                }

                check_integrity($fname) or die "Corrupted image [$fname] after download! Please resolve..." if $check_downloaded_image_integrity;
            }

            print "\n\n";
            $fetch_next_page = 1;
            push @links, $full_link;
        }

        $offset += 24;
    }

    unique_merge_into("$author/links.html", @links);
    unique_merge_into("all_links.html", @links);
}

print '*' x 40, "All Finished! :)", '*' x 40, "\n";