wayback_machine_downloader.rb

# encoding: UTF-8

require 'thread'
require 'net/http'
require 'open-uri'
require 'fileutils'
require 'cgi'
require 'json'
require_relative 'wayback_machine_downloader/tidy_bytes'
require_relative 'wayback_machine_downloader/to_regex'
require_relative 'wayback_machine_downloader/archive_api'

class WaybackMachineDownloader

    include ArchiveAPI

    VERSION = "1.1.4ex"

    attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count

    def initialize params
        @base_url = params[:base_url]
        @directory = params[:directory]
        @from_timestamp = params[:from_timestamp].to_i
        @to_timestamp = params[:to_timestamp].to_i
        @only_filter = params[:only_filter]
        @exclude_filter = params[:exclude_filter]
        @all = params[:all]
        @list = params[:list]
        @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 99999999999999
        @threads_count = params[:threads_count].to_i
    end

    def backup_name
        if @base_url.include? '//'
            @base_url.split('/')[2]
        else
            @base_url
        end
    end

    def backup_path
        if @directory
            if @directory[-1] == '/'
                @directory
            else
                @directory + '/'
            end
        else
            'websites/' + backup_name + '/'
        end
    end

    def match_only_filter file_url
        if @only_filter
            only_filter_regex = @only_filter.to_regex
            if only_filter_regex
                only_filter_regex =~ file_url
            else
                file_url.downcase.include? @only_filter.downcase
            end
        else
            true
        end
    end

    def match_exclude_filter file_url
        if @exclude_filter
            exclude_filter_regex = @exclude_filter.to_regex
            if exclude_filter_regex
                exclude_filter_regex =~ file_url
            else
                file_url.downcase.include? @exclude_filter.downcase
            end
        else
            false
        end
    end

    def get_all_snapshots_to_consider
        # Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index
        print "Getting snapshot pages"
        snapshot_list_to_consider = ""
        snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
        print "."
        snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil)
        print "."
        @maximum_pages.times do |page_index|
            snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
            break if snapshot_list.empty?
            snapshot_list_to_consider += snapshot_list
            print "."
        end
        puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
        puts
        snapshot_list_to_consider
    end

    def get_file_list_curated
        file_list_curated = Hash.new
        get_all_snapshots_to_consider.each_line do |line|
            next unless line.include?('/')
            file_timestamp = line[0..13].to_i
            file_url = line[15..-2]
            file_id = file_url.split('/')[3..-1].join('/')
            file_id = CGI::unescape file_id
            file_id = file_id.tidy_bytes unless file_id == ""

            #file_id = file_id.to_s.split('.')
            #file_id = file_id[0..-2].join('.').to_s + '-' + file_timestamp.to_s + '.' + file_id[-1].to_s

            file_id = file_id.to_s + '|e20ff!44a99|' + file_timestamp.to_s
            #puts "\n" + file_id

            # здесь заменить ид по названию на ад с временной отметкой чтоб не было повторов

            if file_id.nil?
                puts "Malformed file url, ignoring: #{file_url}"
            else
                if match_exclude_filter(file_url)
                    puts "File url matches exclude filter, ignoring: #{file_url}"
                elsif not match_only_filter(file_url)
                    puts "File url doesn't match only filter, ignoring: #{file_url}"
                elsif file_list_curated[file_id]
                    unless file_list_curated[file_id][:timestamp] > file_timestamp
                        file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
                    end
                else
                    file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
                end
            end
        end
        file_list_curated
        #puts file_list_curated.to_s
    end

    def get_file_list_by_timestamp
        file_list_curated = get_file_list_curated
        file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
        file_list_curated.map do |file_remote_info|
            file_remote_info[1][:file_id] = file_remote_info[0]
            file_remote_info[1]
        end
    end

    def list_files
        puts "["
        get_file_list_by_timestamp.each do |file|
            puts file.to_json + ","
        end
        puts "]"
    end

    def download_files
        start_time = Time.now
        puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
        puts

        if file_list_by_timestamp.count == 0
            puts "No files to download."
            puts "Possible reasons:"
            puts "\t* Site is not in Wayback Machine Archive."
            puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
            puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
            puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
            puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
            return
        end

        puts "#{file_list_by_timestamp.count} files to download:"

        threads = []
        @processed_file_count = 0
        @threads_count = 1 unless @threads_count != 0
        @threads_count.times do
            threads << Thread.new do
                until file_queue.empty?
                    file_remote_info = file_queue.pop(true) rescue nil
                    download_file(file_remote_info) if file_remote_info
                end
            end
        end

        threads.each(&:join)
        end_time = Time.now
        puts
        puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
    end

    def structure_dir_path dir_path
        begin
            FileUtils::mkdir_p dir_path unless File.exists? dir_path
        rescue Errno::EEXIST => e
            error_to_string = e.to_s
            puts "# #{error_to_string}"
            if error_to_string.include? "File exists @ dir_s_mkdir - "
                file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
            elsif error_to_string.include? "File exists - "
                file_already_existing = error_to_string.split("File exists - ")[-1]
            else
                raise "Unhandled directory restructure error # #{error_to_string}"
            end
            file_already_existing_temporary = file_already_existing + '.temp'
            file_already_existing_permanent = file_already_existing + '/index.html'
            FileUtils::mv file_already_existing, file_already_existing_temporary
            FileUtils::mkdir_p file_already_existing
            FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
            puts "#{file_already_existing} -> #{file_already_existing_permanent}"
            structure_dir_path dir_path
        end
    end

    def download_file file_remote_info
        file_url = file_remote_info[:file_url]
        file_id = file_remote_info[:file_id]
        file_id = file_id.to_s.split('|e20ff!44a99|')
        file_id = file_id[-2]

        file_timestamp = file_remote_info[:timestamp]
        file_path_elements = file_id.split('/')

        file_path_relative = file_path_elements[0..-2]
        file_name = file_path_elements[-1].to_s.split('.')

        #puts "\n" + file_path_elements.to_s + "\n"
        #puts file_path_relative.to_s + "\n"
        #puts file_name.to_s + "\n"
        error_http = 0
        if file_id == ""
            dir_path = backup_path
            file_path = backup_path + 'index-' + file_timestamp.to_s + '.html'
        elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
            dir_path = backup_path + file_path_elements[0..-1].join('/')
            file_path = backup_path + file_path_elements[0..-1].join('/') + '/index-' + file_timestamp.to_s + '.html'
        else
            dir_path = backup_path + file_path_elements[0..-2].join('/')
            file_path = backup_path + file_path_relative.join('/')  + '/' +  file_name[0..-2].join('.') + '-' + file_timestamp.to_s + '.' + file_name[-1]
            #puts "\n file_path:" + file_path + "\n"
            #puts "file_path_relative.join('/'):" + file_path_relative.join('/') + "\n"
            #puts "file_name[0..-2].join('.'):" + file_name[0..-2].join('.') + "\n"
            #puts "file_name[-1]:" + file_name[-1] + "\n"
        end
        if Gem.win_platform?
            file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
        end
        unless File.exists? file_path
            begin
                structure_dir_path dir_path
                open(file_path, "wb") do |file|
                begin
                    open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Pragma" => "no-cache", "Cache-Control" => "no-cache", "User-Agent" => "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36", "Upgrade-Insecure-Requests" => "1", "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding" => "identity") do |uri|
                    file.write(uri.read)
                end
                rescue OpenURI::HTTPError => e
                    puts "#{file_url} # #{e}"
                    error_http = 1
                    if @all
                        if file_id == ""
                            dir_path2 = backup_path + '!$error/'
                            file_path2 = backup_path + '!$error/' + 'index-' + file_timestamp.to_s + '.html'
                        elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
                            dir_path2 = backup_path + '!$error/' + file_path_elements[0..-1].join('/')
                            file_path2 = backup_path + '!$error/' + file_path_elements[0..-1].join('/') + '/index-' + file_timestamp.to_s + '.html'
                        else
                            dir_path2 = backup_path + '!$error/' + file_path_elements[0..-2].join('/')
                            file_path2 = backup_path + '!$error/' + file_path_relative.join('/')  + '/' +  file_name[0..-2].join('.') + '-' + file_timestamp.to_s + '.' + file_name[-1]
                        end

                        if Gem.win_platform?
                            file_path2 = file_path2.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
                        end
                        structure_dir_path dir_path2
                        open(file_path2, "wb") do |file2|
                            file2.write(e.io.read)
                            puts "\n#{file_path2} saved anyway."
                        end
                    end
                rescue StandardError => e
                    puts "#{file_url} # #{e}"
                end
            end
            rescue StandardError => e
                puts "#{file_url} # #{e}"
            ensure
                if not @all and File.exists?(file_path) and File.size(file_path) == 0 or error_http == 1
                    File.delete(file_path)
                    #Dir[file_path + '**/*'].select { |d| File.directory? d }.select { |d| (Dir.entries(d) - %w[ . .. ]).empty? }.each { |d| Dir.rmdir d }
                    puts "#{file_path} was empty/not need and was removed."
                    error_http = 0
                end
            end
            semaphore.synchronize do
                @processed_file_count += 1
                puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
            end
        else
            semaphore.synchronize do
                @processed_file_count += 1
                puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
            end
        end
    end

    def file_queue
        @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
    end

    def file_list_by_timestamp
        @file_list_by_timestamp ||= get_file_list_by_timestamp
    end

    def semaphore
        @semaphore ||= Mutex.new
    end

end