Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # encoding: UTF-8
- require 'thread'
- require 'net/http'
- require 'open-uri'
- require 'fileutils'
- require 'cgi'
- require 'json'
- require_relative 'wayback_machine_downloader/tidy_bytes'
- require_relative 'wayback_machine_downloader/to_regex'
- require_relative 'wayback_machine_downloader/archive_api'
- class WaybackMachineDownloader
- include ArchiveAPI
- VERSION = "1.1.4ex"
- attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
- def initialize params
- @base_url = params[:base_url]
- @directory = params[:directory]
- @from_timestamp = params[:from_timestamp].to_i
- @to_timestamp = params[:to_timestamp].to_i
- @only_filter = params[:only_filter]
- @exclude_filter = params[:exclude_filter]
- @all = params[:all]
- @list = params[:list]
- @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 99999999999999
- @threads_count = params[:threads_count].to_i
- end
- def backup_name
- if @base_url.include? '//'
- @base_url.split('/')[2]
- else
- @base_url
- end
- end
- def backup_path
- if @directory
- if @directory[-1] == '/'
- @directory
- else
- @directory + '/'
- end
- else
- 'websites/' + backup_name + '/'
- end
- end
- def match_only_filter file_url
- if @only_filter
- only_filter_regex = @only_filter.to_regex
- if only_filter_regex
- only_filter_regex =~ file_url
- else
- file_url.downcase.include? @only_filter.downcase
- end
- else
- true
- end
- end
- def match_exclude_filter file_url
- if @exclude_filter
- exclude_filter_regex = @exclude_filter.to_regex
- if exclude_filter_regex
- exclude_filter_regex =~ file_url
- else
- file_url.downcase.include? @exclude_filter.downcase
- end
- else
- false
- end
- end
- def get_all_snapshots_to_consider
- # Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index
- print "Getting snapshot pages"
- snapshot_list_to_consider = ""
- snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
- print "."
- snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil)
- print "."
- @maximum_pages.times do |page_index|
- snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
- break if snapshot_list.empty?
- snapshot_list_to_consider += snapshot_list
- print "."
- end
- puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
- puts
- snapshot_list_to_consider
- end
- def get_file_list_curated
- file_list_curated = Hash.new
- get_all_snapshots_to_consider.each_line do |line|
- next unless line.include?('/')
- file_timestamp = line[0..13].to_i
- file_url = line[15..-2]
- file_id = file_url.split('/')[3..-1].join('/')
- file_id = CGI::unescape file_id
- file_id = file_id.tidy_bytes unless file_id == ""
- #file_id = file_id.to_s.split('.')
- #file_id = file_id[0..-2].join('.').to_s + '-' + file_timestamp.to_s + '.' + file_id[-1].to_s
- file_id = file_id.to_s + '|e20ff!44a99|' + file_timestamp.to_s
- #puts "\n" + file_id
- # здесь заменить ид по названию на ад с временной отметкой чтоб не было повторов
- if file_id.nil?
- puts "Malformed file url, ignoring: #{file_url}"
- else
- if match_exclude_filter(file_url)
- puts "File url matches exclude filter, ignoring: #{file_url}"
- elsif not match_only_filter(file_url)
- puts "File url doesn't match only filter, ignoring: #{file_url}"
- elsif file_list_curated[file_id]
- unless file_list_curated[file_id][:timestamp] > file_timestamp
- file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
- end
- else
- file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
- end
- end
- end
- file_list_curated
- #puts file_list_curated.to_s
- end
- def get_file_list_by_timestamp
- file_list_curated = get_file_list_curated
- file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
- file_list_curated.map do |file_remote_info|
- file_remote_info[1][:file_id] = file_remote_info[0]
- file_remote_info[1]
- end
- end
- def list_files
- puts "["
- get_file_list_by_timestamp.each do |file|
- puts file.to_json + ","
- end
- puts "]"
- end
- def download_files
- start_time = Time.now
- puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
- puts
- if file_list_by_timestamp.count == 0
- puts "No files to download."
- puts "Possible reasons:"
- puts "\t* Site is not in Wayback Machine Archive."
- puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
- puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
- puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
- puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
- return
- end
- puts "#{file_list_by_timestamp.count} files to download:"
- threads = []
- @processed_file_count = 0
- @threads_count = 1 unless @threads_count != 0
- @threads_count.times do
- threads << Thread.new do
- until file_queue.empty?
- file_remote_info = file_queue.pop(true) rescue nil
- download_file(file_remote_info) if file_remote_info
- end
- end
- end
- threads.each(&:join)
- end_time = Time.now
- puts
- puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
- end
- def structure_dir_path dir_path
- begin
- FileUtils::mkdir_p dir_path unless File.exists? dir_path
- rescue Errno::EEXIST => e
- error_to_string = e.to_s
- puts "# #{error_to_string}"
- if error_to_string.include? "File exists @ dir_s_mkdir - "
- file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
- elsif error_to_string.include? "File exists - "
- file_already_existing = error_to_string.split("File exists - ")[-1]
- else
- raise "Unhandled directory restructure error # #{error_to_string}"
- end
- file_already_existing_temporary = file_already_existing + '.temp'
- file_already_existing_permanent = file_already_existing + '/index.html'
- FileUtils::mv file_already_existing, file_already_existing_temporary
- FileUtils::mkdir_p file_already_existing
- FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
- puts "#{file_already_existing} -> #{file_already_existing_permanent}"
- structure_dir_path dir_path
- end
- end
- def download_file file_remote_info
- file_url = file_remote_info[:file_url]
- file_id = file_remote_info[:file_id]
- file_id = file_id.to_s.split('|e20ff!44a99|')
- file_id = file_id[-2]
- file_timestamp = file_remote_info[:timestamp]
- file_path_elements = file_id.split('/')
- file_path_relative = file_path_elements[0..-2]
- file_name = file_path_elements[-1].to_s.split('.')
- #puts "\n" + file_path_elements.to_s + "\n"
- #puts file_path_relative.to_s + "\n"
- #puts file_name.to_s + "\n"
- error_http = 0
- if file_id == ""
- dir_path = backup_path
- file_path = backup_path + 'index-' + file_timestamp.to_s + '.html'
- elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
- dir_path = backup_path + file_path_elements[0..-1].join('/')
- file_path = backup_path + file_path_elements[0..-1].join('/') + '/index-' + file_timestamp.to_s + '.html'
- else
- dir_path = backup_path + file_path_elements[0..-2].join('/')
- file_path = backup_path + file_path_relative.join('/') + '/' + file_name[0..-2].join('.') + '-' + file_timestamp.to_s + '.' + file_name[-1]
- #puts "\n file_path:" + file_path + "\n"
- #puts "file_path_relative.join('/'):" + file_path_relative.join('/') + "\n"
- #puts "file_name[0..-2].join('.'):" + file_name[0..-2].join('.') + "\n"
- #puts "file_name[-1]:" + file_name[-1] + "\n"
- end
- if Gem.win_platform?
- file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
- end
- unless File.exists? file_path
- begin
- structure_dir_path dir_path
- open(file_path, "wb") do |file|
- begin
- open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Pragma" => "no-cache", "Cache-Control" => "no-cache", "User-Agent" => "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36", "Upgrade-Insecure-Requests" => "1", "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding" => "identity") do |uri|
- file.write(uri.read)
- end
- rescue OpenURI::HTTPError => e
- puts "#{file_url} # #{e}"
- error_http = 1
- if @all
- if file_id == ""
- dir_path2 = backup_path + '!$error/'
- file_path2 = backup_path + '!$error/' + 'index-' + file_timestamp.to_s + '.html'
- elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
- dir_path2 = backup_path + '!$error/' + file_path_elements[0..-1].join('/')
- file_path2 = backup_path + '!$error/' + file_path_elements[0..-1].join('/') + '/index-' + file_timestamp.to_s + '.html'
- else
- dir_path2 = backup_path + '!$error/' + file_path_elements[0..-2].join('/')
- file_path2 = backup_path + '!$error/' + file_path_relative.join('/') + '/' + file_name[0..-2].join('.') + '-' + file_timestamp.to_s + '.' + file_name[-1]
- end
- if Gem.win_platform?
- file_path2 = file_path2.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
- end
- structure_dir_path dir_path2
- open(file_path2, "wb") do |file2|
- file2.write(e.io.read)
- puts "\n#{file_path2} saved anyway."
- end
- end
- rescue StandardError => e
- puts "#{file_url} # #{e}"
- end
- end
- rescue StandardError => e
- puts "#{file_url} # #{e}"
- ensure
- if not @all and File.exists?(file_path) and File.size(file_path) == 0 or error_http == 1
- File.delete(file_path)
- #Dir[file_path + '**/*'].select { |d| File.directory? d }.select { |d| (Dir.entries(d) - %w[ . .. ]).empty? }.each { |d| Dir.rmdir d }
- puts "#{file_path} was empty/not need and was removed."
- error_http = 0
- end
- end
- semaphore.synchronize do
- @processed_file_count += 1
- puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
- end
- else
- semaphore.synchronize do
- @processed_file_count += 1
- puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
- end
- end
- end
- def file_queue
- @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
- end
- def file_list_by_timestamp
- @file_list_by_timestamp ||= get_file_list_by_timestamp
- end
- def semaphore
- @semaphore ||= Mutex.new
- end
- end
Add Comment
Please, Sign In to add comment