Guest User

wayback_machine_downloader.rb

a guest
Aug 12th, 2018
129
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Ruby 10.79 KB | None | 0 0
  1. # encoding: UTF-8
  2.  
  3. require 'thread'
  4. require 'net/http'
  5. require 'open-uri'
  6. require 'fileutils'
  7. require 'cgi'
  8. require 'json'
  9. require_relative 'wayback_machine_downloader/tidy_bytes'
  10. require_relative 'wayback_machine_downloader/to_regex'
  11. require_relative 'wayback_machine_downloader/archive_api'
  12.  
  13. class WaybackMachineDownloader
  14.  
  15.     include ArchiveAPI
  16.  
  17.     VERSION = "1.1.4ex"
  18.  
  19.     attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
  20.  
  21.     def initialize params
  22.         @base_url = params[:base_url]
  23.         @directory = params[:directory]
  24.         @from_timestamp = params[:from_timestamp].to_i
  25.         @to_timestamp = params[:to_timestamp].to_i
  26.         @only_filter = params[:only_filter]
  27.         @exclude_filter = params[:exclude_filter]
  28.         @all = params[:all]
  29.         @list = params[:list]
  30.         @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 99999999999999
  31.         @threads_count = params[:threads_count].to_i
  32.     end
  33.  
  34.     def backup_name
  35.         if @base_url.include? '//'
  36.             @base_url.split('/')[2]
  37.         else
  38.             @base_url
  39.         end
  40.     end
  41.  
  42.     def backup_path
  43.         if @directory
  44.             if @directory[-1] == '/'
  45.                 @directory
  46.             else
  47.                 @directory + '/'
  48.             end
  49.         else
  50.             'websites/' + backup_name + '/'
  51.         end
  52.     end
  53.  
  54.     def match_only_filter file_url
  55.         if @only_filter
  56.             only_filter_regex = @only_filter.to_regex
  57.             if only_filter_regex
  58.                 only_filter_regex =~ file_url
  59.             else
  60.                 file_url.downcase.include? @only_filter.downcase
  61.             end
  62.         else
  63.             true
  64.         end
  65.     end
  66.  
  67.     def match_exclude_filter file_url
  68.         if @exclude_filter
  69.             exclude_filter_regex = @exclude_filter.to_regex
  70.             if exclude_filter_regex
  71.                 exclude_filter_regex =~ file_url
  72.             else
  73.                 file_url.downcase.include? @exclude_filter.downcase
  74.             end
  75.         else
  76.             false
  77.         end
  78.     end
  79.  
  80.     def get_all_snapshots_to_consider
  81.         # Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index
  82.         print "Getting snapshot pages"
  83.         snapshot_list_to_consider = ""
  84.         snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
  85.         print "."
  86.         snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil)
  87.         print "."
  88.         @maximum_pages.times do |page_index|
  89.             snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
  90.             break if snapshot_list.empty?
  91.             snapshot_list_to_consider += snapshot_list
  92.             print "."
  93.         end
  94.         puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
  95.         puts
  96.         snapshot_list_to_consider
  97.     end
  98.  
  99.     def get_file_list_curated
  100.         file_list_curated = Hash.new
  101.         get_all_snapshots_to_consider.each_line do |line|
  102.             next unless line.include?('/')
  103.             file_timestamp = line[0..13].to_i
  104.             file_url = line[15..-2]
  105.             file_id = file_url.split('/')[3..-1].join('/')
  106.             file_id = CGI::unescape file_id
  107.             file_id = file_id.tidy_bytes unless file_id == ""
  108.  
  109.             #file_id = file_id.to_s.split('.')
  110.             #file_id = file_id[0..-2].join('.').to_s + '-' + file_timestamp.to_s + '.' + file_id[-1].to_s
  111.  
  112.             file_id = file_id.to_s + '|e20ff!44a99|' + file_timestamp.to_s
  113.             #puts "\n" + file_id
  114.  
  115.             # здесь заменить ид по названию на ад с временной отметкой чтоб не было повторов
  116.  
  117.             if file_id.nil?
  118.                 puts "Malformed file url, ignoring: #{file_url}"
  119.             else
  120.                 if match_exclude_filter(file_url)
  121.                     puts "File url matches exclude filter, ignoring: #{file_url}"
  122.                 elsif not match_only_filter(file_url)
  123.                     puts "File url doesn't match only filter, ignoring: #{file_url}"
  124.                 elsif file_list_curated[file_id]
  125.                     unless file_list_curated[file_id][:timestamp] > file_timestamp
  126.                         file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
  127.                     end
  128.                 else
  129.                     file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
  130.                 end
  131.             end
  132.         end
  133.         file_list_curated
  134.         #puts file_list_curated.to_s
  135.     end
  136.  
  137.     def get_file_list_by_timestamp
  138.         file_list_curated = get_file_list_curated
  139.         file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
  140.         file_list_curated.map do |file_remote_info|
  141.             file_remote_info[1][:file_id] = file_remote_info[0]
  142.             file_remote_info[1]
  143.         end
  144.     end
  145.  
  146.     def list_files
  147.         puts "["
  148.         get_file_list_by_timestamp.each do |file|
  149.             puts file.to_json + ","
  150.         end
  151.         puts "]"
  152.     end
  153.  
  154.     def download_files
  155.         start_time = Time.now
  156.         puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
  157.         puts
  158.  
  159.         if file_list_by_timestamp.count == 0
  160.             puts "No files to download."
  161.             puts "Possible reasons:"
  162.             puts "\t* Site is not in Wayback Machine Archive."
  163.             puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
  164.             puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
  165.             puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
  166.             puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
  167.             return
  168.         end
  169.  
  170.         puts "#{file_list_by_timestamp.count} files to download:"
  171.  
  172.         threads = []
  173.         @processed_file_count = 0
  174.         @threads_count = 1 unless @threads_count != 0
  175.         @threads_count.times do
  176.             threads << Thread.new do
  177.                 until file_queue.empty?
  178.                     file_remote_info = file_queue.pop(true) rescue nil
  179.                     download_file(file_remote_info) if file_remote_info
  180.                 end
  181.             end
  182.         end
  183.  
  184.         threads.each(&:join)
  185.         end_time = Time.now
  186.         puts
  187.         puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
  188.     end
  189.  
  190.     def structure_dir_path dir_path
  191.         begin
  192.             FileUtils::mkdir_p dir_path unless File.exists? dir_path
  193.         rescue Errno::EEXIST => e
  194.             error_to_string = e.to_s
  195.             puts "# #{error_to_string}"
  196.             if error_to_string.include? "File exists @ dir_s_mkdir - "
  197.                 file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
  198.             elsif error_to_string.include? "File exists - "
  199.                 file_already_existing = error_to_string.split("File exists - ")[-1]
  200.             else
  201.                 raise "Unhandled directory restructure error # #{error_to_string}"
  202.             end
  203.             file_already_existing_temporary = file_already_existing + '.temp'
  204.             file_already_existing_permanent = file_already_existing + '/index.html'
  205.             FileUtils::mv file_already_existing, file_already_existing_temporary
  206.             FileUtils::mkdir_p file_already_existing
  207.             FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
  208.             puts "#{file_already_existing} -> #{file_already_existing_permanent}"
  209.             structure_dir_path dir_path
  210.         end
  211.     end
  212.  
  213.     def download_file file_remote_info
  214.         file_url = file_remote_info[:file_url]
  215.         file_id = file_remote_info[:file_id]
  216.         file_id = file_id.to_s.split('|e20ff!44a99|')
  217.         file_id = file_id[-2]
  218.  
  219.         file_timestamp = file_remote_info[:timestamp]
  220.         file_path_elements = file_id.split('/')
  221.  
  222.         file_path_relative = file_path_elements[0..-2]
  223.         file_name = file_path_elements[-1].to_s.split('.')
  224.  
  225.         #puts "\n" + file_path_elements.to_s + "\n"
  226.         #puts file_path_relative.to_s + "\n"
  227.         #puts file_name.to_s + "\n"
  228.         error_http = 0
  229.         if file_id == ""
  230.             dir_path = backup_path
  231.             file_path = backup_path + 'index-' + file_timestamp.to_s + '.html'
  232.         elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
  233.             dir_path = backup_path + file_path_elements[0..-1].join('/')
  234.             file_path = backup_path + file_path_elements[0..-1].join('/') + '/index-' + file_timestamp.to_s + '.html'
  235.         else
  236.             dir_path = backup_path + file_path_elements[0..-2].join('/')
  237.             file_path = backup_path + file_path_relative.join('/')  + '/' +  file_name[0..-2].join('.') + '-' + file_timestamp.to_s + '.' + file_name[-1]
  238.             #puts "\n file_path:" + file_path + "\n"
  239.             #puts "file_path_relative.join('/'):" + file_path_relative.join('/') + "\n"
  240.             #puts "file_name[0..-2].join('.'):" + file_name[0..-2].join('.') + "\n"
  241.             #puts "file_name[-1]:" + file_name[-1] + "\n"
  242.         end
  243.         if Gem.win_platform?
  244.             file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
  245.         end
  246.         unless File.exists? file_path
  247.             begin
  248.                 structure_dir_path dir_path
  249.                 open(file_path, "wb") do |file|
  250.                 begin
  251.                     open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Pragma" => "no-cache", "Cache-Control" => "no-cache", "User-Agent" => "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36", "Upgrade-Insecure-Requests" => "1", "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding" => "identity") do |uri|
  252.                     file.write(uri.read)
  253.                 end
  254.                 rescue OpenURI::HTTPError => e
  255.                     puts "#{file_url} # #{e}"
  256.                     error_http = 1
  257.                     if @all
  258.                         if file_id == ""
  259.                             dir_path2 = backup_path + '!$error/'
  260.                             file_path2 = backup_path + '!$error/' + 'index-' + file_timestamp.to_s + '.html'
  261.                         elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
  262.                             dir_path2 = backup_path + '!$error/' + file_path_elements[0..-1].join('/')
  263.                             file_path2 = backup_path + '!$error/' + file_path_elements[0..-1].join('/') + '/index-' + file_timestamp.to_s + '.html'
  264.                         else
  265.                             dir_path2 = backup_path + '!$error/' + file_path_elements[0..-2].join('/')
  266.                             file_path2 = backup_path + '!$error/' + file_path_relative.join('/')  + '/' +  file_name[0..-2].join('.') + '-' + file_timestamp.to_s + '.' + file_name[-1]
  267.                         end
  268.  
  269.                         if Gem.win_platform?
  270.                             file_path2 = file_path2.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
  271.                         end
  272.                         structure_dir_path dir_path2
  273.                         open(file_path2, "wb") do |file2|
  274.                             file2.write(e.io.read)
  275.                             puts "\n#{file_path2} saved anyway."
  276.                         end
  277.                     end
  278.                 rescue StandardError => e
  279.                     puts "#{file_url} # #{e}"
  280.                 end
  281.             end
  282.             rescue StandardError => e
  283.                 puts "#{file_url} # #{e}"
  284.             ensure
  285.                 if not @all and File.exists?(file_path) and File.size(file_path) == 0 or error_http == 1
  286.                     File.delete(file_path)
  287.                     #Dir[file_path + '**/*'].select { |d| File.directory? d }.select { |d| (Dir.entries(d) - %w[ . .. ]).empty? }.each { |d| Dir.rmdir d }
  288.                     puts "#{file_path} was empty/not need and was removed."
  289.                     error_http = 0
  290.                 end
  291.             end
  292.             semaphore.synchronize do
  293.                 @processed_file_count += 1
  294.                 puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
  295.             end
  296.         else
  297.             semaphore.synchronize do
  298.                 @processed_file_count += 1
  299.                 puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
  300.             end
  301.         end
  302.     end
  303.  
  304.     def file_queue
  305.         @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
  306.     end
  307.  
  308.     def file_list_by_timestamp
  309.         @file_list_by_timestamp ||= get_file_list_by_timestamp
  310.     end
  311.  
  312.     def semaphore
  313.         @semaphore ||= Mutex.new
  314.     end
  315.  
  316. end
Add Comment
Please, Sign In to add comment