Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- module ArchiveAPI
- def get_raw_list_from_api url, page_index
- #request_url = "http://web.archive.org/cdx/search/xd?url="
- request_url = "http://web.archive.org/cdx/search/cdx?url="
- request_url += url
- request_url += parameters_for_api page_index
- #open(request_url).read
- begin
- puts "\n" + request_url + "\n"
- open(request_url, "Pragma" => "no-cache", "Cache-Control" => "no-cache", "User-Agent" => "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36", "Upgrade-Insecure-Requests" => "1", "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding" => "identity").read
- rescue OpenURI::HTTPError => e
- puts "try"
- retry
- end
- end
- def parameters_for_api page_index
- parameters = "&fl=timestamp,original&collapse=digest&matchType=domain&gzip=false"
- if @all
- parameters += ""
- else
- parameters += "&filter=statuscode:200"
- end
- if @from_timestamp and @from_timestamp != 0
- parameters += "&from=" + @from_timestamp.to_s
- end
- if @to_timestamp and @to_timestamp != 0
- parameters += "&to=" + @to_timestamp.to_s
- end
- if page_index
- parameters += "&page=#{page_index}"
- end
- parameters
- end
- end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement