daily pastebin goal
64%
SHARE
TWEET

win-warc-extractor2.py

a guest Jan 1st, 2019 7 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python3
  2. """
  3.    warc-extractor, a simple command line tool for expanding warc files.
  4.    Copyright (C) 2014  Ryan Chartier
  5.    Portions (C) 2012 Internet Archive
  6.  
  7.    This program is free software: you can redistribute it and/or modify
  8.    it under the terms of the GNU General Public License as published by
  9.    the Free Software Foundation, either version 3 of the License, or
  10.    (at your option) any later version.
  11.  
  12.    This program is distributed in the hope that it will be useful,
  13.    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14.    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15.    GNU General Public License for more details.
  16.  
  17.    You should have received a copy of the GNU General Public License
  18.    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19.  
  20.  
  21. warc.utils
  22. ~~~~~~~~~~
  23.  
  24. This file is part of warc
  25.  
  26. :copyright: (c) 2012 Internet Archive
  27.  
  28.  
  29. warc.warc
  30. ~~~~~~~~~
  31.  
  32. Python library to work with WARC files.
  33.  
  34. :copyright: (c) 2012 Internet Archive
  35.  
  36. """
  37.  
  38. from collections import MutableMapping
  39. from http.client import HTTPMessage
  40. from urllib.parse import urlparse, unquote
  41. from pprint import pprint
  42. import os
  43. import re
  44. import argparse
  45. import sys
  46. import mimetypes
  47. import shutil
  48. import email.parser
  49. import gzip
  50. import datetime
  51. import uuid
  52. import re
  53. import io
  54. import hashlib
  55.  
  56. #---------------------------------------------------
  57. #                      warc.utils                  -
  58. #---------------------------------------------------
  59. SEP = re.compile("[;:=]")
  60.  
  61. class CaseInsensitiveDict(MutableMapping):
  62.     """Almost like a dictionary, but keys are case-insensitive.
  63.  
  64.        >>> d = CaseInsensitiveDict(foo=1, Bar=2)
  65.        >>> d['foo']
  66.        1
  67.        >>> d['bar']
  68.        2
  69.        >>> d['Foo'] = 11
  70.        >>> d['FOO']
  71.        11
  72.        >>> d.keys()
  73.        ["foo", "bar"]
  74.    """
  75.     def __init__(self, *args, **kwargs):
  76.         self._d = {}
  77.         self.update(dict(*args, **kwargs))
  78.  
  79.     def __setitem__(self, name, value):
  80.         self._d[name.lower()] = value
  81.  
  82.     def __getitem__(self, name):
  83.         return self._d[name.lower()]
  84.  
  85.     def __delitem__(self, name):
  86.         del self._d[name.lower()]
  87.  
  88.     def __eq__(self, other):
  89.         return isinstance(other, CaseInsensitiveDict) and other._d == self._d
  90.  
  91.     def __iter__(self):
  92.         return iter(self._d)
  93.  
  94.     def __len__(self):
  95.         return len(self._d)
  96.  
  97. class FilePart:
  98.     """File interface over a part of file.
  99.  
  100.    Takes a file and length to read from the file and returns a file-object
  101.    over that part of the file.
  102.    """
  103.     def __init__(self, fileobj, length):
  104.         self.fileobj = fileobj
  105.         self.length = length
  106.         self.offset = 0
  107.         self.buf = b''
  108.  
  109.     def read(self, size=-1):
  110.         if size == -1:
  111.             size = self.length
  112.  
  113.         if len(self.buf) >= size:
  114.             content = self.buf[:size]
  115.             self.buf = self.buf[size:]
  116.         else:
  117.             size = min(size, self.length - self.offset)
  118.             content = self.buf + self.fileobj.read(size - len(self.buf))
  119.             self.buf = b''
  120.         self.offset += len(content)
  121.         return content
  122.  
  123.     def _unread(self, content):
  124.         self.buf = content + self.buf
  125.         self.offset -= len(content)
  126.  
  127.     def readline(self, size=1024):
  128.         chunks = []
  129.         chunk = self.read(size)
  130.         while chunk and b"\n" not in chunk:
  131.             chunks.append(chunk)
  132.             chunk = self.read(size)
  133.  
  134.         if b"\n" in chunk:
  135.             index = chunk.index(b"\n")
  136.             self._unread(chunk[index+1:])
  137.             chunk = chunk[:index+1]
  138.         chunks.append(chunk)
  139.         return b"".join(chunks)
  140.  
  141.     def __iter__(self):
  142.         line = self.readline()
  143.         while line:
  144.             yield line
  145.             line = self.readline()
  146.  
  147. class HTTPObject(CaseInsensitiveDict):
  148.     """Small object to help with parsing HTTP warc entries"""
  149.     def __init__(self, request_file):
  150.         #Parse version line
  151.         id_str_raw = request_file.readline()
  152.         id_str = id_str_raw.decode("iso-8859-1")
  153.         if "HTTP" not in id_str:
  154.             #This is not an HTTP object.
  155.             request_file._unread(id_str_raw)
  156.             raise ValueError("Object is not HTTP.")
  157.  
  158.         words = id_str.split()
  159.         command = path = status = error = version = None
  160.         #If length is not 3 it is a bad version line.
  161.         if len(words) >= 3:
  162.             if words[1].isdigit():
  163.                 version = words[0]
  164.                 error = words[1]
  165.                 status = " ".join(words[2:])
  166.             else:
  167.                 command, path, version = words
  168.  
  169.         self._id = {
  170.             "vline": id_str_raw,
  171.             "command": command,
  172.             "path": path,
  173.             "status": status,
  174.             "error": error,
  175.             "version": version,
  176.         }
  177.  
  178.         self._header, self.hstring = self._parse_headers(request_file)
  179.         super().__init__(self._header)
  180.         self.payload = request_file
  181.         self._content = None
  182.  
  183.     @staticmethod
  184.     def _parse_headers(fp):
  185.         """This is a modification of the python3 http.clint.parse_headers function."""
  186.         headers = []
  187.         while True:
  188.             line = fp.readline(65536)
  189.             headers.append(line)
  190.             if line in (b'\r\n', b'\n', b''):
  191.                 break
  192.         hstring = b''.join(headers)
  193.         return email.parser.Parser(_class=HTTPMessage).parsestr(hstring.decode('iso-8859-1')), hstring
  194.  
  195.     def __repr__(self):
  196.         return(self.vline + str(self._header))
  197.  
  198.     def __getitem__(self, name):
  199.         try:
  200.             return super().__getitem__(name)
  201.         except KeyError:
  202.             value = name.lower()
  203.             if value == "content_type":
  204.                 return self.content.type
  205.             elif value in self.content:
  206.                 return self.content[value]
  207.             elif value in self._id:
  208.                 return self._id[value]
  209.             else:
  210.                 raise
  211.  
  212.     def _reset(self):
  213.         self.payload._unread(self.hstring)
  214.         self.payload._unread(self._id['vline'])
  215.  
  216.     def write_to(self, f):
  217.         f.write(self._id['vline'])
  218.         f.write(self.hstring)
  219.         f.write(self.payload.read())
  220.         f.write(b"\r\n\r\n")
  221.         f.flush()
  222.  
  223.     @property
  224.     def content(self):
  225.         if self._content is None:
  226.             try:
  227.                 string = self._d["content-type"]
  228.             except KeyError:
  229.                 string = ''
  230.             self._content = ContentType(string)
  231.         return self._content
  232.  
  233.     @property
  234.     def vline(self):
  235.         return self._id["vline"].decode("iso-8859-1")
  236.  
  237.     @property
  238.     def version(self):
  239.         return self._id["version"]
  240.  
  241.     def write_payload_to(self, fp):
  242.         encoding = self._header.get("Transfer-Encoding", "None")
  243.         if encoding == "chunked":
  244.             found = b''
  245.             length = int(str(self.payload.readline(), "iso-8859-1").rstrip(), 16)
  246.             while length > 0:
  247.                 found += self.payload.read(length)
  248.                 self.payload.readline()
  249.                 length = int(str(self.payload.readline(), "iso-8859-1").rstrip(), 16)
  250.         else:
  251.             length = int(self._header.get("Content-Length", -1))
  252.             found = self.payload.read(length)
  253.  
  254.         fp.write(found)
  255.  
  256. class ContentType(CaseInsensitiveDict):
  257.     def __init__(self, string):
  258.         data = {}
  259.         self.type = ''
  260.         if string:
  261.             _list = [i.strip() for i in string.lower().split(";")]
  262.             self.type = _list[0]
  263.             #print(string)
  264.  
  265.             data["type"] = _list[0]
  266.             for i in _list[1:]:
  267.                 test = [n.strip() for n in re.split(SEP, i)]
  268.                 if len(test) == 1:
  269.                     print('list length error with string: \'' + string + '\' (extra semicolon?)')
  270.                 else:
  271.                     data[test[0]] = test[1]
  272.  
  273.         super().__init__(data)
  274.  
  275.     def __repr__(self):
  276.         return self.type
  277.  
  278.  
  279. #---------------------------------------------------
  280. #                      warc.warc                   -
  281. #---------------------------------------------------
  282.  
  283. class WARCHeader(CaseInsensitiveDict):
  284.     """The WARC Header object represents the headers of a WARC record.
  285.  
  286.    It provides dictionary like interface for accessing the headers.
  287.  
  288.    The following mandatory fields are accessible also as attributes.
  289.  
  290.        * h.record_id == h['WARC-Record-ID']
  291.        * h.content_length == int(h['Content-Length'])
  292.        * h.date == h['WARC-Date']
  293.        * h.type == h['WARC-Type']
  294.  
  295.    :params headers: dictionary of headers.
  296.    :params defaults: If True, important headers like WARC-Record-ID,
  297.                      WARC-Date, Content-Type and Content-Length are
  298.                      initialized to automatically if not already present.
  299.  
  300.    """
  301.     CONTENT_TYPES = dict(warcinfo='application/warc-fields',
  302.                         response='application/http; msgtype=response',
  303.                         request='application/http; msgtype=request',
  304.                         metadata='application/warc-fields')
  305.  
  306.     KNOWN_HEADERS = {
  307.         "type": "WARC-Type",
  308.         "date": "WARC-Date",
  309.         "record_id": "WARC-Record-ID",
  310.         "ip_address": "WARC-IP-Address",
  311.         "target_uri": "WARC-Target-URI",
  312.         "warcinfo_id": "WARC-Warcinfo-ID",
  313.         "request_uri": "WARC-Request-URI",
  314.         "content_type": "Content-Type",
  315.         "content_length": "Content-Length"
  316.     }
  317.  
  318.     def __init__(self, headers, defaults=False):
  319.         self.version = "WARC/1.0"
  320.         super().__init__(headers)
  321.         if defaults:
  322.             self.init_defaults()
  323.  
  324.     def __repr__(self):
  325.         return "<WARCHeader: type={}, record_id={}>".format(self.type, self.record_id)
  326.  
  327.     def init_defaults(self):
  328.         """Initializes important headers to default values, if not already specified.
  329.  
  330.        The WARC-Record-ID header is set to a newly generated UUID.
  331.        The WARC-Date header is set to the current datetime.
  332.        The Content-Type is set based on the WARC-Type header.
  333.        The Content-Length is initialized to 0.
  334.        """
  335.         if "WARC-Record-ID" not in self:
  336.             self['WARC-Record-ID'] = "<urn:uuid:%s>" % uuid.uuid1()
  337.         if "WARC-Date" not in self:
  338.             self['WARC-Date'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
  339.         if "Content-Type" not in self:
  340.             self['Content-Type'] = WARCHeader.CONTENT_TYPES.get(self.type, "application/octet-stream")
  341.  
  342.     def write_to(self, f):
  343.         """Writes this header to a file, in the format specified by WARC.
  344.        """
  345.         f.write(self.version.encode() + b"\r\n")
  346.         for name, value in self.items():
  347.             name = name.title()
  348.             # Use standard forms for commonly used patterns
  349.             name = name.replace("Warc-", "WARC-").replace("-Ip-", "-IP-").replace("-Id", "-ID").replace("-Uri", "-URI")
  350.             entry = "{}: {}\r\n".format(str(name), str(value)).encode()
  351.             f.write(entry)
  352.  
  353.         # Header ends with an extra CRLF
  354.         f.write(b"\r\n")
  355.  
  356.     @property
  357.     def content_length(self):
  358.         """The Content-Length header as int."""
  359.         return int(self['Content-Length'])
  360.  
  361.     @property
  362.     def type(self):
  363.         """The value of WARC-Type header."""
  364.         return self['WARC-Type']
  365.  
  366.     @property
  367.     def record_id(self):
  368.         """The value of WARC-Record-ID header."""
  369.         return self['WARC-Record-ID']
  370.  
  371.     @property
  372.     def date(self):
  373.         """The value of WARC-Date header."""
  374.         return self['WARC-Date']
  375.  
  376. class WARCRecord(object):
  377.     """The WARCRecord object represents a WARC Record.
  378.    """
  379.     def __init__(self, header=None, payload=None,  headers={}, defaults=True):
  380.         """Creates a new WARC record.
  381.        """
  382.  
  383.         if header is None and defaults is True:
  384.             headers.setdefault("WARC-Type", "response")
  385.  
  386.         self.header = header or WARCHeader(headers, defaults=True)
  387.  
  388.         if defaults is True and 'Content-Length' not in self.header:
  389.             if payload:
  390.                 self.header['Content-Length'] = len(payload)
  391.             else:
  392.                 self.header['Content-Length'] = "0"
  393.  
  394.         if defaults is True and 'WARC-Payload-Digest' not in self.header:
  395.             self.header['WARC-Payload-Digest'] = self._compute_digest(payload)
  396.  
  397.         if isinstance(payload, str):
  398.             payload = payload.encode()
  399.         if isinstance(payload, bytes):
  400.             payload = io.BytesIO(payload)
  401.  
  402.         self.payload = payload
  403.         self._http = None
  404.         self._content = None
  405.  
  406.     def _compute_digest(self, payload):
  407.         return "sha1:" + hashlib.sha1(payload).hexdigest()
  408.  
  409.     def write_to(self, f):
  410.         self.header.write_to(f)
  411.         if self.http:
  412.             self.http._reset()
  413.         f.write(self.payload.read())
  414.         f.write(b"\r\n")
  415.         f.write(b"\r\n")
  416.         f.flush()
  417.  
  418.     @property
  419.     def content(self):
  420.         if self._content is None:
  421.             try:
  422.                 string = self.header["content-type"]
  423.             except KeyError:
  424.                 string = ''
  425.             self._content = ContentType(string)
  426.         return self._content
  427.  
  428.     @property
  429.     def http(self):
  430.         if self._http is None:
  431.             if 'application/http' in self.header['content-type']:
  432.                 self._http = HTTPObject(self.payload)
  433.             else:
  434.                 self._http = False
  435.         return self._http
  436.  
  437.     @property
  438.     def type(self):
  439.         """Record type"""
  440.         return self.header.type
  441.  
  442.     @property
  443.     def url(self):
  444.         """The value of the WARC-Target-URI header if the record is of type "response"."""
  445.         return self.header.get('WARC-Target-URI')
  446.  
  447.     @property
  448.     def ip_address(self):
  449.         """The IP address of the host contacted to retrieve the content of this record.
  450.  
  451.        This value is available from the WARC-IP-Address header."""
  452.         return self.header.get('WARC-IP-Address')
  453.  
  454.     @property
  455.     def date(self):
  456.         """UTC timestamp of the record."""
  457.         return self.header.get("WARC-Date")
  458.  
  459.     @property
  460.     def checksum(self):
  461.         return self.header.get('WARC-Payload-Digest')
  462.  
  463.     def __getitem__(self, name):
  464.         try:
  465.             return self.header[name]
  466.         except KeyError:
  467.             if name == "content_type":
  468.                 return self.content.type
  469.             elif name in self.content:
  470.                 return self.content[name]
  471.  
  472.     def __setitem__(self, name, value):
  473.         self.header[name] = value
  474.  
  475.     def __contains__(self, name):
  476.         return name in self.header
  477.  
  478.     def __repr__(self):
  479.         return "<WARCRecord: type=%r record_id=%s>" % (self.type, self['WARC-Record-ID'])
  480.  
  481.     @staticmethod
  482.     def from_response(response):
  483.         """Creates a WARCRecord from given response object.
  484.  
  485.        This must be called before reading the response. The response can be
  486.        read after this method is called.
  487.  
  488.        :param response: An instance of :class:`requests.models.Response`.
  489.        """
  490.         # Get the httplib.HTTPResponse object
  491.         http_response = response.raw._original_response
  492.  
  493.         # HTTP status line, headers and body as strings
  494.         status_line = "HTTP/1.1 %d %s" % (http_response.status, http_response.reason)
  495.         headers = str(http_response.msg)
  496.         body = http_response.read()
  497.  
  498.         # Monkey-patch the response object so that it is possible to read from it later.
  499.         response.raw._fp = io.BytesIO(body)
  500.  
  501.         # Build the payload to create warc file.
  502.         payload = status_line + "\r\n" + headers + "\r\n" + body
  503.  
  504.         headers = {
  505.             "WARC-Type": "response",
  506.             "WARC-Target-URI": response.request.url.encode('utf-8')
  507.         }
  508.         return WARCRecord(payload=payload, headers=headers)
  509.  
  510. class WARCFile:
  511.     def __init__(self, filename=None, mode=None, fileobj=None, compress=None):
  512.         if fileobj is None:
  513.             fileobj = open(filename, mode or "rb")
  514.             mode = fileobj.mode
  515.         # initiaize compress based on filename, if not already specified
  516.         if compress is None and filename and filename.endswith(".gz"):
  517.             compress = True
  518.  
  519.         if compress:
  520.             fileobj = gzip.open(fileobj, mode)
  521.  
  522.         self.fileobj = fileobj
  523.         self._reader = None
  524.  
  525.     def __enter__(self):
  526.         return self
  527.  
  528.     def __exit__(self, exc_type, exc_value, traceback):
  529.         self.close()
  530.  
  531.     def __iter__(self):
  532.         return iter(self.reader)
  533.  
  534.     @property
  535.     def reader(self):
  536.         if self._reader is None:
  537.             self._reader = WARCReader(self.fileobj)
  538.         return self._reader
  539.  
  540.     def write_record(self, warc_record):
  541.         """Adds a warc record to this WARC file.
  542.        """
  543.         warc_record.write_to(self.fileobj)
  544.  
  545.     def read_record(self):
  546.         """Reads a warc record from this WARC file."""
  547.         return self.reader.read_record()
  548.  
  549.     def close(self):
  550.         self.fileobj.close()
  551.  
  552.     def tell(self):
  553.         """Returns the file offset.
  554.        """
  555.         return self.fileobj.tell()
  556.  
  557. class WARCReader:
  558.     RE_VERSION = re.compile("WARC/(\d+.\d+)\r\n")
  559.     RE_HEADER = re.compile(r"([a-zA-Z_\-]+): *(.*)\r\n")
  560.     SUPPORTED_VERSIONS = ["1.0"]
  561.  
  562.     def __init__(self, fileobj):
  563.         self.fileobj = fileobj
  564.         self.current_payload = None
  565.  
  566.     def read_header(self, fileobj):
  567.         version_line = fileobj.readline().decode("utf-8")
  568.         if not version_line:
  569.             return None
  570.  
  571.         m = self.RE_VERSION.match(version_line)
  572.         if not m:
  573.             raise IOError("Bad version line: %r" % version_line)
  574.         version = m.group(1)
  575.         if version not in self.SUPPORTED_VERSIONS:
  576.             raise IOError("Unsupported WARC version: %s" % version)
  577.  
  578.         headers = {}
  579.         while True:
  580.             line = fileobj.readline().decode("utf-8")
  581.             if line == "\r\n": # end of headers
  582.                 break
  583.             m = self.RE_HEADER.match(line)
  584.             if not m:
  585.                 raise IOError("Bad header line: %r" % line)
  586.             name, value = m.groups()
  587.             headers[name] = value
  588.         return WARCHeader(headers)
  589.  
  590.     def expect(self, fileobj, expected_line, message=None):
  591.         line = fileobj.readline().decode("utf-8")
  592.         if line != expected_line:
  593.             message = message or "Expected %r, found %r" % (expected_line, line)
  594.             raise IOError(message)
  595.  
  596.     def finish_reading_current_record(self):
  597.         # consume the footer from the previous record
  598.         if self.current_payload:
  599.             # consume all data from the current_payload before moving to next record
  600.             self.current_payload.read()
  601.             self.expect(self.current_payload.fileobj, "\r\n")
  602.             self.expect(self.current_payload.fileobj, "\r\n")
  603.             self.current_payload = None
  604.  
  605.     def read_record(self):
  606.         self.finish_reading_current_record()
  607.         fileobj = self.fileobj
  608.  
  609.         header = self.read_header(fileobj)
  610.         if header is None:
  611.             return None
  612.  
  613.         self.current_payload = FilePart(fileobj, header.content_length)
  614.         record = WARCRecord(header, self.current_payload, defaults=False)
  615.         return record
  616.  
  617.     def _read_payload(self, fileobj, content_length):
  618.         size = 0
  619.         while size < content_length:
  620.             chunk_size = min(1024, content_length-size)
  621.             chunk = fileobj.read(chunk_size)
  622.             size += chunk_size
  623.             yield chunk
  624.  
  625.     def __iter__(self):
  626.         record = self.read_record()
  627.         while record is not None:
  628.             yield record
  629.             record = self.read_record()
  630.  
  631. #---------------------------------------------------
  632. #                 Extractor                        -
  633. #---------------------------------------------------
  634.  
  635. counts = {}
  636.  
  637. class filterObject:
  638.     """Basic object for storing filters."""
  639.     def __init__(self, string):
  640.         self.result = True
  641.         if string[0] == "!":
  642.             self.result = False
  643.             string = string[1:]
  644.  
  645.         _list = string.lower().split(":")
  646.  
  647.         self.http = (_list[0] == 'http')
  648.         if self.http:
  649.             del _list[0]
  650.  
  651.         self.k = _list[0]
  652.         self.v = _list[1]
  653.  
  654. def inc(obj, header=False, dic=False):
  655.     """Short script for counting entries."""
  656.     if header:
  657.         try:
  658.             obj = obj[header]
  659.         except KeyError:
  660.             obj = None
  661.  
  662.     holder = counts
  663.     if dic:
  664.         if dic not in counts:
  665.             counts[dic] = {}
  666.         holder = counts[dic]
  667.  
  668.     if obj in holder:
  669.         holder[obj] += 1
  670.     else:
  671.         holder[obj] = 1
  672.  
  673. def warc_records(string, path):
  674.     """Iterates over warc records in path."""
  675.     for filename in os.listdir(path):
  676.         if re.search(string, filename) and ".warc" in filename:
  677.             print("parsing", filename)
  678.             with WARCFile(path + filename) as warc_file:
  679.                 for record in warc_file:
  680.                     yield record
  681.  
  682. def checkFilter(filters, record):
  683.     """Check record against filters."""
  684.     for i in filters:
  685.         if i.http:
  686.             if not record.http:
  687.                 return False
  688.             value = record.http
  689.         else:
  690.             value = record.header
  691.  
  692.         string = value.get(i.k, None)
  693.         if not string or (i.v in string) != i.result:
  694.             return False
  695.     return True
  696.  
  697. def parse(args):
  698.     #Clear output warc file.
  699.     if args.dump == "warc":
  700.         if args.silence:
  701.             print("Recording", args.dump, "to", args.output + ".")
  702.         with open(args.output_path + args.output, "wb"):
  703.             pass
  704.  
  705.     for record in warc_records(args.string, args.path):
  706.         try:
  707.             #Filter out unwanted entries.
  708.             if not checkFilter(args.filter, record):
  709.                 continue
  710.  
  711.             #Increment Index counters.
  712.             if args.silence:
  713.                 inc("records")
  714.                 inc(record,"warc-type", "types")
  715.                 inc(record, "content_type", "warc-content")
  716.                 if record.http:
  717.                     inc(record.http, "content_type", "http-content")
  718.                     inc(record.http, "error", "status")
  719.  
  720.             #Dump records to file.
  721.             if args.dump == "warc":
  722.                 with open(args.output_path + args.output, "ab") as output:
  723.                     record.write_to(output)
  724.  
  725.             if args.dump == "content":
  726.                 url = urlparse(unquote(record['WARC-Target-URI']))
  727.  
  728.                 #Set up folder
  729.                 index = url.path.rfind("/") + 1
  730.                 file = url.path[index:]
  731.                 path = url.path[:index]
  732.  
  733.                 #Process filename
  734.                 if "." not in file:
  735.                     path += file
  736.                     if not path.endswith("/"):
  737.                         path += "/"
  738.  
  739.                     file = 'index.html'
  740.  
  741.                 #Final fixes.
  742.                 path = path.replace(".", "-")
  743.                
  744.                 host = url.hostname.replace('www.', '', 1)
  745.                 path = args.output_path + host + path
  746.                
  747.                 path = path.replace("\\", "\")
  748.                 #path = path.replace("/", "∕")
  749.                 path = path.replace(":", "꞉")
  750.                 path = path.replace("*", "⁎")
  751.                 path = path.replace("?", "︖")
  752.                 path = path.replace("\"", "”")
  753.                 path = path.replace("<", "‹")
  754.                 path = path.replace(">", "›")
  755.                 path = path.replace("|", "⏐")
  756.                
  757.                 illegalWords = ["CON", "PRN","AUX","NUL",
  758.                 "COM1","COM2","COM3","COM4","COM5","COM6","COM7","COM8","COM9","COM0",
  759.                 "LPT1","LPT2","LPT3","LPT4","LPT5","LPT6","LPT7","LPT8","LPT9","LPT0"]
  760.                
  761.                 for item in illegalWords:
  762.                     for z in range(0, path.upper().count("/" + item + "/")):
  763.                         print("Illegal word \"{0}\" encountered in path \"{1}\", prefixing word with \"_\"\n\n".format(item, path))
  764.                         start = path.upper().find("/" + item + "/") + 1
  765.                         end = start + len(item)
  766.                         splitter = path[start:end + 1]
  767.                         joiner = "_" + splitter
  768.                         path = joiner.join(path.split(splitter))
  769.  
  770.                 #Create new directories
  771.                 if not os.path.exists(path):
  772.                     try:
  773.                         os.makedirs(path)
  774.                     except OSError:
  775.                         path = "/".join([i[:25] for i in path.split("/")]).replace("*", "⁎").replace("\\", "\")
  776.                         os.makedirs(path)
  777.  
  778.                 #Test if file has a proper extension.
  779.                 index = file.index(".")
  780.                 suffix = file[index:]
  781.                 content = record.http.get("content_type", "")
  782.                 slist = mimetypes.guess_all_extensions(content)
  783.                 if suffix not in slist:
  784.                     #Correct suffix if we can.
  785.                     suffix = mimetypes.guess_extension(content)
  786.                     if suffix:
  787.                         file = file[:index] + suffix
  788.                     else:
  789.                         inc(record.http, "content_type", "unknown mime type")
  790.  
  791.                 #Check for gzip compression.
  792.                 if record.http.get("content-encoding", None) == "gzip":
  793.                     file += ".gz"
  794.  
  795.                 path += file
  796.  
  797.                 #If Duplicate file then insert numbers
  798.                 index = path.rfind(".")
  799.                 temp = path
  800.                 n = 0
  801.                 while os.path.isfile(temp):
  802.                     n +=1
  803.                     temp = path[:index] + "("+ str(n) + ")" + path[index:]
  804.                 path = temp.replace("\"", "”")
  805.                 path = path.replace("\\", "\")
  806.                 #path = path.replace("/", "∕")
  807.                 path = path.replace(":", "꞉")
  808.                 path = path.replace("*", "⁎")
  809.                 path = path.replace("?", "︖")
  810.                 path = path.replace("\"", "”")
  811.                 path = path.replace("<", "‹")
  812.                 path = path.replace(">", "›")
  813.                 path = path.replace("|", "⏐")
  814.                
  815.  
  816.                 #Write file.
  817.                 if len(path.split('/')[-1]) > 230:
  818.                     print('file \'' + path + '\' has a name that is too long, not writing')
  819.                 else:
  820.                     with open(path.replace("\\", "\"), 'wb') as fp:
  821.                         record.http.write_payload_to(fp)
  822.         except Exception:
  823.             if args.error:
  824.                 if args.silence:
  825.                     print("Error in record. Recording to error.warc.")
  826.                 with open(args.output_path + "error.warc", "ab") as fp:
  827.                     record.write_to(fp)
  828.             else:
  829.                 raise
  830.  
  831.     #print results
  832.     if args.silence:
  833.         print("-----------------------------")
  834.         for i in counts:
  835.             print("\nCount of {}.".format(i))
  836.             pprint(counts[i])
  837.  
  838. if __name__ == "__main__":
  839.     parser = argparse.ArgumentParser(description='Extracts attributes from warc files.')
  840.     parser.add_argument("filter", nargs='*', help="Attributes to filter by. Entries that do not contain filtered elements are ignored. Example: warc-type:response, would ignore all warc entries that are not responses. Attributes in an HTTP object should be prefixed by 'http'. Example, http:error:200.")
  841.     parser.add_argument("-silence", action="store_false", help="Silences output of warc data.")
  842.     parser.add_argument("-error", action="store_true", help="Silences most errors and records problematic warc entries to error.warc.")
  843.     parser.add_argument("-string", default="", help="Regular expression to limit parsed warc files. Defaults to empty string.")
  844.     parser.add_argument("-path", default="./", help="Path to folder containing warc files. Defaults to current folder.")
  845.     parser.add_argument("-output_path", default="data/", help="Path to folder to dump content files. Defaults to data/ folder.")
  846.     parser.add_argument("-output", default="output.warc", help="File to output warc contents. Defaults to 'output.warc'.")
  847.     parser.add_argument("-dump", choices=['warc', 'content'], type=str, help="Dumps all entries that survived filter. 'warc' creates a filtered warc file. 'content' tries to reproduce file structure of archived websites.")
  848.     args = parser.parse_args()
  849.  
  850.     if args.path[-1] != "/":
  851.         args.path += "/"
  852.  
  853.     if args.output_path[-1] != "/":
  854.         args.output_path += "/"
  855.  
  856.     if args.dump:
  857.         if not os.path.exists(args.output_path):
  858.             os.makedirs(args.output_path)
  859.  
  860.     #Forced filters
  861.     if args.dump == "content":
  862.         args.filter.append("warc-type:response")
  863.         args.filter.append("content-type:application/http")
  864.  
  865.     args.filter = [filterObject(i) for i in args.filter]
  866.  
  867.     args.string = re.compile(args.string)
  868.     parse(args)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top