Guest User

Untitled

a guest
Aug 24th, 2020
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.89 KB | None | 0 0
  1. Traceback (most recent call last):
  2.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/requests/adapters.py", line 449, in send
  3.     timeout=timeout
  4.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 727, in urlopen
  5.     method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
  6.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/urllib3/util/retry.py", line 403, in increment
  7.     raise six.reraise(type(error), error, _stacktrace)
  8.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/urllib3/packages/six.py", line 735, in reraise
  9.     raise value
  10.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 677, in urlopen
  11.     chunked=chunked,
  12.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 428, in _make_request
  13.     self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
  14.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 336, in _raise_timeout
  15.     self, url, "Read timed out. (read timeout=%s)" % timeout_value
  16. urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='index.commoncrawl.org', port=443): Read timed out. (read timeout=30.0)
  17.  
  18. During handling of the above exception, another exception occurred:
  19.  
  20. Traceback (most recent call last):
  21.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/cdx_toolkit/myrequests.py", line 33, in myrequests_get
  22.     timeout=(30., 30.), allow_redirects=False)
  23.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/requests/api.py", line 76, in get
  24.     return request('get', url, params=params, **kwargs)
  25.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/requests/api.py", line 61, in request
  26.     return session.request(method=method, url=url, **kwargs)
  27.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/requests/sessions.py", line 530, in request
  28.     resp = self.send(prep, **send_kwargs)
  29.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/requests/sessions.py", line 643, in send
  30.     r = adapter.send(request, **kwargs)
  31.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/requests/adapters.py", line 529, in send
  32.     raise ReadTimeout(e, request=request)
  33. requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='index.commoncrawl.org', port=443): Read timed out. (read timeout=30.0)
  34.  
  35. During handling of the above exception, another exception occurred:
  36. Traceback (most recent call last):
  37.   File "launcher.py", line 80, in <module>
  38.     main()
  39.   File "/home/corpus/corpus-data-processing/utils/urlprocessing.py", line 74, in process_pdfs
  40.     for obj in self.cdx.iter(url, limit=limit, filter="mime:application/pdf"):
  41.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/cdx_toolkit/__init__.py", line 198, in __next__
  42.     self.get_more()
  43.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/cdx_toolkit/__init__.py", line 175, in get_more
  44.     params=self.params, index_list=self.index_list)
  45.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/cdx_toolkit/__init__.py", line 303, in get_for_iter
  46.     resp = myrequests_get(endpoint, params=params, cdx=True)
  47.   File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/cdx_toolkit/myrequests.py", line 63, in myrequests_get
  48.     raise ValueError(string)
  49. ValueError: Final failure for url https://index.commoncrawl.org/CC-MAIN-2020-10-index {'limit': 20, 'filter': ['mime:application/pdf'], 'url': 'https://www.rsa.com/content/*', 'output': 'json', 'page': 0, 'from': '20190820184512'}: HTTPSConnectionPool(host='index.commoncrawl.org', port=443): Read timed out. (read timeout=30.0)
Add Comment
Please, Sign In to add comment