Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Traceback (most recent call last):
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/requests/adapters.py", line 449, in send
- timeout=timeout
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 727, in urlopen
- method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/urllib3/util/retry.py", line 403, in increment
- raise six.reraise(type(error), error, _stacktrace)
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/urllib3/packages/six.py", line 735, in reraise
- raise value
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 677, in urlopen
- chunked=chunked,
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 428, in _make_request
- self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/urllib3/connectionpool.py", line 336, in _raise_timeout
- self, url, "Read timed out. (read timeout=%s)" % timeout_value
- urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='index.commoncrawl.org', port=443): Read timed out. (read timeout=30.0)
- During handling of the above exception, another exception occurred:
- Traceback (most recent call last):
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/cdx_toolkit/myrequests.py", line 33, in myrequests_get
- timeout=(30., 30.), allow_redirects=False)
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/requests/api.py", line 76, in get
- return request('get', url, params=params, **kwargs)
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/requests/api.py", line 61, in request
- return session.request(method=method, url=url, **kwargs)
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/requests/sessions.py", line 530, in request
- resp = self.send(prep, **send_kwargs)
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/requests/sessions.py", line 643, in send
- r = adapter.send(request, **kwargs)
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/requests/adapters.py", line 529, in send
- raise ReadTimeout(e, request=request)
- requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='index.commoncrawl.org', port=443): Read timed out. (read timeout=30.0)
- During handling of the above exception, another exception occurred:
- Traceback (most recent call last):
- File "launcher.py", line 80, in <module>
- main()
- File "/home/corpus/corpus-data-processing/utils/urlprocessing.py", line 74, in process_pdfs
- for obj in self.cdx.iter(url, limit=limit, filter="mime:application/pdf"):
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/cdx_toolkit/__init__.py", line 198, in __next__
- self.get_more()
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/cdx_toolkit/__init__.py", line 175, in get_more
- params=self.params, index_list=self.index_list)
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/cdx_toolkit/__init__.py", line 303, in get_for_iter
- resp = myrequests_get(endpoint, params=params, cdx=True)
- File "/home/corpus/corpus-data-processing/venv/lib/python3.7/site-packages/cdx_toolkit/myrequests.py", line 63, in myrequests_get
- raise ValueError(string)
- ValueError: Final failure for url https://index.commoncrawl.org/CC-MAIN-2020-10-index {'limit': 20, 'filter': ['mime:application/pdf'], 'url': 'https://www.rsa.com/content/*', 'output': 'json', 'page': 0, 'from': '20190820184512'}: HTTPSConnectionPool(host='index.commoncrawl.org', port=443): Read timed out. (read timeout=30.0)
Add Comment
Please, Sign In to add comment