Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- """
- Turn a common_crawl identifier into a normal-looking URL
- """
- import sys
- import re
- def is_port(s):
- if re.findall(r"\A\d+\Z", s):
- if int(s) <= 65535:
- return True
- return False
- for line in sys.stdin:
- url = line.rstrip()
- rest, schema = url.rsplit(":", 1)
- try:
- domain, path = rest.split('/', 1)
- except ValueError:
- domain = rest
- path = ''
- try:
- maybe_path, maybe_port = path.rsplit(":", 1)
- except ValueError:
- port = None
- else:
- if is_port(maybe_port):
- path = maybe_path
- port = maybe_port
- else:
- port = None
- sys.stdout.write(schema + '://' + '.'.join(domain.split('.')[::-1]) + (':' + port if port is not None else '') + '/' + path + "\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement