Guest User

taguqoxora.py

a guest
Feb 10th, 2015
252
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/python
  2.  
  3. """
  4. Turn a common_crawl identifier into a normal-looking URL
  5. """
  6.  
  7. import sys
  8. import re
  9.  
  10. def is_port(s):
  11.     if re.findall(r"\A\d+\Z", s):
  12.         if int(s) <= 65535:
  13.             return True
  14.     return False
  15.  
  16. for line in sys.stdin:
  17.     url = line.rstrip()
  18.     rest, schema =  url.rsplit(":", 1)
  19.  
  20.     try:
  21.         domain, path = rest.split('/', 1)
  22.     except ValueError:
  23.         domain = rest
  24.         path = ''
  25.  
  26.     try:
  27.         maybe_path, maybe_port = path.rsplit(":", 1)
  28.     except ValueError:
  29.         port = None
  30.     else:
  31.         if is_port(maybe_port):
  32.             path = maybe_path
  33.             port = maybe_port
  34.         else:
  35.             port = None
  36.  
  37.     sys.stdout.write(schema + '://' + '.'.join(domain.split('.')[::-1]) + (':' + port if port is not None else '') + '/' + path + "\n")
RAW Paste Data