Advertisement
cmiN

inefficient link crawler

Jan 5th, 2013
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.25 KB | None | 0 0
  1. #! /usr/bin/env python3
  2.  
  3.  
  4. import re
  5. import sys
  6. from urllib import request, error
  7. from urllib.parse import urlparse
  8.  
  9.  
  10. def extract_urls(fname, path):
  11.     """Extract urls from file.
  12.    
  13.    Open file in text mode then search
  14.    for all URLs using a simple regex and
  15.    return them as a list.
  16.    """
  17.  
  18.     # how to find links
  19.     template = r'(a.+?href=")([^"]+)"'
  20.  
  21.     # parse file and save the links
  22.     links = set()
  23.     with open(fname) as fin:
  24.         regen = re.finditer(template, fin.read())
  25.         for regex in regen:
  26.             string = regex.group(2)
  27.             if urlparse(string).netloc:
  28.                 pass
  29.             elif string.startswith("#"):
  30.                 continue    # no need for fragments
  31.             elif string.startswith(".."):
  32.                 # a "../" relative link
  33.                 string = path[:path.rfind("/")] +\
  34.                          string[2:]
  35.             else:
  36.                 string = path + "/" + string.strip("./")
  37.             parsed = urlparse(string)
  38.             string = parsed.scheme + "://" + parsed.netloc +\
  39.                      parsed.path
  40.             links.add(string)
  41.  
  42.     return list(links)
  43.  
  44.  
  45. def main(argc, argv):
  46.     if argc != 2:
  47.         print("Usage: {} URL".format(argv[0]))
  48.         return 0
  49.  
  50.     # get name
  51.     parsed = urlparse(argv[1])
  52.     regex = re.search(r"(\w+\.)[a-z]+$", parsed.path,
  53.                       flags=re.I)
  54.     urlPath = parsed.scheme + "://" + parsed.netloc
  55.     if regex:
  56.         urlName = regex.group()
  57.         urlPath += "/" + parsed.path[:regex.start()].strip("/")
  58.     else:
  59.         urlName = "index.html"
  60.  
  61.     # create object from link
  62.     try:
  63.         urlReq = request.Request(parsed.geturl())
  64.     except ValueError:
  65.         print("Error: Invalid URL")
  66.         return 1
  67.     else:
  68.         urlReq.add_header("User-Agent", "Mozilla/5.0")
  69.  
  70.     # open it and wirte data to file
  71.     try:
  72.         uin = request.urlopen(urlReq)
  73.     except error.URLError:
  74.         print("Error: Invalid URL")
  75.         return 1
  76.     else:
  77.         with open(urlName, "wb") as fout:
  78.             fout.write(uin.read())
  79.  
  80.     print(extract_urls(urlName, urlPath))
  81.     return 0
  82.  
  83.  
  84. if __name__ == "__main__":
  85.     rc = main(len(sys.argv), sys.argv)
  86.     sys.exit(rc)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement