Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /usr/bin/env python3
- import re
- import sys
- from urllib import request, error
- from urllib.parse import urlparse
- def extract_urls(fname, path):
- """Extract urls from file.
- Open file in text mode then search
- for all URLs using a simple regex and
- return them as a list.
- """
- # how to find links
- template = r'(a.+?href=")([^"]+)"'
- # parse file and save the links
- links = set()
- with open(fname) as fin:
- regen = re.finditer(template, fin.read())
- for regex in regen:
- string = regex.group(2)
- if urlparse(string).netloc:
- pass
- elif string.startswith("#"):
- continue # no need for fragments
- elif string.startswith(".."):
- # a "../" relative link
- string = path[:path.rfind("/")] +\
- string[2:]
- else:
- string = path + "/" + string.strip("./")
- parsed = urlparse(string)
- string = parsed.scheme + "://" + parsed.netloc +\
- parsed.path
- links.add(string)
- return list(links)
- def main(argc, argv):
- if argc != 2:
- print("Usage: {} URL".format(argv[0]))
- return 0
- # get name
- parsed = urlparse(argv[1])
- regex = re.search(r"(\w+\.)[a-z]+$", parsed.path,
- flags=re.I)
- urlPath = parsed.scheme + "://" + parsed.netloc
- if regex:
- urlName = regex.group()
- urlPath += "/" + parsed.path[:regex.start()].strip("/")
- else:
- urlName = "index.html"
- # create object from link
- try:
- urlReq = request.Request(parsed.geturl())
- except ValueError:
- print("Error: Invalid URL")
- return 1
- else:
- urlReq.add_header("User-Agent", "Mozilla/5.0")
- # open it and wirte data to file
- try:
- uin = request.urlopen(urlReq)
- except error.URLError:
- print("Error: Invalid URL")
- return 1
- else:
- with open(urlName, "wb") as fout:
- fout.write(uin.read())
- print(extract_urls(urlName, urlPath))
- return 0
- if __name__ == "__main__":
- rc = main(len(sys.argv), sys.argv)
- sys.exit(rc)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement