Advertisement
tom_enos

getLinks

Sep 28th, 2013
112
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.40 KB | None | 0 0
  1. from tkinter import *
  2. from tkinter.simpledialog import askstring
  3. import tkinter.filedialog, urllib.request, urllib.error, sys, re
  4.  
  5. def getUrl():
  6.     urltoopen = askstring('Address', 'Enter A Site')
  7.     urltoopen.strip()
  8.     if not urltoopen.lower().startswith("http://"):
  9.         urltoopen = "http://" + urltoopen
  10.     return urltoopen
  11.  
  12. def readPage(urltoopen):
  13.     try:
  14.         html = urllib.request.urlopen(urltoopen).read().decode('utf-8')
  15.         body = re.findall(r'<body.*?>(.*?)</body>', html, re.DOTALL)
  16.     except urllib.error.URLError as e:
  17.         print("Error accessing URL",e.reason)
  18.         sys.exit(0)
  19.     return body
  20.  
  21. def getLinks(body):
  22.     newList = []
  23.     final = set()
  24.     links = re.findall(r'<a.*?(.*?)</a>',body[0], re.DOTALL)
  25.     href = re.findall(r'href="(.*?)"',"".join(links), re.DOTALL)
  26.     for httpLinks in href:
  27.         newList.append(re.findall(r'http.*',httpLinks, re.DOTALL))
  28.     for a in newList:
  29.         if a != list():
  30.             final.add(a[0])
  31.     return final
  32.  
  33. def writetofile(saveToFile, links):
  34.     for a in links:
  35.         saveToFile.write(a)
  36.         saveToFile.write('\n')
  37.  
  38. def run():
  39.     root = Tk()  
  40.     root.withdraw()
  41.     site = getUrl()
  42.     body = readPage(site)
  43.     links = getLinks(body)
  44.     saveToFile = tkinter.filedialog.asksaveasfile()
  45.     writetofile(saveToFile, links)
  46.     saveToFile.close()
  47.  
  48. if __name__ == "__main__":
  49.     run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement