Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- pattern = re.compile("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?")
- webList = []
- ##########################
- ## Change Input File Here
- ##
- INPUT_PATH = "test.html"
- ##
- ## Change Output File Here
- ##
- OUTPUT_PATH = "results.txt"
- ##
- ## Change Character Limit Here
- ##
- CHAR_LIMIT = 37
- ##
- ##########################
- # Append Input to Array
- for i, line in enumerate(open(INPUT_PATH)):
- for match in re.finditer(pattern, line):
- webList.append(match.group()[0:CHAR_LIMIT])
- with open(OUTPUT_PATH, "a") as myfile:
- for i in webList:
- myfile.write(i + '\n')
- myfile.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement