Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib
- import requests
- header= {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
- 'AppleWebKit/537.11 (KHTML, like Gecko) '
- 'Chrome/23.0.1271.64 Safari/537.11',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
- 'Accept-Encoding': 'none',
- 'Accept-Language': 'en-US,en;q=0.8',
- 'Connection': 'keep-alive'}
- url = "https://www.explainxkcd.com/{0}"
- crapped_comics = []
- for i in range(1,2613):
- print(url.format(i))
- try:
- req = urllib.request.Request(url=url.format(i), headers=header)
- html = str(urllib.request.urlopen(req).read())
- except:
- print("Skipping.")
- crap_number = len(html.split("crap"))
- if crap_number > 10:
- print(f"This comic is crapped ({crap_number} craps found).")
- crapped_comics.append(f"Comic number: {i} | url=url.format(i)\n")
- else:
- print("Comic likely not crapped.")
- with open("crapped_comics.txt", "w") as cc_txt:
- for line in crapped_comics:
- cc_txt.write(line)
- print(f"{len(crapped_comics)} found, corresponding to {100*round(len(crapped_comics)/2613., 3)} percent.")
Add Comment
Please, Sign In to add comment