Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib2 # needed to open URLs
- import re # for regular expressions
- from BeautifulSoup import BeautifulSoup # used to parse HTML
- page = urllib2.urlopen("http://forums.darklordpotter.net/library_list.php") # Library List URL is opened and the page is stored
- soup = BeautifulSoup(page) # the page is given to BeautifulSoup, which allows to do a bunch of cool shit
- dlp = [] # will store the list of titles from the DLP Library
- final = [] # will store the matches between the Library and the KIA recs
- # loops through all the tags from the library list page such that the href matches the DLP url plush "show" (for "showthread",
- # which would indicated that the link is to a thread; the only threads on this page are those of Library fics)
- for tag in soup.findAll(href=re.compile("^http://forums\.darklordpotter\.net/show")):
- contents = tag.contents[0]
- dlp.append(contents.strip()[0:contents.find(" by")].encode("utf-8")) # takes off the part of the string starting with " by" and adds title
- # each Library entry is fed into a search query on KIA site
- for i in range(len(dlp)):
- page = urllib2.urlopen("http://mujaji.net/kia/?s=" + dlp[i].replace(" ","+") + "&key=Story+Title")
- soup = BeautifulSoup(page)
- # compares the titles of the results page to the Library title in question, and adds it to the final list it there is a match
- for x in soup.findAll(title=re.compile("^Permanent")):
- if (x.contents[0][(x.contents[0].find(";")+1):(x.contents[0].find("”"))]).lower() == dlp[i].lower():
- final.append(dlp[i])
- print final # all of the matches are printed, although there is still some human work to do to rule out false positives
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement