import urllib2 # needed to open URLs import re # for regular expressions from BeautifulSoup import BeautifulSoup # used to parse HTML page = urllib2.urlopen("http://forums.darklordpotter.net/library_list.php") # Library List URL is opened and the page is stored soup = BeautifulSoup(page) # the page is given to BeautifulSoup, which allows to do a bunch of cool shit dlp = [] # will store the list of titles from the DLP Library final = [] # will store the matches between the Library and the KIA recs # loops through all the tags from the library list page such that the href matches the DLP url plush "show" (for "showthread", # which would indicated that the link is to a thread; the only threads on this page are those of Library fics) for tag in soup.findAll(href=re.compile("^http://forums\.darklordpotter\.net/show")): contents = tag.contents[0] dlp.append(contents.strip()[0:contents.find(" by")].encode("utf-8")) # takes off the part of the string starting with " by" and adds title # each Library entry is fed into a search query on KIA site for i in range(len(dlp)): page = urllib2.urlopen("http://mujaji.net/kia/?s=" + dlp[i].replace(" ","+") + "&key=Story+Title") soup = BeautifulSoup(page) # compares the titles of the results page to the Library title in question, and adds it to the final list it there is a match for x in soup.findAll(title=re.compile("^Permanent")): if (x.contents[0][(x.contents[0].find(";")+1):(x.contents[0].find("”"))]).lower() == dlp[i].lower(): final.append(dlp[i]) print final # all of the matches are printed, although there is still some human work to do to rule out false positives