Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3.2
- import os,sys
- import requests
- import re
- import html
- import codecs
- from time import sleep
- #The main RSS feed of MangaUpdates, probably won't change
- MU_RSS_URL = "http://www.mangaupdates.com/rss.php"
- #Maximum items to keep in the feed
- MAX_ITEMS = 20
- #Time (in seconds) between updates
- Refresh_Time = 900
- #Public URL of the list to filter releases with
- List_URL = "https://www.mangaupdates.com/mylist.html?id=408818&list=read"
- #Prune items with no associated MU page?
- Prune_NoLink = False
- #Folder to place the output file in
- Output_Folder = "/var/www/"
- #File to write the resulting RSS-formatted XML out to
- Output_File = "shukaro.xml"
- def getListInfo(listURL):
- try:
- listPage = requests.get(listURL)
- result = re.search('You are viewing (.*?)\'s (.*?) List', listPage.text)
- return result.group(1), result.group(2)
- except RequestException:
- print("Scraping of List Info failed")
- return []
- def getSeriesURLs(listURL):
- try:
- print("Scraping followed series from " + listURL)
- listPage = requests.get(listURL)
- urls = re.findall('<a href="(.*?)" title="Series Info"><u>.*?<\/u>', listPage.text)
- listInfo = getListInfo(listURL)
- print("Scraped " + str(len(urls)) + " series URLs from " + listInfo[0] + "'s " + listInfo[1] + " List")
- return urls
- except RequestException:
- print("Scraping of series failed")
- return []
- def finishedPruning(items, seriesURLs):
- for item in items:
- if Prune_NoLink and item[2] == None:
- return False
- elif item[2] != None and not item[2] in seriesURLs:
- return False
- return True
- listInfo = ["???", "???"]
- os.chdir(Output_Folder)
- while True:
- seriesURLs = getSeriesURLs(List_URL)
- if len(seriesURLs) == 0:
- sleep(Refresh_Time)
- continue
- try:
- rssPage = requests.get(MU_RSS_URL)
- except RequestException:
- print("Failed to scrape MU rss")
- sleep(Refresh_Time)
- continue
- print("Pruning series which aren't followed from " + MU_RSS_URL)
- muFeed = []
- #Title/Description/Link
- for match in re.findall('\<item\>\s*\<title\>(.*?)\<\/title\>\s*\<description\>(.*?)\<\/description\>\s*\<link\>(.*?)\<\/link\>', rssPage.text):
- muFeed.append([match[0], match[1], match[2]])
- muFeed[-1][2] = muFeed[-1][2].replace('http://', 'https://')
- while not finishedPruning(muFeed, seriesURLs):
- print(str(len(muFeed)) + " items left in feed...")
- for item in muFeed:
- if Prune_NoLink and item[2] == None:
- muFeed.remove(item)
- elif item[2] != None and not item[2] in seriesURLs:
- muFeed.remove(item)
- print(str(len(muFeed)) + " items left in feed...")
- existingFeed = []
- try:
- f = codecs.open(Output_File, "r", "ISO-8859-1")
- for match in re.findall('\<item\>\s*\<title\>(.*?)\<\/title\>\s*\<description\>(.*?)\<\/description\>\s*\<link\>(.*?)\<\/link\>', f.read()):
- existingFeed.append([match[0], match[1], match[2]])
- f.close()
- except IOError:
- print("Couldn't open " + Output_File + " in " + Output_Folder)
- print("Writing feed out to " + Output_File + " in " + Output_Folder)
- scrapedList = getListInfo(List_URL)
- if len(scrapedList) > 0:
- listInfo = scrapedList
- f = codecs.open(Output_File, "w", "ISO-8859-1")
- f.write("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n")
- f.write("<rss version=\"2.0\" xmlns:content=\"http://purl.org/rss/1.0/modules/content/\">\n")
- f.write("<channel>\n")
- f.write(" <title>" + listInfo[0] + "'s " + listInfo[1] + " List Feed</title>\n")
- f.write(" <link>" + html.escape(List_URL) + "</link>\n")
- f.write(" <description>Parsed from " + html.escape(MU_RSS_URL) + "</description>\n")
- added = []
- for item in muFeed:
- added.append(item[0])
- f.write(" <item>\n")
- f.write(" <title>" + item[0] + "</title>\n")
- f.write(" <description>" + item[1] + "</description>\n")
- f.write(" <link>" + item[2] + "</link>\n")
- f.write(" </item>\n")
- f.close()
- for item in existingFeed:
- if item[0] in added:
- continue
- f = codecs.open(Output_File, "a", "ISO-8859-1")
- f.write(" <item>\n")
- f.write(" <title>" + item[0] + "</title>\n")
- f.write(" <description>" + item[1] + "</description>\n")
- f.write(" <link>" + item[2] + "</link>\n")
- f.write(" </item>\n")
- f.close()
- f = codecs.open(Output_File, "a", "ISO-8859-1")
- f.write("</channel>\n")
- f.write("</rss>\n")
- f.close()
- print("Sleeping " + str(Refresh_Time) + " seconds")
- sleep(Refresh_Time)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement