Untitled

#!/usr/bin/python3.2

import os,sys
import requests
import re
import html
import codecs
from time import sleep

#The main RSS feed of MangaUpdates, probably won't change
MU_RSS_URL = "http://www.mangaupdates.com/rss.php"
#Maximum items to keep in the feed
MAX_ITEMS = 20
#Time (in seconds) between updates
Refresh_Time = 900
#Public URL of the list to filter releases with
List_URL = "https://www.mangaupdates.com/mylist.html?id=408818&list=read"
#Prune items with no associated MU page?
Prune_NoLink = False
#Folder to place the output file in
Output_Folder = "/var/www/"
#File to write the resulting RSS-formatted XML out to
Output_File = "shukaro.xml"

def getListInfo(listURL):
	try:
		listPage = requests.get(listURL)
		result = re.search('You are viewing (.*?)\'s (.*?) List', listPage.text)
		return result.group(1), result.group(2)
	except RequestException:
		print("Scraping of List Info failed")
		return []

def getSeriesURLs(listURL):
	try:
		print("Scraping followed series from " + listURL)
		listPage = requests.get(listURL)
		urls = re.findall('<a href="(.*?)" title="Series Info"><u>.*?<\/u>', listPage.text)
		listInfo = getListInfo(listURL)
		print("Scraped " + str(len(urls)) + " series URLs from " + listInfo[0] + "'s " + listInfo[1] + " List")
		return urls
	except RequestException:
		print("Scraping of series failed")
		return []

def finishedPruning(items, seriesURLs):
	for item in items:
		if Prune_NoLink and item[2] == None:
			return False
		elif item[2] != None and not item[2] in seriesURLs:
			return False
	return True

listInfo = ["???", "???"]
os.chdir(Output_Folder)

while True:
	seriesURLs = getSeriesURLs(List_URL)
	if len(seriesURLs) == 0:
		sleep(Refresh_Time)
		continue
	try:
		rssPage = requests.get(MU_RSS_URL)
	except RequestException:
		print("Failed to scrape MU rss")
		sleep(Refresh_Time)
		continue
	print("Pruning series which aren't followed from " + MU_RSS_URL)
	muFeed = []
	#Title/Description/Link
	for match in re.findall('\<item\>\s*\<title\>(.*?)\<\/title\>\s*\<description\>(.*?)\<\/description\>\s*\<link\>(.*?)\<\/link\>', rssPage.text):
		muFeed.append([match[0], match[1], match[2]])
		muFeed[-1][2] = muFeed[-1][2].replace('http://', 'https://')

	while not finishedPruning(muFeed, seriesURLs):
		print(str(len(muFeed)) + " items left in feed...")
		for item in muFeed:
			if Prune_NoLink and item[2] == None:
				muFeed.remove(item)
			elif item[2] != None and not item[2] in seriesURLs:
				muFeed.remove(item)
	print(str(len(muFeed)) + " items left in feed...")

	existingFeed = []
	try:
		f = codecs.open(Output_File, "r", "ISO-8859-1")
		for match in re.findall('\<item\>\s*\<title\>(.*?)\<\/title\>\s*\<description\>(.*?)\<\/description\>\s*\<link\>(.*?)\<\/link\>', f.read()):
			existingFeed.append([match[0], match[1], match[2]])
		f.close()
	except IOError:
		print("Couldn't open " + Output_File + " in " + Output_Folder)

	print("Writing feed out to " + Output_File + " in " + Output_Folder)
	scrapedList = getListInfo(List_URL)
	if len(scrapedList) > 0:
		listInfo = scrapedList
	f = codecs.open(Output_File, "w", "ISO-8859-1")
	f.write("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n")
	f.write("<rss version=\"2.0\" xmlns:content=\"http://purl.org/rss/1.0/modules/content/\">\n")
	f.write("<channel>\n")
	f.write("	<title>" + listInfo[0] + "'s " + listInfo[1] + " List Feed</title>\n")
	f.write("	<link>" + html.escape(List_URL) + "</link>\n")
	f.write("	<description>Parsed from " + html.escape(MU_RSS_URL) + "</description>\n")
	added = []
	for item in muFeed:
		added.append(item[0])
		f.write("	<item>\n")
		f.write("		<title>" + item[0] + "</title>\n")
		f.write("		<description>" + item[1] + "</description>\n")
		f.write("		<link>" + item[2] + "</link>\n")
		f.write("	</item>\n")
	f.close()
	for item in existingFeed:
		if item[0] in added:
			continue
		f = codecs.open(Output_File, "a", "ISO-8859-1")
		f.write("	<item>\n")
		f.write("		<title>" + item[0] + "</title>\n")
		f.write("		<description>" + item[1] + "</description>\n")
		f.write("		<link>" + item[2] + "</link>\n")
		f.write("	</item>\n")
		f.close()
	f = codecs.open(Output_File, "a", "ISO-8859-1")
	f.write("</channel>\n")
	f.write("</rss>\n")
	f.close()
	print("Sleeping " + str(Refresh_Time) + " seconds")
	sleep(Refresh_Time)