Advertisement
Guest User

Untitled

a guest
Dec 18th, 2014
201
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.31 KB | None | 0 0
  1. #!/usr/bin/python3.2
  2.  
  3. import os,sys
  4. import requests
  5. import re
  6. import html
  7. import codecs
  8. from time import sleep
  9.  
  10. #The main RSS feed of MangaUpdates, probably won't change
  11. MU_RSS_URL = "http://www.mangaupdates.com/rss.php"
  12. #Maximum items to keep in the feed
  13. MAX_ITEMS = 20
  14. #Time (in seconds) between updates
  15. Refresh_Time = 900
  16. #Public URL of the list to filter releases with
  17. List_URL = "https://www.mangaupdates.com/mylist.html?id=408818&list=read"
  18. #Prune items with no associated MU page?
  19. Prune_NoLink = False
  20. #Folder to place the output file in
  21. Output_Folder = "/var/www/"
  22. #File to write the resulting RSS-formatted XML out to
  23. Output_File = "shukaro.xml"
  24.  
  25. def getListInfo(listURL):
  26. try:
  27. listPage = requests.get(listURL)
  28. result = re.search('You are viewing (.*?)\'s (.*?) List', listPage.text)
  29. return result.group(1), result.group(2)
  30. except RequestException:
  31. print("Scraping of List Info failed")
  32. return []
  33.  
  34. def getSeriesURLs(listURL):
  35. try:
  36. print("Scraping followed series from " + listURL)
  37. listPage = requests.get(listURL)
  38. urls = re.findall('<a href="(.*?)" title="Series Info"><u>.*?<\/u>', listPage.text)
  39. listInfo = getListInfo(listURL)
  40. print("Scraped " + str(len(urls)) + " series URLs from " + listInfo[0] + "'s " + listInfo[1] + " List")
  41. return urls
  42. except RequestException:
  43. print("Scraping of series failed")
  44. return []
  45.  
  46. def finishedPruning(items, seriesURLs):
  47. for item in items:
  48. if Prune_NoLink and item[2] == None:
  49. return False
  50. elif item[2] != None and not item[2] in seriesURLs:
  51. return False
  52. return True
  53.  
  54. listInfo = ["???", "???"]
  55. os.chdir(Output_Folder)
  56.  
  57. while True:
  58. seriesURLs = getSeriesURLs(List_URL)
  59. if len(seriesURLs) == 0:
  60. sleep(Refresh_Time)
  61. continue
  62. try:
  63. rssPage = requests.get(MU_RSS_URL)
  64. except RequestException:
  65. print("Failed to scrape MU rss")
  66. sleep(Refresh_Time)
  67. continue
  68. print("Pruning series which aren't followed from " + MU_RSS_URL)
  69. muFeed = []
  70. #Title/Description/Link
  71. for match in re.findall('\<item\>\s*\<title\>(.*?)\<\/title\>\s*\<description\>(.*?)\<\/description\>\s*\<link\>(.*?)\<\/link\>', rssPage.text):
  72. muFeed.append([match[0], match[1], match[2]])
  73. muFeed[-1][2] = muFeed[-1][2].replace('http://', 'https://')
  74.  
  75. while not finishedPruning(muFeed, seriesURLs):
  76. print(str(len(muFeed)) + " items left in feed...")
  77. for item in muFeed:
  78. if Prune_NoLink and item[2] == None:
  79. muFeed.remove(item)
  80. elif item[2] != None and not item[2] in seriesURLs:
  81. muFeed.remove(item)
  82. print(str(len(muFeed)) + " items left in feed...")
  83.  
  84. existingFeed = []
  85. try:
  86. f = codecs.open(Output_File, "r", "ISO-8859-1")
  87. for match in re.findall('\<item\>\s*\<title\>(.*?)\<\/title\>\s*\<description\>(.*?)\<\/description\>\s*\<link\>(.*?)\<\/link\>', f.read()):
  88. existingFeed.append([match[0], match[1], match[2]])
  89. f.close()
  90. except IOError:
  91. print("Couldn't open " + Output_File + " in " + Output_Folder)
  92.  
  93. print("Writing feed out to " + Output_File + " in " + Output_Folder)
  94. scrapedList = getListInfo(List_URL)
  95. if len(scrapedList) > 0:
  96. listInfo = scrapedList
  97. f = codecs.open(Output_File, "w", "ISO-8859-1")
  98. f.write("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n")
  99. f.write("<rss version=\"2.0\" xmlns:content=\"http://purl.org/rss/1.0/modules/content/\">\n")
  100. f.write("<channel>\n")
  101. f.write(" <title>" + listInfo[0] + "'s " + listInfo[1] + " List Feed</title>\n")
  102. f.write(" <link>" + html.escape(List_URL) + "</link>\n")
  103. f.write(" <description>Parsed from " + html.escape(MU_RSS_URL) + "</description>\n")
  104. added = []
  105. for item in muFeed:
  106. added.append(item[0])
  107. f.write(" <item>\n")
  108. f.write(" <title>" + item[0] + "</title>\n")
  109. f.write(" <description>" + item[1] + "</description>\n")
  110. f.write(" <link>" + item[2] + "</link>\n")
  111. f.write(" </item>\n")
  112. f.close()
  113. for item in existingFeed:
  114. if item[0] in added:
  115. continue
  116. f = codecs.open(Output_File, "a", "ISO-8859-1")
  117. f.write(" <item>\n")
  118. f.write(" <title>" + item[0] + "</title>\n")
  119. f.write(" <description>" + item[1] + "</description>\n")
  120. f.write(" <link>" + item[2] + "</link>\n")
  121. f.write(" </item>\n")
  122. f.close()
  123. f = codecs.open(Output_File, "a", "ISO-8859-1")
  124. f.write("</channel>\n")
  125. f.write("</rss>\n")
  126. f.close()
  127. print("Sleeping " + str(Refresh_Time) + " seconds")
  128. sleep(Refresh_Time)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement