Advertisement
Guest User

Untitled

a guest
Aug 20th, 2016
461
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.84 KB | None | 0 0
  1. #!/usr/bin/python3.5
  2. # -*- coding: utf-8 -*-
  3.  
  4. # Creer un flux rss .xml afichant les derniers films poster sur cpasbien
  5. # Peut etre mis a jour par cron
  6. # Dependance python3 :
  7.  
  8. # python3-pyrss2gen , pthon3-wget , python3-bs4 , python3-requests
  9.  
  10. from bs4 import BeautifulSoup
  11. import os,glob,datetime,PyRSS2Gen,requests,wget
  12.  
  13. """" class """
  14. # permet l intégration de code HTML dans les items du RSS http://stackoverflow.com/questions/5371704/python-generated-rss-outputting-raw-html/7912205#7912205
  15. class NoOutput:
  16.     def __init__(self):
  17.         pass
  18.     def publish(self, handler):
  19.         pass
  20.  
  21. class MediaRSS2(PyRSS2Gen.RSSItem):
  22.     def __init__(self, **kwargs):
  23.         PyRSS2Gen.RSSItem.__init__(self, **kwargs)
  24.  
  25.     def publish(self, handler):
  26.         self.do_not_autooutput_description = self.description
  27.         self.description = NoOutput() # This disables the Py2GenRSS "Automatic" output of the description, which would be escaped.
  28.         PyRSS2Gen.RSSItem.publish(self, handler)
  29.  
  30.     def publish_extensions(self, handler):
  31.         handler._write('<%s><![CDATA[%s]]></%s>' % ("description", self.do_not_autooutput_description, "description"))
  32.  
  33. """ rapatriement des articles """
  34.  
  35. def download():
  36.  
  37.     # récupération code HTML pour cpasbien
  38.     resp = requests.get("http://www.cpasbien.cm/view_cat.php?categorie=films")
  39.     encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
  40.     soup = BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)
  41.  
  42.     # extraction lignes utile (1er filtre)
  43.     liens_brut = soup.findAll(True, {"class":["ligne0", "ligne1"]})
  44.  
  45.     # extraction liens (2ieme filtre) et download des pages
  46.     count = 0
  47.     for x in liens_brut:
  48.         soup = BeautifulSoup(str(x), 'lxml')
  49.         wget.download(soup.a.get('href'),out="page"+str(count))
  50.         count += 1;
  51.  
  52. """ creation rss """
  53.  
  54. def makerss():
  55.  
  56.     # generation items_list
  57.     items_list = []
  58.     for y in range(0, 29):
  59.         soup = BeautifulSoup(open("page"+str(y)), 'lxml')
  60.  
  61.         items_list.append(
  62.             MediaRSS2(
  63.                 title = str(soup.h2.a.contents),
  64.                 description = str(soup.find(id="textefiche"))+str(soup.find(id="bigcover")),
  65.                 link = str(soup.h2.a.get('href')),
  66.             ),
  67.         )
  68.  
  69.     # generation du rss
  70.     rss = PyRSS2Gen.RSS2(
  71.         title = "Cpasbien",
  72.         link = "http://www.cpasbien.cm",
  73.         description = "Sorties films cpasbien",
  74.         lastBuildDate = datetime.datetime.now(),
  75.         items =
  76.             items_list
  77.     )
  78.  
  79.     # ecriture du xml
  80.     rss.rss_attrs["xmlns:media"] = "http://search.yahoo.com/mrss/"
  81.     rss.write_xml(open("Cpasbien.xml", "w"), "utf-8");
  82.  
  83. download()
  84.  
  85.  
  86. makerss()
  87.  
  88. """ netoyage fichier temporaire """
  89. for filename in glob.glob('page*') :
  90.     os.remove( filename )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement