Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3.5
- # -*- coding: utf-8 -*-
- # Creer un flux rss .xml afichant les derniers films poster sur cpasbien
- # Peut etre mis a jour par cron
- # Dependance python3 :
- # python3-pyrss2gen , pthon3-wget , python3-bs4 , python3-requests
- from bs4 import BeautifulSoup
- import os,glob,datetime,PyRSS2Gen,requests,wget
- """" class """
- # permet l intégration de code HTML dans les items du RSS http://stackoverflow.com/questions/5371704/python-generated-rss-outputting-raw-html/7912205#7912205
- class NoOutput:
- def __init__(self):
- pass
- def publish(self, handler):
- pass
- class MediaRSS2(PyRSS2Gen.RSSItem):
- def __init__(self, **kwargs):
- PyRSS2Gen.RSSItem.__init__(self, **kwargs)
- def publish(self, handler):
- self.do_not_autooutput_description = self.description
- self.description = NoOutput() # This disables the Py2GenRSS "Automatic" output of the description, which would be escaped.
- PyRSS2Gen.RSSItem.publish(self, handler)
- def publish_extensions(self, handler):
- handler._write('<%s><![CDATA[%s]]></%s>' % ("description", self.do_not_autooutput_description, "description"))
- """ rapatriement des articles """
- def download():
- # récupération code HTML pour cpasbien
- resp = requests.get("http://www.cpasbien.cm/view_cat.php?categorie=films")
- encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
- soup = BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)
- # extraction lignes utile (1er filtre)
- liens_brut = soup.findAll(True, {"class":["ligne0", "ligne1"]})
- # extraction liens (2ieme filtre) et download des pages
- count = 0
- for x in liens_brut:
- soup = BeautifulSoup(str(x), 'lxml')
- wget.download(soup.a.get('href'),out="page"+str(count))
- count += 1;
- """ creation rss """
- def makerss():
- # generation items_list
- items_list = []
- for y in range(0, 29):
- soup = BeautifulSoup(open("page"+str(y)), 'lxml')
- items_list.append(
- MediaRSS2(
- title = str(soup.h2.a.contents),
- description = str(soup.find(id="textefiche"))+str(soup.find(id="bigcover")),
- link = str(soup.h2.a.get('href')),
- ),
- )
- # generation du rss
- rss = PyRSS2Gen.RSS2(
- title = "Cpasbien",
- link = "http://www.cpasbien.cm",
- description = "Sorties films cpasbien",
- lastBuildDate = datetime.datetime.now(),
- items =
- items_list
- )
- # ecriture du xml
- rss.rss_attrs["xmlns:media"] = "http://search.yahoo.com/mrss/"
- rss.write_xml(open("Cpasbien.xml", "w"), "utf-8");
- download()
- makerss()
- """ netoyage fichier temporaire """
- for filename in glob.glob('page*') :
- os.remove( filename )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement