#!/usr/bin/env python __license__ = 'GPL v3' __author__ = '2010, Gustavo Azambuja ' ''' Muy Interesante ''' from calibre.web.feeds.news import BasicNewsRecipe class General(BasicNewsRecipe): title = 'Muy Interesante' __author__ = 'Gustavo Azambuja' description = 'Revista Muy Interesante, Edicion Espanola' language = 'es' timefmt = '[%a, %d %b, %Y]' use_embedded_content = False recursion = 1 encoding = 'utf8' remove_javascript = True no_stylesheets = True conversion_options = {'linearize_tables': True} oldest_article = 180 max_articles_per_feed = 100 keep_only_tags = [ dict(id=['contenido']), dict(name='td', attrs={'class':'contentheading'}), dict(name='td', attrs={'class':'txt_articulo'}) ] remove_tags = [ dict(name='div', attrs={'class':'breadcrumb'}), dict(name='div', attrs={'class':'bloque1'}), dict(name='div', attrs={'class':'article'}), dict(name='div', attrs={'class':'bajo_title'}), dict(name='div', attrs={'class':'tags_articles'}), dict(name='div', attrs={'id':'comment'}), dict(name='table', attrs={'class':'pagenav'}), dict(name=['object','link']) ] remove_attributes = ['width','height', 'font'] extra_css = ''' h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;} h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;} h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;} img {float:left; clear:both; margin:10px} p {font-family:Arial,Helvetica,sans-serif;} ''' feeds = [ (u'Articulos', u'http://feeds.feedburner.com/Muyinteresantees') ] def preprocess_html(self, soup): attribs = [ 'style','font','valign' ,'colspan','width','height' ,'rowspan','summary','align' ,'cellspacing','cellpadding' ,'frames','rules','border' ] for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']): item.name = 'div' for attrib in attribs: if item.has_key(attrib): del item[attrib] return soup def get_cover_url(self): index = 'http://www.muyinteresante.es/revista' soup = self.index_to_soup(index) link_item = soup.find('img',attrs={'class':'img_portada'}) if link_item: cover_url = "http://www.muyinteresante.es"+link_item['src'] return cover_url