Want more features on Pastebin? Sign Up, it's FREE!
Guest

Untitled

By: a guest on Nov 21st, 2010  |  syntax: None  |  size: 2.89 KB  |  views: 106  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
  5. '''
  6. Muy Interesante
  7. '''
  8.  
  9. from calibre.web.feeds.news import BasicNewsRecipe
  10.  
  11. class General(BasicNewsRecipe):
  12.     title                 = 'Muy Interesante'
  13.     __author__            = 'Gustavo Azambuja'
  14.     description           = 'Revista Muy Interesante, Edicion Espanola'
  15.     language       = 'es'
  16.     timefmt        = '[%a, %d %b, %Y]'
  17.     use_embedded_content  = False
  18.     recursion             = 1
  19.     encoding = 'utf8'
  20.     remove_javascript = True
  21.     no_stylesheets = True
  22.     conversion_options = {'linearize_tables': True}
  23.  
  24.     oldest_article        = 180
  25.     max_articles_per_feed = 100
  26.     keep_only_tags = [
  27.              dict(id=['contenido']),
  28.              dict(name='td', attrs={'class':'contentheading'}),
  29.              dict(name='td', attrs={'class':'txt_articulo'})
  30.                      ]
  31.     remove_tags = [
  32.              dict(name='div', attrs={'class':'breadcrumb'}),
  33.              dict(name='div', attrs={'class':'bloque1'}),
  34.              dict(name='div', attrs={'class':'article'}),
  35.              dict(name='div', attrs={'class':'bajo_title'}),
  36.              dict(name='div', attrs={'class':'tags_articles'}),
  37.              dict(name='div', attrs={'id':'comment'}),
  38.              dict(name='table', attrs={'class':'pagenav'}),
  39.              dict(name=['object','link'])
  40.                   ]
  41.     remove_attributes = ['width','height', 'font']
  42.    
  43.     extra_css = '''
  44.                 h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
  45.                 h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
  46.                 h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
  47.                 img {float:left; clear:both; margin:10px}
  48.                 p {font-family:Arial,Helvetica,sans-serif;}
  49.                 '''
  50.     feeds = [
  51.                   (u'Articulos', u'http://feeds.feedburner.com/Muyinteresantees')
  52.     ]
  53.  
  54.     def preprocess_html(self, soup):
  55.         attribs = [  'style','font','valign'
  56.                     ,'colspan','width','height'
  57.                     ,'rowspan','summary','align'
  58.                     ,'cellspacing','cellpadding'
  59.                     ,'frames','rules','border'
  60.                   ]
  61.         for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
  62.             item.name = 'div'
  63.             for attrib in attribs:
  64.                 if item.has_key(attrib):
  65.                    del item[attrib]
  66.         return soup
  67.  
  68.     def get_cover_url(self):
  69.                 index = 'http://www.muyinteresante.es/revista'
  70.                 soup = self.index_to_soup(index)
  71.                 link_item = soup.find('img',attrs={'class':'img_portada'})
  72.                 if link_item:
  73.                         cover_url = "http://www.muyinteresante.es"+link_item['src']
  74.                 return cover_url
clone this paste RAW Paste Data