Advertisement
Guest User

fogus

a guest
Apr 9th, 2009
218
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.39 KB | None | 0 0
  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
  5. '''
  6. en.wikipedia.org
  7. '''
  8.  
  9. class Wikipedia(BasicNewsRecipe):
  10.     title                 = 'Wikipedia'
  11.     __author__            = 'Darko Miletic'
  12.     description           = 'Wikipedia articles'
  13.     category              = 'data, world'
  14.     oldest_article        = 7
  15.     max_articles_per_feed = 100
  16.     publisher             = 'Wiki'
  17.     no_stylesheets        = True
  18.     use_embedded_content  = False
  19.     encoding              = 'utf-8'
  20.     remove_javascript     = True
  21.     language              = _('English')
  22.     INDEX                 = 'http://en.wikipedia.org/'
  23.  
  24.     html2lrf_options = ['--comment', description, '--category', category, '--publisher', publisher]
  25.  
  26.     html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
  27.  
  28.     keep_only_tags = [dict(name='h1', attrs={'id':'firstHeading'}) ,dict(name='div', attrs={'id':'bodyContent'})]
  29.  
  30.     remove_tags = [dict(name=['link','sup']),dict(name='div',attrs={'id':['printfooter','catlinks','footer']}),dict(name='div',attrs={'class':['thumb left','thumb right']})]
  31.  
  32.  
  33. # add entries in here, for example:
  34. '''
  35. def parse_index(self):
  36.     articles = []
  37.     articles.append({
  38.         'title':'Tree',
  39.         'date':'',
  40.         'url':self.INDEX + 'wiki/Tree',
  41.         'description':'Tree'
  42.         })
  43.     articles.append({
  44.         'title':'Tiger',
  45.         'date':'',
  46.         'url':self.INDEX + 'wiki/Tiger',
  47.         'description':'Tiger'
  48.         })
  49.  
  50.     return [('Articles', articles)]
  51. '''
  52.  
  53.     def parse_index(self):
  54.         articles = [{'title':'Viking','date':'','url':self.INDEX + 'wiki/Viking','description':'Viking'},
  55.         {'title':'Viking_Age','date':'','url':self.INDEX + 'wiki/Viking_Age','description':'Viking_Age'},
  56.         {'title':'History_of_Scandinavia','date':'','url':self.INDEX + 'wiki/History_of_Scandinavia','description':'History_of_Scandinavia'},
  57.         {'title':'History_of_Denmark','date':'','url':self.INDEX + 'wiki/History_of_Denmark','description':'History_of_Denmark'},
  58.         {'title':'History_of_Sweden','date':'','url':self.INDEX + 'wiki/History_of_Sweden','description':'History_of_Sweden'},
  59.         {'title':'History_of_Norway','date':'','url':self.INDEX + 'wiki/History_of_Norway','description':'History_of_Norway'},
  60.         {'title':'History_of_Iceland','date':'','url':self.INDEX + 'wiki/History_of_Iceland','description':'History_of_Iceland'},
  61.         {'title':'History_of_Greenland','date':'','url':self.INDEX + 'wiki/History_of_Greenland','description':'History_of_Greenland'},
  62.         {'title':'Viking_Age_arms_and_armour','date':'','url':self.INDEX + 'wiki/Viking_Age_arms_and_armour','description':'Viking_Age_arms_and_armour'},
  63.         {'title':'Viking_ship', 'date':'','url':self.INDEX + 'wiki/Viking_ship','description':'Viking_ship'},
  64.         {'title':'Skuldelev_ships','date':'','url':self.INDEX + 'wiki/Skuldelev_ships','description':'Skuldelev_ships'},
  65.         {'title':'Gokstad_ship','date':'','url':self.INDEX + 'wiki/Gokstad_ship','description':'Gokstad_ship'},
  66.         {'title':'Oseberg_ship','date':'','url':self.INDEX + 'wiki/Oseberg_ship','description':'Oseberg_ship'},
  67.         {'title':'Tune_ship','date':'','url':self.INDEX + 'wiki/Tune_ship','description':'Tune_ship'},
  68.         {'title':'Norse_funeral','date':'','url':self.INDEX + 'wiki/Norse_funeral','description':'Norse_funeral'},
  69.         {'title':'Norse_pagan_worship','date':'','url':self.INDEX + 'wiki/Norse_pagan_worship','description':'Norse_pagan_worship'},
  70.         {'title':'Runestone','date':'','url':self.INDEX + 'wiki/Runestone','description':'Runestone'},
  71.         {'title':'Norse_mythology','date':'','url':self.INDEX + 'wiki/Norse_mythology','description':'Norse_mythology'},
  72.         {'title':'Borre_mound_cemetery','date':'','url':self.INDEX + 'wiki/Borre_mound_cemetery','description':'Borre_mound_cemetery'},
  73.         {'title':'Saga','date':'','url':self.INDEX + 'wiki/Saga','description':'Saga'},
  74.         {'title':'''L'Anse_aux_Meadows''','date':'','url':self.INDEX + '''wiki/L'Anse_aux_Meadows''','description':'''L'Anse_aux_Meadows'''},
  75.         {'title':'Dorset_culture','date':'','url':self.INDEX + 'wiki/Dorset_culture','description':'Dorset_culture'},
  76.         {'title':'Danelaw','date':'','url':self.INDEX + 'wiki/Danelaw','description':'Danelaw'},
  77.         {'title':'Oland','date':'','url':self.INDEX + 'wiki/Oland','description':'Oland'},
  78.         {'title':'Jelling','date':'','url':self.INDEX + 'wiki/Jelling','description':'Jelling'},
  79.         {'title':'Gamla_Uppsala','date':'','url':self.INDEX + 'wiki/Gamla_Uppsala','description':'Gamla_Uppsala'},
  80.         {'title':'Adelso','date':'','url':self.INDEX + 'wiki/Adelso','description':'Adelso'},
  81.         {'title':'Alby','date':'','url':self.INDEX + 'wiki/Alby','description':'Alby'},
  82.         {'title':'Birka','date':'','url':self.INDEX + 'wiki/Birka','description':'Birka'},
  83.         {'title':'Gene_fornby','date':'','url':self.INDEX + 'wiki/Gene_fornby','description':'Gene_fornby'},
  84.         {'title':'Helgo','date':'','url':self.INDEX + 'wiki/Helgo','description':'Helgo'},
  85.         {'title':'Sigtuna','date':'','url':self.INDEX + 'wiki/Sigtuna','description':'Sigtuna'},
  86.         {'title':'Uppakra','date':'','url':self.INDEX + 'wiki/Uppakra','description':'Uppakra'},
  87.         {'title':'Valsgarde','date':'','url':self.INDEX + 'wiki/Valsgarde','description':'Valsgarde'},
  88.         {'title':'Vendel','date':'','url':self.INDEX + 'wiki/Vendel','description':'Vendel'},
  89.         {'title':'Roskilde','date':'','url':self.INDEX + 'wiki/Roskilde','description':'Roskilde'},
  90.         {'title':'Lindholm_Hoje','date':'','url':self.INDEX + 'wiki/Lindholm_Hoje','description':'Lindholm_Hoje'},
  91.         {'title':'Hedeby','date':'','url':self.INDEX + 'wiki/Hedeby','description':'Hedeby'},
  92.         {'title':'Ribe','date':'','url':self.INDEX + 'wiki/Ribe','description':'Ribe'},
  93.         {'title':'Aarhus','date':'','url':self.INDEX + 'wiki/Aarhus','description':'Aarhus'},
  94.         {'title':'Viborg,_Denmark','date':'','url':self.INDEX + 'wiki/Viborg,_Denmark','description':'Viborg,_Denmark'},
  95.         {'title':'Lund','date':'','url':self.INDEX + 'wiki/Lund','description':'Lund'},
  96.         {'title':'Kaupang','date':'','url':self.INDEX + 'wiki/Kaupang','description':'Kaupang'},
  97.         {'title':'Jorvik','date':'','url':self.INDEX + 'wiki/Jorvik','description':'Jorvik'},
  98.         {'title':'History_of_Dublin','date':'','url':self.INDEX + 'wiki/History_of_Dublin','description':'History_of_Dublin'},
  99.         {'title':'Serkland','date':'','url':self.INDEX + 'wiki/Serkland','description':'Serkland'},
  100.         {'title':'History_of_the_Faroe_Islands','date':'','url':self.INDEX + 'wiki/History_of_the_Faroe_Islands','description':'History_of_the_Faroe_Islands'},
  101.         {'title':'Vinland','date':'','url':self.INDEX + 'wiki/Vinland','description':'Vinland'},
  102.         {'title':'Markland','date':'','url':self.INDEX + 'wiki/Markland','description':'Markland'},
  103.         {'title':'Helluland','date':'','url':self.INDEX + 'wiki/Helluland','description':'Helluland'},
  104.         {'title':'Bjarmaland','date':'','url':self.INDEX + 'wiki/Bjarmaland','description':'Bjarmaland'}]
  105.         return [('Articles', articles)]
  106.        
  107.     def print_version(self, url):
  108.         rest, sep, article_id  = url.rpartition('/')
  109.         return self.INDEX + 'w/index.php?title=' + article_id + '&printable=yes'
  110.  
  111.     def preprocess_html(self, soup):
  112.         mtag = '<meta http-equiv="Content-Language" content="en"/><meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
  113.         soup.head.insert(0,mtag)
  114.         btag = soup.find('div',attrs={'id':'bodyContent'})
  115.         for item in btag.findAll('div'):
  116.             item.extract()
  117.         for item in soup.findAll(style=True):
  118.             del item['style']
  119.         for item in soup.findAll(font=True):
  120.             del item['font']
  121.         return soup
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement