Guest User

Calibre Recipe for http://media.daum.net/

a guest
Jul 22nd, 2010
128
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.98 KB | None | 0 0
  1. import re
  2. from datetime import date, timedelta
  3.  
  4. from calibre.web.feeds.recipes import BasicNewsRecipe
  5. from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString ,Comment
  6.  
  7. class MediaDaumRecipe(BasicNewsRecipe):
  8.     title = u'\uBBF8\uB514\uC5B4 \uB2E4\uC74C \uC624\uB298\uC758 \uC8FC\uC694 \uB274\uC2A4'
  9.     language  = 'ko'
  10.     max_articles = 100
  11.  
  12.     timefmt = ''
  13.     masthead_url = 'http://img-media.daum-img.net/2010ci/service_news.gif'
  14.     cover_margins = (18,18,'grey99')
  15.     no_stylesheets = True
  16.     remove_tags_before = dict(id='GS_con')
  17.     remove_tags_after  = dict(id='GS_con')
  18.     remove_tags = [dict(attrs={'class':[
  19.                             'bline',
  20.                             'GS_vod',
  21.                             ]}),
  22.                    dict(id=[
  23.                             'GS_swf_poll',
  24.                             'ad250',
  25.                             ]),
  26.                    dict(name=['script', 'noscript', 'style', 'object'])]
  27.     preprocess_regexps = [
  28.        (re.compile(r'<\s+', re.DOTALL|re.IGNORECASE),
  29.         lambda match: '&lt; '),
  30.        (re.compile(r'(<br[^>]*>[ \t\r\n]*){3,}', re.DOTALL|re.IGNORECASE),
  31.         lambda match: ''),
  32.        (re.compile(r'(<br[^>]*>[ \t\r\n]*)*</div>', re.DOTALL|re.IGNORECASE),
  33.         lambda match: '</div>'),
  34.        (re.compile(r'(<br[^>]*>[ \t\r\n]*)*</p>', re.DOTALL|re.IGNORECASE),
  35.         lambda match: '</p>'),
  36.        (re.compile(r'(<br[^>]*>[ \t\r\n]*)*</td>', re.DOTALL|re.IGNORECASE),
  37.         lambda match: '</td>'),
  38.        (re.compile(r'(<br[^>]*>[ \t\r\n]*)*</strong>', re.DOTALL|re.IGNORECASE),
  39.         lambda match: '</strong>'),
  40.        (re.compile(r'(<br[^>]*>[ \t\r\n]*)*</b>', re.DOTALL|re.IGNORECASE),
  41.         lambda match: '</b>'),
  42.        (re.compile(r'(<br[^>]*>[ \t\r\n]*)*</em>', re.DOTALL|re.IGNORECASE),
  43.         lambda match: '</em>'),
  44.        (re.compile(r'(<br[^>]*>[ \t\r\n]*)*</i>', re.DOTALL|re.IGNORECASE),
  45.         lambda match: '</i>'),
  46.        (re.compile(u'\(\uB05D\)[ \t\r\n]*<br[^>]*>.*</div>', re.DOTALL|re.IGNORECASE),
  47.         lambda match: '</div>'),
  48.        (re.compile(r'(<br[^>]*>[ \t\r\n]*)*<div', re.DOTALL|re.IGNORECASE),
  49.         lambda match: '<div'),
  50.        (re.compile(r'(<br[^>]*>[ \t\r\n]*)*<p', re.DOTALL|re.IGNORECASE),
  51.         lambda match: '<p'),
  52.        (re.compile(r'(<br[^>]*>[ \t\r\n]*)*<table', re.DOTALL|re.IGNORECASE),
  53.         lambda match: '<table'),
  54.        (re.compile(r'<strong>(<br[^>]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE),
  55.         lambda match: '<strong>'),
  56.        (re.compile(r'<b>(<br[^>]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE),
  57.         lambda match: '<b>'),
  58.        (re.compile(r'<em>(<br[^>]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE),
  59.         lambda match: '<em>'),
  60.        (re.compile(r'<i>(<br[^>]*>[ \t\r\n]*)*', re.DOTALL|re.IGNORECASE),
  61.         lambda match: '<i>'),
  62.        (re.compile(u'(<br[^>]*>[ \t\r\n]*)*(\u25B6|\u25CF|\u261E|\u24D2|\(c\))*\[[^\]]*(\u24D2|\(c\)|\uAE30\uC0AC|\uC778\uAE30[^\]]*\uB274\uC2A4)[^\]]*\].*</div>', re.DOTALL|re.IGNORECASE),
  63.         lambda match: '</div>'),
  64.     ]
  65.  
  66.     def parse_index(self):
  67.         today = date.today();
  68.         articles = []
  69.         articles = self.parse_list_page(articles, today)
  70.         articles = self.parse_list_page(articles, today - timedelta(1))
  71.         articles = self.parse_list_page(articles, today - timedelta(2))
  72.         return [('\uBBF8\uB514\uC5B4 \uB2E4\uC74C \uC624\uB298\uC758 \uC8FC\uC694 \uB274\uC2A4', articles)]
  73.        
  74.  
  75.     def parse_list_page(self, articles, date):
  76.         if len(articles) >= self.max_articles:
  77.             return articles
  78.  
  79.         for page in range(1, 10):
  80.             soup = self.index_to_soup('http://media.daum.net/primary/total/list.html?cateid=100044&date=%(date)s&page=%(page)d' % {'date': date.strftime('%Y%m%d'), 'page': page})
  81.             done = True
  82.             for item in soup.findAll('dl'):
  83.                 dt = item.find('dt', { 'class': 'tit' })
  84.                 dd = item.find('dd', { 'class': 'txt' })
  85.                 if dt is None:
  86.                     break
  87.                 a = dt.find('a', href=True)
  88.                 url = 'http://media.daum.net/primary/total/' + a['href']
  89.                 title = self.tag_to_string(dt)
  90.                 if dd is None:
  91.                     description = ''
  92.                 else:
  93.                     description = self.tag_to_string(dd)
  94.                 articles.append(dict(title=title, description=description, url=url, content=''))
  95.                 done = len(articles) >= self.max_articles                  
  96.                 if done:
  97.                     break
  98.             if done:
  99.                 break
  100.         return articles
  101.  
  102.  
  103.     def preprocess_html(self, soup):
  104.         return self.strip_anchors(soup)
  105.  
  106.     def strip_anchors(self, soup):
  107.         for para in soup.findAll(True):
  108.             aTags = para.findAll('a')
  109.             for a in aTags:
  110.                 if a.img is None:
  111.                     a.replaceWith(a.renderContents().decode('utf-8','replace'))
  112.         return soup
Advertisement
Add Comment
Please, Sign In to add comment