SHOW:
|
|
- or go back to the newest paste.
1 | #!/usr/bin/env python | |
2 | ||
3 | u''' | |
4 | Ведомости | |
5 | ''' | |
6 | ||
7 | from calibre.web.feeds.feedparser import parse | |
8 | from calibre.ebooks.BeautifulSoup import Tag | |
9 | from calibre.web.feeds.news import BasicNewsRecipe | |
10 | ||
11 | class VedomostiRecipe(BasicNewsRecipe): | |
12 | title = u'Ведомости' | |
13 | __author__ = 'Nikolai Kotchetkov' | |
14 | publisher = 'vedomosti.ru' | |
15 | category = 'press, Russia' | |
16 | description = u'Ежедневная деловая газета' | |
17 | oldest_article = 3 | |
18 | max_articles_per_feed = 100 | |
19 | ||
20 | masthead_url = u'http://motorro.com/imgdir/logos/ved_logo_black2_cropped.gif' | |
21 | cover_url = u'http://motorro.com/imgdir/logos/ved_logo_black2_cropped.gif' | |
22 | ||
23 | #Add feed names if you want them to be sorted (feeds of this list appear first) | |
24 | sortOrder = [u'_default', u'Первая полоса', u'Власть и деньги'] | |
25 | ||
26 | encoding = 'cp1251' | |
27 | language = 'ru' | |
28 | no_stylesheets = True | |
29 | remove_javascript = True | |
30 | recursions = 0 | |
31 | ||
32 | conversion_options = { | |
33 | 'comment' : description | |
34 | , 'tags' : category | |
35 | , 'publisher' : publisher | |
36 | , 'language' : language | |
37 | } | |
38 | ||
39 | ||
40 | - | keep_only_tags = [dict(name='td', attrs={'class' : ['second_content']})] |
40 | + | #keep_only_tags = [dict(name='td', attrs={'class' : ['second_content']})] |
41 | ||
42 | - | remove_tags_after = [dict(name='div', attrs={'class' : 'article_text'})] |
42 | + | #remove_tags_after = [dict(name='div', attrs={'class' : 'article_text'})] |
43 | ||
44 | - | remove_tags = [dict(name='div', attrs={'class' : ['sep', 'choice', 'articleRightTbl']})] |
44 | + | #remove_tags = [dict(name='div', attrs={'class' : ['sep', 'choice', 'articleRightTbl']})] |
45 | ||
46 | feeds = [u'http://www.vedomosti.ru/newspaper/out/rss.xml'] | |
47 | ||
48 | #base URL for relative links | |
49 | base_url = u'http://www.vedomosti.ru' | |
50 | ||
51 | extra_css = 'h1 {font-size: 1.5em; margin: 0em 0em 0em 0em; text-align: center;}'\ | |
52 | 'h2 {font-size: 1.0em; margin: 0em 0em 0em 0em;}'\ | |
53 | 'h3 {font-size: 0.8em; margin: 0em 0em 0em 0em;}'\ | |
54 | '.article_date {font-size: 0.5em; color: gray; font-family: monospace; text-align:right;}'\ | |
55 | '.article_authors {font-size: 0.5em; color: gray; font-family: monospace; text-align:right;}'\ | |
56 | '.article_img {width:100%; text-align: center; padding: 3px 3px 3px 3px;}'\ | |
57 | '.article_img_desc {width:100%; text-align: center; font-size: 0.5em; color: gray; font-family: monospace;}'\ | |
58 | '.article_desc {font-size: 1em; font-style:italic;}' | |
59 | ||
60 | def parse_index(self): | |
61 | try: | |
62 | feedData = parse(self.feeds[0]) | |
63 | if not feedData: | |
64 | raise NotImplementedError | |
65 | self.log("parse_index: Feed loaded successfully.") | |
66 | if feedData.feed.has_key('title'): | |
67 | self.title = feedData.feed.title | |
68 | self.log("parse_index: Title updated to: ", self.title) | |
69 | if feedData.feed.has_key('description'): | |
70 | self.description = feedData.feed.description | |
71 | self.log("parse_index: Description updated to: ", self.description) | |
72 | ||
73 | def get_virtual_feed_articles(feed): | |
74 | if feeds.has_key(feed): | |
75 | return feeds[feed][1] | |
76 | self.log("Adding new feed: ", feed) | |
77 | articles = [] | |
78 | feeds[feed] = (feed, articles) | |
79 | return articles | |
80 | ||
81 | feeds = {} | |
82 | ||
83 | #Iterate feed items and distribute articles using tags | |
84 | for item in feedData.entries: | |
85 | link = item.get('link', ''); | |
86 | title = item.get('title', ''); | |
87 | if '' == link or '' == title: | |
88 | continue | |
89 | article = {'title':title, 'url':link, 'description':item.get('description', ''), 'date':item.get('date', ''), 'content':''}; | |
90 | if not item.has_key('tags'): | |
91 | get_virtual_feed_articles('_default').append(article) | |
92 | continue | |
93 | for tag in item.tags: | |
94 | addedToDefault = False | |
95 | term = tag.get('term', '') | |
96 | if '' == term: | |
97 | if (not addedToDefault): | |
98 | get_virtual_feed_articles('_default').append(article) | |
99 | continue | |
100 | get_virtual_feed_articles(term).append(article) | |
101 | ||
102 | #Get feed list | |
103 | #Select sorted feeds first of all | |
104 | result = [] | |
105 | for feedName in self.sortOrder: | |
106 | if (not feeds.has_key(feedName)): continue | |
107 | result.append(feeds[feedName]) | |
108 | del feeds[feedName] | |
109 | result = result + feeds.values() | |
110 | ||
111 | return result | |
112 | ||
113 | except Exception, err: | |
114 | self.log(err) | |
115 | raise NotImplementedError | |
116 | ||
117 | def preprocess_html(self, soup): | |
118 | return self.adeify_images(soup) | |
119 | ||
120 | def postprocess_html(self, soup, first_fetch): | |
121 | #self.log('Original: ', soup.prettify()) | |
122 | ||
123 | #Find article | |
124 | contents = soup.find('div', {'class':['article_text']}) | |
125 | if not contents: | |
126 | self.log('postprocess_html: article div not found!') | |
127 | return soup | |
128 | contents.extract() | |
129 | ||
130 | #Find title | |
131 | title = soup.find('h1') | |
132 | if title: | |
133 | contents.insert(0, title) | |
134 | ||
135 | #Find article image | |
136 | newstop = soup.find('div', {'class':['newstop']}) | |
137 | if newstop: | |
138 | img = newstop.find('img') | |
139 | if img: | |
140 | imgDiv = Tag(soup, 'div') | |
141 | imgDiv['class'] = 'article_img' | |
142 | ||
143 | if img.has_key('width'): | |
144 | del(img['width']) | |
145 | if img.has_key('height'): | |
146 | del(img['height']) | |
147 | ||
148 | #find description | |
149 | element = img.parent.nextSibling | |
150 | ||
151 | img.extract() | |
152 | imgDiv.insert(0, img) | |
153 | ||
154 | while element: | |
155 | if not isinstance(element, Tag): | |
156 | continue | |
157 | nextElement = element.nextSibling | |
158 | if 'p' == element.name: | |
159 | element.extract() | |
160 | element['class'] = 'article_img_desc' | |
161 | imgDiv.insert(len(imgDiv.contents), element) | |
162 | element = nextElement | |
163 | ||
164 | contents.insert(1, imgDiv) | |
165 | ||
166 | #find article abstract | |
167 | abstract = soup.find('p', {'class':['subhead']}) | |
168 | if abstract: | |
169 | abstract['class'] = 'article_desc' | |
170 | contents.insert(2, abstract) | |
171 | ||
172 | #Find article authors | |
173 | authorsDiv = soup.find('div', {'class':['autors']}) | |
174 | if authorsDiv: | |
175 | authorsP = authorsDiv.find('p') | |
176 | if authorsP: | |
177 | authorsP['class'] = 'article_authors' | |
178 | contents.insert(len(contents.contents), authorsP) | |
179 | ||
180 | #Fix urls that use relative path | |
181 | urls = contents.findAll('a'); | |
182 | if urls: | |
183 | for url in urls: | |
184 | if not url.has_key('href'): | |
185 | continue | |
186 | if '/' == url['href'][0]: | |
187 | url['href'] = self.base_url + url['href'] | |
188 | ||
189 | body = soup.find('td', {'class':['second_content']}) | |
190 | if body: | |
191 | body.replaceWith(contents) | |
192 | ||
193 | self.log('Result: ', soup.prettify()) | |
194 | return soup |