View difference between Paste ID: WWYP39U0 and 0T9w8YSM
SHOW: | | - or go back to the newest paste.
1
#!/usr/bin/env  python
2
3
u'''
4
Ведомости
5
'''
6
7
from calibre.web.feeds.feedparser import parse
8
from calibre.ebooks.BeautifulSoup import Tag
9
from calibre.web.feeds.news import BasicNewsRecipe
10
11
class VedomostiRecipe(BasicNewsRecipe):
12
    title = u'Ведомости'
13
    __author__ = 'Nikolai Kotchetkov'
14
    publisher = 'vedomosti.ru'
15
    category = 'press, Russia'
16
    description = u'Ежедневная деловая газета'
17
    oldest_article = 3
18
    max_articles_per_feed = 100
19
20
    masthead_url = u'http://motorro.com/imgdir/logos/ved_logo_black2_cropped.gif'
21
    cover_url = u'http://motorro.com/imgdir/logos/ved_logo_black2_cropped.gif'
22
23
    #Add feed names if you want them to be sorted (feeds of this list appear first)
24
    sortOrder = [u'_default', u'Первая полоса', u'Власть и деньги']
25
26
    encoding = 'cp1251'
27
    language = 'ru'
28
    no_stylesheets = True
29
    remove_javascript = True
30
    recursions = 0
31
32
    conversion_options = {
33
                          'comment'   : description
34
                        , 'tags'      : category
35
                        , 'publisher' : publisher
36
                        , 'language'  : language
37
                        }
38
39
40-
    keep_only_tags = [dict(name='td', attrs={'class' : ['second_content']})]
40+
    #keep_only_tags = [dict(name='td', attrs={'class' : ['second_content']})]
41
42-
    remove_tags_after = [dict(name='div', attrs={'class' : 'article_text'})]
42+
    #remove_tags_after = [dict(name='div', attrs={'class' : 'article_text'})]
43
44-
    remove_tags = [dict(name='div', attrs={'class' : ['sep', 'choice', 'articleRightTbl']})]
44+
    #remove_tags = [dict(name='div', attrs={'class' : ['sep', 'choice', 'articleRightTbl']})]
45
46
    feeds = [u'http://www.vedomosti.ru/newspaper/out/rss.xml']
47
48
    #base URL for relative links
49
    base_url = u'http://www.vedomosti.ru'
50
51
    extra_css = 'h1 {font-size: 1.5em; margin: 0em 0em 0em 0em; text-align: center;}'\
52
                'h2 {font-size: 1.0em; margin: 0em 0em 0em 0em;}'\
53
                'h3 {font-size: 0.8em; margin: 0em 0em 0em 0em;}'\
54
                '.article_date {font-size: 0.5em; color: gray; font-family: monospace; text-align:right;}'\
55
                '.article_authors {font-size: 0.5em; color: gray; font-family: monospace; text-align:right;}'\
56
                '.article_img {width:100%; text-align: center; padding: 3px 3px 3px 3px;}'\
57
                '.article_img_desc {width:100%; text-align: center; font-size: 0.5em; color: gray; font-family: monospace;}'\
58
                '.article_desc {font-size: 1em; font-style:italic;}'
59
60
    def parse_index(self):
61
        try:
62
            feedData = parse(self.feeds[0])
63
            if not feedData:
64
                raise NotImplementedError
65
            self.log("parse_index: Feed loaded successfully.")
66
            if feedData.feed.has_key('title'):
67
                self.title = feedData.feed.title
68
                self.log("parse_index: Title updated to: ", self.title)
69
            if feedData.feed.has_key('description'):
70
                self.description = feedData.feed.description
71
                self.log("parse_index: Description updated to: ", self.description)
72
73
            def get_virtual_feed_articles(feed):
74
                if feeds.has_key(feed):
75
                    return feeds[feed][1]
76
                self.log("Adding new feed: ", feed)
77
                articles = []
78
                feeds[feed] = (feed, articles)
79
                return articles
80
81
            feeds = {}
82
83
            #Iterate feed items and distribute articles using tags
84
            for item in feedData.entries:
85
                link = item.get('link', '');
86
                title = item.get('title', '');
87
                if '' == link or '' == title:
88
                    continue
89
                article = {'title':title, 'url':link, 'description':item.get('description', ''), 'date':item.get('date', ''), 'content':''};
90
                if not item.has_key('tags'):
91
                    get_virtual_feed_articles('_default').append(article)
92
                    continue
93
                for tag in item.tags:
94
                    addedToDefault = False
95
                    term = tag.get('term', '')
96
                    if '' == term:
97
                        if (not addedToDefault):
98
                            get_virtual_feed_articles('_default').append(article)
99
                        continue
100
                    get_virtual_feed_articles(term).append(article)
101
102
            #Get feed list
103
            #Select sorted feeds first of all
104
            result = []
105
            for feedName in self.sortOrder:
106
                if (not feeds.has_key(feedName)): continue
107
                result.append(feeds[feedName])
108
                del feeds[feedName]
109
            result = result + feeds.values()
110
111
            return result
112
113
        except Exception, err:
114
            self.log(err)
115
            raise NotImplementedError
116
117
    def preprocess_html(self, soup):
118
        return self.adeify_images(soup)
119
120
    def postprocess_html(self, soup, first_fetch):
121
        #self.log('Original: ', soup.prettify())
122
123
        #Find article
124
        contents = soup.find('div', {'class':['article_text']})
125
        if not contents:
126
            self.log('postprocess_html: article div not found!')
127
            return soup
128
        contents.extract()
129
130
        #Find title
131
        title = soup.find('h1')
132
        if title:
133
            contents.insert(0, title)
134
135
        #Find article image
136
        newstop = soup.find('div', {'class':['newstop']})
137
        if newstop:
138
            img = newstop.find('img')
139
            if img:
140
                imgDiv = Tag(soup, 'div')
141
                imgDiv['class'] = 'article_img'
142
143
                if img.has_key('width'):
144
                    del(img['width'])
145
                if img.has_key('height'):
146
                    del(img['height'])
147
148
                #find description
149
                element = img.parent.nextSibling
150
151
                img.extract()
152
                imgDiv.insert(0, img)
153
154
                while element:
155
                    if not isinstance(element, Tag):
156
                        continue
157
                    nextElement = element.nextSibling
158
                    if 'p' == element.name:
159
                        element.extract()
160
                        element['class'] = 'article_img_desc'
161
                        imgDiv.insert(len(imgDiv.contents), element)
162
                    element = nextElement
163
164
                contents.insert(1, imgDiv)
165
166
        #find article abstract
167
        abstract = soup.find('p', {'class':['subhead']})
168
        if abstract:
169
            abstract['class'] = 'article_desc'
170
            contents.insert(2, abstract)
171
172
        #Find article authors
173
        authorsDiv = soup.find('div', {'class':['autors']})
174
        if authorsDiv:
175
            authorsP = authorsDiv.find('p')
176
            if authorsP:
177
                authorsP['class'] = 'article_authors'
178
                contents.insert(len(contents.contents), authorsP)
179
180
        #Fix urls that use relative path
181
        urls = contents.findAll('a');
182
        if urls:
183
            for url in urls:
184
                if not url.has_key('href'):
185
                    continue
186
                if '/' == url['href'][0]:
187
                    url['href'] = self.base_url + url['href']
188
189
        body = soup.find('td', {'class':['second_content']})
190
        if body:
191
            body.replaceWith(contents)
192
193
        self.log('Result: ', soup.prettify())
194
        return soup