Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- '''
- Created on Mar 31, 2010
- @author: Damjan
- '''
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import urllib2, re
- from xml.dom import minidom
- class RssReader():
- def __init__(self, url):
- self.xml = urllib2.urlopen(url)
- self.title = []
- self.links = []
- self.description = []
- if (self.xml):
- xmldoc = minidom.parse(self.xml)
- if (xmldoc):
- rootNode = xmldoc.documentElement
- for node in rootNode.childNodes:
- if (node.nodeName == "channel"):
- for item_node in node.childNodes:
- for item in item_node.childNodes:
- if (item.nodeName == "title"):
- title = ""
- for text_node in item.childNodes:
- if (text_node.nodeType == node.TEXT_NODE):
- title += text_node.nodeValue
- if (len(title)>0):
- self.title.append(title)
- if (item.nodeName == "description"):
- description = ""
- for text_node in item.childNodes:
- if (text_node.nodeType == node.TEXT_NODE):
- description += text_node.nodeValue
- if (len(description)>0):
- self.description.append(self.remove_html_tags(description))
- else:
- self.description.append("")
- if (item.nodeName == "link"):
- link = ""
- for text_node in item.childNodes:
- if (text_node.nodeType == node.TEXT_NODE):
- link += text_node.nodeValue
- if (len(link)>0):
- self.links.append(link)
- else:
- print "Error getting XML document!"
- else:
- print "Error! Getting URL"
- def remove_extra_spaces(self, data):
- p = re.compile(r'\s+')
- return p.sub(' ', data)
- def remove_html_tags(self, data):
- p = re.compile(r'<.*?>')
- return self.remove_extra_spaces(p.sub('', data))
- R = RssReader("http://rss.cnn.com/rss/cnn_topstories.rss")
- f = open("file.html", "w")
- f.write('<html><body>')
- i=0
- print len(R.title)
- print len(R.links)
- print len(R.description)
- for ttl in R.title:
- f.write('<a href="' + R.links[i] + '">' + ttl + '</a><p>' + R.description[i] + '</p>')
- i+=1
- f.write('</body></html>')
- f.close()
Advertisement
Add Comment
Please, Sign In to add comment