VMR

'''
Created on Mar 31, 2010

@author: Damjan
'''

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import urllib2, re
from xml.dom import minidom

class RssReader():
    def __init__(self, url):
        self.xml = urllib2.urlopen(url)
        self.title = []
        self.links = []
        self.description = []

        if (self.xml):
            xmldoc = minidom.parse(self.xml)
            if (xmldoc):
                rootNode = xmldoc.documentElement
                for node in rootNode.childNodes:
                    if (node.nodeName == "channel"):
                        for item_node in node.childNodes:
                            for item in item_node.childNodes:

                                if (item.nodeName == "title"):
                                    title = ""
                                    for text_node in item.childNodes:
                                        if (text_node.nodeType == node.TEXT_NODE):
                                            title += text_node.nodeValue
                                        if (len(title)>0):
                                            self.title.append(title)

                                if (item.nodeName == "description"):
                                    description = ""
                                    for text_node in item.childNodes:
                                        if (text_node.nodeType == node.TEXT_NODE):
                                            description += text_node.nodeValue
                                        if (len(description)>0):
                                            self.description.append(self.remove_html_tags(description))
                                        else:
                                            self.description.append("")

                                if (item.nodeName == "link"):
                                    link = ""
                                    for text_node in item.childNodes:
                                        if (text_node.nodeType == node.TEXT_NODE):
                                            link += text_node.nodeValue
                                        if (len(link)>0):
                                            self.links.append(link)


            else:
                print "Error getting XML document!"
        else:
            print "Error! Getting URL"


    def remove_extra_spaces(self, data):
        p = re.compile(r'\s+')
        return p.sub(' ', data)

    def remove_html_tags(self, data):
        p = re.compile(r'<.*?>')
        return self.remove_extra_spaces(p.sub('', data))

R = RssReader("http://rss.cnn.com/rss/cnn_topstories.rss")

f = open("file.html", "w")
f.write('<html><body>')
i=0
print len(R.title)
print len(R.links)
print len(R.description)

for ttl in R.title:
    f.write('<a href="' + R.links[i] + '">' + ttl + '</a><p>' + R.description[i] + '</p>')
    i+=1
f.write('</body></html>')
f.close()