RSS Parser

#!/usr/bin/python3

###
###  TODO: clean_text: NOT WORKING WELL
###  TODO: \x7f problem needs tests
###

import codecs
import sys
import os
import feedparser
import urllib.request, urllib.error, urllib.parse
import re
import types
from bs4 import BeautifulSoup
from html.parser import HTMLParser

stop_string_list=("li><a","<div","href","<par","</div")

def clean_text(html_text):
    def char_from_entity(match):
        code = html.entities.name2codepoint.get(match.group(1), 0xFFFD)
        return chr(code)

    def clean_str(tmp_str):
        while tmp_str.find("\x7f")>0:
            pos=tmp_str.find("\x7f")
            tmp_str=tmp_str[0:pos-1]+tmp_str[pos+1:]
        return tmp_str

    text = re.sub(r"<!--(?:.|\n)*?-->", "", html_text)
    text = re.sub(r"<[Pp][^>]*?(?!</)>", "\n\n", text)
    text = re.sub(r"<[^>]*?>", "", text)
    text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text)
    text = re.sub(r"&([A-Za-z]+);", char_from_entity, text)
    text = re.sub(r"\n(?:[ \xA0\t]+\n)+", "\n", text)
    t = re.sub(r"\n\n+", "\n\n", text.strip())
    return clean_str(t)

def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title', 'dates']:
        return False
    elif re.match('<!--.*-->', str(element),re.UNICODE):
        return False
    return True

if(sys.argv[1:] == []):

    print("Simple RSS Parser")
    print("")
    print("Usage:")
    print("")
    print("    rss_parser (rss_link)(mode) [encoding]")
    print("")
    print("Possible modes:")
    print("")
    print("    line    -  line by line output(format: YYYY-MM-DD|header|body|link)")
    print("    pandoc  -  document compatible with Pandoc")
    print("    plain   -  plain text output")
    print("    dump    -  dump HTML content using the specified encoding")
    print("    fplain  -  dump date, header, body and link to separate files using the specified encoding")
    print("    fdump   -  dump HTML content to separate files using the specified encoding")
    print("")

else:

    try:
        rss_link = sys.argv[1]
        out_format = sys.argv[2]
    except:
        sys.exit("ERROR: Some argument is missing.")

    try:
        d = feedparser.parse(rss_link)
    except:
        sys.exit("ERROR: Unable to parse RSS file. Check the RSS link and internet connection.")

    if(out_format == "line"):

        for entry in d["entries"]:
            time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
            print(time_stamp + " | " + entry.title + " | " + clean_text(entry.summary) + " | " + entry.link)

    elif(out_format == "pandoc"):

        for entry in d["entries"]:
            time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
            print("#", time_stamp, " ",entry.title, "#")
            print("")
            print(clean_text(entry.summary))
            print("")
            print(entry.link)
            print("")
            print("")

    elif(out_format == "plain"):

        for entry in d["entries"]:
            time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
            print(time_stamp, " ", entry.title)
            print(clean_text(entry.summary))
            print(entry.link)
            print("")
            print("")

    elif(out_format == "dump"):

        try:
            page_enc = sys.argv[3]
        except:
            sys.exit("ERROR: Encoding not specified.")

        for entry in d["entries"]:
            time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
            print(time_stamp, " ", entry.title)
            print(clean_text(entry.summary))
            print(entry.link)
            f=urllib.request.urlopen(entry.link)
            print(clean_text(f.read().decode(page_enc)))
            print("")
            print("")

    elif(out_format == "fplain"):

        wrk_dir = os.getcwd()

        for entry in d["entries"]:
            time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
            # prepare file name for file: date + 8 chars of title
            f_name = entry.title
            f_name = time_stamp + f_name.replace(" ", "")[0:8]

            not_exist=False
            try:
                open(wrk_dir + "/" + f_name)
            except IOError:
                not_exist=True

            if (not_exist):
                f_hndl = codecs.open(wrk_dir + "/" + f_name,"w","utf-8")

                f_hndl.write(time_stamp + " " + entry.title + "\n\n")
                f_hndl.write(clean_text(entry.summary) + "\n\n")
                f_hndl.write(entry.link + "\n")
                f_hndl.close()

    elif(out_format == "fdump"):

        try:
            page_enc = sys.argv[3]
        except:
            sys.exit("ERROR: Encoding not specified.")

        wrk_dir = os.getcwd()

        for entry in d["entries"]:
            time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
            # prepare file name for file: date + 8 chars of title
            f_name = entry.title
            f_name = time_stamp + f_name.replace(" ", "")[0:8]

            not_exist=False
            try:
                open(wrk_dir + "/" + f_name)
            except IOError:
                not_exist=True

            if (not_exist):
                f_hndl = codecs.open(wrk_dir + "/" + f_name,"w","utf-8")

                f_hndl.write(time_stamp + " " + entry.title + "\n\n")
                f_hndl.write(clean_text(entry.summary) + "\n\n")
                f_hndl.write(entry.link + "\n\n")

                web_page=urllib.request.urlopen(entry.link)
                page=web_page.read().decode(page_enc)

                soup=BeautifulSoup(page)
                texts=soup.findAll(text=True)
                visible_texts = filter(visible, texts)

                out_file = ""
                for item in visible_texts:
                    st_item=str(item)
                    not_in_stop=True
                    for stop_item in stop_string_list:
                        not_in_stop=not_in_stop and (st_item.find(stop_item)<0)
                    if not_in_stop:
                        out_file+=item

                out_file.replace("\t","")
                out_file=re.sub("\n\s*\n*", "\n", out_file)

                f_hndl.write(out_file)
                f_hndl.close()

    else:
        print("ERROR: Argument not defined.")