Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3
- ###
- ### TODO: clean_text: NOT WORKING WELL
- ### TODO: \x7f problem needs tests
- ###
- import codecs
- import sys
- import os
- import feedparser
- import urllib.request, urllib.error, urllib.parse
- import re
- import types
- from bs4 import BeautifulSoup
- from html.parser import HTMLParser
- stop_string_list=("li><a","<div","href","<par","</div")
- def clean_text(html_text):
- def char_from_entity(match):
- code = html.entities.name2codepoint.get(match.group(1), 0xFFFD)
- return chr(code)
- def clean_str(tmp_str):
- while tmp_str.find("\x7f")>0:
- pos=tmp_str.find("\x7f")
- tmp_str=tmp_str[0:pos-1]+tmp_str[pos+1:]
- return tmp_str
- text = re.sub(r"<!--(?:.|\n)*?-->", "", html_text)
- text = re.sub(r"<[Pp][^>]*?(?!</)>", "\n\n", text)
- text = re.sub(r"<[^>]*?>", "", text)
- text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text)
- text = re.sub(r"&([A-Za-z]+);", char_from_entity, text)
- text = re.sub(r"\n(?:[ \xA0\t]+\n)+", "\n", text)
- t = re.sub(r"\n\n+", "\n\n", text.strip())
- return clean_str(t)
- def visible(element):
- if element.parent.name in ['style', 'script', '[document]', 'head', 'title', 'dates']:
- return False
- elif re.match('<!--.*-->', str(element),re.UNICODE):
- return False
- return True
- if(sys.argv[1:] == []):
- print("Simple RSS Parser")
- print("")
- print("Usage:")
- print("")
- print(" rss_parser (rss_link)(mode) [encoding]")
- print("")
- print("Possible modes:")
- print("")
- print(" line - line by line output(format: YYYY-MM-DD|header|body|link)")
- print(" pandoc - document compatible with Pandoc")
- print(" plain - plain text output")
- print(" dump - dump HTML content using the specified encoding")
- print(" fplain - dump date, header, body and link to separate files using the specified encoding")
- print(" fdump - dump HTML content to separate files using the specified encoding")
- print("")
- else:
- try:
- rss_link = sys.argv[1]
- out_format = sys.argv[2]
- except:
- sys.exit("ERROR: Some argument is missing.")
- try:
- d = feedparser.parse(rss_link)
- except:
- sys.exit("ERROR: Unable to parse RSS file. Check the RSS link and internet connection.")
- if(out_format == "line"):
- for entry in d["entries"]:
- time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
- print(time_stamp + " | " + entry.title + " | " + clean_text(entry.summary) + " | " + entry.link)
- elif(out_format == "pandoc"):
- for entry in d["entries"]:
- time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
- print("#", time_stamp, " ",entry.title, "#")
- print("")
- print(clean_text(entry.summary))
- print("")
- print(entry.link)
- print("")
- print("")
- elif(out_format == "plain"):
- for entry in d["entries"]:
- time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
- print(time_stamp, " ", entry.title)
- print(clean_text(entry.summary))
- print(entry.link)
- print("")
- print("")
- elif(out_format == "dump"):
- try:
- page_enc = sys.argv[3]
- except:
- sys.exit("ERROR: Encoding not specified.")
- for entry in d["entries"]:
- time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
- print(time_stamp, " ", entry.title)
- print(clean_text(entry.summary))
- print(entry.link)
- f=urllib.request.urlopen(entry.link)
- print(clean_text(f.read().decode(page_enc)))
- print("")
- print("")
- elif(out_format == "fplain"):
- wrk_dir = os.getcwd()
- for entry in d["entries"]:
- time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
- # prepare file name for file: date + 8 chars of title
- f_name = entry.title
- f_name = time_stamp + f_name.replace(" ", "")[0:8]
- not_exist=False
- try:
- open(wrk_dir + "/" + f_name)
- except IOError:
- not_exist=True
- if (not_exist):
- f_hndl = codecs.open(wrk_dir + "/" + f_name,"w","utf-8")
- f_hndl.write(time_stamp + " " + entry.title + "\n\n")
- f_hndl.write(clean_text(entry.summary) + "\n\n")
- f_hndl.write(entry.link + "\n")
- f_hndl.close()
- elif(out_format == "fdump"):
- try:
- page_enc = sys.argv[3]
- except:
- sys.exit("ERROR: Encoding not specified.")
- wrk_dir = os.getcwd()
- for entry in d["entries"]:
- time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
- # prepare file name for file: date + 8 chars of title
- f_name = entry.title
- f_name = time_stamp + f_name.replace(" ", "")[0:8]
- not_exist=False
- try:
- open(wrk_dir + "/" + f_name)
- except IOError:
- not_exist=True
- if (not_exist):
- f_hndl = codecs.open(wrk_dir + "/" + f_name,"w","utf-8")
- f_hndl.write(time_stamp + " " + entry.title + "\n\n")
- f_hndl.write(clean_text(entry.summary) + "\n\n")
- f_hndl.write(entry.link + "\n\n")
- web_page=urllib.request.urlopen(entry.link)
- page=web_page.read().decode(page_enc)
- soup=BeautifulSoup(page)
- texts=soup.findAll(text=True)
- visible_texts = filter(visible, texts)
- out_file = ""
- for item in visible_texts:
- st_item=str(item)
- not_in_stop=True
- for stop_item in stop_string_list:
- not_in_stop=not_in_stop and (st_item.find(stop_item)<0)
- if not_in_stop:
- out_file+=item
- out_file.replace("\t","")
- out_file=re.sub("\n\s*\n*", "\n", out_file)
- f_hndl.write(out_file)
- f_hndl.close()
- else:
- print("ERROR: Argument not defined.")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement