Advertisement
Guest User

RSS Parser

a guest
May 22nd, 2012
191
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.13 KB | None | 0 0
  1. #!/usr/bin/python3
  2.  
  3. ###
  4. ###  TODO: clean_text: NOT WORKING WELL
  5. ###  TODO: \x7f problem needs tests
  6. ###
  7.  
  8. import codecs
  9. import sys
  10. import os
  11. import feedparser
  12. import urllib.request, urllib.error, urllib.parse
  13. import re
  14. import types
  15. from bs4 import BeautifulSoup
  16. from html.parser import HTMLParser
  17.  
  18. stop_string_list=("li><a","<div","href","<par","</div")
  19.  
  20. def clean_text(html_text):
  21.     def char_from_entity(match):
  22.         code = html.entities.name2codepoint.get(match.group(1), 0xFFFD)
  23.         return chr(code)
  24.  
  25.     def clean_str(tmp_str):
  26.         while tmp_str.find("\x7f")>0:
  27.             pos=tmp_str.find("\x7f")
  28.             tmp_str=tmp_str[0:pos-1]+tmp_str[pos+1:]    
  29.         return tmp_str    
  30.  
  31.     text = re.sub(r"<!--(?:.|\n)*?-->", "", html_text)
  32.     text = re.sub(r"<[Pp][^>]*?(?!</)>", "\n\n", text)
  33.     text = re.sub(r"<[^>]*?>", "", text)
  34.     text = re.sub(r"&#(\d+);", lambda m: chr(int(m.group(1))), text)
  35.     text = re.sub(r"&([A-Za-z]+);", char_from_entity, text)
  36.     text = re.sub(r"\n(?:[ \xA0\t]+\n)+", "\n", text)
  37.     t = re.sub(r"\n\n+", "\n\n", text.strip())
  38.     return clean_str(t)
  39.  
  40. def visible(element):
  41.     if element.parent.name in ['style', 'script', '[document]', 'head', 'title', 'dates']:
  42.         return False
  43.     elif re.match('<!--.*-->', str(element),re.UNICODE):
  44.         return False
  45.     return True
  46.  
  47. if(sys.argv[1:] == []):
  48.  
  49.     print("Simple RSS Parser")
  50.     print("")
  51.     print("Usage:")
  52.     print("")
  53.     print("    rss_parser (rss_link)(mode) [encoding]")
  54.     print("")
  55.     print("Possible modes:")
  56.     print("")
  57.     print("    line    -  line by line output(format: YYYY-MM-DD|header|body|link)")
  58.     print("    pandoc  -  document compatible with Pandoc")
  59.     print("    plain   -  plain text output")
  60.     print("    dump    -  dump HTML content using the specified encoding")
  61.     print("    fplain  -  dump date, header, body and link to separate files using the specified encoding")
  62.     print("    fdump   -  dump HTML content to separate files using the specified encoding")
  63.     print("")
  64.    
  65. else:
  66.  
  67.     try:
  68.         rss_link = sys.argv[1]    
  69.         out_format = sys.argv[2]
  70.     except:
  71.         sys.exit("ERROR: Some argument is missing.")
  72.        
  73.     try:
  74.         d = feedparser.parse(rss_link)
  75.     except:
  76.         sys.exit("ERROR: Unable to parse RSS file. Check the RSS link and internet connection.")
  77.  
  78.     if(out_format == "line"):
  79.  
  80.         for entry in d["entries"]:
  81.             time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
  82.             print(time_stamp + " | " + entry.title + " | " + clean_text(entry.summary) + " | " + entry.link)
  83.  
  84.     elif(out_format == "pandoc"):
  85.  
  86.         for entry in d["entries"]:
  87.             time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
  88.             print("#", time_stamp, " ",entry.title, "#")
  89.             print("")
  90.             print(clean_text(entry.summary))
  91.             print("")
  92.             print(entry.link)
  93.             print("")
  94.             print("")
  95.  
  96.     elif(out_format == "plain"):
  97.  
  98.         for entry in d["entries"]:
  99.             time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
  100.             print(time_stamp, " ", entry.title)
  101.             print(clean_text(entry.summary))
  102.             print(entry.link)
  103.             print("")
  104.             print("")
  105.  
  106.     elif(out_format == "dump"):
  107.  
  108.         try:
  109.             page_enc = sys.argv[3]
  110.         except:
  111.             sys.exit("ERROR: Encoding not specified.")
  112.  
  113.         for entry in d["entries"]:                                        
  114.             time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
  115.             print(time_stamp, " ", entry.title)
  116.             print(clean_text(entry.summary))
  117.             print(entry.link)
  118.             f=urllib.request.urlopen(entry.link)
  119.             print(clean_text(f.read().decode(page_enc)))
  120.             print("")
  121.             print("")      
  122.    
  123.     elif(out_format == "fplain"):
  124.  
  125.         wrk_dir = os.getcwd()
  126.  
  127.         for entry in d["entries"]:                                        
  128.             time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
  129.             # prepare file name for file: date + 8 chars of title
  130.             f_name = entry.title
  131.             f_name = time_stamp + f_name.replace(" ", "")[0:8]
  132.                    
  133.             not_exist=False        
  134.             try:
  135.                 open(wrk_dir + "/" + f_name)
  136.             except IOError:
  137.                 not_exist=True
  138.            
  139.             if (not_exist):
  140.                 f_hndl = codecs.open(wrk_dir + "/" + f_name,"w","utf-8")
  141.                
  142.                 f_hndl.write(time_stamp + " " + entry.title + "\n\n")
  143.                 f_hndl.write(clean_text(entry.summary) + "\n\n")
  144.                 f_hndl.write(entry.link + "\n")
  145.                 f_hndl.close()
  146.  
  147.     elif(out_format == "fdump"):
  148.  
  149.         try:
  150.             page_enc = sys.argv[3]
  151.         except:
  152.             sys.exit("ERROR: Encoding not specified.")
  153.  
  154.         wrk_dir = os.getcwd()
  155.  
  156.         for entry in d["entries"]:                                        
  157.             time_stamp = str(entry.updated_parsed.tm_year) + "-" + str(entry.updated_parsed.tm_mon) + "-" + str(entry.updated_parsed.tm_mday)
  158.             # prepare file name for file: date + 8 chars of title
  159.             f_name = entry.title
  160.             f_name = time_stamp + f_name.replace(" ", "")[0:8]
  161.            
  162.             not_exist=False        
  163.             try:
  164.                 open(wrk_dir + "/" + f_name)
  165.             except IOError:
  166.                 not_exist=True
  167.            
  168.             if (not_exist):
  169.                 f_hndl = codecs.open(wrk_dir + "/" + f_name,"w","utf-8")
  170.                
  171.                 f_hndl.write(time_stamp + " " + entry.title + "\n\n")
  172.                 f_hndl.write(clean_text(entry.summary) + "\n\n")
  173.                 f_hndl.write(entry.link + "\n\n")
  174.                
  175.                 web_page=urllib.request.urlopen(entry.link)
  176.                 page=web_page.read().decode(page_enc)
  177.                
  178.                 soup=BeautifulSoup(page)
  179.                 texts=soup.findAll(text=True)
  180.                 visible_texts = filter(visible, texts)
  181.                
  182.                 out_file = ""
  183.                 for item in visible_texts:
  184.                     st_item=str(item)
  185.                     not_in_stop=True
  186.                     for stop_item in stop_string_list:
  187.                         not_in_stop=not_in_stop and (st_item.find(stop_item)<0)
  188.                     if not_in_stop:
  189.                         out_file+=item
  190.                
  191.                 out_file.replace("\t","")
  192.                 out_file=re.sub("\n\s*\n*", "\n", out_file)
  193.                
  194.                 f_hndl.write(out_file)
  195.                 f_hndl.close()
  196.            
  197.     else:
  198.         print("ERROR: Argument not defined.")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement