Advertisement
Guest User

venkasub

a guest
Jan 2nd, 2011
247
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.94 KB | None | 0 0
  1. from BeautifulSoup import BeautifulSoup,Comment
  2. import re
  3.  
  4. invalid_tags = ['b', 'i', 'u','link','em','small','span','blockquote','strong','abbr','ol','h1', 'h2', 'h3','h4','font','tr','td','center','tbody','table']
  5. not_allowed_tags = ['script','noscript','img','object','meta','code','pre','br','hr','form','input','iframe' ,'style','dl','dt','sup','head','acronym']
  6.  
  7. #attributes that are checked for in a given html tag - if present, the tag is removed.
  8. unwanted_tags=["tags","breadcrumbs","disqus","boxy","popular","recent","feature_title","logo","leaderboard","widget","neighbor","dsq","announcement","button","more","categories","blogroll","cloud","related","tab"]
  9.  
  10. def unwanted(tag_class):
  11.   for each_class in unwanted_tags:
  12.     if each_class in tag_class:
  13.       return True
  14.   return False
  15.  
  16. #from http://stackoverflow.com/questions/1765848/remove-a-tag-using-beautifulsoup-but-keep-its-contents
  17. def remove_tag(tag):
  18.   for i, x in enumerate(tag.parent.contents):
  19.     if x == tag: break
  20.   else:
  21.     print "Can't find", tag, "in", tag.parent
  22.     return
  23.   for r in reversed(tag.contents):
  24.     tag.parent.insert(i, r)
  25.   tag.extract()
  26.  
  27. def strip_tags(html):
  28.   tags = ""
  29.   soup = BeautifulSoup(html)
  30.  
  31.   #remove doctype
  32.   doctype = soup.findAll(text=re.compile("DOCTYPE"))
  33.   [tree.extract() for tree in doctype]
  34.  
  35.   #remove all links
  36.   links = soup.findAll(text=re.compile("http://"))
  37.   [tree.extract() for tree in links]
  38.  
  39.   #remove all comments
  40.   comments = soup.findAll(text=lambda text:isinstance(text, Comment) )
  41.   [comment.extract() for comment in comments]
  42.  
  43.   for tag in soup.findAll(True):
  44.     #remove all the tags that are not allowed.
  45.     if tag.name in not_allowed_tags :
  46.       tag.extract()
  47.       continue
  48.    
  49.     #replace the tags with the content of the tag
  50.     if tag.name in invalid_tags:      
  51.       remove_tag(tag)
  52.    
  53.     # similar to not_allowed_tags but does a check for the attribute-class/id before removing it
  54.     if unwanted(tag.get('class','')) or unwanted(tag.get('id','')) :
  55.       tag.extract()
  56.       continue
  57.    
  58.     # special case of lists - the lists can be part of navbars/sideheadings too,
  59.     # hence check length before removing them
  60.     if tag.name =='li':
  61.       tagc = strip_tags(str(tag.contents))
  62.       if len(str(tagc).split()) < 3:
  63.         tag.extract()
  64.         continue
  65.    
  66.     #finally remove all empty and spurious tags and replce it with its content
  67.     if tag.name in ['div','a','p','ul','li','html','body'] :
  68.       remove_tag(tag)
  69.      
  70.   return soup
  71.  
  72. #open the file which contains the html
  73. #this step can be replaced with reading directly from the url
  74. #however, i think its always better to store the html in the
  75. #  local storage for any later processing.
  76. html = open("techcrunch.html").read()
  77. soup = strip_tags(html)
  78. content = str(soup.prettify())
  79.  
  80. #write the stripped content into another file.
  81. outfile = open("tech.txt","w")
  82. outfile.write(content)
  83. outfile.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement