Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from BeautifulSoup import BeautifulSoup,Comment
- import re
- invalid_tags = ['b', 'i', 'u','link','em','small','span','blockquote','strong','abbr','ol','h1', 'h2', 'h3','h4','font','tr','td','center','tbody','table']
- not_allowed_tags = ['script','noscript','img','object','meta','code','pre','br','hr','form','input','iframe' ,'style','dl','dt','sup','head','acronym']
- #attributes that are checked for in a given html tag - if present, the tag is removed.
- unwanted_tags=["tags","breadcrumbs","disqus","boxy","popular","recent","feature_title","logo","leaderboard","widget","neighbor","dsq","announcement","button","more","categories","blogroll","cloud","related","tab"]
- def unwanted(tag_class):
- for each_class in unwanted_tags:
- if each_class in tag_class:
- return True
- return False
- #from http://stackoverflow.com/questions/1765848/remove-a-tag-using-beautifulsoup-but-keep-its-contents
- def remove_tag(tag):
- for i, x in enumerate(tag.parent.contents):
- if x == tag: break
- else:
- print "Can't find", tag, "in", tag.parent
- return
- for r in reversed(tag.contents):
- tag.parent.insert(i, r)
- tag.extract()
- def strip_tags(html):
- tags = ""
- soup = BeautifulSoup(html)
- #remove doctype
- doctype = soup.findAll(text=re.compile("DOCTYPE"))
- [tree.extract() for tree in doctype]
- #remove all links
- links = soup.findAll(text=re.compile("http://"))
- [tree.extract() for tree in links]
- #remove all comments
- comments = soup.findAll(text=lambda text:isinstance(text, Comment) )
- [comment.extract() for comment in comments]
- for tag in soup.findAll(True):
- #remove all the tags that are not allowed.
- if tag.name in not_allowed_tags :
- tag.extract()
- continue
- #replace the tags with the content of the tag
- if tag.name in invalid_tags:
- remove_tag(tag)
- # similar to not_allowed_tags but does a check for the attribute-class/id before removing it
- if unwanted(tag.get('class','')) or unwanted(tag.get('id','')) :
- tag.extract()
- continue
- # special case of lists - the lists can be part of navbars/sideheadings too,
- # hence check length before removing them
- if tag.name =='li':
- tagc = strip_tags(str(tag.contents))
- if len(str(tagc).split()) < 3:
- tag.extract()
- continue
- #finally remove all empty and spurious tags and replce it with its content
- if tag.name in ['div','a','p','ul','li','html','body'] :
- remove_tag(tag)
- return soup
- #open the file which contains the html
- #this step can be replaced with reading directly from the url
- #however, i think its always better to store the html in the
- # local storage for any later processing.
- html = open("techcrunch.html").read()
- soup = strip_tags(html)
- content = str(soup.prettify())
- #write the stripped content into another file.
- outfile = open("tech.txt","w")
- outfile.write(content)
- outfile.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement