Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- --- html2text.py 2011-12-09 13:37:15.000000000 +0100
- +++ html2text-improved.py 2011-12-12 21:43:56.000000000 +0100
- @@ -31,6 +31,11 @@
- except:
- import urllib
- import optparse, re, sys, codecs, types
- +from BeautifulSoup import BeautifulSoup
- +#html = "<html><p>Para 1<p>Para 2<blockquote>Quote 1<blockquote>Quote 2"
- +#soup = BeautifulSoup(html)
- +#print soup.prettify()
- +
- try: from textwrap import wrap
- except: pass
- @@ -222,7 +227,8 @@
- try: del unifiable_n[name2cp('nbsp')]
- except KeyError: pass
- - unifiable['nbsp'] = ' _place_holder;'
- +# unifiable['nbsp'] = ' _place_holder;'
- + unifiable['nbsp'] = ' '
- def feed(self, data):
- @@ -695,7 +701,7 @@
- newlines = 2
- else:
- if not onlywhite(para):
- - result += para + "\n"
- + result += para + "\n"
- newlines = 1
- else:
- if newlines < 2:
- @@ -736,7 +742,7 @@
- (options, args) = p.parse_args()
- # process input
- - encoding = "utf-8"
- + encoding = None;
- if len(args) > 0:
- file_ = args[0]
- if len(args) == 2:
- @@ -767,6 +773,14 @@
- else:
- data = sys.stdin.read()
- + # Preproceso
- + soup = BeautifulSoup(data)
- + # Quitamos las citaciones
- + subtree = soup.blockquote
- + if subtree is not None:
- + subtree.extract()
- + data = soup.prettify()
- + # Fin Preproceso
- data = data.decode(encoding)
- h = HTML2Text(baseurl=baseurl)
- # handle option
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement