Advertisement
josacar

html2text remove blockquote

Mar 21st, 2012
51
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 1.61 KB | None | 0 0
  1. --- html2text.py        2011-12-09 13:37:15.000000000 +0100
  2. +++ html2text-improved.py       2011-12-12 21:43:56.000000000 +0100
  3. @@ -31,6 +31,11 @@
  4.  except:
  5.      import urllib
  6.  import optparse, re, sys, codecs, types
  7. +from BeautifulSoup import BeautifulSoup
  8. +#html = "<html><p>Para 1<p>Para 2<blockquote>Quote 1<blockquote>Quote 2"
  9. +#soup = BeautifulSoup(html)
  10. +#print soup.prettify()
  11. +
  12.  
  13.  try: from textwrap import wrap
  14.  except: pass
  15. @@ -222,7 +227,8 @@
  16.  
  17.          try: del unifiable_n[name2cp('nbsp')]
  18.          except KeyError: pass
  19. -        unifiable['nbsp'] = '&nbsp_place_holder;'
  20. +#        unifiable['nbsp'] = '&nbsp_place_holder;'
  21. +        unifiable['nbsp'] = ' '
  22.  
  23.  
  24.      def feed(self, data):
  25. @@ -695,7 +701,7 @@
  26.                      newlines = 2
  27.                  else:
  28.                      if not onlywhite(para):
  29. -                        result += para + "\n"
  30. +                        result +=  para + "\n"
  31.                          newlines = 1
  32.              else:
  33.                  if newlines < 2:
  34. @@ -736,7 +742,7 @@
  35.      (options, args) = p.parse_args()
  36.  
  37.      # process input
  38. -    encoding = "utf-8"
  39. +    encoding = None;
  40.      if len(args) > 0:
  41.          file_ = args[0]
  42.          if len(args) == 2:
  43. @@ -767,6 +773,14 @@
  44.      else:
  45.          data = sys.stdin.read()
  46.  
  47. +    # Preproceso
  48. +    soup = BeautifulSoup(data)
  49. +    # Quitamos las citaciones
  50. +    subtree = soup.blockquote
  51. +    if subtree is not None:
  52. +       subtree.extract()
  53. +    data = soup.prettify()
  54. +    # Fin Preproceso
  55.      data = data.decode(encoding)
  56.      h = HTML2Text(baseurl=baseurl)
  57.      # handle option
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement