Advertisement
Guest User

Untitled

a guest
Oct 13th, 2015
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.79 KB | None | 0 0
  1. #/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. from bs4 import BeautifulSoup, Comment, UnicodeDammit
  4. import glob
  5. from os.path import isfile, join
  6. import html2text
  7.  
  8. #Dos pasos para codificar correctamente la pagina
  9. def lector(sopa):
  10. fuente = sopa.prettify()
  11. print fuente.encode('utf8')
  12.  
  13. def get_mission_title(sections):
  14. if (sections[1].span.span != None):
  15. print sections[1].span.span.string.encode('utf8')
  16.  
  17. def get_mission_number(sections):
  18. if (sections[0] != None):
  19. print sections[0].span.string.encode('utf8')
  20.  
  21. def get_mission_location(sections):
  22. if (sections[3] != None):
  23. print sections[3].get_text().encode('utf8').strip()
  24.  
  25. def get_mission_subtitle(sections):
  26. if (sections[2].span.span != None):
  27. print sections[2].span.span.string.encode('utf8')
  28.  
  29. def table_text(sopa):
  30. sections = sopa.find_all("div")
  31. for section in sections:
  32. section.decompose()
  33. lines = sopa.find_all("hr")
  34. broes = sopa.find_all("br")
  35. for x in range(0, 2):
  36. lines[x].decompose()
  37. broes[x].decompose()
  38. for link in sopa.find_all("a"):
  39. link.decompose()
  40. for tag in sopa.find_all(text=lambda text:isinstance(text, Comment)):
  41. tag.extract()
  42.  
  43. for elem in sopa.find_all("td"):
  44. if (elem.get_text().strip() == ""):
  45. prox = elem.next_element
  46. if prox != None:
  47. if prox.strip() == "":
  48. otro_prox = elem.next_element.next_element
  49. if otro_prox != None:
  50. if str(otro_prox).strip() != "":
  51. elem.string = "wedd"
  52. else:
  53. elem.string = "wedd"
  54.  
  55. res = str(sopa.encode('utf8'))
  56. print sopa.get_text().replace("wedd","-").encode('utf8')
  57. #print sopa.prettify().encode('utf8')
  58. h = html2text.HTML2Text()
  59. whous = h.handle(res.decode('utf-8-sig'))
  60. #print type(res.decode('utf-8-sig'))
  61. #print whous.replace("wedd","-").encode('utf8')
  62. #print(markup.unicode_markup)
  63. #print sopa.get_text().encode('utf8')
  64.  
  65. def get_basic_info(sopa):
  66. main_section = sopa.find("div", id="body")
  67. sections = main_section.find_all("div")
  68. table_text(main_section)
  69. #get_mission_number(sections)
  70. #get_mission_title(sections)
  71. #get_mission_location(sections)
  72.  
  73. def run_single():
  74. soup = BeautifulSoup(open('origen/11.html'), from_encoding="utf-8")
  75. get_basic_info(soup)
  76.  
  77. def run_multiple():
  78. mypath = '../Applications/sphere/origen/';
  79. for f in sorted(glob.glob(mypath+'*.html')):
  80. if isfile(join(mypath,f)) and f != '../Applications/sphere/origen/4.html':
  81. soup = BeautifulSoup(open(f))
  82. get_basic_info(soup)
  83.  
  84. if __name__ == "__main__":
  85. #run_multiple()
  86. run_single()
  87. #lector(soup)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement