Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #/usr/bin/python
- # -*- coding: utf-8 -*-
- from bs4 import BeautifulSoup, Comment, UnicodeDammit
- import glob
- from os.path import isfile, join
- import html2text
- #Dos pasos para codificar correctamente la pagina
- def lector(sopa):
- fuente = sopa.prettify()
- print fuente.encode('utf8')
- def get_mission_title(sections):
- if (sections[1].span.span != None):
- print sections[1].span.span.string.encode('utf8')
- def get_mission_number(sections):
- if (sections[0] != None):
- print sections[0].span.string.encode('utf8')
- def get_mission_location(sections):
- if (sections[3] != None):
- print sections[3].get_text().encode('utf8').strip()
- def get_mission_subtitle(sections):
- if (sections[2].span.span != None):
- print sections[2].span.span.string.encode('utf8')
- def table_text(sopa):
- sections = sopa.find_all("div")
- for section in sections:
- section.decompose()
- lines = sopa.find_all("hr")
- broes = sopa.find_all("br")
- for x in range(0, 2):
- lines[x].decompose()
- broes[x].decompose()
- for link in sopa.find_all("a"):
- link.decompose()
- for tag in sopa.find_all(text=lambda text:isinstance(text, Comment)):
- tag.extract()
- for elem in sopa.find_all("td"):
- if (elem.get_text().strip() == ""):
- prox = elem.next_element
- if prox != None:
- if prox.strip() == "":
- otro_prox = elem.next_element.next_element
- if otro_prox != None:
- if str(otro_prox).strip() != "":
- elem.string = "wedd"
- else:
- elem.string = "wedd"
- res = str(sopa.encode('utf8'))
- print sopa.get_text().replace("wedd","-").encode('utf8')
- #print sopa.prettify().encode('utf8')
- h = html2text.HTML2Text()
- whous = h.handle(res.decode('utf-8-sig'))
- #print type(res.decode('utf-8-sig'))
- #print whous.replace("wedd","-").encode('utf8')
- #print(markup.unicode_markup)
- #print sopa.get_text().encode('utf8')
- def get_basic_info(sopa):
- main_section = sopa.find("div", id="body")
- sections = main_section.find_all("div")
- table_text(main_section)
- #get_mission_number(sections)
- #get_mission_title(sections)
- #get_mission_location(sections)
- def run_single():
- soup = BeautifulSoup(open('origen/11.html'), from_encoding="utf-8")
- get_basic_info(soup)
- def run_multiple():
- mypath = '../Applications/sphere/origen/';
- for f in sorted(glob.glob(mypath+'*.html')):
- if isfile(join(mypath,f)) and f != '../Applications/sphere/origen/4.html':
- soup = BeautifulSoup(open(f))
- get_basic_info(soup)
- if __name__ == "__main__":
- #run_multiple()
- run_single()
- #lector(soup)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement