Untitled

#/usr/bin/python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup, Comment, UnicodeDammit
import glob
from os.path import isfile, join
import html2text

#Dos pasos para codificar correctamente la pagina
def lector(sopa):
    fuente = sopa.prettify()
    print fuente.encode('utf8')

def get_mission_title(sections):
    if (sections[1].span.span != None):
        print sections[1].span.span.string.encode('utf8')

def get_mission_number(sections):
    if (sections[0] != None):
        print sections[0].span.string.encode('utf8')

def get_mission_location(sections):
    if (sections[3] != None):
        print sections[3].get_text().encode('utf8').strip()

def get_mission_subtitle(sections):
    if (sections[2].span.span != None):
        print sections[2].span.span.string.encode('utf8')

def table_text(sopa):
    sections = sopa.find_all("div")
    for section in sections:
        section.decompose()
    lines = sopa.find_all("hr")
    broes = sopa.find_all("br")
    for x in range(0, 2):
        lines[x].decompose()
        broes[x].decompose()
    for link in sopa.find_all("a"):
        link.decompose()
    for tag in sopa.find_all(text=lambda text:isinstance(text, Comment)):
        tag.extract()

    for elem in sopa.find_all("td"):
        if (elem.get_text().strip() == ""):
            prox = elem.next_element
            if prox != None:
                if prox.strip() == "":
                    otro_prox = elem.next_element.next_element
                    if otro_prox != None:
                        if str(otro_prox).strip() != "":
                            elem.string = "wedd"
                else:
                    elem.string = "wedd"

    res = str(sopa.encode('utf8'))
    print sopa.get_text().replace("wedd","-").encode('utf8')
    #print sopa.prettify().encode('utf8')
    h = html2text.HTML2Text()
    whous = h.handle(res.decode('utf-8-sig'))
    #print type(res.decode('utf-8-sig'))
    #print whous.replace("wedd","-").encode('utf8')
    #print(markup.unicode_markup)
    #print sopa.get_text().encode('utf8')

def get_basic_info(sopa):
    main_section = sopa.find("div", id="body")
    sections = main_section.find_all("div")
    table_text(main_section)
    #get_mission_number(sections)
    #get_mission_title(sections)
    #get_mission_location(sections)

def run_single():
    soup = BeautifulSoup(open('origen/11.html'), from_encoding="utf-8")
    get_basic_info(soup)

def run_multiple():
    mypath = '../Applications/sphere/origen/';
    for f in sorted(glob.glob(mypath+'*.html')):
        if isfile(join(mypath,f)) and f != '../Applications/sphere/origen/4.html':
            soup = BeautifulSoup(open(f))
            get_basic_info(soup)

if __name__ == "__main__":
    #run_multiple()
    run_single()
    #lector(soup)