Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def truncus03a(url='http://legis.senado.leg.br/legislacao/ListaTextoSigen.action?norma=483900&id=14310946&idBinario=15795498&mime=application/rtf',
- path='', filename=''):
- header = '<!DOCTYPE html>' \
- '<html lang="pt-br">' \
- '<head>' \
- '<meta http-equiv="Content-Type" content="text/html; charset=iso8859-1"/>' \
- '<link rel="stylesheet" href="../../../saj_projects/view/legis_3.css">' \
- '</head>' \
- '<body>' \
- '<header> ' \
- '<h1>Presidência da República</h1> ' \
- '<h2>Casa Civil</h2> ' \
- '<h3>Subchefia para Assuntos Jurídicos</h3> ' \
- '</header>'
- footer='<p class="dou">Este texto não substitui o publicado no DOU de 17.1.2018</p></body></html>'
- if not filename:
- filename = '{}.html'.format(inspect.stack()[0][3])
- if not path:
- path = ''
- #os.mkdir(path, 0o755)
- page = re.sub(r' style="[^"]+"', '', requests.get(url).text)
- page = re.sub(r'<span>|</span>', '', page)
- page = re.sub(r'EpgrafeAlt1|Epgrafe', 'epigrafe', page)
- page = re.sub(r'EmentaAlt2|Ementa', 'ementa', page)
- page = re.sub(r'Assinatura1Alt7|Assinatura1', 'presidente', page)
- page = re.sub(r'Assinatura2Alt8|Assinatura2', 'ministro', page)
- soup = BeautifulSoup(page, 'html.parser').find_all(id='conteudoPrincipal')
- caterva = soup[0].div.html.body
- container = caterva.find_all('div')[2]
- print(container)
- result = header + container.prettify() + footer
- print(BeautifulSoup(result, 'html.parser').prettify())
- #os.mkdir(path, 0o755)
- try:
- with open("{}{}".format(path, filename), 'w') as file:
- file.write(BeautifulSoup(result, 'html.parser').prettify())
- #pass
- sys.exit(1);
- except:
- print ('Saindo do programa')
- if __name__ == '__main__':
- truncus03a()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement