Advertisement
britodfbr

truncus03.py

Feb 28th, 2018
109
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.92 KB | None | 0 0
  1. #coding: utf-8
  2. __author__ = 'britodfbr'
  3.  
  4. from bs4 import BeautifulSoup
  5. import inspect
  6. import re
  7. import requests
  8. import os
  9. import sys
  10.  
  11. def truncus03a(url='http://legis.senado.leg.br/legislacao/ListaTextoSigen.action?norma=483900&id=14310946&idBinario=15795498&mime=application/rtf',
  12.                path='', filename=''):
  13.     header = '<!DOCTYPE html>' \
  14.              '<html lang="pt-br">' \
  15.              '<head>' \
  16.              '<meta http-equiv="Content-Type" content="text/html; charset=iso8859-1"/>' \
  17.              '<link rel="stylesheet" href="../../../saj_projects/view/legis_3.css">' \
  18.              '</head>' \
  19.              '<body>' \
  20.              '<header> ' \
  21.                 '<h3>Subchefia para Assuntos Jurídicos</h3> ' \
  22.              '</header>'
  23.     footer='<p class="dou">Este texto não substitui o publicado no DOU de 17.1.2018</p></body></html>'
  24.     if not filename:
  25.         filename = '{}.html'.format(inspect.stack()[0][3])
  26.  
  27.     if not path:
  28.         path = ''
  29.         #os.mkdir(path, 0o755)
  30.  
  31.     page = re.sub(r' style="[^"]+"', '', requests.get(url).text)
  32.     page = re.sub(r'<span>|</span>', '', page)
  33.     page = re.sub(r'EpgrafeAlt1|Epgrafe', 'epigrafe', page)
  34.     page = re.sub(r'EmentaAlt2|Ementa', 'ementa', page)
  35.     page = re.sub(r'Assinatura1Alt7|Assinatura1', 'presidente', page)
  36.     page = re.sub(r'Assinatura2Alt8|Assinatura2', 'ministro', page)
  37.     soup = BeautifulSoup(page, 'html.parser').find_all(id='conteudoPrincipal')
  38.     caterva = soup[0].div.html.body
  39.  
  40.     container = caterva.find_all('div')[2]
  41.     print(container)
  42.     result = header + container.prettify() + footer
  43.     print(BeautifulSoup(result, 'html.parser').prettify())
  44.     #os.mkdir(path, 0o755)
  45.     with open("{}{}".format(path, filename), 'w') as file:
  46.         file.write(BeautifulSoup(result, 'html.parser').prettify())
  47.         pass
  48.     print('-' * 20)
  49.     sys.exit(0)
  50.  
  51.  
  52. if __name__ == '__main__':
  53.     truncus03a()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement