Advertisement
Guest User

Terzo.py

a guest
Apr 29th, 2017
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.14 KB | None | 0 0
  1. import urllib.request
  2. from bs4 import BeautifulSoup
  3. import csv
  4.  
  5. def find_between( s, first, last ):
  6.     try:
  7.         start = s.index( first ) + len( first )
  8.         end = s.index( last, start )
  9.         return s[start:end]
  10.     except ValueError:
  11.         return ""
  12.  
  13. pagina = "https://sites.google.com/site/compendiumofphysicalactivities/Activity-Categories/"
  14. web_page = urllib.request.urlopen(pagina).read()
  15. soup = BeautifulSoup(web_page)
  16.  
  17. allul = soup.findAll("ul")
  18. links = []
  19.  
  20. for i in range(1,len(allul)):
  21.     temp = str(allul[i])
  22.     if temp.find("a href") != -1 and temp.find("new-activity-updates") == -1:
  23.         link = find_between(temp,'<a href="','">')
  24.         link.strip()
  25.         links.append(link)
  26.  
  27. for l in links:
  28.     web_page = urllib.request.urlopen(l).read()
  29.     soup = BeautifulSoup(web_page)
  30.     nome_categoria = l[l.rfind("/")+1:]
  31.     nome_categoria = nome_categoria.replace("-"," ")
  32.  
  33.     alltd = soup.findAll("td")
  34.  
  35.     f = open("terzo2.csv","a",encoding="utf8")
  36.     c = csv.writer(f,delimiter=';')
  37.  
  38.     for i in range(1,len(alltd)):
  39.         temp = str(alltd[i])
  40.  
  41.         if temp.find(".") != -1 and temp.find("<b>") != -1 and len(temp) < 100:
  42.             codice = find_between(str(alltd[i-1]),"<b>","</b>").strip().replace("<br/>","").replace("<br/>","")
  43.             if codice.find("</font>") != -1:
  44.                 codice = find_between(codice,'">','</font>').strip()
  45.             valore = find_between(temp,"<b>","</b>").replace(" <br/>","").replace("<br/>","").strip()
  46.             if valore.find("</font>") != -1:
  47.                 valore = find_between(valore,'">','</font>').strip()
  48.             attivita = find_between(str(alltd[i+1]),">","</td>").replace(" <br/>","").replace("<br/>","").replace("<sup>TM</sup>","")
  49.             if attivita.find("</font>") != -1:
  50.                 attivita = find_between(attivita,'">','</font>').strip()
  51.  
  52.             codice = codice.strip()
  53.             valore = valore.strip()
  54.             attivita = attivita.strip()
  55.            
  56.             print(codice+":"+valore+":"+attivita)
  57.             c.writerow([nome_categoria,attivita,"0",valore,codice])
  58.            
  59.     f.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement