Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib.request
- from bs4 import BeautifulSoup
- import csv
- def find_between( s, first, last ):
- try:
- start = s.index( first ) + len( first )
- end = s.index( last, start )
- return s[start:end]
- except ValueError:
- return ""
- pagina = "https://sites.google.com/site/compendiumofphysicalactivities/Activity-Categories/"
- web_page = urllib.request.urlopen(pagina).read()
- soup = BeautifulSoup(web_page)
- allul = soup.findAll("ul")
- links = []
- for i in range(1,len(allul)):
- temp = str(allul[i])
- if temp.find("a href") != -1 and temp.find("new-activity-updates") == -1:
- link = find_between(temp,'<a href="','">')
- link.strip()
- links.append(link)
- for l in links:
- web_page = urllib.request.urlopen(l).read()
- soup = BeautifulSoup(web_page)
- nome_categoria = l[l.rfind("/")+1:]
- nome_categoria = nome_categoria.replace("-"," ")
- alltd = soup.findAll("td")
- f = open("terzo2.csv","a",encoding="utf8")
- c = csv.writer(f,delimiter=';')
- for i in range(1,len(alltd)):
- temp = str(alltd[i])
- if temp.find(".") != -1 and temp.find("<b>") != -1 and len(temp) < 100:
- codice = find_between(str(alltd[i-1]),"<b>","</b>").strip().replace("<br/>","").replace("<br/>","")
- if codice.find("</font>") != -1:
- codice = find_between(codice,'">','</font>').strip()
- valore = find_between(temp,"<b>","</b>").replace(" <br/>","").replace("<br/>","").strip()
- if valore.find("</font>") != -1:
- valore = find_between(valore,'">','</font>').strip()
- attivita = find_between(str(alltd[i+1]),">","</td>").replace(" <br/>","").replace("<br/>","").replace("<sup>TM</sup>","")
- if attivita.find("</font>") != -1:
- attivita = find_between(attivita,'">','</font>').strip()
- codice = codice.strip()
- valore = valore.strip()
- attivita = attivita.strip()
- print(codice+":"+valore+":"+attivita)
- c.writerow([nome_categoria,attivita,"0",valore,codice])
- f.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement