zero_shubham1

Iskon Scrape

Aug 26th, 2016
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.71 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import requests
  3. import xlwt
  4. def main():
  5.     wb= xlwt.Workbook()
  6.     try:
  7.         for pg_num in range(0,4):
  8.             ws= wb.add_sheet("page_{0}".format(pg_num+1),cell_overwrite_ok=True)
  9.             ws.write(0,0,"Title")
  10.             ws.write(0,1,"City")
  11.             ws.write(0,2,"Phone")
  12.             ws.write(0,3,"Temple Name")
  13.             ws.write(0,4,"Country")
  14.             ws.write(0,5,"Website")
  15.             all_detail=list()
  16.             req = requests.get("http://directory.krishna.com/temples?page="+str(pg_num))
  17.             soup= BeautifulSoup(req.content,"lxml")
  18.             first_temple=soup.find_all("div","views-row views-row-1 views-row-odd views-row-first")
  19.             soup_2= BeautifulSoup(first_temple[0].prettify(),"lxml")
  20.             title= soup_2.find_all("div","views-field views-field-title")
  21.             all_detail.append(title[0].a.string.strip())
  22.  
  23.  
  24.             city= soup_2.find_all("div","views-field views-field-city")
  25.             soup_3= BeautifulSoup(city[0].prettify(),"lxml")
  26.             city_name=soup_3.find_all("span","field-content")
  27.             all_detail.append(city_name[0].contents[0].strip())
  28.  
  29.             phn= soup_2.find_all("div","views-field views-field-phone")
  30.             soup_4= BeautifulSoup(phn[0].prettify(),"lxml")
  31.             phn_num= soup_4.find_all("span","field-content")
  32.             all_detail.append(phn_num[0].contents[0].strip())
  33.  
  34.             tmpl_name= soup_2.find_all("div","views-field views-field-name")
  35.             soup_4= BeautifulSoup(tmpl_name[0].prettify(),"lxml")
  36.             tmpl_name_f= soup_4.find_all("span","field-content")
  37.             all_detail.append(tmpl_name_f[0].contents[0].strip())
  38.  
  39.             country_name= soup_2.find_all("div","views-field views-field-country")
  40.             soup_4= BeautifulSoup(country_name[0].prettify(),"lxml")
  41.             country_name_f= soup_4.find_all("span","field-content")
  42.             all_detail.append(country_name_f[0].contents[0].strip())
  43.  
  44.             website= soup_2.find_all("div","views-field views-field-field-website-url")
  45.             soup_4= BeautifulSoup(website[0].prettify(),"lxml")
  46.             website_url= soup_4.find_all("span","field-content")
  47.             soup_5= BeautifulSoup(website_url[0].prettify(),"lxml")
  48.             website_url_f= soup_5.find_all("a")
  49.             try:
  50.                 all_detail.append(website_url_f[0]["href"])
  51.             except:
  52.                 all_detail.append("No website URL fornd!")
  53.             all_detail.append(1)
  54.             all_detail.append(ws)
  55.             write(all_detail)
  56.             all_detail=list()
  57.  
  58.             for j in range(2,30):
  59.                 if (j%2==0):
  60.                     even_temples=soup.find_all("div","views-row views-row-"+str(j)+" views-row-even")
  61.                     for i in even_temples:
  62.                         soup_2= BeautifulSoup(i.prettify(),"lxml")
  63.                         title= soup_2.find_all("div","views-field views-field-title")
  64.                         all_detail.append(title[0].a.string.strip())
  65.  
  66.  
  67.                         city= soup_2.find_all("div","views-field views-field-city")
  68.                         soup_3= BeautifulSoup(city[0].prettify(),"lxml")
  69.                         city_name=soup_3.find_all("span","field-content")
  70.                         all_detail.append(city_name[0].contents[0].strip())
  71.  
  72.                         phn= soup_2.find_all("div","views-field views-field-phone")
  73.                         soup_4= BeautifulSoup(phn[0].prettify(),"lxml")
  74.                         phn_num= soup_4.find_all("span","field-content")
  75.                         all_detail.append(phn_num[0].contents[0].strip())
  76.  
  77.                         tmpl_name= soup_2.find_all("div","views-field views-field-name")
  78.                         soup_4= BeautifulSoup(tmpl_name[0].prettify(),"lxml")
  79.                         tmpl_name_f= soup_4.find_all("span","field-content")
  80.                         all_detail.append(tmpl_name_f[0].contents[0].strip())
  81.  
  82.                         country_name= soup_2.find_all("div","views-field views-field-country")
  83.                         soup_4= BeautifulSoup(country_name[0].prettify(),"lxml")
  84.                         country_name_f= soup_4.find_all("span","field-content")
  85.                         all_detail.append(country_name_f[0].contents[0].strip())
  86.  
  87.                         website= soup_2.find_all("div","views-field views-field-field-website-url")
  88.                         soup_4= BeautifulSoup(website[0].prettify(),"lxml")
  89.                         website_url= soup_4.find_all("span","field-content")
  90.                         soup_5= BeautifulSoup(website_url[0].prettify(),"lxml")
  91.                         website_url_f= soup_5.find_all("a")
  92.                         try:
  93.                             all_detail.append(website_url_f[0]["href"])
  94.                         except:
  95.                             all_detail.append("No website URL fornd!")
  96.                         all_detail.append(j)
  97.                         all_detail.append(ws)
  98.                         write(all_detail)
  99.                         all_detail=list()
  100.                 else:
  101.                     odd_temples=soup.find_all("div","views-row views-row-"+str(j)+" views-row-odd")
  102.                     for i in odd_temples:
  103.                         soup_2= BeautifulSoup(i.prettify(),"lxml")
  104.                         title= soup_2.find_all("div","views-field views-field-title")
  105.                         all_detail.append(title[0].a.string.strip())
  106.  
  107.  
  108.                         city= soup_2.find_all("div","views-field views-field-city")
  109.                         soup_3= BeautifulSoup(city[0].prettify(),"lxml")
  110.                         city_name=soup_3.find_all("span","field-content")
  111.                         all_detail.append(city_name[0].contents[0].strip())
  112.  
  113.  
  114.                         phn= soup_2.find_all("div","views-field views-field-phone")
  115.                         soup_4= BeautifulSoup(phn[0].prettify(),"lxml")
  116.                         phn_num= soup_4.find_all("span","field-content")
  117.                         all_detail.append(phn_num[0].contents[0].strip())
  118.  
  119.                         tmpl_name= soup_2.find_all("div","views-field views-field-name")
  120.                         soup_4= BeautifulSoup(tmpl_name[0].prettify(),"lxml")
  121.                         tmpl_name_f= soup_4.find_all("span","field-content")
  122.                         all_detail.append(tmpl_name_f[0].contents[0].strip())
  123.  
  124.                         country_name= soup_2.find_all("div","views-field views-field-country")
  125.                         soup_4= BeautifulSoup(country_name[0].prettify(),"lxml")
  126.                         country_name_f= soup_4.find_all("span","field-content")
  127.                         all_detail.append(country_name_f[0].contents[0].strip())
  128.  
  129.                         website= soup_2.find_all("div","views-field views-field-field-website-url")
  130.                         soup_4= BeautifulSoup(website[0].prettify(),"lxml")
  131.                         website_url= soup_4.find_all("span","field-content")
  132.                         soup_5= BeautifulSoup(website_url[0].prettify(),"lxml")
  133.                         website_url_f= soup_5.find_all("a")
  134.                         try:
  135.                             all_detail.append(website_url_f[0]["href"])
  136.                         except:
  137.                             all_detail.append("No website URL fornd!")
  138.                         all_detail.append(j)
  139.                         all_detail.append(ws)
  140.                         write(all_detail)
  141.                         all_detail=list()
  142.             last_temple=soup.find_all("div","views-row views-row-30 views-row-even views-row-last")
  143.             print(last_temple)
  144.             soup_2= BeautifulSoup(last_temple[0].prettify(),"lxml")
  145.             title= soup_2.find_all("div","views-field views-field-title")
  146.             all_detail.append(title[0].a.string.strip())
  147.  
  148.  
  149.             city= soup_2.find_all("div","views-field views-field-city")
  150.             soup_3= BeautifulSoup(city[0].prettify(),"lxml")
  151.             city_name=soup_3.find_all("span","field-content")
  152.             all_detail.append(city_name[0].contents[0].strip())
  153.  
  154.             phn= soup_2.find_all("div","views-field views-field-phone")
  155.             soup_4= BeautifulSoup(phn[0].prettify(),"lxml")
  156.             phn_num= soup_4.find_all("span","field-content")
  157.             all_detail.append(phn_num[0].contents[0].strip())
  158.  
  159.             tmpl_name= soup_2.find_all("div","views-field views-field-name")
  160.             soup_4= BeautifulSoup(tmpl_name[0].prettify(),"lxml")
  161.             tmpl_name_f= soup_4.find_all("span","field-content")
  162.             all_detail.append(tmpl_name_f[0].contents[0].strip())
  163.  
  164.             country_name= soup_2.find_all("div","views-field views-field-country")
  165.             soup_4= BeautifulSoup(country_name[0].prettify(),"lxml")
  166.             country_name_f= soup_4.find_all("span","field-content")
  167.             all_detail.append(country_name_f[0].contents[0].strip())
  168.  
  169.             website= soup_2.find_all("div","views-field views-field-field-website-url")
  170.             soup_4= BeautifulSoup(website[0].prettify(),"lxml")
  171.             website_url= soup_4.find_all("span","field-content")
  172.             soup_5= BeautifulSoup(website_url[0].prettify(),"lxml")
  173.             website_url_f= soup_5.find_all("a")
  174.             try:
  175.                 all_detail.append(website_url_f[0]["href"])
  176.             except:
  177.                 all_detail.append("No website URL fornd!")
  178.             all_detail.append(j+1)
  179.             all_detail.append(ws)
  180.             write(all_detail)
  181.             all_detail=list()
  182.     except:
  183.         print("done")
  184.     wb.save("learn.xls")
  185. def write(tpl):
  186.     ws=tpl[7]
  187.     print(tpl[6])
  188.     for i in range(0,6):
  189.         ws.write(int(tpl[6]),i,tpl[i])
  190. if __name__ == '__main__':
  191.     main()
Add Comment
Please, Sign In to add comment