Advertisement
Typhoon

Scrape Z*** Faktury Loop 4

May 19th, 2015
307
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.86 KB | None | 0 0
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. import scraperwiki
  4. import sys
  5. from bs4 import BeautifulSoup
  6. import re
  7. import urllib2
  8. import time
  9. import string
  10.  
  11. lastinvoicenumber = scraperwiki.sqlite.get_var("lastnumber")
  12. if not lastinvoicenumber:
  13.     lastinvoicenumber = 1
  14.  
  15. checknumber = lastinvoicenumber
  16. counter_bad = 0
  17. counter_good = 0
  18. while counter_bad<15:
  19.     try:
  20.         checknumber = str(checknumber)
  21.         checkurl= 'http://www.zvjs.sk/index.php?fa_obj&type=fa&id=' + checknumber
  22.         checkpage = urllib2.urlopen(checkurl)
  23.         checksoup = BeautifulSoup(checkpage.read(), from_encoding="windows-1252")
  24.         price_val = checksoup.find_all('td')[9].text[:-1].replace(",", ".").replace(" ", "") or "null"
  25.         print checknumber
  26.         print checkurl
  27.         print price_val
  28.         if price_val == "0.00":
  29.             counter_bad = counter_bad + 1
  30.             print "BAD :" , counter_bad
  31.         else:
  32.             counter_good = counter_good + 1
  33.             print "GOOD :" , counter_good
  34.         checknumber = int(checknumber) 
  35.         checknumber = checknumber + 1
  36.         #time.sleep(1)
  37.     except:
  38.         print ("Error checking available sites : "), checkurl
  39.         counter_bad = counter_bad + 1
  40.         print "BAD : " , counter_bad
  41.  
  42. newmax_val = counter_good
  43. print "NEW MAX INVOICE NO :"
  44. print newmax_val
  45. print "####################"
  46.  
  47. number = lastinvoicenumber
  48. maxval = lastinvoicenumber + newmax_val
  49. while number<maxval:
  50.     try:
  51.         print number
  52.             number = str(number)
  53.             url= 'http://www.zvjs.sk/index.php?fa_obj&type=fa&id=' + number
  54.             page = urllib2.urlopen(url)
  55.             soup = BeautifulSoup(page.read(), from_encoding="windows-1252")
  56.  
  57.             evidence_no_val = soup.find_all('td')[3].text.strip() or "null"
  58.             paired_with_val = soup.find_all('td')[5].text.strip() or "null"
  59.         invoice_desc_val = soup.find_all('td')[7].text.replace('\"', '' ) or "null"
  60.         invoice_price_val = soup.find_all('td')[9].text[:-1].replace(",", ".").replace(" ", "") or "null"
  61.             date_received_val = soup.find_all('td')[11].text or "null"
  62.             date_payment_val = soup.find_all('td')[13].text or "null"
  63.             pay_form_val = soup.find_all('td')[15].text or "null"
  64.             trade_name_val = soup.find_all('td')[17].text or "null"
  65.             company_location_val = soup.find_all('td')[19].text or "null"
  66.             trade_form_val = soup.find_all('td')[21].text or "null"
  67.             court_val = soup.find_all('td')[23].text or "null"
  68.             ico_val = soup.find_all('td')[25].text or "null"
  69.             dic_val = soup.find_all('td')[27].text or "null"
  70.             accout_no_val = soup.find_all('td')[29].text or "null"
  71.             for link in soup.find_all("a", limit=1):
  72.                 attachment_url = (link.get('href'))
  73.             if ( attachment_url == '/index.php?fa_obj'):
  74.                              attachment_url="null"
  75.                     else:
  76.                              attachment_url = "http://www.zvjs.sk" + attachment_url
  77.             number = int(number)              
  78.             scraperwiki.sqlite.save_var('lastnumber', number)
  79.         output = trade_name_val + " | " + invoice_price_val + " | " + date_received_val + " | " + date_payment_val
  80.         print output.encode("utf-8")
  81.         scraperwiki.sqlite.save(unique_keys=["invoice_id"],
  82.                         data={  "invoice_id":number,
  83.                             "invoice_price":invoice_price_val,
  84.                             "evidence_no":evidence_no_val,
  85.                             "paired_with":paired_with_val,
  86.                             "invoice_desc":invoice_desc_val,
  87.                             "date_received":date_received_val,
  88.                             "date_payment":date_payment_val,
  89.                             "pay_form":pay_form_val,
  90.                             "trade_name":trade_name_val,
  91.                             "trade_form":trade_form_val,
  92.                             "company_location":company_location_val,
  93.                             "court":court_val,
  94.                             "ico":ico_val,
  95.                             "dic":dic_val,
  96.                             "accout_no":accout_no_val,
  97.                             "invoice_attachment":attachment_url,
  98.                             "invoice_url":url})
  99.             number = number +1
  100.             time.sleep(1)
  101.     except:
  102.         print ("Error Scraping Data")
  103.         number = int(number)
  104.         number = number + 1
  105.  
  106. sys.exit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement