Advertisement
Typhoon

Scrape Z*** Faktury Loop 3

Feb 11th, 2015
334
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.57 KB | None | 0 0
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. import scraperwiki
  4. import sys
  5. from bs4 import BeautifulSoup
  6. import re
  7. import urllib2
  8. import time
  9. import string
  10.  
  11. lastinvoicenumber = scraperwiki.sqlite.get_var("lastnumber")
  12. if not lastinvoicenumber:
  13.     lastinvoicenumber = 1
  14.  
  15.  
  16.  
  17. checknumber = lastinvoicenumber
  18. counter_bad = 0
  19. counter_good = 0
  20. while counter_bad<50:
  21.    
  22.     checknumber = str(checknumber)
  23.     checkurl= 'http://www.zvjs.sk/index.php?fa_obj&type=fa&id=' + checknumber
  24.     checkpage = urllib2.urlopen(checkurl)
  25.     checksoup = BeautifulSoup(checkpage.read(), from_encoding="windows-1252")
  26.     price_val = checksoup.find_all('td')[9].text[:-1].replace(",", ".").replace(" ", "") or "null"
  27.     print checknumber
  28.     print checkurl
  29.     print price_val
  30.     if price_val == "0.00":
  31.         counter_bad = counter_bad + 1
  32.         print "BAD : " , counter_bad
  33.     else:
  34.         counter_good = counter_good + 1
  35.         print "GOOD : " , counter_good
  36.     checknumber = int(checknumber) 
  37.     checknumber = checknumber + 1
  38.     #time.sleep(1)
  39. newmax_val = counter_good
  40. print "NEW MAX INVOICE NO :"
  41. print newmax_val
  42. print "####################"
  43.  
  44. number = lastinvoicenumber
  45. maxval = lastinvoicenumber + newmax_val
  46. while number<maxval:
  47.     print number
  48.         number = str(number)
  49.         url= 'http://www.zvjs.sk/index.php?fa_obj&type=fa&id=' + number
  50.         page = urllib2.urlopen(url)
  51.         soup = BeautifulSoup(page.read(), from_encoding="windows-1252")
  52.  
  53.         evidence_no_val = soup.find_all('td')[3].text.strip() or "null"
  54.         paired_with_val = soup.find_all('td')[5].text.strip() or "null"
  55.     invoice_desc_val = soup.find_all('td')[7].text.replace('\"', '' ) or "null"
  56.     invoice_price_val = soup.find_all('td')[9].text[:-1].replace(",", ".").replace(" ", "") or "null"
  57.         date_received_val = soup.find_all('td')[11].text or "null"
  58.         date_payment_val = soup.find_all('td')[13].text or "null"
  59.         pay_form_val = soup.find_all('td')[15].text or "null"
  60.         trade_name_val = soup.find_all('td')[17].text or "null"
  61.         company_location_val = soup.find_all('td')[19].text or "null"
  62.         trade_form_val = soup.find_all('td')[21].text or "null"
  63.         court_val = soup.find_all('td')[23].text or "null"
  64.         ico_val = soup.find_all('td')[25].text or "null"
  65.         dic_val = soup.find_all('td')[27].text or "null"
  66.         accout_no_val = soup.find_all('td')[29].text or "null"
  67.         for link in soup.find_all("a", limit=1):
  68.             attachment_url = (link.get('href'))
  69.         if ( attachment_url == '/index.php?fa_obj'):
  70.                          attachment_url="null"
  71.                 else:
  72.                          attachment_url = "http://www.zvjs.sk" + attachment_url
  73.         number = int(number)              
  74.         scraperwiki.sqlite.save_var('lastnumber', number)
  75.     output = trade_name_val + " | " + invoice_price_val + " | " + date_received_val + " | " + date_payment_val
  76.     print output.encode("utf-8")
  77.     scraperwiki.sqlite.save(unique_keys=["invoice_id"],
  78.                     data={  "invoice_id":number,
  79.                         "invoice_price":invoice_price_val,
  80.                         "evidence_no":evidence_no_val,
  81.                         "paired_with":paired_with_val,
  82.                         "invoice_desc":invoice_desc_val,
  83.                         "date_received":date_received_val,
  84.                         "date_payment":date_payment_val,
  85.                         "pay_form":pay_form_val,
  86.                         "trade_name":trade_name_val,
  87.                         "trade_form":trade_form_val,
  88.                         "company_location":company_location_val,
  89.                         "court":court_val,
  90.                         "ico":ico_val,
  91.                         "dic":dic_val,
  92.                         "accout_no":accout_no_val,
  93.                         "invoice_attachment":attachment_url,
  94.                         "invoice_url":url})
  95.         number = number +1
  96.         time.sleep(1)
  97.  
  98. sys.exit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement