Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- import scraperwiki
- import sys
- from bs4 import BeautifulSoup
- import re
- import urllib2
- import time
- import string
- lastinvoicenumber = scraperwiki.sqlite.get_var("lastnumber")
- if not lastinvoicenumber:
- lastinvoicenumber = 1
- checknumber = lastinvoicenumber
- counter_bad = 0
- counter_good = 0
- while counter_bad<50:
- checknumber = str(checknumber)
- checkurl= 'http://www.zvjs.sk/index.php?fa_obj&type=fa&id=' + checknumber
- checkpage = urllib2.urlopen(checkurl)
- checksoup = BeautifulSoup(checkpage.read(), from_encoding="windows-1252")
- price_val = checksoup.find_all('td')[9].text[:-1].replace(",", ".").replace(" ", "") or "null"
- print checknumber
- print checkurl
- print price_val
- if price_val == "0.00":
- counter_bad = counter_bad + 1
- print "BAD : " , counter_bad
- else:
- counter_good = counter_good + 1
- print "GOOD : " , counter_good
- checknumber = int(checknumber)
- checknumber = checknumber + 1
- #time.sleep(1)
- newmax_val = counter_good
- print "NEW MAX INVOICE NO :"
- print newmax_val
- print "####################"
- number = lastinvoicenumber
- maxval = lastinvoicenumber + newmax_val
- while number<maxval:
- print number
- number = str(number)
- url= 'http://www.zvjs.sk/index.php?fa_obj&type=fa&id=' + number
- page = urllib2.urlopen(url)
- soup = BeautifulSoup(page.read(), from_encoding="windows-1252")
- evidence_no_val = soup.find_all('td')[3].text.strip() or "null"
- paired_with_val = soup.find_all('td')[5].text.strip() or "null"
- invoice_desc_val = soup.find_all('td')[7].text.replace('\"', '' ) or "null"
- invoice_price_val = soup.find_all('td')[9].text[:-1].replace(",", ".").replace(" ", "") or "null"
- date_received_val = soup.find_all('td')[11].text or "null"
- date_payment_val = soup.find_all('td')[13].text or "null"
- pay_form_val = soup.find_all('td')[15].text or "null"
- trade_name_val = soup.find_all('td')[17].text or "null"
- company_location_val = soup.find_all('td')[19].text or "null"
- trade_form_val = soup.find_all('td')[21].text or "null"
- court_val = soup.find_all('td')[23].text or "null"
- ico_val = soup.find_all('td')[25].text or "null"
- dic_val = soup.find_all('td')[27].text or "null"
- accout_no_val = soup.find_all('td')[29].text or "null"
- for link in soup.find_all("a", limit=1):
- attachment_url = (link.get('href'))
- if ( attachment_url == '/index.php?fa_obj'):
- attachment_url="null"
- else:
- attachment_url = "http://www.zvjs.sk" + attachment_url
- number = int(number)
- scraperwiki.sqlite.save_var('lastnumber', number)
- output = trade_name_val + " | " + invoice_price_val + " | " + date_received_val + " | " + date_payment_val
- print output.encode("utf-8")
- scraperwiki.sqlite.save(unique_keys=["invoice_id"],
- data={ "invoice_id":number,
- "invoice_price":invoice_price_val,
- "evidence_no":evidence_no_val,
- "paired_with":paired_with_val,
- "invoice_desc":invoice_desc_val,
- "date_received":date_received_val,
- "date_payment":date_payment_val,
- "pay_form":pay_form_val,
- "trade_name":trade_name_val,
- "trade_form":trade_form_val,
- "company_location":company_location_val,
- "court":court_val,
- "ico":ico_val,
- "dic":dic_val,
- "accout_no":accout_no_val,
- "invoice_attachment":attachment_url,
- "invoice_url":url})
- number = number +1
- time.sleep(1)
- sys.exit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement