Advertisement
Typhoon

Scrape Z*** Faktury Loop 2

Feb 4th, 2015
288
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.75 KB | None | 0 0
  1. {-
  2.    All code is under the following license unless otherwise noted:
  3.    This program is free software; you can redistribute it and/or modify
  4.    it under the terms of the GNU General Public License as published by
  5.    the Free Software Foundation; either version 2 of the License, or
  6.    (at your option) any later version.
  7.  
  8.    This program is distributed in the hope that it will be useful,
  9.    but WITHOUT ANY WARRANTY; without even the implied warranty of
  10.    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11.    GNU General Public License for more details.
  12.  
  13.    You should have received a copy of the GNU General Public License
  14.    along with this program; if not, write to the Free Software
  15.    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  16.  
  17. The GNU General Public License is available in the file COPYING in the source
  18. distribution.  Debian GNU/Linux users may find this in
  19. /usr/share/common-licenses/GPL-2.
  20.  
  21. If the GPL is unacceptable for your uses, please e-mail me; alternative
  22. terms can be negotiated for your project.
  23. -}
  24.  
  25. #!/usr/bin/python
  26. # -*- coding: utf-8 -*-
  27. import sys
  28. from bs4 import BeautifulSoup
  29. from bs4 import NavigableString
  30. import re
  31. import urllib2
  32. import json
  33. import time
  34. import string
  35.  
  36. number = 115000
  37. while number<119420:
  38.         print number
  39.         number = str(number)
  40.         url= 'http://www.zvjs.sk/index.php?fa_obj&type=fa&id=' + number
  41.         page = urllib2.urlopen(url)
  42.         soup = BeautifulSoup(page.read(), from_encoding="windows-1252")
  43.  
  44.         evcislo = soup.find_all('td')[3].text
  45.         evcisloval = evcislo.strip()
  46.         if not evcisloval:
  47.             evcisloval="NULL"
  48.         prinalezi = soup.find_all('td')[5].text
  49.         prinalezival = prinalezi.strip()
  50.         if not prinalezival:
  51.             prinalezival="NULL"
  52.     popisfaplneniaval = soup.find_all('td')[7].text
  53.     popisfaplnenia = popisfaplneniaval.replace('\"', '');
  54.         if not popisfaplnenia:
  55.             popisfaplnenia="NULL"
  56.     hodnotafaplneniaval = soup.find_all('td')[9].text
  57.     hodnotafaplneniaval2 = hodnotafaplneniaval[:-1]
  58.     hodnotafaplneniaval3 = hodnotafaplneniaval2.replace(",", ".");
  59.     hodnotafaplnenia = hodnotafaplneniaval3.replace(" ", "");
  60.         datumdfa = soup.find_all('td')[11].text
  61.         if not datumdfa:
  62.             datumdfa="1.1.2016"
  63.         datumzfa = soup.find_all('td')[13].text
  64.         if not datumzfa:
  65.             datumzfa="1.1.2016"
  66.         formazaplatenia = soup.find_all('td')[15].text
  67.         if not formazaplatenia:
  68.             formazaplatenia="NULL"
  69.         obchmenonazov = soup.find_all('td')[17].text
  70.         if not obchmenonazov:
  71.             obchmenonazov="NULL"
  72.         sidlofirmy = soup.find_all('td')[19].text
  73.         if not sidlofirmy:
  74.             sidlofirmy="NULL"
  75.         pravnaforma = soup.find_all('td')[21].text
  76.         if not pravnaforma:
  77.             pravnaforma="NULL"
  78.         sudregistracie = soup.find_all('td')[23].text
  79.         if not sudregistracie:
  80.             sudregistracie="NULL"
  81.         ico = soup.find_all('td')[25].text
  82.         if not ico:
  83.             ico="NULL"
  84.         dic = soup.find_all('td')[27].text
  85.         if not dic:
  86.             dic="NULL"
  87.         cislouctu = soup.find_all('td')[29].text
  88.         if not cislouctu:
  89.             cislouctu="NULL"
  90.         for link in soup.find_all("a", limit=1):
  91.             urlfa = (link.get('href'))
  92.  
  93.     output = "{ " + "\"index\"" " : " "{ " + "\"_index\"" " : " +"\"zvjs\"" " , " + "\"_type\"" " : " +"\"ZVJS\"" +"}}" +"\n" "{"  "\"web_id_fa\"" ":" "\"" +number + "\"," "\"evidencne_cislo_faktury\"" ":" "\""+evcisloval.encode("utf-8") + "\"," "\"prinalezi_k\"" ":" "\""+prinalezival.encode("utf-8") + "\","  "\"popis_fakturovaneho_plnenia\"" ":" "\""+popisfaplnenia.encode("utf-8") + "\"," "\"hodnota_fakturovaneho_plnenia_s_dph\"" ":" "\""+hodnotafaplnenia.encode("utf-8") + "\"," "\"datum_dorucenia_faktury\"" ":" "\""+datumdfa.encode("utf-8") + "\"," "\"datum_zaplatenia_faktury\"" ":" "\""+datumzfa.encode("utf-8") + "\"," "\"forma_zaplatenia\"" ":" "\""+formazaplatenia.encode("utf-8") + "\"," "\"obchodne_meno\"" ":" "\""+obchmenonazov.encode("utf-8") + "\"," "\"sidlo_pravnickej_osoby\"" ":" "\""+sidlofirmy.encode("utf-8") + "\"," "\"pravna_forma\"" ":" "\""+pravnaforma.encode("utf-8") + "\"," "\"sud_registracie\"" ":" "\""+sudregistracie.encode("utf-8") + "\"," "\"ico\"" ":" "\""+ico.encode("utf-8") + "\"," "\"dic\"" ":" "\""+dic.encode("utf-8") + "\"," "\"cislo_uctu\"" ":" "\""+cislouctu.encode("utf-8") + "\"," "\"url_stranky_faktury\"" ":" "\""+url.encode("utf-8") + "\"," "\"url_faktura_pdf\"" ":" "\"""http://www.zvjs.sk" + urlfa.encode("utf-8") +"\"""}" +"\n"
  94.         outfile = open('/home/pi/scrapy/ZVJS_115000_119420.JSON', 'a+')
  95.         outfile.write(output)
  96.         number = int(number)
  97.         number = number +1
  98.     print output
  99.     time.sleep(1)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement