Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {-
- All code is under the following license unless otherwise noted:
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- The GNU General Public License is available in the file COPYING in the source
- distribution. Debian GNU/Linux users may find this in
- /usr/share/common-licenses/GPL-2.
- If the GPL is unacceptable for your uses, please e-mail me; alternative
- terms can be negotiated for your project.
- -}
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- import sys
- from bs4 import BeautifulSoup
- from bs4 import NavigableString
- import re
- import urllib2
- import json
- import time
- import string
- number = 115000
- while number<119420:
- print number
- number = str(number)
- url= 'http://www.zvjs.sk/index.php?fa_obj&type=fa&id=' + number
- page = urllib2.urlopen(url)
- soup = BeautifulSoup(page.read(), from_encoding="windows-1252")
- evcislo = soup.find_all('td')[3].text
- evcisloval = evcislo.strip()
- if not evcisloval:
- evcisloval="NULL"
- prinalezi = soup.find_all('td')[5].text
- prinalezival = prinalezi.strip()
- if not prinalezival:
- prinalezival="NULL"
- popisfaplneniaval = soup.find_all('td')[7].text
- popisfaplnenia = popisfaplneniaval.replace('\"', '');
- if not popisfaplnenia:
- popisfaplnenia="NULL"
- hodnotafaplneniaval = soup.find_all('td')[9].text
- hodnotafaplneniaval2 = hodnotafaplneniaval[:-1]
- hodnotafaplneniaval3 = hodnotafaplneniaval2.replace(",", ".");
- hodnotafaplnenia = hodnotafaplneniaval3.replace(" ", "");
- datumdfa = soup.find_all('td')[11].text
- if not datumdfa:
- datumdfa="1.1.2016"
- datumzfa = soup.find_all('td')[13].text
- if not datumzfa:
- datumzfa="1.1.2016"
- formazaplatenia = soup.find_all('td')[15].text
- if not formazaplatenia:
- formazaplatenia="NULL"
- obchmenonazov = soup.find_all('td')[17].text
- if not obchmenonazov:
- obchmenonazov="NULL"
- sidlofirmy = soup.find_all('td')[19].text
- if not sidlofirmy:
- sidlofirmy="NULL"
- pravnaforma = soup.find_all('td')[21].text
- if not pravnaforma:
- pravnaforma="NULL"
- sudregistracie = soup.find_all('td')[23].text
- if not sudregistracie:
- sudregistracie="NULL"
- ico = soup.find_all('td')[25].text
- if not ico:
- ico="NULL"
- dic = soup.find_all('td')[27].text
- if not dic:
- dic="NULL"
- cislouctu = soup.find_all('td')[29].text
- if not cislouctu:
- cislouctu="NULL"
- for link in soup.find_all("a", limit=1):
- urlfa = (link.get('href'))
- output = "{ " + "\"index\"" " : " "{ " + "\"_index\"" " : " +"\"zvjs\"" " , " + "\"_type\"" " : " +"\"ZVJS\"" +"}}" +"\n" "{" "\"web_id_fa\"" ":" "\"" +number + "\"," "\"evidencne_cislo_faktury\"" ":" "\""+evcisloval.encode("utf-8") + "\"," "\"prinalezi_k\"" ":" "\""+prinalezival.encode("utf-8") + "\"," "\"popis_fakturovaneho_plnenia\"" ":" "\""+popisfaplnenia.encode("utf-8") + "\"," "\"hodnota_fakturovaneho_plnenia_s_dph\"" ":" "\""+hodnotafaplnenia.encode("utf-8") + "\"," "\"datum_dorucenia_faktury\"" ":" "\""+datumdfa.encode("utf-8") + "\"," "\"datum_zaplatenia_faktury\"" ":" "\""+datumzfa.encode("utf-8") + "\"," "\"forma_zaplatenia\"" ":" "\""+formazaplatenia.encode("utf-8") + "\"," "\"obchodne_meno\"" ":" "\""+obchmenonazov.encode("utf-8") + "\"," "\"sidlo_pravnickej_osoby\"" ":" "\""+sidlofirmy.encode("utf-8") + "\"," "\"pravna_forma\"" ":" "\""+pravnaforma.encode("utf-8") + "\"," "\"sud_registracie\"" ":" "\""+sudregistracie.encode("utf-8") + "\"," "\"ico\"" ":" "\""+ico.encode("utf-8") + "\"," "\"dic\"" ":" "\""+dic.encode("utf-8") + "\"," "\"cislo_uctu\"" ":" "\""+cislouctu.encode("utf-8") + "\"," "\"url_stranky_faktury\"" ":" "\""+url.encode("utf-8") + "\"," "\"url_faktura_pdf\"" ":" "\"""http://www.zvjs.sk" + urlfa.encode("utf-8") +"\"""}" +"\n"
- outfile = open('/home/pi/scrapy/ZVJS_115000_119420.JSON', 'a+')
- outfile.write(output)
- number = int(number)
- number = number +1
- print output
- time.sleep(1)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement