Advertisement
jmacura

volby2wiki.py

Oct 19th, 2018
208
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.05 KB | None | 0 0
  1. # This Source Code Form is subject to the terms of the Mozilla Public
  2. # License, v. 2.0. If a copy of the MPL was not distributed with this
  3. # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4. # @author jmacura 2018
  5. from bs4 import BeautifulSoup
  6. import datetime
  7. from pprint import pprint
  8. #import mwparserfromhell as mwp
  9. import ssl
  10. import sys
  11. import time
  12. import urllib.request, urllib.parse, urllib.error
  13.  
  14. # Global variable
  15. nuts = None
  16. obec = None
  17. zkr = None
  18.  
  19. # Parse command line params
  20. usage = """Usage: volby2wiki.py -nuts kodNuts -obec kodObec [-zkr zkratkaObce]
  21. Options:
  22.  -nuts  Kód NUTS kraje
  23.  -obec  Kód obce
  24.  -zkr   Zkratka obce pro použití v názvu reference
  25. """
  26. if len(sys.argv) > 4 and len(sys.argv) < 8:
  27.     nuts = sys.argv[2]
  28.     obec = sys.argv[4]
  29.     if len(sys.argv) == 7:
  30.         zkr = sys.argv[6]
  31. else:
  32.     print(usage)
  33.     quit()
  34.  
  35. # Ignore SSL certificate errors
  36. ctx = ssl.create_default_context()
  37. ctx.check_hostname = False
  38. ctx.verify_mode = ssl.CERT_NONE
  39.  
  40. #Now hard-coded, but changeable in future
  41. #serviceurl = 'https://volby.cz/pls/kv2018/kv1111?xjazyk=CZ&xid=1&xdz=3&xnumnuts=3203&xobec=554791&xstat=0&xvyber=0'
  42. serviceurl = 'https://volby.cz/pls/kv2018/kv1111?' #?xjazyk=CZ&xid=1&xdz=3&xnumnuts=4102&xobec=554961&xstat=0&xvyber=0
  43. def getPageContent():
  44.     params = {'xjazyk': 'CZ',
  45.        'xid': '1',
  46.        'xdz': '3',
  47.        'xnumnuts': nuts if nuts is not None else 3203,
  48.        'xobec': obec if obec is not None else 554791,
  49.        'xstat': 0,
  50.        'xvyber': 0}
  51.     url = serviceurl + urllib.parse.urlencode(params)
  52.     req = urllib.request.Request(url)
  53.     req.add_header('User-Agent', "volby2wiki/dev (jan.macura@wikimedia.cz)")
  54.     print('Retrieving', url)
  55.     try:
  56.         data = urllib.request.urlopen(req, context=ctx).read()
  57.     except (TimeoutError, urllib.error.URLError):
  58.         print("Request timed out")
  59.         return "timeout", url
  60.     print('Retrieved', len(data), 'characters')
  61.     return data, url
  62.  
  63. def parsePage(text):
  64.     content = soup.find(id="publikace").find_all('table')
  65.     data_row = content[0].find_all('tr')[2].find_all('td')
  66.     #(pocet clenu, volebni ucast, platne hlasy)
  67.     stats = (int(data_row[0].string), float(data_row[7].string.replace(",", '.')), int(str(data_row[9].string).replace("\xa0", '')))
  68.     pprint(stats)
  69.  
  70.     data_block = content[1].find_all('tr')
  71.     party_data = []
  72.     for i, row in enumerate(data_block):
  73.         if i > 1:
  74.             data_row = row.find_all('td')
  75.             #(nazev, hlasy absolutne, hlasy procenta, mandaty)
  76.             party_data.append( (
  77.                 data_row[1].string,
  78.                 int(data_row[2].string.replace("\xa0", '')),
  79.                 float(data_row[3].string.replace(",", '.')),
  80.                 int(data_row[7].string)
  81.             ) )
  82.     pprint(party_data)
  83.     return stats, party_data
  84.  
  85. print("Dialing the page...")
  86. html, fullurl = getPageContent()
  87. print("Parsing the page...")
  88. soup = BeautifulSoup(html, 'html.parser')
  89. stats, party_data = parsePage(soup)
  90. print()
  91. party_data.sort(reverse = True, key = lambda t: t[1])
  92. pprint(party_data)
  93.  
  94. filename = (zkr if zkr is not None else "plzen") + ".txt"
  95. with open(filename, 'w', encoding="utf-8") as fh:
  96.     now = str(datetime.datetime.today())
  97.     fh.write(
  98.     """Volební účast v [[Plzeň|Plzni]] činila {0} %.<ref name="{1}18">{{{{Citace elektronické monografie
  99. | titul = Volby do zastupitelstev obcí 05.10. - 06.10.2018: Zastupitelstvo statutárního města: Plzeň
  100. | url = {2}
  101. | vydavatel = Český statistický úřad
  102. | datum přístupu = {3}
  103. }}}}</ref>
  104. \n""".format(str(stats[1]).replace(".", ','), zkr if zkr is not None else "plzen", fullurl, now.split()[0])
  105.     )
  106.     fh.write(
  107. """{{| class="wikitable sortable" style="text-align: right;"
  108. |-
  109. ! rowspan="2" width="250" | Volební strana
  110. ! colspan="2" style="padding: 0 50px;" | Hlasy
  111. ! colspan="3" style="padding: 0 50px;" | Mandáty
  112. |-
  113. ! počet
  114. ! v %
  115. ! data-sort-type="number" | 2018<ref name="{0}18" />
  116. ! data-sort-type="number" | [[Volby do zastupitelstev obcí v Česku 2014|2014]]
  117. ! data-sort-type="number" | bilance
  118. """.format(zkr if zkr is not None else "plzen"))
  119.     no_rest = 0
  120.     rest = 0
  121.     rest_p = 0.00
  122.     for party in party_data:
  123.         if party[2] < 3.00:
  124.             no_rest += 1
  125.             rest += party[1]
  126.             rest_p += party[2]
  127.         else:
  128.             fh.write(
  129. """|-
  130. | align="left" | [[{0}]]
  131. | {{{{formatnum:{1}}}}}
  132. | {2}
  133. | {3}
  134. |
  135. |
  136. """.format(party[0], party[1], str(party[2]).replace(".", ','), party[3])
  137.             )
  138.     fh.write(
  139.     """|- class="sortbottom"
  140. | align="left" | Ostatní celkem ({0} subjektů)
  141. | {{{{formatnum:{1}}}}}
  142. | {2}
  143. | –
  144. | –
  145. | –
  146. """.format(no_rest, rest, str(rest_p).replace(".", ','))
  147.     )
  148.     fh.write(
  149.     """|- class="sortbottom"
  150. ! Celkem
  151. ! {{{{formatnum:{0}}}}}
  152. ! 100
  153. ! {1}
  154. !
  155. ! –
  156. |}}
  157. """.format(stats[2], stats[0])
  158.     )
  159. print("File \"{}\" succesfully created".format(filename))
  160.  
  161. # fails = 0
  162. # timeouts = 0
  163. # t = time.time()
  164. # if mod < 2:
  165. #     i = 0
  166. #         for row in reader(fh):
  167. #             pageid = row[1] #56336 #29498 #4936 #67350
  168. #             if pageid == "pageid":
  169. #                 continue
  170. #             i += 1
  171. #             #if i > 1:
  172. #                 #break # at least for testing
  173. #             k = row[0][4:] #"amenity"
  174. #             print("Item No. {}: {}".format(i, k))
  175. #             if not rdflib.Literal(k) in g.objects(None, OSM.key):
  176. #                 print("key {} is not in graph!".format(k))
  177. #                 continue
  178. #             print("Processing {}...".format(k))
  179. #             key = rdflib.URIRef(OSMK + k.replace(':', "--"))
  180. #             wikitext = getWikiInfo(pageid)
  181. #             if not wikitext:
  182. #                 fails += 1
  183. #                 print("No wikitext to parse")
  184. #                 continue
  185. #             if wikitext == "timeout":
  186. #                 timeouts += 1
  187. #                 continue
  188. #             req, group, wikidata, status, cartopic = parseWikitext(wikitext)
  189. #             to_graph(key, req, group, wikidata, status, cartopic)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement