Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import sys
- import bs4
- import csv
- from operator import methodcaller
- from collections import OrderedDict
- import lxml.html
- import re
- import sys
- def escstr(s):
- return s.strip().replace("\n", "\\n")\
- .replace("\r", "\\r")\
- .replace('"', '\"')\
- .replace(";", "\;").encode("utf-8")
- def arr2csvR(arr):
- rc = ""
- for i in arr:
- rc += escstr(i) + ";"
- return rc + '\n' #no need for unterminated rows this time around
- def textof(elem):
- # while len(elem):
- # elem = elem[0]
- return elem.text_content()
- def tohdr(s):
- return re.sub(r' +', '_', re.sub(r'[^a-zA-Z0-9]', ' ', s.lower()).strip()).encode("utf-8")
- physicalcols = {'next': 0}
- admissioncols = {'next': 0}
- statuscols = {'next': 0}
- sentencingcols = {'next' : 0}
- for fn in sorted(os.listdir("inmates"))[:30000]: #for debug: add [:10000] or some such to make it go faster, note that this might could cause errors in below if it's too low
- first_sentencing = True
- if fn.endswith(".html"):
- r = lxml.html.parse(os.path.join("inmates/", fn)).getroot()
- main = r[1][0][4][0][0][0]
- if len(main) == 1:
- continue #this inmate doesn't exist
- indices = {'physical': None, 'admission': None, 'status': None, 'sentencing': None}
- for i, item in enumerate(main):
- text = item.text_content()
- if len(item) == 1:
- if text == "PHYSICAL PROFILE": #it's a header
- indices["physical"] = i+1
- elif text == "ADMISSION / RELEASE / DISCHARGE INFO":
- indices["admission"] = i+1
- elif text == "SENTENCING INFORMATION":
- indices["sentencing"] = i+1
- if text.startswith("Parent Institution: ") or text.startswith("Alias: "): #maybe elif? can they be 0?
- indices['status'] = i
- if None in indices.values():
- print "Failed at " + fn
- print indices
- quit(1)
- for i in main[indices["physical"]]:
- if i[0].text_content() not in physicalcols:
- physicalcols[i[0].text_content()] = physicalcols['next']
- physicalcols['next'] += 1
- for i in main[indices["admission"]]:
- if i[0].text_content() not in admissioncols:
- admissioncols[i[0].text_content()] = admissioncols['next']
- admissioncols['next'] += 1
- for i in main[indices["status"]]:
- if i[0].text_content() not in statuscols and len(i[0].text_content().strip()):
- statuscols[i[0].text_content()] = statuscols['next']
- statuscols['next'] += 1
- for i in main[indices['sentencing']]:
- if len(i[0].text_content().strip()):
- if first_sentencing:
- if i[0].text_content() not in sentencingcols:
- sentencingcols[i[0].text_content()] = sentencingcols['next']
- sentencingcols['next'] += 1
- else:
- assert i[0].text_content() in sentencingcols
- first_sentencing = False
- print physicalcols
- print admissioncols
- print statuscols
- print sentencingcols
- files = {
- 'physical': open("csv/physical.csv", 'w'),
- 'admission': open("csv/admission.csv", 'w'),
- 'sentencing': open("csv/sentencing.csv", 'w'),
- #'warrant': open("csv/warrant.csv", 'w'),
- #'warning': open("csv/warning.csv", 'w'),
- #'captured': open("csv/captured.csv", 'w'),
- 'marks': open("csv/marks.csv", 'w'),
- 'status': open("csv/status.csv", 'w'),
- }
- files['marks'].write(arr2csvR(['id', 'mark']))
- files['status'].write(arr2csvR(['id'] + map(tohdr, map(lambda i: statuscols.keys()[statuscols.values().index(i)], range(statuscols['next'])))))
- files['physical'].write(arr2csvR(['id'] + map(tohdr, map(lambda i: physicalcols.keys()[physicalcols.values().index(i)], range(physicalcols['next'])))))
- files['admission'].write(arr2csvR(['id'] + map(tohdr, map(lambda i: admissioncols.keys()[admissioncols.values().index(i)], range(admissioncols['next'])))))
- files['sentencing'].write(arr2csvR(['id'] + map(tohdr, map(lambda i: sentencingcols.keys()[sentencingcols.values().index(i)], range(sentencingcols['next'])))))
- for fn in sorted(os.listdir("inmates")):
- if fn.endswith(".html"):
- r = lxml.html.parse(os.path.join("inmates/", fn)).getroot()
- main = r[1][0][4][0][0][0]
- if len(main) == 1:
- continue #this inmate doesn't exist
- physical = {}
- admission = {}
- id = ''
- armed = False
- captured = False
- indices = {'physical': None, 'admission': None, 'sentencing': None, 'status': None}
- classified = {}
- status = {}
- for i, item in enumerate(main):
- text = item.text_content()
- if len(item) == 1:
- if text == "PHYSICAL PROFILE": #it's a header
- indices['physical'] = i+1
- classified[i] = True
- classified[i+1] = True
- elif text == "ADMISSION / RELEASE / DISCHARGE INFO":
- indices['admission'] = i+1
- classified[i] = True
- classified[i+1] = True
- elif text == "SENTENCING INFORMATION":
- indices["sentencing"] = i+1
- classified[i] = True
- classified[i+1] = True
- elif re.match("[A-Za-z][0-9]{5} - ", text):
- id = text[0:6]
- classified[i] = True
- elif text.startswith("Warrant Information"):
- # indices['warrant'] = i
- classified[i] = True
- elif text.startswith("Date of Birth: "):
- "do nothing, we already have physical profile"
- elif text.startswith("The information"): #disclaimer
- classified[i] = True
- elif text.startswith("IDOC TOLL FREE"): #warning
- classified[i] = True
- elif text == "Captured":
- classified[i] = True
- "do nothing"
- else:
- print "unexpected text: " + text
- quit(2)
- elif text.startswith("Parent Institution: ") or text.startswith("Alias: "):
- indices['status'] = i
- classified[i] = True
- assert None not in indices.values()
- for i, item in enumerate(main):
- if i not in classified:
- if len(item):
- assert len(item) != 1
- text = item.text_content()
- if text.startswith(" MARKS, SCARS, & TATTOOS"):
- for m in item[1:]:
- files['marks'].write(arr2csvR([id, m.text_content()]))
- else:
- print "unexpected text: " + text
- quit(3)
- elif i in indices.values():
- type = indices.keys()[indices.values().index(i)]
- arr = [id]
- if type == 'sentencing':
- #9 fields and a spacer, last also has the spacer
- assert len(item) % 9 == 0
- for i in range(len(item)/9):
- files['sentencing'].write(arr2csvR([id] + map(lambda x: x[1].text_content(), item[9*i:9*i+8])))
- else:
- c = {'physical': physicalcols,
- 'admission': admissioncols,
- 'status': statuscols}[type]
- for k in range(c['next']):
- found = False
- for m in item:
- if len(m[0].text_content().strip()) and c[m[0].text_content()] == k:
- found = True
- if m[0].text_content() == 'Sex Offender Registry Required':
- arr += ["true"] #there's no m[1], so we might as well handle it special case
- else:
- if type == 'physical':
- if m[0].text_content() == "Weight: ":
- txt = m[1].text_content()
- if txt[-5:] == " lbs.":
- arr += [txt[0:-5]] #"111 lbs." -> 111
- elif txt == "Not Available":
- arr += ["N/A"]
- else:
- print "unknown weight " + txt
- quit(4)
- elif m[0].text_content() == "Height: ":
- txt = m[1].text_content()
- if re.match(". ft. .. in.", txt):
- foot = int(txt[0])
- inch = int(txt[6:8])
- arr += [str(foot*12+inch)]
- elif txt == "Not Available":
- arr += ["N/A"]
- else:
- print "unknown weight " + txt
- quit(5)
- else:
- if not len(m) > 1:
- arr += ['']
- else:
- arr += [m[1].text_content()]
- else:
- if not len(m) > 1:
- arr += ['']
- else:
- arr += [m[1].text_content()]
- if not found:
- arr += ['']
- files[type].write(arr2csvR(arr))
- else:
- assert i in classified
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement