Untitled

import os
import sys
import bs4
import csv
from operator import methodcaller
from collections import OrderedDict
import lxml.html
import re
import sys

def escstr(s):
    return s.strip().replace("\n", "\\n")\
            .replace("\r", "\\r")\
            .replace('"', '\"')\
            .replace(";", "\;").encode("utf-8")

def arr2csvR(arr):
    rc = ""
    for i in arr:
        rc += escstr(i) + ";"
    return rc + '\n' #no need for unterminated rows this time around

def textof(elem):
#    while len(elem):
#        elem = elem[0]
    return elem.text_content()

def tohdr(s):
    return re.sub(r' +', '_', re.sub(r'[^a-zA-Z0-9]', ' ', s.lower()).strip()).encode("utf-8")

physicalcols = {'next': 0}
admissioncols = {'next': 0}
statuscols = {'next': 0}
sentencingcols = {'next' : 0}

for fn in sorted(os.listdir("inmates"))[:30000]: #for debug: add [:10000] or some such to make it go faster, note that this might could cause errors in below if it's too low
    first_sentencing = True
    if fn.endswith(".html"):
        r = lxml.html.parse(os.path.join("inmates/", fn)).getroot()
        main = r[1][0][4][0][0][0]
        if len(main) == 1:
            continue #this inmate doesn't exist
        indices = {'physical': None, 'admission': None, 'status': None, 'sentencing': None}
        for i, item in enumerate(main):
            text = item.text_content()
            if len(item) == 1:
                if text == "PHYSICAL PROFILE": #it's a header
                    indices["physical"] = i+1
                elif text == "ADMISSION / RELEASE / DISCHARGE INFO":
                    indices["admission"] = i+1
                elif text == "SENTENCING INFORMATION":
                    indices["sentencing"] = i+1
            if text.startswith("Parent Institution: ") or text.startswith("Alias: "): #maybe elif? can they be 0?
                indices['status'] = i
        if None in indices.values():
            print "Failed at " + fn
            print indices
            quit(1)
        for i in main[indices["physical"]]:
            if i[0].text_content() not in physicalcols:
                physicalcols[i[0].text_content()] = physicalcols['next']
                physicalcols['next'] += 1
        for i in main[indices["admission"]]:
            if i[0].text_content() not in admissioncols:
                admissioncols[i[0].text_content()] = admissioncols['next']
                admissioncols['next'] += 1
        for i in main[indices["status"]]:
            if i[0].text_content() not in statuscols and len(i[0].text_content().strip()):
                statuscols[i[0].text_content()] = statuscols['next']
                statuscols['next'] += 1
        for i in main[indices['sentencing']]:
            if len(i[0].text_content().strip()):
                if first_sentencing:
                    if i[0].text_content() not in sentencingcols:
                        sentencingcols[i[0].text_content()] = sentencingcols['next']
                        sentencingcols['next'] += 1
                else:
                    assert i[0].text_content() in sentencingcols
        first_sentencing = False

print physicalcols
print admissioncols
print statuscols
print sentencingcols

files = {
'physical': open("csv/physical.csv", 'w'),
'admission': open("csv/admission.csv", 'w'),
'sentencing': open("csv/sentencing.csv", 'w'),
#'warrant': open("csv/warrant.csv", 'w'),
#'warning': open("csv/warning.csv", 'w'),
#'captured': open("csv/captured.csv", 'w'),
'marks': open("csv/marks.csv", 'w'),
'status': open("csv/status.csv", 'w'),
}

files['marks'].write(arr2csvR(['id', 'mark']))
files['status'].write(arr2csvR(['id'] + map(tohdr, map(lambda i: statuscols.keys()[statuscols.values().index(i)], range(statuscols['next'])))))
files['physical'].write(arr2csvR(['id'] + map(tohdr, map(lambda i: physicalcols.keys()[physicalcols.values().index(i)], range(physicalcols['next'])))))
files['admission'].write(arr2csvR(['id'] + map(tohdr, map(lambda i: admissioncols.keys()[admissioncols.values().index(i)], range(admissioncols['next'])))))
files['sentencing'].write(arr2csvR(['id'] + map(tohdr, map(lambda i: sentencingcols.keys()[sentencingcols.values().index(i)], range(sentencingcols['next'])))))

for fn in sorted(os.listdir("inmates")):
    if fn.endswith(".html"):
        r = lxml.html.parse(os.path.join("inmates/", fn)).getroot()
        main = r[1][0][4][0][0][0]
        if len(main) == 1:
            continue #this inmate doesn't exist
        physical = {}
        admission = {}
        id = ''
        armed = False
        captured = False
        indices = {'physical': None, 'admission': None, 'sentencing': None, 'status': None}
        classified = {}
        status = {}
        for i, item in enumerate(main):
            text = item.text_content()
            if len(item) == 1:
                if text == "PHYSICAL PROFILE": #it's a header
                    indices['physical'] = i+1
                    classified[i] = True
                    classified[i+1] = True
                elif text == "ADMISSION / RELEASE / DISCHARGE INFO":
                    indices['admission'] = i+1
                    classified[i] = True
                    classified[i+1] = True
                elif text == "SENTENCING INFORMATION":
                    indices["sentencing"] = i+1
                    classified[i] = True
                    classified[i+1] = True
                elif re.match("[A-Za-z][0-9]{5} - ", text):
                    id = text[0:6]
                    classified[i] = True
                elif text.startswith("Warrant Information"):
#                    indices['warrant'] = i
                    classified[i] = True
                elif text.startswith("Date of Birth: "):
                    "do nothing, we already have physical profile"
                elif text.startswith("The information"): #disclaimer
                    classified[i] = True
                elif text.startswith("IDOC TOLL FREE"): #warning
                    classified[i] = True
                elif text == "Captured":
                    classified[i] = True
                    "do nothing"
                else:
                    print "unexpected text: " + text
                    quit(2)
            elif text.startswith("Parent Institution: ") or text.startswith("Alias: "):
                indices['status'] = i
                classified[i] = True

        assert None not in indices.values()
        for i, item in enumerate(main):
            if i not in classified:
                if len(item):
                    assert len(item) != 1
                    text = item.text_content()
                    if text.startswith(" MARKS, SCARS, & TATTOOS"):
                        for m in item[1:]:
                            files['marks'].write(arr2csvR([id, m.text_content()]))
                    else:
                        print "unexpected text: " + text
                        quit(3)
            elif i in indices.values():
                type = indices.keys()[indices.values().index(i)]
                arr = [id]
                if type == 'sentencing':
                    #9 fields and a spacer, last also has the spacer
                    assert len(item) % 9 == 0
                    for i in range(len(item)/9):
                        files['sentencing'].write(arr2csvR([id] + map(lambda x: x[1].text_content(), item[9*i:9*i+8])))
                else:
                    c = {'physical': physicalcols,
                        'admission': admissioncols,
                        'status': statuscols}[type]
                    for k in range(c['next']):
                        found = False
                        for m in item:
                            if len(m[0].text_content().strip()) and c[m[0].text_content()] == k:
                                found = True
                                if m[0].text_content() == 'Sex Offender Registry Required':
                                    arr += ["true"] #there's no m[1], so we might as well handle it special case
                                else:
                                    if type == 'physical':
                                        if m[0].text_content() == "Weight:  ":
                                            txt = m[1].text_content()
                                            if txt[-5:] == " lbs.":
                                                arr += [txt[0:-5]] #"111 lbs." -> 111
                                            elif txt == "Not Available":
                                                arr += ["N/A"]
                                            else:
                                                print "unknown weight " + txt
                                                quit(4)
                                        elif m[0].text_content() == "Height: ":
                                            txt = m[1].text_content()
                                            if re.match(". ft. .. in.", txt):
                                                foot = int(txt[0])
                                                inch = int(txt[6:8])
                                                arr += [str(foot*12+inch)]
                                            elif txt == "Not Available":
                                                arr += ["N/A"]
                                            else:
                                                print "unknown weight " + txt
                                                quit(5)
                                        else:
                                            if not len(m) > 1:
                                                arr += ['']
                                            else:
                                                arr += [m[1].text_content()]
                                    else:
                                        if not len(m) > 1:
                                            arr += ['']
                                        else:
                                            arr += [m[1].text_content()]
                        if not found:
                            arr += ['']
                    files[type].write(arr2csvR(arr))
            else:
                assert i in classified