Untitled

import csv

class ThisDialect(csv.excel):
    lineterminator='\r'


jobnames = list(csv.DictReader(open("job-name.csv", 'U'), dialect=ThisDialect()))[1:]
paircounts = list(csv.DictReader(open("pair-count.csv", 'U'), dialect=ThisDialect()))[1:]

code_to_name = {}
for j in jobnames:
    code_to_name[j['code']] = j['occupation']
    # if "Nurse" in j['occupation']:
        # print j['occupation'], j['code']
# 1/0

# Sexes: 1 is male, 2 is female
married_counts = {}
t1 = 0
t2 = 0
for p in paircounts:
    if p['occ_sp'] in ('0', ''):
        continue
    k = (p['sex_sp'], p['occ_sp'])
    if k[1] == '0':
        continue
    if p['sex_sp'] == p['sex']:
        continue
    married_counts[k] = married_counts.get(k, 0) + float(p['total'])
    if p['sex_sp'] == '1':
        t1 += float(p['total'])
    if p['sex_sp'] == '2':
        t2 += float(p['total'])

print "Most commonly-married female professions:"
most_common_female = []
for (sex, code), count in married_counts.items():
    if sex != '2':
        continue
    most_common_female.append((count, code))
most_common_female.sort(reverse=True)
for count, code in most_common_female[:10]:
    this_perc = 100.0 * int(count) / t2
    print "%s: %.1f%%" % (code_to_name[code], this_perc)
print

# investigate_code = '10' # CEOs
investigate_code = '1010' # Programmers
# vestigate_code = '3255' # Registered Nurses
# investigate_code = '4220' # Janitors
# investigate_code = '5700' # Secretaries

MARRYING_SEX = '1'
MARRIED_SEX = '2'
assert MARRIED_SEX != MARRYING_SEX, "sorry the script needs to be updated to look at same-sex marriages"
tx = (t2 if MARRIED_SEX == '2' else t1)

by_count = []
total = 0
for p in paircounts:
    if p['occ_sp'] in ('0', ''):
        continue
    if p['sex'] != MARRYING_SEX or p['sex_sp'] != MARRIED_SEX:
        continue
    if p['occ'] == investigate_code:
        by_count.append((float(p['total']), p['occ_sp']))
        total += by_count[-1][0]
by_count.sort(reverse=True)

seen = set()
normalized = []
over_expected = []
for count, code in by_count:
    this_perc = 100.0 * int(count) / total
    global_perc = married_counts[(MARRIED_SEX, code)] / tx * 100
    seen.add(code)
    over_expected.append(((count - total * global_perc / 100) / total, count, code))
    normalized.append((this_perc / global_perc, count, code))

print "Most common for %s to marry by count:" % code_to_name[investigate_code]
for count, code in by_count[:5]:
    this_perc = 100.0 * int(count) / total
    global_perc = married_counts[(MARRIED_SEX, code)] / tx * 100
    print "%s: %d, %.1f%%, %.1f%%" % (code_to_name[code], int(count), this_perc, global_perc)
print

print "Most common for %s to marry, normalized vs the population:" % code_to_name[investigate_code]
normalized.sort(reverse=True)
for mult, count, code in normalized[:5]:
    this_perc = 100.0 * int(count) / total
    global_perc = married_counts[(MARRIED_SEX, code)] / t2 * 100
    print "%s: %.1fx (%.1f%% vs %.1f%%; %d found)" % (code_to_name[code], mult, this_perc, global_perc, count)
print

print "Most common for %s to marry, over expected:" % code_to_name[investigate_code]
over_expected.sort(reverse=True)
for over, count, code in over_expected[:5]:
    this_perc = 100.0 * int(count) / total
    global_perc = married_counts[(MARRIED_SEX, code)] / t2 * 100
    print "%s: %.1f%% more than expected (%.1f%% vs %.1f%%)" % (code_to_name[code], 100 * over, this_perc, global_perc)
print

'''
for sex, code in married_counts:
    if sex != MARRIED_SEX:
        continue
    if code not in seen:
        print "didn't see:", code_to_name[code], married_counts[(MARRIED_SEX, code)] / t2 * 100
'''