Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- class ThisDialect(csv.excel):
- lineterminator='\r'
- jobnames = list(csv.DictReader(open("job-name.csv", 'U'), dialect=ThisDialect()))[1:]
- paircounts = list(csv.DictReader(open("pair-count.csv", 'U'), dialect=ThisDialect()))[1:]
- code_to_name = {}
- for j in jobnames:
- code_to_name[j['code']] = j['occupation']
- # if "Nurse" in j['occupation']:
- # print j['occupation'], j['code']
- # 1/0
- # Sexes: 1 is male, 2 is female
- married_counts = {}
- t1 = 0
- t2 = 0
- for p in paircounts:
- if p['occ_sp'] in ('0', ''):
- continue
- k = (p['sex_sp'], p['occ_sp'])
- if k[1] == '0':
- continue
- if p['sex_sp'] == p['sex']:
- continue
- married_counts[k] = married_counts.get(k, 0) + float(p['total'])
- if p['sex_sp'] == '1':
- t1 += float(p['total'])
- if p['sex_sp'] == '2':
- t2 += float(p['total'])
- print "Most commonly-married female professions:"
- most_common_female = []
- for (sex, code), count in married_counts.items():
- if sex != '2':
- continue
- most_common_female.append((count, code))
- most_common_female.sort(reverse=True)
- for count, code in most_common_female[:10]:
- this_perc = 100.0 * int(count) / t2
- print "%s: %.1f%%" % (code_to_name[code], this_perc)
- print
- # investigate_code = '10' # CEOs
- investigate_code = '1010' # Programmers
- # vestigate_code = '3255' # Registered Nurses
- # investigate_code = '4220' # Janitors
- # investigate_code = '5700' # Secretaries
- MARRYING_SEX = '1'
- MARRIED_SEX = '2'
- assert MARRIED_SEX != MARRYING_SEX, "sorry the script needs to be updated to look at same-sex marriages"
- tx = (t2 if MARRIED_SEX == '2' else t1)
- by_count = []
- total = 0
- for p in paircounts:
- if p['occ_sp'] in ('0', ''):
- continue
- if p['sex'] != MARRYING_SEX or p['sex_sp'] != MARRIED_SEX:
- continue
- if p['occ'] == investigate_code:
- by_count.append((float(p['total']), p['occ_sp']))
- total += by_count[-1][0]
- by_count.sort(reverse=True)
- seen = set()
- normalized = []
- over_expected = []
- for count, code in by_count:
- this_perc = 100.0 * int(count) / total
- global_perc = married_counts[(MARRIED_SEX, code)] / tx * 100
- seen.add(code)
- over_expected.append(((count - total * global_perc / 100) / total, count, code))
- normalized.append((this_perc / global_perc, count, code))
- print "Most common for %s to marry by count:" % code_to_name[investigate_code]
- for count, code in by_count[:5]:
- this_perc = 100.0 * int(count) / total
- global_perc = married_counts[(MARRIED_SEX, code)] / tx * 100
- print "%s: %d, %.1f%%, %.1f%%" % (code_to_name[code], int(count), this_perc, global_perc)
- print
- print "Most common for %s to marry, normalized vs the population:" % code_to_name[investigate_code]
- normalized.sort(reverse=True)
- for mult, count, code in normalized[:5]:
- this_perc = 100.0 * int(count) / total
- global_perc = married_counts[(MARRIED_SEX, code)] / t2 * 100
- print "%s: %.1fx (%.1f%% vs %.1f%%; %d found)" % (code_to_name[code], mult, this_perc, global_perc, count)
- print
- print "Most common for %s to marry, over expected:" % code_to_name[investigate_code]
- over_expected.sort(reverse=True)
- for over, count, code in over_expected[:5]:
- this_perc = 100.0 * int(count) / total
- global_perc = married_counts[(MARRIED_SEX, code)] / t2 * 100
- print "%s: %.1f%% more than expected (%.1f%% vs %.1f%%)" % (code_to_name[code], 100 * over, this_perc, global_perc)
- print
- '''
- for sex, code in married_counts:
- if sex != MARRIED_SEX:
- continue
- if code not in seen:
- print "didn't see:", code_to_name[code], married_counts[(MARRIED_SEX, code)] / t2 * 100
- '''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement