View difference between Paste ID: f239f43a7 and
SHOW:
|
|
- or go back to the newest paste.
1 | - | |
1 | + | #!/usr/bin/python |
2 | ||
3 | #you'll need my library pype; http://arvindn.livejournal.com/68137.html; you can get it with: | |
4 | # $ svn checkout svn://randomwalker.info/pype/pype.py | |
5 | ||
6 | #do this first | |
7 | # $ wget -qO- http://www.census.gov/genealogy/names/dist.male.first | cut -d \ -f 1 > firstnames | |
8 | # $ wget -qO- http://www.census.gov/genealogy/names/dist.female.first | cut -d \ -f 1 >> firstnames | |
9 | # $ wget -qO- http://www.census.gov/genealogy/names/dist.all.last | cut -d \ -f 1 > lastnames | |
10 | ||
11 | #finally, copy the list of usernames from the csv into "usernames.in", 1 per line | |
12 | ||
13 | import os | |
14 | from pype import * | |
15 | ||
16 | #read in the list of first names | |
17 | firstnames = os.popen("echo; cat firstnames") | pStrip | pLower | pSet | |
18 | ||
19 | #this is trickier: we don't want rare (index > 5000) lastnames that are too short; | |
20 | #because it leads to spurious matches | |
21 | lastnames = os.popen("echo; head -5000 lastnames; tail -n +5000 lastnames | grep .....") | pStrip | pLower | pSet | |
22 | #dictionary words. but not proper nouns. | |
23 | words = os.popen("grep -v '[A-X]' /usr/share/dict/words") | pStrip | pSet | |
24 | ||
25 | #check if a pair of words looks like a name | |
26 | def validName(left, right): | |
27 | if len(left) in [1,2]: return False #length 0 is ok. length >=3 is ok. | |
28 | if len(right) in [1,2]: return False | |
29 | return (left in firstnames and right in lastnames) or \ | |
30 | (left in lastnames and right in firstnames) | |
31 | ||
32 | #check if a username looks like a first/last name pair | |
33 | def matchName(username): | |
34 | for i in xrange(len(username)): | |
35 | #cut the string at each possible index and check | |
36 | left, right = username[:i], username[i:] | |
37 | #we don't allow dictionary words, since that leads to spurious matches | |
38 | #compromise: but if only one of the names is a dictionary word it's ok | |
39 | #it's surprising how many first & last names are in fact english words | |
40 | if validName(left, right) and (left not in words or right not in words): | |
41 | return left, right | |
42 | ||
43 | titleCase = lambda word: word[0].upper() + word[1:].lower() if word else '' | |
44 | ||
45 | #read usernames from usernames.in, but strip non-alphabetical characters | |
46 | #test each username and write found names to realnames.out | |
47 | os.popen("sed 's/[^a-zA-Z]//g' usernames.in") | pStrip | pLower | Map(matchName) | Filter(lambda x:x) | Map(lambda name:" ".join(titleCase(w) for w in name)) | pStrip | pWrite("realnames.out") |