Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- #you'll need my library pype; http://arvindn.livejournal.com/68137.html; you can get it with:
- # $ svn checkout svn://randomwalker.info/pype/pype.py
- #do this first
- # $ wget -qO- http://www.census.gov/genealogy/names/dist.male.first | cut -d \ -f 1 > firstnames
- # $ wget -qO- http://www.census.gov/genealogy/names/dist.female.first | cut -d \ -f 1 >> firstnames
- # $ wget -qO- http://www.census.gov/genealogy/names/dist.all.last | cut -d \ -f 1 > lastnames
- #finally, copy the list of usernames from the csv into "usernames.in", 1 per line
- import os
- from pype import *
- #read in the list of first names
- firstnames = os.popen("echo; cat firstnames") | pStrip | pLower | pSet
- #this is trickier: we don't want rare (index > 5000) lastnames that are too short;
- #because it leads to spurious matches
- lastnames = os.popen("echo; head -5000 lastnames; tail -n +5000 lastnames | grep .....") | pStrip | pLower | pSet
- #dictionary words. but not proper nouns.
- words = os.popen("grep -v '[A-X]' /usr/share/dict/words") | pStrip | pSet
- #check if a pair of words looks like a name
- def validName(left, right):
- if len(left) in [1,2]: return False #length 0 is ok. length >=3 is ok.
- if len(right) in [1,2]: return False
- return (left in firstnames and right in lastnames) or \
- (left in lastnames and right in firstnames)
- #check if a username looks like a first/last name pair
- def matchName(username):
- for i in xrange(len(username)):
- #cut the string at each possible index and check
- left, right = username[:i], username[i:]
- #we don't allow dictionary words, since that leads to spurious matches
- #compromise: but if only one of the names is a dictionary word it's ok
- #it's surprising how many first & last names are in fact english words
- if validName(left, right) and (left not in words or right not in words):
- return left, right
- titleCase = lambda word: word[0].upper() + word[1:].lower() if word else ''
- #read usernames from usernames.in, but strip non-alphabetical characters
- #test each username and write found names to realnames.out
- os.popen("sed 's/[^a-zA-Z]//g' usernames.in") | pStrip | pLower | Map(matchName) | Filter(lambda x:x) | Map(lambda name:" ".join(titleCase(w) for w in name)) | pStrip | pWrite("realnames.out")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement