Pastebin launched a little side project called VERYVIRAL.com, check it out ;-) Want more features on Pastebin? Sign Up, it's FREE!
Guest

Arvind Narayanan

By: a guest on Nov 4th, 2008  |  syntax: Python  |  size: 2.32 KB  |  views: 949  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
This paste has a previous version, view the difference. Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. #!/usr/bin/python
  2.  
  3. #you'll need my library pype; http://arvindn.livejournal.com/68137.html; you can get it with:
  4. # $ svn checkout svn://randomwalker.info/pype/pype.py
  5.  
  6. #do this first
  7. # $ wget -qO- http://www.census.gov/genealogy/names/dist.male.first | cut -d \  -f 1  > firstnames
  8. # $ wget -qO- http://www.census.gov/genealogy/names/dist.female.first | cut -d \  -f 1  >> firstnames
  9. # $ wget -qO- http://www.census.gov/genealogy/names/dist.all.last | cut -d \  -f 1  > lastnames
  10.  
  11. #finally, copy the list of usernames from the csv into "usernames.in", 1 per line
  12.  
  13. import os
  14. from pype import *
  15.  
  16. #read in the list of first names
  17. firstnames = os.popen("echo; cat firstnames") | pStrip | pLower | pSet
  18.  
  19. #this is trickier: we don't want rare (index > 5000) lastnames that are too short;
  20. #because it leads to spurious matches
  21. lastnames = os.popen("echo; head -5000 lastnames; tail -n +5000 lastnames | grep .....") | pStrip | pLower | pSet
  22. #dictionary words. but not proper nouns.
  23. words = os.popen("grep -v '[A-X]' /usr/share/dict/words") | pStrip | pSet
  24.  
  25. #check if a pair of words looks like a name
  26. def validName(left, right):
  27.     if len(left) in [1,2]: return False #length 0 is ok. length >=3 is ok.
  28.     if len(right) in [1,2]: return False
  29.     return (left in firstnames and right in lastnames) or \
  30.             (left in lastnames and right in firstnames)
  31.  
  32. #check if a username looks like a first/last name pair
  33. def matchName(username):
  34.     for i in xrange(len(username)):
  35.         #cut the string at each possible index and check
  36.         left, right = username[:i], username[i:]
  37.         #we don't allow dictionary words, since that leads to spurious matches
  38.         #compromise: but if only one of the names is a dictionary word it's ok
  39.         #it's surprising how many first & last names are in fact english words
  40.         if validName(left, right) and (left not in words or right not in words):
  41.             return left, right
  42.  
  43. titleCase = lambda word: word[0].upper() + word[1:].lower() if word else ''
  44.  
  45. #read usernames from usernames.in, but strip non-alphabetical characters
  46. #test each username and write found names to realnames.out
  47. os.popen("sed 's/[^a-zA-Z]//g'  usernames.in") | pStrip | pLower | Map(matchName) | Filter(lambda x:x) | Map(lambda name:" ".join(titleCase(w) for w in name)) | pStrip | pWrite("realnames.out")