View difference between Paste ID: <a href="/f239f43a7">f239f43a7</a> and <a href="/post/view"></a>

View difference between Paste ID: f239f43a7 and

SHOW: | | - or go back to the newest paste.


#!/usr/bin/python

#you'll need my library pype; http://arvindn.livejournal.com/68137.html; you can get it with:
# $ svn checkout svn://randomwalker.info/pype/pype.py

#do this first
# $ wget -qO- http://www.census.gov/genealogy/names/dist.male.first | cut -d \  -f 1  > firstnames
# $ wget -qO- http://www.census.gov/genealogy/names/dist.female.first | cut -d \  -f 1  >> firstnames
# $ wget -qO- http://www.census.gov/genealogy/names/dist.all.last | cut -d \  -f 1  > lastnames 

#finally, copy the list of usernames from the csv into "usernames.in", 1 per line

import os
from pype import *

#read in the list of first names
firstnames = os.popen("echo; cat firstnames") | pStrip | pLower | pSet

#this is trickier: we don't want rare (index > 5000) lastnames that are too short; 
#because it leads to spurious matches
lastnames = os.popen("echo; head -5000 lastnames; tail -n +5000 lastnames | grep .....") | pStrip | pLower | pSet
#dictionary words. but not proper nouns.
words = os.popen("grep -v '[A-X]' /usr/share/dict/words") | pStrip | pSet

#check if a pair of words looks like a name
def validName(left, right):
    if len(left) in [1,2]: return False #length 0 is ok. length >=3 is ok.
    if len(right) in [1,2]: return False
    return (left in firstnames and right in lastnames) or \
            (left in lastnames and right in firstnames)

#check if a username looks like a first/last name pair
def matchName(username):
    for i in xrange(len(username)):
        #cut the string at each possible index and check
        left, right = username[:i], username[i:]
        #we don't allow dictionary words, since that leads to spurious matches
        #compromise: but if only one of the names is a dictionary word it's ok
        #it's surprising how many first & last names are in fact english words
        if validName(left, right) and (left not in words or right not in words):
            return left, right

titleCase = lambda word: word[0].upper() + word[1:].lower() if word else ''

#read usernames from usernames.in, but strip non-alphabetical characters
#test each username and write found names to realnames.out
os.popen("sed 's/[^a-zA-Z]//g'  usernames.in") | pStrip | pLower | Map(matchName) | Filter(lambda x:x) | Map(lambda name:" ".join(titleCase(w) for w in name)) | pStrip | pWrite("realnames.out")

1	-
1	+	#!/usr/bin/python
2
3		#you'll need my library pype; http://arvindn.livejournal.com/68137.html; you can get it with:
4		# $ svn checkout svn://randomwalker.info/pype/pype.py
5
6		#do this first
7		# $ wget -qO- http://www.census.gov/genealogy/names/dist.male.first \| cut -d \ -f 1 > firstnames
8		# $ wget -qO- http://www.census.gov/genealogy/names/dist.female.first \| cut -d \ -f 1 >> firstnames
9		# $ wget -qO- http://www.census.gov/genealogy/names/dist.all.last \| cut -d \ -f 1 > lastnames
10
11		#finally, copy the list of usernames from the csv into "usernames.in", 1 per line
12
13		import os
14		from pype import *
15
16		#read in the list of first names
17		firstnames = os.popen("echo; cat firstnames") \| pStrip \| pLower \| pSet
18
19		#this is trickier: we don't want rare (index > 5000) lastnames that are too short;
20		#because it leads to spurious matches
21		lastnames = os.popen("echo; head -5000 lastnames; tail -n +5000 lastnames \| grep .....") \| pStrip \| pLower \| pSet
22		#dictionary words. but not proper nouns.
23		words = os.popen("grep -v '[A-X]' /usr/share/dict/words") \| pStrip \| pSet
24
25		#check if a pair of words looks like a name
26		def validName(left, right):
27		if len(left) in [1,2]: return False #length 0 is ok. length >=3 is ok.
28		if len(right) in [1,2]: return False
29		return (left in firstnames and right in lastnames) or \
30		(left in lastnames and right in firstnames)
31
32		#check if a username looks like a first/last name pair
33		def matchName(username):
34		for i in xrange(len(username)):
35		#cut the string at each possible index and check
36		left, right = username[:i], username[i:]
37		#we don't allow dictionary words, since that leads to spurious matches
38		#compromise: but if only one of the names is a dictionary word it's ok
39		#it's surprising how many first & last names are in fact english words
40		if validName(left, right) and (left not in words or right not in words):
41		return left, right
42
43		titleCase = lambda word: word[0].upper() + word[1:].lower() if word else ''
44
45		#read usernames from usernames.in, but strip non-alphabetical characters
46		#test each username and write found names to realnames.out
47		os.popen("sed 's/[^a-zA-Z]//g' usernames.in") \| pStrip \| pLower \| Map(matchName) \| Filter(lambda x:x) \| Map(lambda name:" ".join(titleCase(w) for w in name)) \| pStrip \| pWrite("realnames.out")