Untitled

## word_lookup.py
#!/usr/bin/python
"""lookup the catches to words in official lists, or specific lists

words are simultaneously loaded into a database, where the "catch" probabilities and point-values of child nodes are summed and stored

other scripts would be used for sorting an loading catches from the DB

This was a learning project. Please DO send me feedback / style suggestions to darius dot roberts at google's mail.com.
"""

import sys
sys.path.insert(0,'words')
from word import Word
del sys.path[0]

import pdb

from helpers import validate_argv_words
words = validate_argv_words(sys.argv)

print "\n","--------------------"*3,"\n"

print "Words to look up:", words,
initial = Word.count()

print "\n","--------------------"*3,"\n"

# MYTODO: flag "newly created" words inline... if they have a created_at datetime?
for i, w in enumerate(Word.find_or_create_all_by_name(words)):
    if i==0: print "Words not in db will be created."
    print "{0.name:{1}} => {0.catches}".format(w,(10-len(str(w.name)))*" ")
else:
    print "Nope. None of those words have catches."

print "\n","--------------------"*3,"\n"

final = Word.count()
delta = final - initial
print "New words:           {0}\nTotal words in db: {1}".format(delta, final)

print "\n","--------------------"*3,"\n"


##helpers.py
def validate_argv_words(word_list):
    if len(word_list) < 2:
        raise "this script requires argments. 'foo.txt' word source or 'bar baz' words are acceptable."
    elif len(word_list) > 40: raise "you know this will be slow, right? that's a LOT of words to process."
    elif any(filter(lambda a: '.txt' in a, word_list[1:])):
        print "found a .txt argument"
        files = map(lambda f: open(f,"r"), word_list[1:])
        for f in files:
            raise "{f} filesize too large".format(f)
    if len(set(word_list)) < 3:
        #trivial case... corrects find_all syntax
        return tuple(set(word_list[1:]) | set([word_list[1:][0]+"s"]))
    else:
        return tuple(word_list[1:])


## words/word.py

import scriptutil as SU
import re

import psycopg2
from psycopg2.extras import DictCursor
from psycopg2.extensions import adapt

try:
    db = psycopg2.connect(database="scrabble", user="python", password="python")
    cur = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
# cur.execute ("CREATE TABLE words (name varchar, probability int, frequency int, catches varchar, hangs varchar);")
except:
    print "I am unable to connect to the database"
    sys.ext()

try:
	"trying to find a wordlist reference file"
except:
    "failing to find a wordlist reference file. You're on your own, you database-dependent chump!"


class Word:
    """legal scrabble words

	1) in official lists, and
	2) have point/frequency attributes that are derived --- not from it's own letters ---
	   but rather from the point/prob sums of all the possible _derivative_ scrabble-legal words

	# raw data from official scrabble lists. Can be downloaded from hasbro
	"""

    letters     = "_  a  b  c  d   e  f  g  h  i  j  k  l  m  n  o  p   q  r  s  t  u  v  w  x  y   z".split()
    frequencies = (2, 9, 2, 2, 4, 12, 2, 3, 2, 9, 1, 1, 4, 2, 6, 8, 2,  1, 6, 4, 6, 4, 2, 2, 1, 2,  1)
    points      = (0, 1, 3, 3, 2,  1, 4, 2, 4, 1, 8, 5, 1, 3, 1, 1, 3, 10, 1, 1, 1, 1, 4, 4, 8, 4, 10)


    letter_frequencies = dict(zip(letters,frequencies))
    letter_points = dict(zip(letters,frequencies))

    def calculate_probability(self):
	    return sum(map(lambda letter: letter_points[letter], self.catches))
    def calculate_frequencies(self):
	    return sum(map(lambda letter: letter_frequencies[letter], self.catches))

    def __init__(self,name,points=None,frequency=None,catches=None,hangs=None):
        self.name = name

        if catches is None: self.catches = catches
        if frequency is None: self.frequency = frequency
        if points is None: self.points = points
        if hangs is None: self.hangs = hangs

    @staticmethod
    def count(finder_sql = ""):
        """rails-style finder
        """
        cur.execute("select * from words {0}".format(finder_sql))
        return cur.rowcount


    def hangs(self):
        """ one-lettter shorter
        """
        return self.name[0:-1]

    @staticmethod
    def find_or_create_all_by_name(names):
        """
        merge
        VS
	    cur.copy_in( ... scratch ... )
        insert into words select * from (select distinct * from scratch) uniq where not exists (select 1 from words where words.name = uniq.name);
        """

        # MYTODO escape names ... learning exercise.
        matches = Word.find_all("""where words.name in {0}""".format(tuple(names)))
        unmatched = set(names) - set(map(lambda w: w.name, matches))
        pdb.set_trace()

#MYTODO: transactions?
        invalid_words = []
        created_words = []
        for n in unmatched:
            w = Word(n)
            try:
                w.new()
                created_words.append(w)
            except NameError:
                invalid_words.append(n)
                # MYTODO: hose invalid words over to the output somehow ... through a logger, if nothing else

        if not len(created_words) == 0: db.commit()
        return created_words.extend(matches) or []


    def new(self):
        """ vaguely rails-AR-like new()

        validates, find-greps for catches, and pre-commits instance to the db

        #MYTODO: profiling. Is it worth it to split up the two grep searches? (above)
        """
        self.validate_against_local_lists()
        grepd_catches = self.fgrep_catches_in_directories(("./words",))

        flat_catches = []
        for c in grepd_catches: flat_catches.extend(c) #split()
        self.catches = "".join(map(lambda catch: catch+" ", set(flat_catches))).strip()

        cur.execute("""INSERT INTO words VALUES {0}""".format(
           (
            self.name,
            self.calculate_probability(),
            self.calculate_frequencies(),
            self.catches,
           # hangs
            self.name[1:] + " " + self.name[:-1],
           )
        ))


    def validate_against_local_lists(self, lists=(".",)):
        """if not found in any text file => not a legal word!

        this will also catch all the weird things people might throw. Like numbers.
        """
        if [self.name] not in self.fgrep_in_directories(lists):
            raise NameError, "not in ./words/*.txt. Look again, shall we?"
        pass

    def fgrep_in_directories(self, directories=(".",),search_string=None):
        """ grep in dir ("." by default)

        find a word in local .txt files
        """
        if search_string is None:
            search_tuple = (("^{0}$".format(self.name), re.I),)
        else:
            search_tuple = ((search_string, re.M),)

        result = map(lambda directory:
                    SU.ffindgrep(directory, namefs=(lambda s: s.endswith('.txt'),),
                                            regexl=search_tuple
                             ).values(),
               directories)

        return [catch[0] for catch in result if len(catch) is not 0]

    def fgrep_catches_in_directories(self, directories=(".",)):
        """find all _catches_

        find a word in local .txt files
        """
        temp = []
        temp.extend(self.fgrep_in_directories(("./words",), "^{0}.$".format(self.name)))
        temp.extend(self.fgrep_in_directories(("./words",), "^.{0}$".format(self.name)))
        return temp

# raise ArgumentError
    @staticmethod
    def find_all(finder_sql = ""):
        """rails-style finder
        """
        cur.execute("select * from words {0}".format(finder_sql))
        return map(lambda properties: Word(*properties), cur.fetchall())


    def flatten(l):
        if l is []:
            pass
        elif isinstance(l,list):
            return sum(map(flatten,l))
        else:
            return l