Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## word_lookup.py
- #!/usr/bin/python
- """lookup the catches to words in official lists, or specific lists
- words are simultaneously loaded into a database, where the "catch" probabilities and point-values of child nodes are summed and stored
- other scripts would be used for sorting an loading catches from the DB
- This was a learning project. Please DO send me feedback / style suggestions to darius dot roberts at google's mail.com.
- """
- import sys
- sys.path.insert(0,'words')
- from word import Word
- del sys.path[0]
- import pdb
- from helpers import validate_argv_words
- words = validate_argv_words(sys.argv)
- print "\n","--------------------"*3,"\n"
- print "Words to look up:", words,
- initial = Word.count()
- print "\n","--------------------"*3,"\n"
- # MYTODO: flag "newly created" words inline... if they have a created_at datetime?
- for i, w in enumerate(Word.find_or_create_all_by_name(words)):
- if i==0: print "Words not in db will be created."
- print "{0.name:{1}} => {0.catches}".format(w,(10-len(str(w.name)))*" ")
- else:
- print "Nope. None of those words have catches."
- print "\n","--------------------"*3,"\n"
- final = Word.count()
- delta = final - initial
- print "New words: {0}\nTotal words in db: {1}".format(delta, final)
- print "\n","--------------------"*3,"\n"
- ##helpers.py
- def validate_argv_words(word_list):
- if len(word_list) < 2:
- raise "this script requires argments. 'foo.txt' word source or 'bar baz' words are acceptable."
- elif len(word_list) > 40: raise "you know this will be slow, right? that's a LOT of words to process."
- elif any(filter(lambda a: '.txt' in a, word_list[1:])):
- print "found a .txt argument"
- files = map(lambda f: open(f,"r"), word_list[1:])
- for f in files:
- raise "{f} filesize too large".format(f)
- if len(set(word_list)) < 3:
- #trivial case... corrects find_all syntax
- return tuple(set(word_list[1:]) | set([word_list[1:][0]+"s"]))
- else:
- return tuple(word_list[1:])
- ## words/word.py
- import scriptutil as SU
- import re
- import psycopg2
- from psycopg2.extras import DictCursor
- from psycopg2.extensions import adapt
- try:
- db = psycopg2.connect(database="scrabble", user="python", password="python")
- cur = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
- # cur.execute ("CREATE TABLE words (name varchar, probability int, frequency int, catches varchar, hangs varchar);")
- except:
- print "I am unable to connect to the database"
- sys.ext()
- try:
- "trying to find a wordlist reference file"
- except:
- "failing to find a wordlist reference file. You're on your own, you database-dependent chump!"
- class Word:
- """legal scrabble words
- 1) in official lists, and
- 2) have point/frequency attributes that are derived --- not from it's own letters ---
- but rather from the point/prob sums of all the possible _derivative_ scrabble-legal words
- # raw data from official scrabble lists. Can be downloaded from hasbro
- """
- letters = "_ a b c d e f g h i j k l m n o p q r s t u v w x y z".split()
- frequencies = (2, 9, 2, 2, 4, 12, 2, 3, 2, 9, 1, 1, 4, 2, 6, 8, 2, 1, 6, 4, 6, 4, 2, 2, 1, 2, 1)
- points = (0, 1, 3, 3, 2, 1, 4, 2, 4, 1, 8, 5, 1, 3, 1, 1, 3, 10, 1, 1, 1, 1, 4, 4, 8, 4, 10)
- letter_frequencies = dict(zip(letters,frequencies))
- letter_points = dict(zip(letters,frequencies))
- def calculate_probability(self):
- return sum(map(lambda letter: letter_points[letter], self.catches))
- def calculate_frequencies(self):
- return sum(map(lambda letter: letter_frequencies[letter], self.catches))
- def __init__(self,name,points=None,frequency=None,catches=None,hangs=None):
- self.name = name
- if catches is None: self.catches = catches
- if frequency is None: self.frequency = frequency
- if points is None: self.points = points
- if hangs is None: self.hangs = hangs
- @staticmethod
- def count(finder_sql = ""):
- """rails-style finder
- """
- cur.execute("select * from words {0}".format(finder_sql))
- return cur.rowcount
- def hangs(self):
- """ one-lettter shorter
- """
- return self.name[0:-1]
- @staticmethod
- def find_or_create_all_by_name(names):
- """
- merge
- VS
- cur.copy_in( ... scratch ... )
- insert into words select * from (select distinct * from scratch) uniq where not exists (select 1 from words where words.name = uniq.name);
- """
- # MYTODO escape names ... learning exercise.
- matches = Word.find_all("""where words.name in {0}""".format(tuple(names)))
- unmatched = set(names) - set(map(lambda w: w.name, matches))
- pdb.set_trace()
- #MYTODO: transactions?
- invalid_words = []
- created_words = []
- for n in unmatched:
- w = Word(n)
- try:
- w.new()
- created_words.append(w)
- except NameError:
- invalid_words.append(n)
- # MYTODO: hose invalid words over to the output somehow ... through a logger, if nothing else
- if not len(created_words) == 0: db.commit()
- return created_words.extend(matches) or []
- def new(self):
- """ vaguely rails-AR-like new()
- validates, find-greps for catches, and pre-commits instance to the db
- #MYTODO: profiling. Is it worth it to split up the two grep searches? (above)
- """
- self.validate_against_local_lists()
- grepd_catches = self.fgrep_catches_in_directories(("./words",))
- flat_catches = []
- for c in grepd_catches: flat_catches.extend(c) #split()
- self.catches = "".join(map(lambda catch: catch+" ", set(flat_catches))).strip()
- cur.execute("""INSERT INTO words VALUES {0}""".format(
- (
- self.name,
- self.calculate_probability(),
- self.calculate_frequencies(),
- self.catches,
- # hangs
- self.name[1:] + " " + self.name[:-1],
- )
- ))
- def validate_against_local_lists(self, lists=(".",)):
- """if not found in any text file => not a legal word!
- this will also catch all the weird things people might throw. Like numbers.
- """
- if [self.name] not in self.fgrep_in_directories(lists):
- raise NameError, "not in ./words/*.txt. Look again, shall we?"
- pass
- def fgrep_in_directories(self, directories=(".",),search_string=None):
- """ grep in dir ("." by default)
- find a word in local .txt files
- """
- if search_string is None:
- search_tuple = (("^{0}$".format(self.name), re.I),)
- else:
- search_tuple = ((search_string, re.M),)
- result = map(lambda directory:
- SU.ffindgrep(directory, namefs=(lambda s: s.endswith('.txt'),),
- regexl=search_tuple
- ).values(),
- directories)
- return [catch[0] for catch in result if len(catch) is not 0]
- def fgrep_catches_in_directories(self, directories=(".",)):
- """find all _catches_
- find a word in local .txt files
- """
- temp = []
- temp.extend(self.fgrep_in_directories(("./words",), "^{0}.$".format(self.name)))
- temp.extend(self.fgrep_in_directories(("./words",), "^.{0}$".format(self.name)))
- return temp
- # raise ArgumentError
- @staticmethod
- def find_all(finder_sql = ""):
- """rails-style finder
- """
- cur.execute("select * from words {0}".format(finder_sql))
- return map(lambda properties: Word(*properties), cur.fetchall())
- def flatten(l):
- if l is []:
- pass
- elif isinstance(l,list):
- return sum(map(flatten,l))
- else:
- return l
Add Comment
Please, Sign In to add comment