Guest User

Untitled

a guest
Jul 13th, 2018
96
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.03 KB | None | 0 0
  1. ## word_lookup.py
  2. #!/usr/bin/python
  3. """lookup the catches to words in official lists, or specific lists
  4.  
  5. words are simultaneously loaded into a database, where the "catch" probabilities and point-values of child nodes are summed and stored
  6.  
  7. other scripts would be used for sorting an loading catches from the DB
  8.  
  9. This was a learning project. Please DO send me feedback / style suggestions to darius dot roberts at google's mail.com.
  10. """
  11.  
  12. import sys
  13. sys.path.insert(0,'words')
  14. from word import Word
  15. del sys.path[0]
  16.  
  17. import pdb
  18.  
  19. from helpers import validate_argv_words
  20. words = validate_argv_words(sys.argv)
  21.  
  22. print "\n","--------------------"*3,"\n"
  23.  
  24. print "Words to look up:", words,
  25. initial = Word.count()
  26.  
  27. print "\n","--------------------"*3,"\n"
  28.  
  29. # MYTODO: flag "newly created" words inline... if they have a created_at datetime?
  30. for i, w in enumerate(Word.find_or_create_all_by_name(words)):
  31. if i==0: print "Words not in db will be created."
  32. print "{0.name:{1}} => {0.catches}".format(w,(10-len(str(w.name)))*" ")
  33. else:
  34. print "Nope. None of those words have catches."
  35.  
  36. print "\n","--------------------"*3,"\n"
  37.  
  38. final = Word.count()
  39. delta = final - initial
  40. print "New words: {0}\nTotal words in db: {1}".format(delta, final)
  41.  
  42. print "\n","--------------------"*3,"\n"
  43.  
  44.  
  45. ##helpers.py
  46. def validate_argv_words(word_list):
  47. if len(word_list) < 2:
  48. raise "this script requires argments. 'foo.txt' word source or 'bar baz' words are acceptable."
  49. elif len(word_list) > 40: raise "you know this will be slow, right? that's a LOT of words to process."
  50. elif any(filter(lambda a: '.txt' in a, word_list[1:])):
  51. print "found a .txt argument"
  52. files = map(lambda f: open(f,"r"), word_list[1:])
  53. for f in files:
  54. raise "{f} filesize too large".format(f)
  55. if len(set(word_list)) < 3:
  56. #trivial case... corrects find_all syntax
  57. return tuple(set(word_list[1:]) | set([word_list[1:][0]+"s"]))
  58. else:
  59. return tuple(word_list[1:])
  60.  
  61.  
  62. ## words/word.py
  63.  
  64. import scriptutil as SU
  65. import re
  66.  
  67. import psycopg2
  68. from psycopg2.extras import DictCursor
  69. from psycopg2.extensions import adapt
  70.  
  71. try:
  72. db = psycopg2.connect(database="scrabble", user="python", password="python")
  73. cur = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
  74. # cur.execute ("CREATE TABLE words (name varchar, probability int, frequency int, catches varchar, hangs varchar);")
  75. except:
  76. print "I am unable to connect to the database"
  77. sys.ext()
  78.  
  79. try:
  80. "trying to find a wordlist reference file"
  81. except:
  82. "failing to find a wordlist reference file. You're on your own, you database-dependent chump!"
  83.  
  84.  
  85. class Word:
  86. """legal scrabble words
  87.  
  88. 1) in official lists, and
  89. 2) have point/frequency attributes that are derived --- not from it's own letters ---
  90. but rather from the point/prob sums of all the possible _derivative_ scrabble-legal words
  91.  
  92. # raw data from official scrabble lists. Can be downloaded from hasbro
  93. """
  94.  
  95. letters = "_ a b c d e f g h i j k l m n o p q r s t u v w x y z".split()
  96. frequencies = (2, 9, 2, 2, 4, 12, 2, 3, 2, 9, 1, 1, 4, 2, 6, 8, 2, 1, 6, 4, 6, 4, 2, 2, 1, 2, 1)
  97. points = (0, 1, 3, 3, 2, 1, 4, 2, 4, 1, 8, 5, 1, 3, 1, 1, 3, 10, 1, 1, 1, 1, 4, 4, 8, 4, 10)
  98.  
  99.  
  100. letter_frequencies = dict(zip(letters,frequencies))
  101. letter_points = dict(zip(letters,frequencies))
  102.  
  103. def calculate_probability(self):
  104. return sum(map(lambda letter: letter_points[letter], self.catches))
  105. def calculate_frequencies(self):
  106. return sum(map(lambda letter: letter_frequencies[letter], self.catches))
  107.  
  108. def __init__(self,name,points=None,frequency=None,catches=None,hangs=None):
  109. self.name = name
  110.  
  111. if catches is None: self.catches = catches
  112. if frequency is None: self.frequency = frequency
  113. if points is None: self.points = points
  114. if hangs is None: self.hangs = hangs
  115.  
  116. @staticmethod
  117. def count(finder_sql = ""):
  118. """rails-style finder
  119. """
  120. cur.execute("select * from words {0}".format(finder_sql))
  121. return cur.rowcount
  122.  
  123.  
  124. def hangs(self):
  125. """ one-lettter shorter
  126. """
  127. return self.name[0:-1]
  128.  
  129. @staticmethod
  130. def find_or_create_all_by_name(names):
  131. """
  132. merge
  133. VS
  134. cur.copy_in( ... scratch ... )
  135. insert into words select * from (select distinct * from scratch) uniq where not exists (select 1 from words where words.name = uniq.name);
  136. """
  137.  
  138. # MYTODO escape names ... learning exercise.
  139. matches = Word.find_all("""where words.name in {0}""".format(tuple(names)))
  140. unmatched = set(names) - set(map(lambda w: w.name, matches))
  141. pdb.set_trace()
  142.  
  143. #MYTODO: transactions?
  144. invalid_words = []
  145. created_words = []
  146. for n in unmatched:
  147. w = Word(n)
  148. try:
  149. w.new()
  150. created_words.append(w)
  151. except NameError:
  152. invalid_words.append(n)
  153. # MYTODO: hose invalid words over to the output somehow ... through a logger, if nothing else
  154.  
  155. if not len(created_words) == 0: db.commit()
  156. return created_words.extend(matches) or []
  157.  
  158.  
  159. def new(self):
  160. """ vaguely rails-AR-like new()
  161.  
  162. validates, find-greps for catches, and pre-commits instance to the db
  163.  
  164. #MYTODO: profiling. Is it worth it to split up the two grep searches? (above)
  165. """
  166. self.validate_against_local_lists()
  167. grepd_catches = self.fgrep_catches_in_directories(("./words",))
  168.  
  169. flat_catches = []
  170. for c in grepd_catches: flat_catches.extend(c) #split()
  171. self.catches = "".join(map(lambda catch: catch+" ", set(flat_catches))).strip()
  172.  
  173. cur.execute("""INSERT INTO words VALUES {0}""".format(
  174. (
  175. self.name,
  176. self.calculate_probability(),
  177. self.calculate_frequencies(),
  178. self.catches,
  179. # hangs
  180. self.name[1:] + " " + self.name[:-1],
  181. )
  182. ))
  183.  
  184.  
  185. def validate_against_local_lists(self, lists=(".",)):
  186. """if not found in any text file => not a legal word!
  187.  
  188. this will also catch all the weird things people might throw. Like numbers.
  189. """
  190. if [self.name] not in self.fgrep_in_directories(lists):
  191. raise NameError, "not in ./words/*.txt. Look again, shall we?"
  192. pass
  193.  
  194. def fgrep_in_directories(self, directories=(".",),search_string=None):
  195. """ grep in dir ("." by default)
  196.  
  197. find a word in local .txt files
  198. """
  199. if search_string is None:
  200. search_tuple = (("^{0}$".format(self.name), re.I),)
  201. else:
  202. search_tuple = ((search_string, re.M),)
  203.  
  204. result = map(lambda directory:
  205. SU.ffindgrep(directory, namefs=(lambda s: s.endswith('.txt'),),
  206. regexl=search_tuple
  207. ).values(),
  208. directories)
  209.  
  210. return [catch[0] for catch in result if len(catch) is not 0]
  211.  
  212. def fgrep_catches_in_directories(self, directories=(".",)):
  213. """find all _catches_
  214.  
  215. find a word in local .txt files
  216. """
  217. temp = []
  218. temp.extend(self.fgrep_in_directories(("./words",), "^{0}.$".format(self.name)))
  219. temp.extend(self.fgrep_in_directories(("./words",), "^.{0}$".format(self.name)))
  220. return temp
  221.  
  222. # raise ArgumentError
  223. @staticmethod
  224. def find_all(finder_sql = ""):
  225. """rails-style finder
  226. """
  227. cur.execute("select * from words {0}".format(finder_sql))
  228. return map(lambda properties: Word(*properties), cur.fetchall())
  229.  
  230.  
  231. def flatten(l):
  232. if l is []:
  233. pass
  234. elif isinstance(l,list):
  235. return sum(map(flatten,l))
  236. else:
  237. return l
Add Comment
Please, Sign In to add comment