Guest User

Untitled

a guest
May 7th, 2017
103
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2. # example_python_n-gram_parser.py
  3. # not as tidy as it might be,
  4. # but you should get the general idea.
  5. # Define:
  6. # master_words, a list of master words to check n-grams against
  7. # LIST_OF_5GRAMS_FROM_LP
  8. # LIST_OF_FILENAMES (google data)
  9. # PATH_TO_COMPRESSED_DATA
  10. # PATH_TO_OUTPUT_FOLDER
  11. import gzip
  12. import time
  13. import csv
  14. import os
  15. import time
  16. from master_words import LIST_OF_FILENAMES
  17. from LP_ngrams import LIST_OF_5GRAMS_FROM_LP
  18.  
  19.  
  20. class get_rune_length_code():
  21. def __init__(self):
  22. self.latin_fragments = ['F', 'U', 'TH', 'O', 'R', 'C', 'G', 'W', 'H', 'N', 'I',
  23. 'J', 'EO', 'P', 'X', 'S', 'T', 'B', 'E', 'M', 'L', 'NG',
  24. 'OE', 'D', 'A', 'AE', 'Y', 'IA', 'EA']
  25. self.period = "."
  26. self.quote = "\""
  27. self.apostrophe = "'"
  28. self.colon = ":"
  29. self.semicolon = ";"
  30. self.comma = ","
  31. self.exclamation = "!"
  32. self.question = "?"
  33. self.bigram = ['TH', 'EO', 'NG', 'OE', 'AE', 'IA', 'IO', 'EA']
  34. self.trgram = 'ING'
  35.  
  36. def translate_to_gematria(self,word):# thanks to 'solvers
  37. res = []
  38. skip = 0
  39. WORD = word.upper()
  40. for i, val in enumerate(WORD):
  41. if skip:
  42. skip -= 1
  43. continue
  44. if WORD[i:i+3] == self.trgram:
  45. res.append(self.trgram)
  46. skip += 2
  47. continue
  48. if WORD[i:i+2] in self.bigram:
  49. res.append(WORD[i:i+2])
  50. skip += 1
  51. continue
  52. res.append(val)
  53. return res
  54.  
  55. def get_code(self,string): #meh
  56. if string == self.period:
  57. return self.period
  58. elif string == self.question:
  59. return self.period
  60. elif string == self.exclamation:
  61. return self.period
  62. elif string == self.quote:
  63. return self.quote
  64. elif string == self.apostrophe:
  65. return self.apostrophe
  66. elif string == self.colon:
  67. return self.comma
  68. elif string == self.semicolon:
  69. return self.comma
  70. elif string == self.comma:
  71. return self.comma
  72. elif string.isalpha():
  73. return len(self.translate_to_gematria(string))
  74. else:
  75. return -4
  76.  
  77. def get_code_list(self,strings):# code is the 5-grams word lengths of string is,
  78. r = [] # and is checked against LP 5-grams
  79. for string in strings:
  80. a = self.get_code(string)
  81. r.append(a)
  82. if a == -4:
  83. return False
  84. return r
  85.  
  86. class compressed_ngram_parser():
  87. def __init__(self):
  88. self.to_rune_latin = get_rune_length_code()
  89. # "_.", "_END_" are because punctuation
  90. # and _end_ is superflous assumming there is always a . (?)
  91. self.google_pos_tags = ["_NOUN","_VERB","_ADJ","_ADV","_PRON",
  92. "_DET","_ADP","_NUM","_CONJ","_PRT","_X","_.", "_END_"]
  93. self.google_pos_word ={"_NOUN_","_VERB_","_ADJ_","_ADV_",
  94. "_PRON_","_DET_","_ADP_","_NUM_","_CONJ_","_PRT_", "_ROOT_","_X_"}
  95. self.punctuation = [",", "_", ":", "!" ,"_",
  96. "\"", "'", "?","-", ")","(", "[","]"]
  97. self.master_words_set = set(master_words)
  98. self.master_words_set.update( self.google_pos_word)
  99. self.words_to_check = []
  100. self.code = None
  101. self.phrases = None
  102. self.valid_characters =
  103. set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,.\";:' ?!")
  104. self.pos_code = {"_NOUN":"N","_VERB":"V","_ADJ":"J","_ADV":"D","_PRON":"P",
  105. "_DET" :"E","_ADP" :"A","_NUM" :"U","_CONJ":"C","_PRT" :"R","_X":"X",
  106. "_.":".","_MASTER":"M","_NOTMASTER":"S","_":"_"}
  107.  
  108. def remove_google_pos_tags(self, word):
  109. for tag in self.google_pos_tags:
  110. word = word.replace(tag,'')
  111. return word.lower()
  112.  
  113. def remove_google_pos_tags_2(self, word):
  114. for tag in self.google_pos_tags:
  115. word = word.replace(tag,'')
  116. return word
  117.  
  118. def string_has_valid_characters(self, string):
  119. return set(string) <= self.valid_characters
  120.  
  121. def string_has_google_pos_word(self, string):
  122. return any(a in string for a in self.google_pos_word)
  123.  
  124. def is_valid_string(self, string):
  125. if self.string_has_google_pos_word(string):
  126. return False
  127. else:
  128. return self.string_has_valid_characters(string)
  129.  
  130. def check_ngram(self, n_gram_phrase):
  131. global grams5
  132. phrase = self.remove_google_pos_tags(n_gram_phrase)
  133. if self.is_valid_string(phrase):
  134. self.phrase = phrase.split()
  135. self.code = self.to_rune_latin.get_code_list(self.phrase)
  136. if self.code == False:
  137. return False
  138. else:
  139. return self.code in grams5
  140. else:
  141. return False
  142.  
  143. def get_data_to_keep(self, previous_line_phrase, total_count):
  144. phrases = previous_line_phrase.split()
  145. to_write = previous_line_phrase + " " + str(total_count) + " "
  146. to_write += ' '.join(str(e) for e in self.code)
  147. if self.are_in_master_words_set(self.phrase):
  148. to_write += ' _MASTER '
  149. else:
  150. to_write += ' _NOTMASTER '
  151. return to_write
  152.  
  153. def are_in_master_words_set(self, words):
  154. lower_words = [x.lower() for x in words]
  155. return set(lower_words) < self.master_words_set
  156.  
  157. def get_data(self, ngram_file, output):# main loop function
  158. count = 0
  159. total_count = 0
  160. previous_line_phrase = None
  161. should_keep = False
  162. with open(output, 'w') as output:
  163. # open compressed file for reading line by line
  164. with gzip.open(ngram_file, 'rb') as input:
  165. # reader object, data is tab delimited
  166. reader = csv.reader(input, delimiter='\t')
  167. for line in reader:
  168. #if same phrase as last, and should_keep
  169. if line[0] == previous_line_phrase and should_keep:
  170. total_count += int( line[-2] )
  171. # or its a first pass
  172. elif None == previous_line_phrase:
  173. should_keep = self.check_ngram(line[0])
  174. total_count = int( line[-2] )
  175. # or we have a new phrase, and maybe we
  176. # write the old phrase to file
  177. elif line[0] != previous_line_phrase:
  178. if should_keep:
  179. output.write(self.get_data_to_keep(
  180. previous_line_phrase,total_count))
  181. output.write("\n")
  182. should_keep = self.check_ngram(line[0])
  183. total_count = int( line[-2] )
  184. previous_line_phrase = line[0]
  185.  
  186. count += 1
  187. if count == 50000:
  188. break
  189.  
  190. def is_in_data_totidy(self, data, datatofind,part):
  191. data_all_part = [item[part] for item in data]
  192. try:
  193. position = data_all_part.index(datatofind)
  194. return position
  195. except ValueError:
  196. return -1
  197.  
  198. def has_pos_tags(self, line):
  199. for tag in self.google_pos_tags:
  200. if tag in line:
  201. return True
  202. return False
  203.  
  204. def get_tags(self,word):
  205. for tag in self.google_pos_tags:
  206. if tag in word:
  207. return tag
  208. return "_"
  209. def get_tags_list(self,wordlist):
  210. r = []
  211. for word in wordlist:
  212. r.append(self.pos_code[self.get_tags(word)])
  213. return r
  214.  
  215. def get_combo_data(self,line):
  216. pos_tags = self.has_pos_tags(line)
  217. line_split = line.split()
  218. words = self.remove_google_pos_tags(line).split()[0:5]
  219. tags = self.get_tags_list(line_split[0:5])
  220. count = int(line_split[5])
  221. other_tags = line_split[6:]
  222. data = [words+tags,pos_tags,count,other_tags,line]
  223. return data
  224.  
  225. # garbage, but it only takes a few minutes to run
  226. def tidy_files(self, inputfile, masteroutputfile, nonmasteroutputfile):
  227. counter = 0
  228. with open(inputfile,'r') as input,open(masteroutputfile,'w') as master,
  229. open(nonmasteroutputfile,'w') as nonmaster:
  230. for line in input:
  231. line_split = line.split()
  232. tags = self.get_tags_list(line_split[0:5])
  233. line_no_tags = self.remove_google_pos_tags_2(line)
  234. line_no_tags_split = line_no_tags.split()
  235. words_string = ' '.join(line_no_tags_split[0:5])
  236. tags_string = ' '.join(tags)
  237. length_string = ' '.join(line_no_tags_split[6:-1])
  238. master_string = self.pos_code[line_no_tags_split[-1]]
  239. count_string = line_no_tags_split[5]
  240. towrite = words_string + ' '
  241. towrite += tags_string + ' '
  242. towrite += length_string + ' '
  243. towrite += master_string + ' '
  244. towrite += count_string + '\n'
  245. if line_no_tags_split[-1] == '_MASTER':
  246. master.write(towrite)
  247. elif line_no_tags_split[-1] == '_NOTMASTER':
  248. nonmaster.write(towrite)
  249. counter += 1
  250. if counter == 5000:
  251. break
  252.  
  253. from files import fns (LIST_OF_FILENAMES raw google data .gz)
  254. in_root = PATH_TO_COMPRESSED_DATA
  255. out_root = PATH_TO_OUTPUT_FOLDER
  256.  
  257. parser = compressed_ngram_parser()
  258. for file in fns:
  259. in_file = in_root+file
  260. out_file = out_root+ file[0:-2]+"txt"
  261.  
  262. print in_file
  263. print out_file
  264. t1= time.time()
  265. parser.get_data( in_file, out_file )
  266.  
  267. t2= time.time()
  268. print fns[0]+ " " + str( (t2 - t1 ) / 60 )
  269.  
  270. # some tidying, and minimising data_string size
  271. in_root = PATH_TO_OUTPUT_FOLDER
  272. fns = os.listdir(in_root)
  273.  
  274. out_rootm = PATH_TO_OUTPUT_FOLDER_M
  275. out_rootn = PATH_TO_OUTPUT_FOLDER_N
  276.  
  277. t1= time.time()
  278. for file in fns:
  279.  
  280. inf = in_root + file
  281. outm = out_rootm + file[-6:-4]+ "_master.txt"
  282. outn = out_rootn + file[-6:-4]+ "_non_master.txt"
  283.  
  284. parser.tidy_files(inf,outm,outn)
  285. print "TIME = " + str( (time.time() - t1 ) / 60 )
RAW Paste Data