SHARE
TWEET

Untitled

a guest May 7th, 2017 70 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2. # example_python_n-gram_parser.py
  3. # not as tidy as it might be,  
  4. # but you should  get the general idea.
  5. # Define:
  6. # master_words, a list of master words to check n-grams against
  7. # LIST_OF_5GRAMS_FROM_LP
  8. # LIST_OF_FILENAMES (google data)
  9. # PATH_TO_COMPRESSED_DATA
  10. # PATH_TO_OUTPUT_FOLDER
  11. import gzip
  12. import time
  13. import csv
  14. import os
  15. import time
  16. from master_words import LIST_OF_FILENAMES
  17. from LP_ngrams import LIST_OF_5GRAMS_FROM_LP   
  18.  
  19.  
  20. class get_rune_length_code():
  21.     def __init__(self):
  22.         self.latin_fragments = ['F', 'U', 'TH', 'O', 'R', 'C', 'G', 'W', 'H', 'N', 'I',
  23.                                 'J', 'EO', 'P', 'X', 'S', 'T', 'B', 'E', 'M', 'L', 'NG',
  24.                                 'OE', 'D', 'A', 'AE', 'Y', 'IA', 'EA']
  25.         self.period      = "."
  26.         self.quote       = "\""
  27.         self.apostrophe  = "'"
  28.         self.colon       = ":"
  29.         self.semicolon   = ";"
  30.         self.comma       = ","
  31.         self.exclamation = "!"
  32.         self.question    = "?"
  33.         self.bigram = ['TH', 'EO', 'NG', 'OE', 'AE', 'IA', 'IO', 'EA']
  34.         self.trgram = 'ING'
  35.  
  36.     def translate_to_gematria(self,word):# thanks to 'solvers
  37.         res = []
  38.         skip = 0
  39.         WORD = word.upper()
  40.         for i, val in enumerate(WORD):
  41.             if skip:
  42.                 skip -= 1
  43.                 continue
  44.             if WORD[i:i+3] == self.trgram:
  45.                 res.append(self.trgram)
  46.                 skip += 2
  47.                 continue
  48.             if WORD[i:i+2] in self.bigram:
  49.                 res.append(WORD[i:i+2])
  50.                 skip += 1
  51.                 continue
  52.             res.append(val)
  53.         return res
  54.    
  55.     def get_code(self,string): #meh
  56.         if string == self.period:
  57.             return self.period
  58.         elif string == self.question:      
  59.             return self.period
  60.         elif string == self.exclamation:       
  61.             return self.period
  62.         elif string == self.quote:
  63.             return self.quote
  64.         elif string == self.apostrophe:
  65.             return self.apostrophe
  66.         elif string == self.colon:
  67.             return self.comma
  68.         elif string == self.semicolon:
  69.             return self.comma      
  70.         elif string == self.comma:     
  71.             return self.comma
  72.         elif string.isalpha():
  73.             return len(self.translate_to_gematria(string))
  74.         else:
  75.             return -4
  76.        
  77.     def get_code_list(self,strings):# code is the 5-grams word lengths of string is,  
  78.         r = []                      # and is checked against LP 5-grams
  79.         for string in strings:
  80.             a = self.get_code(string)
  81.             r.append(a)
  82.             if a == -4:
  83.                 return False
  84.         return r
  85.  
  86. class compressed_ngram_parser():
  87.     def __init__(self):
  88.         self.to_rune_latin = get_rune_length_code()
  89.         # "_.", "_END_" are because punctuation
  90.         # and _end_ is superflous assumming there is always a . (?)
  91.         self.google_pos_tags = ["_NOUN","_VERB","_ADJ","_ADV","_PRON",
  92.         "_DET","_ADP","_NUM","_CONJ","_PRT","_X","_.", "_END_"]        
  93.         self.google_pos_word ={"_NOUN_","_VERB_","_ADJ_","_ADV_",
  94.         "_PRON_","_DET_","_ADP_","_NUM_","_CONJ_","_PRT_",  "_ROOT_","_X_"}
  95.         self.punctuation = [",", "_", ":", "!" ,"_",
  96.         "\"", "'", "?","-", ")","(", "[","]"]
  97.         self.master_words_set = set(master_words)
  98.         self.master_words_set.update( self.google_pos_word)
  99.         self.words_to_check = []
  100.         self.code = None
  101.         self.phrases = None
  102.         self.valid_characters =
  103.         set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,.\";:' ?!")
  104.         self.pos_code = {"_NOUN":"N","_VERB":"V","_ADJ":"J","_ADV":"D","_PRON":"P",
  105.         "_DET" :"E","_ADP" :"A","_NUM" :"U","_CONJ":"C","_PRT" :"R","_X":"X",
  106.         "_.":".","_MASTER":"M","_NOTMASTER":"S","_":"_"}
  107.        
  108.     def remove_google_pos_tags(self, word):
  109.         for tag in self.google_pos_tags:
  110.             word = word.replace(tag,'')
  111.         return word.lower()    
  112.  
  113.     def remove_google_pos_tags_2(self, word):
  114.         for tag in self.google_pos_tags:
  115.             word = word.replace(tag,'')
  116.         return word
  117.        
  118.     def string_has_valid_characters(self, string):
  119.         return set(string) <= self.valid_characters
  120.  
  121.     def string_has_google_pos_word(self, string):
  122.         return any(a in string for a in self.google_pos_word)
  123.  
  124.     def is_valid_string(self, string):
  125.         if self.string_has_google_pos_word(string):
  126.             return False
  127.         else:
  128.             return self.string_has_valid_characters(string)
  129.            
  130.     def check_ngram(self, n_gram_phrase):
  131.         global grams5
  132.         phrase = self.remove_google_pos_tags(n_gram_phrase)
  133.         if self.is_valid_string(phrase):
  134.             self.phrase = phrase.split()
  135.             self.code =  self.to_rune_latin.get_code_list(self.phrase)
  136.             if self.code == False:
  137.                 return False
  138.             else:
  139.                 return self.code in grams5
  140.         else:
  141.             return False
  142.    
  143.     def get_data_to_keep(self, previous_line_phrase, total_count):
  144.         phrases = previous_line_phrase.split()
  145.         to_write = previous_line_phrase + " " + str(total_count) + " "
  146.         to_write += ' '.join(str(e) for e in self.code)
  147.         if self.are_in_master_words_set(self.phrase):
  148.             to_write += ' _MASTER '
  149.         else:
  150.             to_write += ' _NOTMASTER '
  151.         return to_write
  152.  
  153.     def are_in_master_words_set(self, words):
  154.         lower_words = [x.lower() for x in words]
  155.         return set(lower_words) < self.master_words_set
  156.        
  157.     def get_data(self, ngram_file, output):# main loop function
  158.         count = 0
  159.         total_count = 0
  160.         previous_line_phrase = None
  161.         should_keep = False
  162.         with open(output, 'w') as output:
  163.             # open compressed file for reading line by line
  164.             with gzip.open(ngram_file, 'rb') as input:     
  165.                 # reader object, data is tab delimited
  166.                 reader = csv.reader(input, delimiter='\t')  
  167.                 for line in reader:
  168.                     #if same phrase as last, and should_keep
  169.                     if line[0] == previous_line_phrase and should_keep:
  170.                         total_count += int( line[-2] )
  171.                     # or its a first pass
  172.                     elif None == previous_line_phrase:
  173.                         should_keep = self.check_ngram(line[0])
  174.                         total_count = int( line[-2] )
  175.                     # or we have a new phrase, and maybe we
  176.                     # write the old phrase to file
  177.                     elif line[0] != previous_line_phrase:
  178.                         if should_keep:
  179.                             output.write(self.get_data_to_keep(
  180.                             previous_line_phrase,total_count))
  181.                             output.write("\n")
  182.                         should_keep = self.check_ngram(line[0])
  183.                         total_count = int( line[-2] )
  184.                     previous_line_phrase = line[0]
  185.                    
  186.                     count += 1
  187.                     if count == 50000:
  188.                         break
  189.  
  190.     def is_in_data_totidy(self, data, datatofind,part):
  191.         data_all_part = [item[part] for item in data]
  192.         try:
  193.             position = data_all_part.index(datatofind)
  194.             return position
  195.         except ValueError:
  196.             return -1
  197.    
  198.     def has_pos_tags(self, line):
  199.         for tag in self.google_pos_tags:
  200.             if tag in line:
  201.                 return True
  202.         return False
  203.    
  204.     def get_tags(self,word):
  205.         for tag in self.google_pos_tags:
  206.             if tag in word:
  207.                 return tag
  208.         return "_"
  209.     def get_tags_list(self,wordlist):
  210.         r = []
  211.         for word in wordlist:
  212.             r.append(self.pos_code[self.get_tags(word)])
  213.         return r
  214.    
  215.     def get_combo_data(self,line):
  216.         pos_tags = self.has_pos_tags(line)
  217.         line_split = line.split()
  218.         words = self.remove_google_pos_tags(line).split()[0:5]
  219.         tags  = self.get_tags_list(line_split[0:5])
  220.         count = int(line_split[5])
  221.         other_tags = line_split[6:]
  222.         data = [words+tags,pos_tags,count,other_tags,line]
  223.         return data
  224.        
  225.     # garbage, but it only takes a few minutes to run  
  226.     def tidy_files(self, inputfile, masteroutputfile, nonmasteroutputfile):
  227.         counter = 0
  228.         with open(inputfile,'r') as input,open(masteroutputfile,'w') as master,
  229.         open(nonmasteroutputfile,'w') as nonmaster:
  230.             for line in input:
  231.                 line_split = line.split()
  232.                 tags  = self.get_tags_list(line_split[0:5])
  233.                 line_no_tags  = self.remove_google_pos_tags_2(line)
  234.                 line_no_tags_split = line_no_tags.split()
  235.                 words_string = ' '.join(line_no_tags_split[0:5])
  236.                 tags_string  = ' '.join(tags)
  237.                 length_string = ' '.join(line_no_tags_split[6:-1])
  238.                 master_string = self.pos_code[line_no_tags_split[-1]]
  239.                 count_string = line_no_tags_split[5]
  240.                 towrite = words_string + ' '
  241.                 towrite += tags_string + ' '
  242.                 towrite += length_string + ' '
  243.                 towrite += master_string + ' '
  244.                 towrite += count_string + '\n'
  245.                 if line_no_tags_split[-1] == '_MASTER':
  246.                     master.write(towrite)
  247.                 elif line_no_tags_split[-1] == '_NOTMASTER':
  248.                     nonmaster.write(towrite)
  249.                 counter += 1
  250.                 if counter == 5000:
  251.                     break
  252.  
  253. from files import fns (LIST_OF_FILENAMES raw google data .gz)
  254. in_root = PATH_TO_COMPRESSED_DATA
  255. out_root = PATH_TO_OUTPUT_FOLDER
  256.  
  257. parser = compressed_ngram_parser()
  258. for file in fns:
  259.     in_file = in_root+file
  260.     out_file = out_root+ file[0:-2]+"txt"
  261.  
  262.     print in_file
  263.     print out_file
  264.     t1= time.time()
  265.     parser.get_data( in_file, out_file )
  266.  
  267.     t2= time.time()
  268.     print fns[0]+ " " + str( (t2 - t1 ) / 60 )
  269.  
  270. # some tidying, and minimising data_string size
  271. in_root  = PATH_TO_OUTPUT_FOLDER
  272. fns = os.listdir(in_root)
  273.  
  274. out_rootm = PATH_TO_OUTPUT_FOLDER_M
  275. out_rootn = PATH_TO_OUTPUT_FOLDER_N
  276.  
  277. t1= time.time()
  278. for file in fns:
  279.    
  280.     inf = in_root + file
  281.     outm = out_rootm + file[-6:-4]+ "_master.txt"
  282.     outn = out_rootn + file[-6:-4]+ "_non_master.txt"
  283.    
  284.     parser.tidy_files(inf,outm,outn)
  285. print "TIME = " + str( (time.time() - t1 ) / 60 )
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top