Advertisement
Guest User

Untitled

a guest
Sep 4th, 2015
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.07 KB | None | 0 0
  1. # -*- coding: UTF-8 -*-
  2. import os
  3. import codecs
  4. import re
  5. import json
  6. import time
  7. import sys
  8. vowels = u"уеыаоэяиюё"
  9. delete_trash = re.compile(u'[…()",\.?!*:;\-—»«0123456789]')
  10. user_dictionary_dir = u"accent1.dic"
  11. user_dictionary_codec = "cp1251"
  12.  
  13. def write_data (path, data):
  14. json_data = json.dumps(data, ensure_ascii=False, indent=1)
  15. json_file = codecs.open (path, 'w', 'cp1251')
  16. json_file.write (json_data)
  17. json_file.close()
  18.  
  19. def read_data (path):
  20. data_file = codecs.open(path, 'r', 'utf-8')
  21. data = json.load(data_file)
  22. data_file.close()
  23. return data
  24.  
  25. def return_paths(name):
  26. return os.listdir(name)
  27.  
  28. def read_file(path):
  29. f = codecs.open(path, "r", "utf-8")
  30.  
  31.  
  32. def main():
  33.  
  34. open_dict = codecs.open(user_dictionary_dir, "r", user_dictionary_codec)
  35. dictionary = complect_dict(open_dict)
  36. open_dict.close()
  37. new_dictionary = u""
  38. scanned = {}
  39. interesting = {}
  40.  
  41. f_type, name = input_info()
  42. files = []
  43.  
  44. is_author = re.compile(u"<author.*?")
  45. is_div = re.compile(u"<div.*?")
  46. is_not_div = re.compile(u"</div>")
  47. is_title = re.compile(u"<title.*?")
  48. is_string = re.compile(u"[А-Яа-я]\t")
  49. is_tab = re.compile(u"\t")
  50.  
  51. author = re.compile(u"author=\".*\"?")
  52. nick = re.compile(u"nick=\".*\"?")
  53. s_id = re.compile(u'id="[0-9]*"?')
  54. url = re.compile(u"url=\".*\"?")
  55. title = re.compile(u"title=\".*\"?")
  56. quotes = re.compile(u"\".*\"?")
  57.  
  58. curr_author = u""
  59. curr_nick = u""
  60. curr_id = u""
  61. curr_url = u""
  62. curr_title = u""
  63.  
  64. direct_num = 1
  65. splitter = u""
  66. if f_type == u"dir":
  67. files = return_paths(name)
  68. splitter = u"\\"
  69. elif f_type == u"file":
  70. files = [u""]
  71. direct_num = len(files)
  72. counter = 0
  73. counted = 0
  74. start = time.time()
  75. print "Started at ", time.ctime(), '\n\n'
  76. for i in files:
  77. counter += 1
  78. counted += 1
  79. if counter == 100:
  80. counter = 0
  81. n_sec = ((time.time() - start)/counted)*(direct_num - counted)
  82. n_min = n_sec/60.0
  83. n_hr = n_min/60.0
  84. sys.stdout.write(str(n_hr) + ' hr ||'+ str(n_min) + ' min ||' + str(n_sec) + ' sec || >> ' + str(direct_num - counted) + ' files' + '\n')
  85.  
  86. text_file = codecs.open(name + splitter + i, "r", "utf-8")
  87. stanza_info = {}
  88. div = False
  89. for j in text_file:
  90. if is_author.search(j):
  91. curr_author = author.findall(j)[0].strip(u"\"")
  92. curr_nick = nick.findall(j)[0].strip(u"\"")
  93. elif is_div.search(j):
  94. div = True
  95. curr_id = quotes.findall(s_id.findall(j)[0])[0].strip(u"\"")
  96. curr_url = url.findall(j)[0].strip(u"\"")
  97. elif is_title.search(j):
  98. curr_title = title.findall(j)[0].strip(u"\"")
  99. elif is_not_div.search(j):
  100. div = False
  101. elif div and is_string.search(j):
  102. string = scan_string(j)
  103.  
  104.  
  105. for word in string:
  106. if word[0]:
  107. if word[0] not in scanned:
  108. scanned.update({word[0]:[[word[1]], [word[2]], [curr_id]]})
  109. else:
  110. if word[1] not in scanned[word[0]][0]:
  111. scanned[word[0]][0].append(word[1])
  112. scanned[word[0]][1].append(word[2])
  113. scanned[word[0]][2].append(curr_id)
  114.  
  115. print "\n\nSaving started at ", time.ctime(), "\n\n"
  116. for i in scanned:
  117. if len(scanned[i][0]) == 1:
  118. if i not in dictionary:
  119. new_dictionary += constr_dict_string(i, scanned[i])
  120. else:
  121. interesting.update({i:scanned[i]})
  122.  
  123.  
  124. write_data(name+u"_quest.json", interesting)
  125.  
  126. open_dict = codecs.open(user_dictionary_dir, "r", user_dictionary_codec)
  127. dictionary = open_dict.read()
  128. open_dict.close()
  129. res_dict = dictionary + new_dictionary
  130. open_dict = codecs.open(user_dictionary_dir, "w", user_dictionary_codec)
  131. open_dict.write(res_dict)
  132. open_dict.close()
  133.  
  134. print "Completed at ", time.ctime()
  135.  
  136. def constr_dict_string(word, info):
  137. res = u""
  138. #print info
  139. res = res + u"\r\n" + word + u"\t" + info[0][0] + info[1][0]
  140. return res
  141.  
  142. def input_info():
  143. print u"Инструкция!"
  144. print u"Введите путь к файлу или папке (используя \\)."
  145. print u"Если введен путь к папке, будет предпринята попытка обработать все файлы из нее."
  146. print u"Конец инструкции!\n"
  147. is_end = False
  148. new = input_path()
  149. if file_type(new) == u"file":
  150. return u"file", new
  151. elif file_type(new) == u"dir":
  152. return u"dir", new
  153. else: return False
  154.  
  155. def input_path():
  156. line = raw_input(u"Введите путь к файлу: ")#.decode('cp1251')
  157. #line = u"c:\\daniil\\stihi_ru_clean_m2"#u"test1"
  158. return line.lower()
  159. def file_type(path):
  160. split_path = path.split(u"\\")
  161. if u"." in split_path[-1]:
  162. return u"file"
  163. else:
  164. return u"dir"
  165.  
  166. def scan_string(string): ###Правильность
  167. string_info = []
  168. string = string.split(u"\t")[0]
  169. words = string.split()
  170. is_capital = None
  171. for i in words:
  172. word = clear_word(i)
  173. stress = stress_syll(word)
  174. if word and word[0] != word[0].lower():
  175. is_capital = u"!"
  176. else:
  177. is_capital = u""
  178. word = word.replace(u"`", u"")
  179. string_info.append([word.lower(), unicode(stress), is_capital])
  180. return string_info
  181.  
  182.  
  183. def clear_word(word):
  184. global symbs
  185. return delete_trash.sub(u"", word)
  186.  
  187.  
  188. def stress_syll(word):
  189. global vowels
  190. parts = word.split(u"`")
  191. if len(parts) < 2:
  192. return 0
  193. syll_num = 0
  194. for i in parts[0]:
  195. if i in vowels:
  196. syll_num += 1
  197. if not syll_num:
  198. return 1
  199. return syll_num
  200.  
  201. def complect_dict(path):
  202. dictionary = {}
  203. for i in path:
  204. is_upper = u""
  205. if len(i) > 2 and i[0] != u"#" and len(i.split()) > 1 and not u"(" in i and not u")" in i: #потому что информативная строка словаря содержит слово из >0 букв, таб и номер ударного слога
  206. parts = i.split()
  207. if u"!" in parts[1]:
  208. is_upper = u"!"
  209. dictionary.update({parts[0]:[parts[1].split(u","), is_upper]})
  210.  
  211. elif len(i) > 2 and i[0] != u"#" and len(i.split()) > 1 and u"(" in i and u")" in i:
  212. parts = i.split()
  213. if u"!" in parts[1]:
  214. is_upper = u"!"
  215. pr_string = parts[0].split(u"(")
  216. pr_string[1] = pr_string[1].replace(u")", u"")
  217. ends = pr_string[1].split(u"|")
  218. for j in ends:
  219. for k in j.split(u"/"):
  220. dictionary.update({pr_string[0]+k : [parts[1].split(u","), is_upper]})
  221. return dictionary
  222.  
  223. if __name__ == '__main__':
  224. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement