Advertisement
Guest User

Untitled

a guest
Mar 22nd, 2019
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.24 KB | None | 0 0
  1. """
  2. 比对两个不同OCR软件的识别结果,对于二者有不同的部分,根据汉字总体字频选择频率更高的
  3.  
  4. """
  5.  
  6. import difflib
  7. import collections
  8. import math
  9.  
  10. # 汉字字频文件,出现频率高的字排在前面
  11. chinese_frequency = collections.defaultdict(lambda: math.inf, [(val, idx) for idx, val in enumerate(open("chinese_frequency.txt", encoding="utf-8").read().strip())])
  12.  
  13. def ishan(char):
  14. return '\u4e00' <= char <= '\u9fff'
  15.  
  16. def pick_one(a, b):
  17. # 一个为空,一个不为空,要不为空的
  18. if (a == ""): return b
  19. if (b == ""): return a
  20. hana = list(filter(ishan, a))
  21. hanb = list(filter(ishan, b))
  22. # 一个有汉字一个没汉字,要有汉字的
  23. if (len(hana) == 0): return b
  24. if (len(hanb) == 0): return a
  25. av = sum(map(lambda x: chinese_frequency[x], hana)) / len(hana)
  26. bv = sum(map(lambda x: chinese_frequency[x], hanb)) / len(hanb)
  27. # 要总体汉字优先级高的
  28. return a if av < bv else b
  29.  
  30. # 两个不同软件的OCR识别结果
  31. ocr1 = open("ocr1.txt", encoding="utf-8").read()
  32. ocr2 = open("ocr2.txt", encoding="utf-8").read()
  33.  
  34. print(len(ocr1), len(ocr2))
  35.  
  36. matcher = difflib.SequenceMatcher(lambda x: x in " \t\n\r,。、^“”《》:〉,;", ocr1, ocr2)
  37.  
  38. pointer_a = 0
  39. pointer_b = 0
  40. matching_blocks = matcher.get_matching_blocks()
  41. merged_result = ""
  42. count = 0
  43. for current_block in matching_blocks:
  44. current_block = matching_blocks.pop(0)
  45. mismatch_a = matcher.a[pointer_a:current_block.a]
  46. mismatch_b = matcher.b[pointer_b:current_block.b]
  47. merged_result += pick_one(mismatch_a, mismatch_b)
  48. # 有很多差异是空格造成的
  49. if not str.isspace(pick_one(mismatch_a, mismatch_b)):
  50. count += 1
  51. print(count, mismatch_a, "|", mismatch_b, "|", pick_one(mismatch_a, mismatch_b))
  52. merged_result += matcher.a[current_block.a:current_block.a+current_block.size]
  53. pointer_a = current_block.a + current_block.size
  54. pointer_b = current_block.b + current_block.size
  55.  
  56. # 处理最后一个块(如果有的话)
  57. if (pointer_a != len(matcher.a) or pointer_b != len(matcher.b)):
  58. merged_result += pick_one(matcher.a[pointer_a:len(matcher.a)], matcher.b[pointer_b:len(matcher.b)])
  59.  
  60. with open("out.txt", "w", encoding="utf-8") as f:
  61. f.write(merged_result)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement