Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- 比对两个不同OCR软件的识别结果,对于二者有不同的部分,根据汉字总体字频选择频率更高的
- """
- import difflib
- import collections
- import math
- # 汉字字频文件,出现频率高的字排在前面
- chinese_frequency = collections.defaultdict(lambda: math.inf, [(val, idx) for idx, val in enumerate(open("chinese_frequency.txt", encoding="utf-8").read().strip())])
- def ishan(char):
- return '\u4e00' <= char <= '\u9fff'
- def pick_one(a, b):
- # 一个为空,一个不为空,要不为空的
- if (a == ""): return b
- if (b == ""): return a
- hana = list(filter(ishan, a))
- hanb = list(filter(ishan, b))
- # 一个有汉字一个没汉字,要有汉字的
- if (len(hana) == 0): return b
- if (len(hanb) == 0): return a
- av = sum(map(lambda x: chinese_frequency[x], hana)) / len(hana)
- bv = sum(map(lambda x: chinese_frequency[x], hanb)) / len(hanb)
- # 要总体汉字优先级高的
- return a if av < bv else b
- # 两个不同软件的OCR识别结果
- ocr1 = open("ocr1.txt", encoding="utf-8").read()
- ocr2 = open("ocr2.txt", encoding="utf-8").read()
- print(len(ocr1), len(ocr2))
- matcher = difflib.SequenceMatcher(lambda x: x in " \t\n\r,。、^“”《》:〉,;", ocr1, ocr2)
- pointer_a = 0
- pointer_b = 0
- matching_blocks = matcher.get_matching_blocks()
- merged_result = ""
- count = 0
- for current_block in matching_blocks:
- current_block = matching_blocks.pop(0)
- mismatch_a = matcher.a[pointer_a:current_block.a]
- mismatch_b = matcher.b[pointer_b:current_block.b]
- merged_result += pick_one(mismatch_a, mismatch_b)
- # 有很多差异是空格造成的
- if not str.isspace(pick_one(mismatch_a, mismatch_b)):
- count += 1
- print(count, mismatch_a, "|", mismatch_b, "|", pick_one(mismatch_a, mismatch_b))
- merged_result += matcher.a[current_block.a:current_block.a+current_block.size]
- pointer_a = current_block.a + current_block.size
- pointer_b = current_block.b + current_block.size
- # 处理最后一个块(如果有的话)
- if (pointer_a != len(matcher.a) or pointer_b != len(matcher.b)):
- merged_result += pick_one(matcher.a[pointer_a:len(matcher.a)], matcher.b[pointer_b:len(matcher.b)])
- with open("out.txt", "w", encoding="utf-8") as f:
- f.write(merged_result)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement