Untitled

"""
比对两个不同OCR软件的识别结果，对于二者有不同的部分，根据汉字总体字频选择频率更高的

"""

import difflib
import collections
import math

# 汉字字频文件，出现频率高的字排在前面
chinese_frequency = collections.defaultdict(lambda: math.inf, [(val, idx) for idx, val in enumerate(open("chinese_frequency.txt", encoding="utf-8").read().strip())])

def ishan(char):
    return '\u4e00' <= char <= '\u9fff'

def pick_one(a, b):
    # 一个为空，一个不为空，要不为空的
    if (a == ""): return b
    if (b == ""): return a
    hana = list(filter(ishan, a))
    hanb = list(filter(ishan, b))
    # 一个有汉字一个没汉字，要有汉字的
    if (len(hana) == 0): return b
    if (len(hanb) == 0): return a
    av = sum(map(lambda x: chinese_frequency[x], hana)) / len(hana)
    bv = sum(map(lambda x: chinese_frequency[x], hanb)) / len(hanb)
    # 要总体汉字优先级高的
    return a if av < bv else b

# 两个不同软件的OCR识别结果
ocr1 = open("ocr1.txt", encoding="utf-8").read()
ocr2 = open("ocr2.txt", encoding="utf-8").read()

print(len(ocr1), len(ocr2))

matcher = difflib.SequenceMatcher(lambda x: x in " \t\n\r，。、^“”《》：〉，；", ocr1, ocr2)

pointer_a = 0
pointer_b = 0
matching_blocks = matcher.get_matching_blocks()
merged_result = ""
count = 0
for current_block in matching_blocks:
    current_block = matching_blocks.pop(0)
    mismatch_a = matcher.a[pointer_a:current_block.a]
    mismatch_b = matcher.b[pointer_b:current_block.b]
    merged_result += pick_one(mismatch_a, mismatch_b)
    # 有很多差异是空格造成的
    if not str.isspace(pick_one(mismatch_a, mismatch_b)):
        count += 1
        print(count, mismatch_a, "|", mismatch_b, "|", pick_one(mismatch_a, mismatch_b))
    merged_result += matcher.a[current_block.a:current_block.a+current_block.size]
    pointer_a = current_block.a + current_block.size
    pointer_b = current_block.b + current_block.size

# 处理最后一个块（如果有的话）
if (pointer_a != len(matcher.a) or pointer_b != len(matcher.b)):
    merged_result += pick_one(matcher.a[pointer_a:len(matcher.a)], matcher.b[pointer_b:len(matcher.b)])

with open("out.txt", "w", encoding="utf-8") as f:
    f.write(merged_result)