Untitled

import re

# 去除英文
# update 删除带英语的句子. 因为删除英语剩下的就是病句
pattern = re.compile(r'[a-zA-Z]+')
test_text = [text for text in test_text if not pattern.search(text) ]

# change num
pattern = re.compile(r'[0-9]+')
test_text = [pattern.sub("L", lines) for lines in test_text]

# 去掉每行首尾空格
test_text = [lines.strip() for lines in test_text]

# 去掉每行首尾空格
test_text = [lines.strip() for lines in test_text]

# 将上面的正则换成负责找行中的空格
pattern = re.compile(r' +')
# 将所有space替换成空
test_text = [pattern.sub("", lines) for lines in test_text]

# 清理括号
pattern = re.compile("\（[^（）]*\）")

test_text = [pattern.sub("", lines) for lines in test_text]

pattern = re.compile("\([^()]*\)")

test_text = [pattern.sub("", lines) for lines in test_text]

# 清理章节名
pattern = re.compile("^第[一二三四五六七八九十]")
test_text = [text for text in test_text if not pattern.search(text) ]

# 清理 类似 图1 金融市场
pattern = re.compile("^图L")
test_text = [text for text in test_text if not pattern.search(text) ]

# 清理 类似 1.xxx 题目
pattern = re.compile("^L\.")
test_text = [text for text in test_text if not pattern.match(text) ]


pattern = re.compile("[\u4e00-\u9fa5]")

# 修复doc分页导致的错误句子分割
deep_num = 2
for _ in range(deep_num):
    for id, text in enumerate(test_text):
         if not text:
            continue
         pattern = re.compile("[\u4e00-\u9fa5]")
                  if pattern.match(text[-1]) and id != len(test_text):
            text=text+test_text[id+1]
            del test_text[id+1]
            test_text[id] = text

# 清理病态括号
# 有左括号 无右括号结尾
pattern = re.compile("\（[^）]*$")
test_text = [pattern.sub("", lines) for lines in test_text]

# 有右括号 但开头无左括号
pattern = re.compile("^[^（]*）")
test_text = [pattern.sub("", lines) for lines in test_text]

# 英文版
# 有左括号 无右括号结尾
pattern = re.compile("\([^\)]*$")
test_text = [pattern.sub("", lines) for lines in test_text]

# 有右括号 但开头无左括号
pattern = re.compile("^[^\(]*\)")
test_text = [pattern.sub("", lines) for lines in test_text]

# 删除一些由于删除英语和数字导致的文本错误
china_end = "！？｡，、"
pattern = re.compile(r"["+china_end+r"]+[\u4e00-\u9fa5]{1,2}["+china_end+"]+")
test_text = [pattern.sub("", lines) for lines in test_text]

# 清理标注
pattern = re.compile(r"[①②③④⑤⑥⑦⑧⑨⑩]")
test_text = [pattern.sub("", lines) for lines in test_text]

# 清理\t
pattern = re.compile(r"\t")
test_text = [pattern.sub("", lines) for lines in test_text]

# 有右双引号 但开头无左引号
pattern = re.compile("^[^“]*”")
test_text = [pattern.sub("", lines) for lines in test_text]

pattern = re.compile("“[^”]*$")
test_text = [pattern.sub("", lines) for lines in test_text]
# 填充句号
for id, text in enumerate(test_text):
         if not text:
            continue
         pattern = re.compile("[\u4e00-\u9fa5]")

         if pattern.match(text[-1]):
            text=text+test_text[id+1]
            del test_text[id+1]
            test_text[id] = text + "。"