Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- # 去除英文
- # update 删除带英语的句子. 因为删除英语剩下的就是病句
- pattern = re.compile(r'[a-zA-Z]+')
- test_text = [text for text in test_text if not pattern.search(text) ]
- # change num
- pattern = re.compile(r'[0-9]+')
- test_text = [pattern.sub("L", lines) for lines in test_text]
- # 去掉每行首尾空格
- test_text = [lines.strip() for lines in test_text]
- # 去掉每行首尾空格
- test_text = [lines.strip() for lines in test_text]
- # 将上面的正则换成负责找行中的空格
- pattern = re.compile(r' +')
- # 将所有space替换成空
- test_text = [pattern.sub("", lines) for lines in test_text]
- # 清理括号
- pattern = re.compile("\([^()]*\)")
- test_text = [pattern.sub("", lines) for lines in test_text]
- pattern = re.compile("\([^()]*\)")
- test_text = [pattern.sub("", lines) for lines in test_text]
- # 清理章节名
- pattern = re.compile("^第[一二三四五六七八九十]")
- test_text = [text for text in test_text if not pattern.search(text) ]
- # 清理 类似 图1 金融市场
- pattern = re.compile("^图L")
- test_text = [text for text in test_text if not pattern.search(text) ]
- # 清理 类似 1.xxx 题目
- pattern = re.compile("^L\.")
- test_text = [text for text in test_text if not pattern.match(text) ]
- pattern = re.compile("[\u4e00-\u9fa5]")
- # 修复doc分页导致的错误句子分割
- deep_num = 2
- for _ in range(deep_num):
- for id, text in enumerate(test_text):
- if not text:
- continue
- pattern = re.compile("[\u4e00-\u9fa5]")
- if pattern.match(text[-1]) and id != len(test_text):
- text=text+test_text[id+1]
- del test_text[id+1]
- test_text[id] = text
- # 清理病态括号
- # 有左括号 无右括号结尾
- pattern = re.compile("\([^)]*$")
- test_text = [pattern.sub("", lines) for lines in test_text]
- # 有右括号 但开头无左括号
- pattern = re.compile("^[^(]*)")
- test_text = [pattern.sub("", lines) for lines in test_text]
- # 英文版
- # 有左括号 无右括号结尾
- pattern = re.compile("\([^\)]*$")
- test_text = [pattern.sub("", lines) for lines in test_text]
- # 有右括号 但开头无左括号
- pattern = re.compile("^[^\(]*\)")
- test_text = [pattern.sub("", lines) for lines in test_text]
- # 删除一些由于删除英语和数字导致的文本错误
- china_end = "!?。,、"
- pattern = re.compile(r"["+china_end+r"]+[\u4e00-\u9fa5]{1,2}["+china_end+"]+")
- test_text = [pattern.sub("", lines) for lines in test_text]
- # 清理标注
- pattern = re.compile(r"[①②③④⑤⑥⑦⑧⑨⑩]")
- test_text = [pattern.sub("", lines) for lines in test_text]
- # 清理\t
- pattern = re.compile(r"\t")
- test_text = [pattern.sub("", lines) for lines in test_text]
- # 有右双引号 但开头无左引号
- pattern = re.compile("^[^“]*”")
- test_text = [pattern.sub("", lines) for lines in test_text]
- pattern = re.compile("“[^”]*$")
- test_text = [pattern.sub("", lines) for lines in test_text]
- # 填充句号
- for id, text in enumerate(test_text):
- if not text:
- continue
- pattern = re.compile("[\u4e00-\u9fa5]")
- if pattern.match(text[-1]):
- text=text+test_text[id+1]
- del test_text[id+1]
- test_text[id] = text + "。"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement