Advertisement
Guest User

Untitled

a guest
Jul 21st, 2019
109
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.19 KB | None | 0 0
  1. import re
  2.  
  3. # 去除英文
  4. # update 删除带英语的句子. 因为删除英语剩下的就是病句
  5. pattern = re.compile(r'[a-zA-Z]+')
  6. test_text = [text for text in test_text if not pattern.search(text) ]
  7.  
  8. # change num
  9. pattern = re.compile(r'[0-9]+')
  10. test_text = [pattern.sub("L", lines) for lines in test_text]
  11.  
  12. # 去掉每行首尾空格
  13. test_text = [lines.strip() for lines in test_text]
  14.  
  15. # 去掉每行首尾空格
  16. test_text = [lines.strip() for lines in test_text]
  17.  
  18. # 将上面的正则换成负责找行中的空格
  19. pattern = re.compile(r' +')
  20. # 将所有space替换成空
  21. test_text = [pattern.sub("", lines) for lines in test_text]
  22.  
  23. # 清理括号
  24. pattern = re.compile("\([^()]*\)")
  25.  
  26. test_text = [pattern.sub("", lines) for lines in test_text]
  27.  
  28. pattern = re.compile("\([^()]*\)")
  29.  
  30. test_text = [pattern.sub("", lines) for lines in test_text]
  31.  
  32. # 清理章节名
  33. pattern = re.compile("^第[一二三四五六七八九十]")
  34. test_text = [text for text in test_text if not pattern.search(text) ]
  35.  
  36. # 清理 类似 图1 金融市场
  37. pattern = re.compile("^图L")
  38. test_text = [text for text in test_text if not pattern.search(text) ]
  39.  
  40. # 清理 类似 1.xxx 题目
  41. pattern = re.compile("^L\.")
  42. test_text = [text for text in test_text if not pattern.match(text) ]
  43.  
  44.  
  45. pattern = re.compile("[\u4e00-\u9fa5]")
  46.  
  47. # 修复doc分页导致的错误句子分割
  48. deep_num = 2
  49. for _ in range(deep_num):
  50. for id, text in enumerate(test_text):
  51. if not text:
  52. continue
  53. pattern = re.compile("[\u4e00-\u9fa5]")
  54. if pattern.match(text[-1]) and id != len(test_text):
  55. text=text+test_text[id+1]
  56. del test_text[id+1]
  57. test_text[id] = text
  58.  
  59. # 清理病态括号
  60. # 有左括号 无右括号结尾
  61. pattern = re.compile("\([^)]*$")
  62. test_text = [pattern.sub("", lines) for lines in test_text]
  63.  
  64. # 有右括号 但开头无左括号
  65. pattern = re.compile("^[^(]*)")
  66. test_text = [pattern.sub("", lines) for lines in test_text]
  67.  
  68. # 英文版
  69. # 有左括号 无右括号结尾
  70. pattern = re.compile("\([^\)]*$")
  71. test_text = [pattern.sub("", lines) for lines in test_text]
  72.  
  73. # 有右括号 但开头无左括号
  74. pattern = re.compile("^[^\(]*\)")
  75. test_text = [pattern.sub("", lines) for lines in test_text]
  76.  
  77. # 删除一些由于删除英语和数字导致的文本错误
  78. china_end = "!?。,、"
  79. pattern = re.compile(r"["+china_end+r"]+[\u4e00-\u9fa5]{1,2}["+china_end+"]+")
  80. test_text = [pattern.sub("", lines) for lines in test_text]
  81.  
  82. # 清理标注
  83. pattern = re.compile(r"[①②③④⑤⑥⑦⑧⑨⑩]")
  84. test_text = [pattern.sub("", lines) for lines in test_text]
  85.  
  86. # 清理\t
  87. pattern = re.compile(r"\t")
  88. test_text = [pattern.sub("", lines) for lines in test_text]
  89.  
  90. # 有右双引号 但开头无左引号
  91. pattern = re.compile("^[^“]*”")
  92. test_text = [pattern.sub("", lines) for lines in test_text]
  93.  
  94. pattern = re.compile("“[^”]*$")
  95. test_text = [pattern.sub("", lines) for lines in test_text]
  96. # 填充句号
  97. for id, text in enumerate(test_text):
  98. if not text:
  99. continue
  100. pattern = re.compile("[\u4e00-\u9fa5]")
  101.  
  102. if pattern.match(text[-1]):
  103. text=text+test_text[id+1]
  104. del test_text[id+1]
  105. test_text[id] = text + "。"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement