Guest User

Untitled

a guest
Mar 13th, 2018
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.31 KB | None | 0 0
  1. #! /usr/bin/env python
  2. # encoding: utf-8
  3. #
  4.  
  5. import sys
  6. import math
  7. import re
  8. from operator import attrgetter
  9.  
  10. from pdfminer.pdfparser import PDFParser
  11. from pdfminer.pdfdocument import PDFDocument
  12.  
  13. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  14. from pdfminer.converter import TextConverter, PDFPageAggregator
  15. from pdfminer.layout import LAParams
  16. from pdfminer.layout import LTTextContainer, LTTextBox, LTLine
  17. from pdfminer.pdfpage import PDFPage
  18.  
  19. DEBUG = False
  20. QUOTE = False
  21.  
  22. # magick number
  23. range_violation_col = (320,480)
  24. range_first_col = ( 35, 179 )
  25. range_column_last = (705, 790 )
  26. #column_count = 6 # 列の数
  27.  
  28.  
  29.  
  30. rsrcmgr = PDFResourceManager()
  31. laparams = LAParams()
  32.  
  33. # laparams.char_margin = 0.5
  34. laparams.word_margin = 0.07
  35.  
  36.  
  37. #laparams.detect_vertical = True
  38.  
  39.  
  40.  
  41.  
  42. def format_list_cell(node) :
  43. temp_str = node.get_text().rstrip("\n")
  44.  
  45. # 違反した条文を列挙しているカラムの処理
  46. if "条" in temp_str :
  47. src_str = re.sub(r'(?<=(条|\d))\n(?!(の|$))', ',', temp_str)
  48. else:
  49. src_str = temp_str.replace("\n",',')
  50.  
  51.  
  52. str = src_str.replace("\n",'')
  53. return str
  54.  
  55. def remove_returncode(node) :
  56. src_str = node.get_text()
  57.  
  58. if 'H' in src_str :
  59. src_str = re.sub(r'\n(?=H)', ',', src_str)
  60.  
  61.  
  62. str = src_str.replace("\n", '')
  63.  
  64.  
  65. if DEBUG :
  66. print(str, file=sys.stderr)
  67.  
  68. return str
  69.  
  70.  
  71.  
  72. def listup_horizontal_range(nodelist):
  73. horizontal_point_list = [ n.y0 for n in nodelist if (n.y0 == n.y1) ]
  74. if DEBUG :
  75. print(horizontal_point_list, file=sys.stderr)
  76.  
  77. return horizontal_point_list
  78.  
  79.  
  80. def parse_and_output(page, page_number) :
  81.  
  82. if page_number == debug_page :
  83. DEBUG = True
  84. else:
  85. DEBUG = False
  86.  
  87. cells = list()
  88.  
  89. lineobjlist = [ node for node in layout if issubclass(node.__class__, (LTLine) ) ]
  90.  
  91. hpoint_range = listup_horizontal_range(lineobjlist)
  92.  
  93.  
  94. if DEBUG:
  95. print(hpoint_range, file=sys.stderr)
  96.  
  97.  
  98. for node in layout:
  99. if ( not issubclass(node.__class__ ,(LTTextBox, LTTextContainer ) ) ):
  100. continue
  101. else:
  102.  
  103. temp_str = node.get_text().rstrip("\n")
  104. if temp_str.startswith("最終更新日") :
  105. last_modified_date = temp_str[6:]
  106. if DEBUG :
  107. print(last_modified_date, file=sys.stderr)
  108. continue
  109. if temp_str.endswith("労働局") :
  110. dept_labor = temp_str
  111. continue
  112. if temp_str.endswith("公表事案") :
  113. continue
  114.  
  115. cells.append(node)
  116.  
  117. filtered_cells = list(filter(lambda c: not(c.get_text().rstrip() in header_text) , cells))
  118. if DEBUG :
  119. print("filtered:" , filtered_cells, file=sys.stderr)
  120.  
  121. header_cells = list(filter(lambda c: c.get_text().rstrip() in header_text , cells))
  122.  
  123.  
  124. temp_cells = sorted(filtered_cells, key = attrgetter('x0'), reverse=False)
  125.  
  126. node_group = list()
  127.  
  128. range_list = [ i for i in zip(hpoint_range[1:], hpoint_range[2:]) ]
  129.  
  130. for r in range_list:
  131. temp = [ node for node in temp_cells if ( node.y0 < r[0] and node.y1 > r[1]) ]
  132.  
  133. node_group.append(temp)
  134.  
  135.  
  136. if DEBUG :
  137. for n in node_group :
  138. print(n,file=sys.stderr)
  139.  
  140.  
  141. columns = list()
  142.  
  143. for cells in node_group :
  144. for cell in cells :
  145.  
  146. if DEBUG:
  147. print(cell, file=sys.stderr)
  148.  
  149. center_x = ( cell.x0 + cell.x1 ) / 2.0
  150.  
  151. if ( center_x > range_first_col[0] and cell.x0 < range_first_col[1] ) :
  152. # 先頭
  153. col_str = remove_returncode(cell)
  154. if ( cell.x1 < range_first_col[1] and ' ' in col_str ) :
  155. col_str = re.sub(' +', ' ', col_str)
  156. print(col_str, file=sys.stderr)
  157.  
  158. columns.insert(0, col_str)
  159.  
  160. elif ( center_x > range_column_last[0] and center_x < range_column_last[1] ):
  161. # 最終カラム
  162. col_str = remove_returncode(cell)
  163.  
  164. columns.append(col_str)
  165.  
  166.  
  167. # 出力
  168.  
  169. # clean up
  170. purificated = list()
  171. for st in columns:
  172. if DEBUG:
  173. print(st, file=sys.stderr)
  174.  
  175. if st.endswith(" ") :
  176. st = st.rstrip(" ") # 末尾の全角空白を削除
  177.  
  178. if re.search(' ',st):
  179. # 半角空白の連続は全角スペースに
  180. st = st.replace(' ',"\u3000")
  181.  
  182. # 正常にパースできなかったセルを半角スペースで分割
  183. st = st.rstrip(' ')
  184. if (' ' in st ) :
  185. # temp = st.split(' ')
  186. if re.search('(都|道|府|県).+(市|町|村)' ,st) :
  187. if '条' in st :
  188. # カラムが4つ結合しているケース
  189. temp = st.rsplit(" ",3)
  190. elif 'H' in st :
  191. # カラムが3つ結合しているケース
  192. temp = st.rsplit(" ",2)
  193. else:
  194. # カラムが2つ結合しているケース
  195. temp = st.rsplit(" ",1)
  196. else:
  197. temp = re.split('(?<!)) ',st)
  198.  
  199. for ts in temp :
  200. purificated.append(ts)
  201. else:
  202. purificated.append(st)
  203.  
  204.  
  205. output = dept_labor + "\t" + last_modified_date + "\t" + "\t".join(purificated)
  206. if QUOTE:
  207. temp = list(map( lambda x: '"%s"' % x, output.split("\t") ))
  208. output = "\t".join(temp)
  209.  
  210. print(output)
  211. columns = list()
  212.  
  213.  
  214. else:
  215. # それ以外(先頭でも最終カラムでもないセル)
  216. if center_x > range_violation_col[0] and center_x < range_violation_col[1] :
  217. # セルの中身が複数行
  218. col_str = format_list_cell(cell)
  219. else:
  220. col_str = remove_returncode(cell)
  221.  
  222. columns.append(col_str)
  223.  
  224. #if __name__ == '__main__':
  225. # test()
  226.  
  227. device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  228.  
  229. debug_page = None
  230.  
  231. if DEBUG:
  232. print(sys.argv, file=sys.stderr)
  233.  
  234. if ( len(sys.argv) > 1 ):
  235. filename = sys.argv[1]
  236. else:
  237. print("Please specfy input pdf filename", file=sys.stderr)
  238. exit(-1)
  239.  
  240. # 処理するPDFを開く
  241. fp = open(filename, 'rb')
  242. interpreter = PDFPageInterpreter(rsrcmgr, device)
  243.  
  244.  
  245. parser = PDFParser(fp)
  246. document = PDFDocument(parser, '')
  247.  
  248. dept_labor = "" # 都道府県労働局
  249. last_modified_date = ""
  250.  
  251. header_text = ["企業・事業場名称", "所在地", "公表日", "違反法条", "事案概要", "その他参考事項" ]
  252.  
  253. for page in PDFPage.create_pages(document):
  254. interpreter.process_page(page)
  255. # page.contents
  256. layout = device.get_result()
  257.  
  258. if DEBUG :
  259. print(layout.pageid, file=sys.stderr)
  260.  
  261. # 1ページ目をスキップ
  262. page_number = layout.pageid
  263. if (layout.pageid == 1 ) :
  264. continue
  265.  
  266. parse_and_output(page, page_number)
  267.  
  268. fp.close()
  269. device.close()
Add Comment
Please, Sign In to add comment