Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /usr/bin/env python
- # encoding: utf-8
- #
- import sys
- import math
- import re
- from operator import attrgetter
- from pdfminer.pdfparser import PDFParser
- from pdfminer.pdfdocument import PDFDocument
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
- from pdfminer.converter import TextConverter, PDFPageAggregator
- from pdfminer.layout import LAParams
- from pdfminer.layout import LTTextContainer, LTTextBox, LTLine
- from pdfminer.pdfpage import PDFPage
- DEBUG = False
- QUOTE = False
- # magick number
- range_violation_col = (320,480)
- range_first_col = ( 35, 179 )
- range_column_last = (705, 790 )
- #column_count = 6 # 列の数
- rsrcmgr = PDFResourceManager()
- laparams = LAParams()
- # laparams.char_margin = 0.5
- laparams.word_margin = 0.07
- #laparams.detect_vertical = True
- def format_list_cell(node) :
- temp_str = node.get_text().rstrip("\n")
- # 違反した条文を列挙しているカラムの処理
- if "条" in temp_str :
- src_str = re.sub(r'(?<=(条|\d))\n(?!(の|$))', ',', temp_str)
- else:
- src_str = temp_str.replace("\n",',')
- str = src_str.replace("\n",'')
- return str
- def remove_returncode(node) :
- src_str = node.get_text()
- if 'H' in src_str :
- src_str = re.sub(r'\n(?=H)', ',', src_str)
- str = src_str.replace("\n", '')
- if DEBUG :
- print(str, file=sys.stderr)
- return str
- def listup_horizontal_range(nodelist):
- horizontal_point_list = [ n.y0 for n in nodelist if (n.y0 == n.y1) ]
- if DEBUG :
- print(horizontal_point_list, file=sys.stderr)
- return horizontal_point_list
- def parse_and_output(page, page_number) :
- if page_number == debug_page :
- DEBUG = True
- else:
- DEBUG = False
- cells = list()
- lineobjlist = [ node for node in layout if issubclass(node.__class__, (LTLine) ) ]
- hpoint_range = listup_horizontal_range(lineobjlist)
- if DEBUG:
- print(hpoint_range, file=sys.stderr)
- for node in layout:
- if ( not issubclass(node.__class__ ,(LTTextBox, LTTextContainer ) ) ):
- continue
- else:
- temp_str = node.get_text().rstrip("\n")
- if temp_str.startswith("最終更新日") :
- last_modified_date = temp_str[6:]
- if DEBUG :
- print(last_modified_date, file=sys.stderr)
- continue
- if temp_str.endswith("労働局") :
- dept_labor = temp_str
- continue
- if temp_str.endswith("公表事案") :
- continue
- cells.append(node)
- filtered_cells = list(filter(lambda c: not(c.get_text().rstrip() in header_text) , cells))
- if DEBUG :
- print("filtered:" , filtered_cells, file=sys.stderr)
- header_cells = list(filter(lambda c: c.get_text().rstrip() in header_text , cells))
- temp_cells = sorted(filtered_cells, key = attrgetter('x0'), reverse=False)
- node_group = list()
- range_list = [ i for i in zip(hpoint_range[1:], hpoint_range[2:]) ]
- for r in range_list:
- temp = [ node for node in temp_cells if ( node.y0 < r[0] and node.y1 > r[1]) ]
- node_group.append(temp)
- if DEBUG :
- for n in node_group :
- print(n,file=sys.stderr)
- columns = list()
- for cells in node_group :
- for cell in cells :
- if DEBUG:
- print(cell, file=sys.stderr)
- center_x = ( cell.x0 + cell.x1 ) / 2.0
- if ( center_x > range_first_col[0] and cell.x0 < range_first_col[1] ) :
- # 先頭
- col_str = remove_returncode(cell)
- if ( cell.x1 < range_first_col[1] and ' ' in col_str ) :
- col_str = re.sub(' +', ' ', col_str)
- print(col_str, file=sys.stderr)
- columns.insert(0, col_str)
- elif ( center_x > range_column_last[0] and center_x < range_column_last[1] ):
- # 最終カラム
- col_str = remove_returncode(cell)
- columns.append(col_str)
- # 出力
- # clean up
- purificated = list()
- for st in columns:
- if DEBUG:
- print(st, file=sys.stderr)
- if st.endswith(" ") :
- st = st.rstrip(" ") # 末尾の全角空白を削除
- if re.search(' ',st):
- # 半角空白の連続は全角スペースに
- st = st.replace(' ',"\u3000")
- # 正常にパースできなかったセルを半角スペースで分割
- st = st.rstrip(' ')
- if (' ' in st ) :
- # temp = st.split(' ')
- if re.search('(都|道|府|県).+(市|町|村)' ,st) :
- if '条' in st :
- # カラムが4つ結合しているケース
- temp = st.rsplit(" ",3)
- elif 'H' in st :
- # カラムが3つ結合しているケース
- temp = st.rsplit(" ",2)
- else:
- # カラムが2つ結合しているケース
- temp = st.rsplit(" ",1)
- else:
- temp = re.split('(?<!)) ',st)
- for ts in temp :
- purificated.append(ts)
- else:
- purificated.append(st)
- output = dept_labor + "\t" + last_modified_date + "\t" + "\t".join(purificated)
- if QUOTE:
- temp = list(map( lambda x: '"%s"' % x, output.split("\t") ))
- output = "\t".join(temp)
- print(output)
- columns = list()
- else:
- # それ以外(先頭でも最終カラムでもないセル)
- if center_x > range_violation_col[0] and center_x < range_violation_col[1] :
- # セルの中身が複数行
- col_str = format_list_cell(cell)
- else:
- col_str = remove_returncode(cell)
- columns.append(col_str)
- #if __name__ == '__main__':
- # test()
- device = PDFPageAggregator(rsrcmgr, laparams=laparams)
- debug_page = None
- if DEBUG:
- print(sys.argv, file=sys.stderr)
- if ( len(sys.argv) > 1 ):
- filename = sys.argv[1]
- else:
- print("Please specfy input pdf filename", file=sys.stderr)
- exit(-1)
- # 処理するPDFを開く
- fp = open(filename, 'rb')
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- parser = PDFParser(fp)
- document = PDFDocument(parser, '')
- dept_labor = "" # 都道府県労働局
- last_modified_date = ""
- header_text = ["企業・事業場名称", "所在地", "公表日", "違反法条", "事案概要", "その他参考事項" ]
- for page in PDFPage.create_pages(document):
- interpreter.process_page(page)
- # page.contents
- layout = device.get_result()
- if DEBUG :
- print(layout.pageid, file=sys.stderr)
- # 1ページ目をスキップ
- page_number = layout.pageid
- if (layout.pageid == 1 ) :
- continue
- parse_and_output(page, page_number)
- fp.close()
- device.close()
Add Comment
Please, Sign In to add comment