Advertisement
Guest User

Untitled

a guest
Apr 8th, 2020
255
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 14.51 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. from cpy.parser import Parser
  3. from cpy.table import *
  4. from xlrd import *
  5. from xlrd.compdoc import *
  6. from xlrd.book import *
  7.  
  8. parser = Parser()
  9.  
  10.  
  11. class MyCompDoc(CompDoc):
  12.  
  13.     def _locate_stream(self, mem, base, sat, sec_size, start_sid, expected_stream_size, qname, seen_id):
  14.         # print >> self.logfile, "_locate_stream", base, sec_size, start_sid, expected_stream_size
  15.         s = start_sid
  16.         if s < 0:
  17.             raise CompDocError("_locate_stream: start_sid (%d) is -ve" % start_sid)
  18.         p = -99  # dummy previous SID
  19.         start_pos = -9999
  20.         end_pos = -8888
  21.         slices = []
  22.         tot_found = 0
  23.         found_limit = (expected_stream_size + sec_size - 1) // sec_size
  24.         while s >= 0:
  25.             # if self.seen[s]:
  26.             #     print("_locate_stream(%s): seen" % qname, file=self.logfile); dump_list(self.seen, 20, self.logfile)
  27.             #     raise CompDocError("%s corruption: seen[%d] == %d" % (qname, s, self.seen[s]))
  28.             self.seen[s] = seen_id
  29.             tot_found += 1
  30.             if tot_found > found_limit:
  31.                 raise CompDocError(
  32.                     "%s: size exceeds expected %d bytes; corrupt?"
  33.                     % (qname, found_limit * sec_size)
  34.                 )  # Note: expected size rounded up to higher sector
  35.             if s == p + 1:
  36.                 # contiguous sectors
  37.                 end_pos += sec_size
  38.             else:
  39.                 # start new slice
  40.                 if p >= 0:
  41.                     # not first time
  42.                     slices.append((start_pos, end_pos))
  43.                 start_pos = base + s * sec_size
  44.                 end_pos = start_pos + sec_size
  45.             p = s
  46.             s = sat[s]
  47.         assert s == EOCSID
  48.         assert tot_found == found_limit
  49.         # print >> self.logfile, "_locate_stream(%s): seen" % qname; dump_list(self.seen, 20, self.logfile)
  50.         if not slices:
  51.             # The stream is contiguous ... just what we like!
  52.             return (mem, start_pos, expected_stream_size)
  53.         slices.append((start_pos, end_pos))
  54.         # print >> self.logfile, "+++>>> %d fragments" % len(slices)
  55.         return (b''.join(mem[start_pos:end_pos] for start_pos, end_pos in slices), 0, expected_stream_size)
  56.  
  57.  
  58. class MyBook(Book):
  59.     def __init__(self):
  60.         super(MyBook, self).__init__()
  61.  
  62.     def biff2_8_load(self, filename=None, file_contents=None,
  63.                      logfile=sys.stdout, verbosity=0, use_mmap=USE_MMAP,
  64.                      encoding_override=None,
  65.                      formatting_info=False,
  66.                      on_demand=False,
  67.                      ragged_rows=False,
  68.                      ):
  69.         # DEBUG = 0
  70.         self.logfile = logfile
  71.         self.verbosity = verbosity
  72.         self.use_mmap = use_mmap and MMAP_AVAILABLE
  73.         self.encoding_override = encoding_override
  74.         self.formatting_info = formatting_info
  75.         self.on_demand = on_demand
  76.         self.ragged_rows = ragged_rows
  77.  
  78.         if not file_contents:
  79.             with open(filename, "rb") as f:
  80.                 f.seek(0, 2)  # EOF
  81.                 size = f.tell()
  82.                 f.seek(0, 0)  # BOF
  83.                 if size == 0:
  84.                     raise XLRDError("File size is 0 bytes")
  85.                 if self.use_mmap:
  86.                     self.filestr = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
  87.                     self.stream_len = size
  88.                 else:
  89.                     self.filestr = f.read()
  90.                     self.stream_len = len(self.filestr)
  91.         else:
  92.             self.filestr = file_contents
  93.             self.stream_len = len(file_contents)
  94.  
  95.         self.base = 0
  96.         if self.filestr[:8] != SIGNATURE:
  97.             # got this one at the antique store
  98.             self.mem = self.filestr
  99.         else:
  100.             cd = MyCompDoc(self.filestr, logfile=self.logfile)
  101.             if USE_FANCY_CD:
  102.                 for qname in ['Workbook', 'Book']:
  103.                     self.mem, self.base, self.stream_len = \
  104.                         cd.locate_named_stream(UNICODE_LITERAL(qname))
  105.                     if self.mem: break
  106.                 else:
  107.                     raise XLRDError("Can't find workbook in OLE2 compound document")
  108.             else:
  109.                 for qname in ['Workbook', 'Book']:
  110.                     self.mem = cd.get_named_stream(UNICODE_LITERAL(qname))
  111.                     if self.mem: break
  112.                 else:
  113.                     raise XLRDError("Can't find workbook in OLE2 compound document")
  114.                 self.stream_len = len(self.mem)
  115.             del cd
  116.             if self.mem is not self.filestr:
  117.                 if hasattr(self.filestr, "close"):
  118.                     self.filestr.close()
  119.                 self.filestr = b''
  120.         self._position = self.base
  121.         if DEBUG:
  122.             print("mem: %s, base: %d, len: %d" % (type(self.mem), self.base, self.stream_len), self.logfile)
  123.  
  124.  
  125. def open_workbook_xls(filename=None,
  126.     logfile=sys.stdout, verbosity=0, use_mmap=USE_MMAP,
  127.     file_contents=None,
  128.     encoding_override=None,
  129.     formatting_info=False, on_demand=False, ragged_rows=False,
  130.     ):
  131.     t0 = time.clock()
  132.     if TOGGLE_GC:
  133.         orig_gc_enabled = gc.isenabled()
  134.         if orig_gc_enabled:
  135.             gc.disable()
  136.     bk = MyBook()
  137.     try:
  138.         bk.biff2_8_load(
  139.             filename=filename, file_contents=file_contents,
  140.             logfile=logfile, verbosity=verbosity, use_mmap=use_mmap,
  141.             encoding_override=encoding_override,
  142.             formatting_info=formatting_info,
  143.             on_demand=on_demand,
  144.             ragged_rows=ragged_rows,
  145.             )
  146.         t1 = time.clock()
  147.         bk.load_time_stage_1 = t1 - t0
  148.         biff_version = bk.getbof(XL_WORKBOOK_GLOBALS)
  149.         if not biff_version:
  150.             raise XLRDError("Can't determine file's BIFF version")
  151.         if biff_version not in SUPPORTED_VERSIONS:
  152.             raise XLRDError(
  153.                 "BIFF version %s is not supported"
  154.                 % biff_text_from_num[biff_version]
  155.                 )
  156.         bk.biff_version = biff_version
  157.         if biff_version <= 40:
  158.             # no workbook globals, only 1 worksheet
  159.             if on_demand:
  160.                 fprintf(bk.logfile,
  161.                     "*** WARNING: on_demand is not supported for this Excel version.\n"
  162.                     "*** Setting on_demand to False.\n")
  163.                 bk.on_demand = on_demand = False
  164.             bk.fake_globals_get_sheet()
  165.         elif biff_version == 45:
  166.             # worksheet(s) embedded in global stream
  167.             bk.parse_globals()
  168.             if on_demand:
  169.                 fprintf(bk.logfile, "*** WARNING: on_demand is not supported for this Excel version.\n"
  170.                                     "*** Setting on_demand to False.\n")
  171.                 bk.on_demand = on_demand = False
  172.         else:
  173.             bk.parse_globals()
  174.             bk._sheet_list = [None for sh in bk._sheet_names]
  175.             if not on_demand:
  176.                 bk.get_sheets()
  177.         bk.nsheets = len(bk._sheet_list)
  178.         if biff_version == 45 and bk.nsheets > 1:
  179.             fprintf(bk.logfile,
  180.                 "*** WARNING: Excel 4.0 workbook (.XLW) file contains %d worksheets.\n"
  181.                 "*** Book-level data will be that of the last worksheet.\n",
  182.                 bk.nsheets
  183.                 )
  184.         if TOGGLE_GC:
  185.             if orig_gc_enabled:
  186.                 gc.enable()
  187.         t2 = time.clock()
  188.         bk.load_time_stage_2 = t2 - t1
  189.     except:
  190.         bk.release_resources()
  191.         raise
  192.     # normal exit
  193.     if not on_demand:
  194.         bk.release_resources()
  195.     return bk
  196.  
  197.  
  198. def open_workbook(filename=None,
  199.                   logfile=sys.stdout,
  200.                   verbosity=0,
  201.                   use_mmap=USE_MMAP,
  202.                   file_contents=None,
  203.                   encoding_override=None,
  204.                   formatting_info=False,
  205.                   on_demand=False,
  206.                   ragged_rows=False,
  207.                   ):
  208.  
  209.     peeksz = 4
  210.     if file_contents:
  211.         peek = file_contents[:peeksz]
  212.     else:
  213.         with open(filename, "rb") as f:
  214.             peek = f.read(peeksz)
  215.     if peek == b"PK\x03\x04":  # a ZIP file
  216.         if file_contents:
  217.             zf = zipfile.ZipFile(timemachine.BYTES_IO(file_contents))
  218.         else:
  219.             zf = zipfile.ZipFile(filename)
  220.  
  221.         # Workaround for some third party files that use forward slashes and
  222.         # lower case names. We map the expected name in lowercase to the
  223.         # actual filename in the zip container.
  224.         component_names = dict([(X12Book.convert_filename(name), name)
  225.                                 for name in zf.namelist()])
  226.  
  227.         if verbosity:
  228.             logfile.write('ZIP component_names:\n')
  229.             pprint.pprint(component_names, logfile)
  230.         if 'xl/workbook.xml' in component_names:
  231.             from xlrd import xlsx
  232.             bk = xlsx.open_workbook_2007_xml(
  233.                 zf,
  234.                 component_names,
  235.                 logfile=logfile,
  236.                 verbosity=verbosity,
  237.                 use_mmap=use_mmap,
  238.                 formatting_info=formatting_info,
  239.                 on_demand=on_demand,
  240.                 ragged_rows=ragged_rows,
  241.             )
  242.             return bk
  243.         if 'xl/workbook.bin' in component_names:
  244.             raise XLRDError('Excel 2007 xlsb file; not supported')
  245.         if 'content.xml' in component_names:
  246.             raise XLRDError('Openoffice.org ODS file; not supported')
  247.         raise XLRDError('ZIP file contents not a known type of workbook')
  248.  
  249.     from xlrd import book
  250.     bk = open_workbook_xls(
  251.         filename=filename,
  252.         logfile=logfile,
  253.         verbosity=verbosity,
  254.         use_mmap=use_mmap,
  255.         file_contents=file_contents,
  256.         encoding_override=encoding_override,
  257.         formatting_info=formatting_info,
  258.         on_demand=on_demand,
  259.         ragged_rows=ragged_rows,
  260.     )
  261.     return bk
  262.  
  263.  
  264. class MyExcelTableRows(TableField):
  265.  
  266.     def __init__(self, formatting_info=True, populate_merged=True):
  267.  
  268.         self.formatting_info = formatting_info
  269.         self.populate_merged = populate_merged
  270.  
  271.     def __call__(self, table):
  272.         return self._extract_rows(table)
  273.  
  274.     def _extract_rows(self, table):
  275.         filename = table.prdata['filename']
  276.  
  277.         book = open_workbook(
  278.                     filename,
  279.                     on_demand=True,
  280.                     formatting_info=self.formatting_info
  281.                 )
  282.  
  283.         table_identifier = table.prdata['table_identifier']
  284.         sheet = book.sheet_by_index(table_identifier)
  285.  
  286.         table.prdata['sheet_name'] = sheet.name
  287.  
  288.         book_date_mode = book.datemode
  289.  
  290.         rows = []
  291.         for row in range(sheet.nrows):
  292.  
  293.             row_list = []
  294.             for col in range(sheet.ncols):
  295.                 try:
  296.                     cell = sheet.cell(row, col)
  297.                     value = cell.value
  298.                     cell_type = cell.ctype
  299.                 except IndexError:
  300.                     cell_type = XL_CELL_ERROR
  301.  
  302.                 if cell_type == XL_CELL_ERROR:
  303.                     value = ""
  304.                 elif cell_type == XL_CELL_DATE:
  305.                     try:
  306.  
  307.                         date_value = xldate_as_tuple(value, book_date_mode)
  308.                         try:
  309.                             value = str(date(*date_value[:3]))
  310.                         except ValueError:
  311.                             pass
  312.  
  313.                     except xlrd.xldate.XLDateError:
  314.                         pass
  315.  
  316.                 if type(value) != str and type(value) != unicode:
  317.  
  318.                     # This would cut and round (5 down)
  319.                     value = "%0.15g" % value
  320.  
  321.                 row_list.append(value)
  322.  
  323.             rows.append(row_list)
  324.  
  325.         if self.populate_merged:
  326.             # Populate values in merged cells
  327.             for crange in sheet.merged_cells:
  328.                 rlo, rhi, clo, chi = crange
  329.                 value = rows[rlo][clo]
  330.                 for rowx in xrange(rlo, rhi):
  331.                     for colx in xrange(clo, chi):
  332.                         rows[rowx][colx] = value
  333.  
  334.         return rows
  335.  
  336.  
  337. class MyTableIdentifierExcel(TableField):
  338.     def __init__(self, formatting_info=True, cleaner=lambda v: v):
  339.  
  340.         self.formatting_info = formatting_info
  341.         self.cleaner = cleaner
  342.  
  343.     def __call__(self, table):
  344.  
  345.         filename = table.prdata['filename']
  346.         table_descr = self.cleaner(table.prdata['table_descr'])
  347.         table_regex = getre(table_descr, re.I | re.S | re.U)
  348.  
  349.         matched_sheets = []
  350.  
  351.         book = open_workbook(
  352.                     filename,
  353.                     on_demand=True,
  354.                     formatting_info=self.formatting_info
  355.                 )
  356.  
  357.         for index, name in enumerate(book.sheet_names()):
  358.             if table_regex.search(name):
  359.                 matched_sheets.append(index)
  360.  
  361.         return matched_sheets
  362.  
  363.  
  364. @parser.source('file', "Laporan\s*Harian")
  365. class TableOne(AutoDateFinder.params(data_frequency='daily',
  366.                                      priority_regexes=[r'(?P<day>\d{2})\W*(?P<month>\d{2})\W*(?P<year>(?:19|20)\d{2})']),
  367.                Rows, ExcelTable):
  368.     normalizer = CleanAllNonAlphaNumUnicode()
  369.  
  370.     duplicate_suffix = '_dummy'
  371.  
  372.     table_identifier = MyTableIdentifierExcel()
  373.  
  374.     @prdata
  375.     def rows(self):
  376.         rows = MyExcelTableRows()(self)
  377.  
  378.         """
  379.        There are a lot of duplicated descriptions
  380.        """
  381.         found_descriptions = {}
  382.         descr_col_index = None
  383.  
  384.         for row_idx, row in enumerate(rows):
  385.             if descr_col_index is None:
  386.                 match_descr_col = match_forward(regex=getre(r'Provinsi', re.I | re.U), iterable=row)
  387.  
  388.                 if match_descr_col[0] is not None:
  389.                     descr_col_index = match_descr_col[1]
  390.                     continue
  391.             else:
  392.                 orig_descr = row[descr_col_index]
  393.                 descr = self.normalizer(orig_descr)
  394.  
  395.                 if not descr:
  396.                     continue
  397.  
  398.                 descr_count = found_descriptions.setdefault(descr, 0)
  399.                 descr_count += 1
  400.                 found_descriptions[descr] = descr_count
  401.  
  402.                 if descr_count == 2:
  403.                     rows[row_idx][descr_col_index] = orig_descr + self.duplicate_suffix
  404.  
  405.                 elif descr_count > 2:
  406.                     rows[row_idx][descr_col_index] = orig_descr + self.duplicate_suffix + str((descr_count - 1))
  407.  
  408.         return rows
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement