Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- import argparse
- import json
- import os.path
- import re
- import struct
- import sys
- from binascii import hexlify
- MSBT_HEADER_LEN = 0x20
- LBL1_HEADER_LEN = 0x14
- ATR1_HEADER_LEN = 0x14
- TXT2_HEADER_LEN = 0x14
- MSBT_MAGIC = b'MsgStdBn'
- LBL1_MAGIC = b'LBL1'
- ATR1_MAGIC = b'ATR1'
- TXT2_MAGIC = b'TXT2'
- MSBT_HEADER_STRUCT = '=8s2H2B2HI10s'
- LBL1_HEADER_STRUCT = '%s4sI8sI'
- ATR1_HEADER_STRUCT = '%s4s4I'
- TXT2_HEADER_STRUCT = '%s4s4I'
- SECTION_END_MAGIC = b'\xAB'
- COLOR_ESCAPE = b'\x03\x00\x04\x00'
- ENCODING_UTF8 = 0x00
- ENCODING_UTF16 = 0x01
- ENCODINGS = {
- ENCODING_UTF8: "UTF-8",
- ENCODING_UTF16: "UTF-16"
- }
- class Msbt:
- order = None
- invalid = False
- filename = ''
- file_size = 0
- header_unknowns = []
- sections = {}
- section_order = []
- section_count = 0
- encoding = ENCODING_UTF16
- def __init__(self, verbose=False, debug=False, colors=False):
- self.verbose = verbose
- self.debug = debug
- self.colors = colors
- def read(self, filename):
- self.filename = filename
- self.file_size = os.stat(filename).st_size
- data = open(self.filename, 'rb').read()
- self._parse_header(data[:MSBT_HEADER_LEN])
- if self.invalid:
- return
- position = MSBT_HEADER_LEN
- sections_left = self.section_count
- while sections_left > 0 and position < self.file_size:
- magic = data[position:position + 4]
- if magic == LBL1_MAGIC:
- self._parse_lbl1_header(data[position:position + LBL1_HEADER_LEN])
- position += LBL1_HEADER_LEN
- if self.invalid:
- return
- self._parse_lbl1_data(data[position:position + self.sections['LBL1']['header']['size']])
- position += self.sections['LBL1']['header']['size']
- elif magic == ATR1_MAGIC:
- self._parse_atr1_header(data[position:position + ATR1_HEADER_LEN])
- position += ATR1_HEADER_LEN
- if self.invalid:
- return
- # TODO: parse ATR1 data?
- position += self.sections['ATR1']['header']['size']
- elif magic == TXT2_MAGIC:
- self._parse_txt2_header(data[position:position + TXT2_HEADER_LEN])
- position += TXT2_HEADER_LEN
- if self.invalid:
- return
- self._parse_txt2_data(data[position:position + self.sections['TXT2']['header']['size']])
- position += self.sections['TXT2']['header']['size']
- # TODO:
- # elif magic == NLI1_MAGIC:
- else:
- position += struct.unpack('%sI' % self.order, data[position + 4:position + 8])[0]
- position += TXT2_HEADER_LEN
- if self.debug:
- print('\nUnknown section skipped')
- print('Unknown section Magic bytes\n: %s' % magic)
- sections_left -= 1
- self.section_order.append(magic)
- while position < self.file_size:
- if data[position] != '\xAB':
- break
- position += 1
- def save(self, filename):
- output = open(filename, 'wb')
- bom = 0
- if self.order == '>':
- bom = 0xFFFE
- elif self.order == '<':
- bom = 0xFEFF
- if self.debug:
- print('\nMSBT Magic: %s' % MSBT_MAGIC)
- print('MSBT Byte-order marker: 0x%x' % bom)
- print('MSBT Unknown1: 0x%x' % self.header_unknowns[0])
- print('MSBT Encoding: %d (%s)' % (self.encoding, ENCODINGS[self.encoding]))
- print('MSBT Unknown2: 0x%x' % self.header_unknowns[1])
- print('MSBT Sections: %d' % self.section_count)
- print('MSBT Unknown3: 0x%x' % self.header_unknowns[2])
- print('MSBT File size: (unknown)')
- print('MSBT Unknown4: 0x%s\n' % self.header_unknowns[3].encode('hex'))
- msbt_header = struct.pack(MSBT_HEADER_STRUCT, MSBT_MAGIC, bom, self.header_unknowns[0], self.encoding,
- self.header_unknowns[1], self.section_count, self.header_unknowns[2],
- 0, str(self.header_unknowns[3]))
- output.write(msbt_header)
- for section in self.section_order:
- data = {
- 'LBL1': self._serialize_lbl1,
- 'ATR1': self._serialize_atr1,
- 'TXT2': self._serialize_txt2
- }[section]()
- output.write(data)
- position = output.tell()
- # write the section end bytes until the next 0x10 alignment
- padding = (16 - (position % 16))
- if padding < 16:
- output.write(SECTION_END_MAGIC * padding)
- # update the size in the header with the final size
- size = output.tell()
- output.seek(0x12)
- output.write(struct.pack('=I', size))
- output.close()
- print('\nMSBT File size: %d' % size)
- def to_json(self, filename):
- output = {
- 'strings': {},
- 'structure': {}
- }
- try:
- label_lists = self.sections['LBL1']['data']
- for label_list in label_lists:
- for label in label_list[0]:
- id_ = label[0]
- name = label[1]
- value = self.sections['TXT2']['data'][id_]
- output['strings'][name] = value
- except KeyError:
- value = self.sections['TXT2']['data']
- output['strings'] = value
- output['structure']['MSBT'] = {
- 'header': {
- 'byte_order': self.order,
- 'encoding': ENCODINGS[self.encoding],
- 'sections': self.section_count,
- 'section_order': self.section_order,
- 'unknowns': self.header_unknowns
- }
- }
- for section in self.sections.keys():
- output['structure'][section] = {
- 'header': self.sections[section]['header']
- }
- try:
- output['structure']['LBL1']['lists'] = self.sections['LBL1']['data']
- except KeyError:
- output['structure'] = self.sections
- json.dump(output, open(filename, 'w'), indent=2, sort_keys=True, ensure_ascii=False)
- def from_json(self, filename):
- json_data = json.load(open(filename, 'r'))
- strings = json_data['strings']
- structure = json_data['structure']
- lbl1 = structure['LBL1']
- self.sections['LBL1'] = {
- 'header': lbl1['header'],
- 'data': lbl1['lists']
- }
- self.sections['ATR1'] = {
- 'header': json_data['structure']['ATR1']['header']
- }
- self.sections['TXT2'] = {
- 'header': json_data['structure']['TXT2']['header'],
- 'data': []
- }
- msbt_header = json_data['structure']['MSBT']['header']
- self.order = msbt_header['byte_order']
- self.encoding = msbt_header['encoding']
- self.section_order = msbt_header['section_order']
- self.section_count = msbt_header['sections']
- self.header_unknowns = msbt_header['unknowns']
- for encoding in ENCODINGS:
- if self.encoding == ENCODINGS[encoding]:
- self.encoding = encoding
- for i in range(len(json_data['strings'])):
- self.sections['TXT2']['data'].append('')
- label_lists = self.sections['LBL1']['data']
- for label_list in label_lists:
- for label in label_list[0]:
- id_ = label[0]
- name = label[1]
- value = strings[name]
- self.sections['TXT2']['data'][id_] = value
- def _parse_header(self, data):
- magic, bom, unknown1, encoding, unknown2, sections, unknown3, file_size, unknown4 = struct.unpack(
- MSBT_HEADER_STRUCT, data)
- if magic != MSBT_MAGIC:
- print('Invalid header magic bytes: %s (expected %s)' % (magic, MSBT_MAGIC))
- self.invalid = True
- return
- if bom == 0xFFFE:
- self.order = '>'
- elif bom == 0xFEFF:
- self.order = '<'
- if self.order is None:
- print('Invalid byte-order marker: 0x%x (expected either 0xFFFE or 0xFEFF)' % bom)
- self.invalid = True
- return
- if file_size != self.file_size:
- print('Invalid file size reported: %d (OS reports %d)' % (file_size, self.file_size))
- self.section_count = sections
- self.encoding = encoding
- # save for repacking
- self.header_unknowns = [
- unknown1,
- unknown2,
- unknown3,
- unknown4
- ]
- if self.debug:
- print('MSBT Magic bytes: %s' % magic)
- print('MSBT Byte-order: %s' % self.order)
- print('MSBT Sections: %d' % sections)
- print('MSBT File size: %s' % file_size)
- print('\nUnknown1: 0x%x' % unknown1)
- print('Unknown2: 0x%x' % unknown2)
- print('Unknown3: 0x%x' % unknown3)
- print('Unknown4: 0x%s\n' % hexlify(unknown4).decode('utf-8'))
- def _parse_lbl1_header(self, data):
- magic, size, unknown, entries = struct.unpack(LBL1_HEADER_STRUCT % self.order, data)
- if magic != LBL1_MAGIC:
- print('Invalid LBL1 magic bytes: %s (expected %s)' % (magic, LBL1_MAGIC))
- self.invalid = True
- return
- # -4 from size since we're reading the entries as part of the header
- self.sections['LBL1'] = {
- 'header': {
- 'size': size - 4,
- 'entries': entries,
- 'unknown': unknown
- }
- }
- if self.debug:
- print('LBL1 Magic bytes: %s' % magic)
- print('LBL1 Size: %d' % size)
- print('LBL1 Entries: %d' % entries)
- print('\nLBL1 Unknown: 0x%s\n' % unknown.encode('hex'))
- def _parse_lbl1_data(self, data):
- entries = self.sections['LBL1']['header']['entries']
- position = 0
- lists = []
- if self.debug:
- print('\nLBL1 Entries:')
- entry = 1
- while entries > 0:
- count, offset = struct.unpack('%s2I' % self.order, data[position:position + 8])
- if self.debug:
- print('\n#%d' % entry)
- entry += 1
- print('List length: %d' % count)
- print('First offset: 0x%x' % offset)
- position += 8
- entries -= 1
- offset -= 4
- list_ = []
- for i in range(count):
- length = ord(data[offset])
- name_end = offset + length + 1
- name = data[offset + 1:name_end]
- id_offset = name_end
- id_ = struct.unpack('%sI' % self.order, data[id_offset:id_offset + 4])[0]
- list_.append((id_, name))
- offset = id_offset + 4
- if self.debug:
- print(' %d: %s' % (id_, name))
- lists.append((list_, offset))
- if self.debug:
- print('')
- self.sections['LBL1']['data'] = lists
- def _parse_atr1_header(self, data):
- magic, size, unknown1, unknown2, entries = struct.unpack(ATR1_HEADER_STRUCT % self.order, data)
- if magic != ATR1_MAGIC:
- print('Invalid ATR1 magic bytes: %s (expected %s)' % (magic, ATR1_MAGIC))
- self.invalid = True
- return
- # -4 from size since we're reading the entries as part of the header
- self.sections['ATR1'] = {
- 'header': {
- 'size': size - 4,
- 'entries': entries,
- 'unknown1': unknown1,
- 'unknown2': unknown2
- }
- }
- if self.debug:
- print('ATR1 Magic bytes: %s' % magic)
- print('ATR1 Size: %d' % size)
- print('ATR1 Entries: %d' % entries)
- print('\nATR1 Unknown1: 0x%x' % unknown1)
- print('ATR1 Unknown2: 0x%x\n' % unknown2)
- def _parse_txt2_header(self, data):
- magic, size, unknown1, unknown2, entries = struct.unpack(TXT2_HEADER_STRUCT % self.order, data)
- if magic != TXT2_MAGIC:
- print('Invalid TXT2 magic bytes: %s (expected %s)' % (magic, TXT2_MAGIC))
- self.invalid = True
- return
- # -4 from size since we're reading the entries as part of the header
- self.sections['TXT2'] = {
- 'header': {
- 'size': size - 4,
- 'entries': entries,
- 'unknown1': unknown1,
- 'unknown2': unknown2
- }
- }
- if self.debug:
- print('TXT2 Magic bytes: %s' % magic)
- print('TXT2 Size: %d' % size)
- print('TXT2 Entries: %d' % entries)
- print('\nTXT2 Unknown1: 0x%x' % unknown1)
- print('TXT2 Unknown2: 0x%x\n' % unknown2)
- def _parse_txt2_data(self, data):
- entries = self.sections['TXT2']['header']['entries']
- data_len = len(data)
- offsets = []
- strings = []
- for i in range(entries):
- start = i * 4
- end = (i + 1) * 4
- offsets.append(struct.unpack('%sI' % self.order, data[start:end])[0] - 4)
- for i in range(entries):
- start = offsets[i]
- if i < entries - 1:
- end = offsets[i + 1]
- else:
- end = data_len
- string_data = data[start:end]
- position = 0
- string = b''
- substrings = []
- while position < len(string_data):
- if self.colors and len(string) >= 4 and string[-4:] == COLOR_ESCAPE:
- # save color information
- color = struct.unpack('%sI' % self.order, string_data[position:position + 4])[0]
- position += 4
- string += (b'[#%08x]' % color).encode('utf-16-%s' % ({'<':'le', '>':'be'}[self.order]))
- continue
- utf16char = string_data[position:position + 2]
- if utf16char != b'\x00\x00':
- string += utf16char
- else:
- substrings.append(string.decode('utf-16', 'replace'))
- string = b''
- position += 2
- strings.append(substrings)
- self.sections['TXT2']['data'] = strings
- def _serialize_lbl1(self):
- entries = self.sections['LBL1']['header']['entries']
- header_bytes = struct.pack(LBL1_HEADER_STRUCT % self.order, LBL1_MAGIC, 0,
- str(self.sections['LBL1']['header']['unknown']), entries)
- section1_bytes = ''
- section2_bytes = ''
- # each section 1 entry is 8 bytes long
- # but we're including the entries data in the header bytes so we need to compensate for that
- section2_offset = (entries * 8) + 4
- for label_list in self.sections['LBL1']['data']:
- count = len(label_list[0])
- offset = len(section2_bytes)
- for label in label_list[0]:
- length = len(label[1])
- section2_bytes += struct.pack('%sB%dsI' % (self.order, length), length, str(label[1]), label[0])
- section1_bytes += struct.pack('%s2I' % self.order, count, section2_offset + offset)
- size = len(section1_bytes) + len(section2_bytes) + 4
- header_bytes = header_bytes[:4] + struct.pack('%sI' % self.order, size) + header_bytes[8:]
- if self.debug:
- print('\nLBL1 Magic: %s' % LBL1_MAGIC)
- print('LBL1 Size: %d' % size)
- print('LBL1 Unknown: 0x%s' % self.sections['LBL1']['header']['unknown'].encode('hex'))
- print('LBL1 Entries: %d\n' % entries)
- return header_bytes + section1_bytes + section2_bytes
- def _serialize_atr1(self):
- # ATR1 is unknown right now so we're going to just pad the section
- # (which is all we've got in Rhythm Tengoku string files
- header = self.sections['ATR1']['header']
- if self.debug:
- print('\nATR1 Magic: %s' % ATR1_MAGIC)
- print('ATR1 Size: %d' % (header['size'] + 4))
- print('ATR1 Unknown1: 0x%d' % header['unknown1'])
- print('ATR1 Unknown2: 0x%d' % header['unknown2'])
- print('ATR1 Entries: %d\n' % header['entries'])
- header_bytes = struct.pack(ATR1_HEADER_STRUCT % self.order, ATR1_MAGIC, header['size'] + 4, header['unknown1'],
- header['unknown2'], header['entries'])
- atr1_data_bytes = struct.pack('%s%ds' % (self.order, header['size']), '\0' * header['size'])
- return header_bytes + atr1_data_bytes
- def _serialize_txt2(self):
- # section 1: offsets for each index to the data section
- # section 2: utf-16 strings with a null terminator
- strings = self.sections['TXT2']['data']
- entries = len(strings)
- header = self.sections['TXT2']['header']
- header_bytes = struct.pack(TXT2_HEADER_STRUCT % self.order, TXT2_MAGIC, 0, header['unknown1'],
- header['unknown2'], entries)
- section1_bytes = ''
- section2_bytes = ''
- # each entry is a single 32-bit integer representing an offset from the start of section1 to an area in section2
- section1_length = entries * 4
- order = ''
- if self.order == '<':
- order = '-le'
- elif self.order == '>':
- order = '-be'
- for string_list in strings:
- section1_bytes += struct.pack('%sI' % self.order, section1_length + len(section2_bytes) + 4)
- for string in string_list:
- utf16string = string.encode('utf-16%s' % order)
- if self.colors:
- haystack = string
- matcher = ''
- utf16string = ''
- while matcher is not None:
- matcher = re.search('(?P<pre>.*)\\[#(?P<color>[a-fA-F0-9]{8})\\](?P<post>.*)', haystack,
- re.DOTALL)
- if matcher is not None:
- pre = matcher.group('pre')
- color = matcher.group('color')
- color_value = int(color, 16)
- post = matcher.group('post')
- utf16string += pre.encode('utf-16%s' % order)
- utf16string += struct.pack('%sI' % self.order, color_value)
- haystack = post
- else:
- utf16string += haystack.encode('utf-16%s' % order)
- section2_bytes += struct.pack('=%ds' % len(utf16string), utf16string)
- section2_bytes += '\x00\x00'
- size = len(section1_bytes) + len(section2_bytes) + 4
- header_bytes = header_bytes[:4] + struct.pack('%sI' % self.order, size) + header_bytes[8:]
- if self.debug:
- print('TXT2 Magic: %s' % TXT2_MAGIC)
- print('TXT2 Size: %d' % size)
- print('TXT2 Unknown1: 0x%x' % header['unknown1'])
- print('TXT2 Unknown2: 0x%x' % header['unknown2'])
- print('TXT2 Entries: %d' % entries)
- return header_bytes + section1_bytes + section2_bytes
- def prompt_yes_no(prompt):
- answer_ = None
- while answer_ not in ('y', 'n'):
- if answer_ is not None:
- print('Please answer "y" or "n"')
- answer_ = raw_input(prompt).lower()
- if len(answer_) == 0:
- answer_ = 'n'
- return answer_
- if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='MsgStdBn Parser')
- parser.add_argument('-v', '--verbose', help='print more data when working', action='store_true', default=False)
- parser.add_argument('-d', '--debug', help='print debug information', action='store_true', default=False)
- parser.add_argument('-c', '--colors', help='decode colors in strings', action='store_true', default=False)
- group = parser.add_mutually_exclusive_group(required=True)
- group.add_argument('-x', '--extract', help='extract MSBT to plain text', action='store_true', default=False)
- group.add_argument('-p', '--pack', help='pack plain text into an MSBT file', action='store_true', default=False)
- parser.add_argument('-y', '--yes', help='answer "Yes" to any questions (overwriting files)', action='store_true',
- default=False)
- parser.add_argument('-j', '--json', help='JSON document to read from or write to', required=True)
- parser.add_argument('msbt_file', help='MSBT file to parse')
- args = parser.parse_args()
- if args.extract and not os.path.exists(args.msbt_file):
- print('MSBT file not found!')
- print(args.msbt_file)
- sys.exit(1)
- if args.extract and os.path.exists(args.json) and not args.yes:
- print('JSON output file exists.')
- answer = prompt_yes_no('Overwrite? (y/N) ')
- if answer == 'n':
- print('Aborted.')
- sys.exit(1)
- json_dirname = os.path.dirname(args.json)
- if len(json_dirname) > 0 and not os.path.exists(json_dirname):
- print('Folder not found: %s' % json_dirname)
- sys.exit(1)
- if args.pack and not os.path.exists(args.json):
- print('JSON file not found!')
- print(args.json)
- sys.exit(1)
- if args.pack and os.path.exists(args.msbt_file) and not args.yes:
- print('MSBT output file exists.')
- answer = prompt_yes_no('Overwrite? (y/N) ')
- if answer == 'n':
- print('Aborted.')
- sys.exit(1)
- msbt = Msbt(verbose=args.verbose, debug=args.debug, colors=args.colors)
- if args.pack:
- msbt.from_json(args.json)
- msbt.save(args.msbt_file)
- elif args.extract:
- msbt.read(args.msbt_file)
- if msbt.invalid:
- print('Invalid MSBT file!')
- sys.exit(1)
- msbt.to_json(args.json)
- print('All good!')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement