Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from lark import Lark, Transformer
- from lark.exceptions import LarkError
- class ParsingException(Exception):
- pass
- javascript_object_grammar = "".join(
- (
- """
- ?start: value
- ?value: object
- | array
- | string
- | SIGNED_NUMBER -> number
- | "true" -> true
- | "false" -> false
- | "null" -> null
- array : _BRACK1 [value (_COMMA value)*] _BRACK2
- object : _CURLY1 [pair (_COMMA pair)*] _CURLY2
- pair : string _COLON value
- _COLON: /\s*:\s*/
- _COMMA: /\s*,\s*/
- _CURLY1: /\s*{\s*/
- _CURLY2: /\s*}\s*/
- _BRACK1: /\s*\[\s*/
- _BRACK2: /\s*\]\s*/
- string : ESCAPED_STRING | BACKTICK_STRING | SINGLE_QUOTE_STRING | javascript_identifier
- """,
- # used this https://stackoverflow.com/a/1661249/1542900 but also see other answers
- # [a-zA-Z_$][0-9a-zA-Z_$]*
- """
- javascript_identifier: ("a".."z"|"A".."Z"|"_"|"$") ("0..9"|"a".."z"|"A".."Z"|"_"|"$")+
- BACKTICK_STRING_INNER: "\\`" | /[^`]/
- BACKTICK_STRING: "`" BACKTICK_STRING_INNER* "`"
- SINGLE_QUOTE_STRING_INNER: "\\\'" | /[^']/
- SINGLE_QUOTE_STRING: "'" SINGLE_QUOTE_STRING_INNER* "'"
- %import common.ESCAPED_STRING
- %import common.SIGNED_NUMBER
- %import common.WS
- %ignore WS
- """
- )
- )
- class TreeToStr(Transformer):
- def string(self, s):
- (s,) = s
- return s[1:-1]
- def javascript_identifier(self, s):
- return '"{}"'.format(''.join(s))
- def number(self, n):
- (n,) = n
- return float(n)
- list = list
- pair = tuple
- dict = dict
- null = lambda self, _: None
- true = lambda self, _: True
- false = lambda self, _: False
- class TreeToJson(Transformer):
- def string(self, s):
- (s,) = s
- return s[1:-1]
- def javascript_identifier(self, s):
- return '"{}"'.format(''.join(s))
- def number(self, n):
- (n,) = n
- return float(n)
- list = list
- pair = tuple
- dict = dict
- null = lambda self, _: None
- true = lambda self, _: True
- false = lambda self, _: False
- _javascript_object_parser = Lark(javascript_object_grammar, start='value', lexer='standard', parser='lalr', transformer=TreeToJson())
- def parse_javascript_object(string):
- string = string.rstrip(';')
- try:
- return _javascript_object_parser.parse(string)
- except LarkError as e:
- raise ParsingException(str(e))
- import demjson
- import unicodedata
- import re
- def parse_json_object(text, array=False):
- """Parse JSON object or JSON array from the string (or byte array).
- :param text: Takes a string (or byte array) containing JSON object
- :param array: True if expected to parse JSON array, defaults to False
- :return: python dictionary or list
- :raises: ValueError
- Selector can create unparseable JSON, if the original data contains
- HTML entities in an otherwise escaped text (Embedded escaped HTML).
- This can cause parse_json_object to be unable to parse.
- Remove these entities before hand.
- Example: title=\\"What is "Was"?\\"
- """
- # replace non-breaking space
- text = demjson.helpers.unicode_decode(text).string.replace(u'\xa0', u' ')
- text = demjson.helpers.strip_format_control_chars(text)
- start_char = '[' if array else '{'
- start = text.find(start_char)
- if start < 0:
- raise ValueError(
- 'Input string does not contain "{}" character'.format(start_char)
- )
- text = text[start:]
- try:
- return demjson.decode(text)
- except demjson.JSONDecodeError as e:
- # Workaround for {u'name': u'value (Styles vary}'}
- if e.args[0] == 'Unexpected text after end of JSON value' and e.position.char_position:
- text = text[:e.position.char_position]
- try:
- return demjson.decode(text)
- except demjson.JSONDecodeError as e:
- pass
- raise ValueError(e.pretty_description())
- test_json = u"""
- {
- '152065' : {
- 'canonicalURL': 'https://www.chewy.com/living-world-cuttlebone-bird-treat-2/dp/152065',
- 'ajaxURL': "/living-world-cuttlebone-bird-treat-2/dp/152065?features",
- 'sku': 124945,
- 'images': [
- '//img.chewy.com/is/image/catalog/124945_MAIN._AC_SL400_V1495567031_.jpg',
- '//img.chewy.com/is/image/catalog/124945_PT2._AC_SL320_V1497994333_.jpg'
- ],
- 'price': '$1.69'
- },
- '131457' : {
- 'canonicalURL': 'https://www.chewy.com/living-world-cuttlebone-bird-treat/dp/131457',
- 'ajaxURL': "/living-world-cuttlebone-bird-treat/dp/131457?features",
- 'sku': 103970,
- 'images': [
- '//img.chewy.com/is/catalog/103970._AC_SL400_V1469015482_.jpg',
- '//img.chewy.com/is/image/catalog/103970_PT1._AC_SL320_V1518213672_.jpg'
- ],
- 'price': '$5.91'
- }
- }
- """
- def fix_json(script, max_attempt_json_fix=30):
- # Fix broken json Ex. {"description": "There is 3" length"}
- # http://stackoverflow.com/questions/18514910/how-do-i-automatically-fix-an-invalid-json-string
- # with a bit modify
- count = 0
- if not script.strip():
- raise ValueError(
- 'Input string looks empty'
- )
- while True:
- try:
- result = json.loads(script)
- break
- except Exception as e:
- count += 1
- if count == max_attempt_json_fix:
- raise e
- # find the character index where it breaks
- unexp_index = re.findall(r'\(char (\d+)\)', str(e))
- if unexp_index:
- unexp_index = int(unexp_index[0])
- else:
- raise e
- # find the previous character that is problematic
- unescaped = script.rfind(r'"', 0, unexp_index)
- if unescaped == -1:
- raise e
- script = script[:unescaped] + r'\"' + script[unescaped + 1:]
- return json.dumps(result)
- import timeit
- import json
- #print(timeit.repeat("json.loads(fix_json(test_json))", "from __main__ import fix_json, json, test_json", number=1000))
- #print(timeit.repeat("parse_json_object(test_json)", "from __main__ import parse_json_object, test_json", number=1000))
- #print(timeit.repeat("parse_javascript_object(test_json)", "from __main__ import parse_javascript_object, test_json", number=1000))
- print(parse_javascript_object(test_json))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement