Advertisement
Guest User

Untitled

a guest
Apr 8th, 2020
264
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.60 KB | None | 0 0
  1. from lark import Lark, Transformer
  2. from lark.exceptions import LarkError
  3.  
  4. class ParsingException(Exception):
  5.     pass
  6.  
  7. javascript_object_grammar = "".join(
  8.     (
  9.         """
  10.        ?start: value
  11.  
  12.        ?value: object
  13.              | array
  14.              | string
  15.              | SIGNED_NUMBER      -> number
  16.              | "true"             -> true
  17.              | "false"            -> false
  18.              | "null"             -> null
  19.  
  20.        array  : _BRACK1 [value (_COMMA value)*] _BRACK2
  21.        object : _CURLY1 [pair (_COMMA pair)*] _CURLY2
  22.        pair   : string _COLON value
  23.  
  24.        _COLON: /\s*:\s*/
  25.        _COMMA: /\s*,\s*/
  26.        _CURLY1: /\s*{\s*/
  27.        _CURLY2: /\s*}\s*/
  28.        _BRACK1: /\s*\[\s*/
  29.        _BRACK2: /\s*\]\s*/
  30.  
  31.        string : ESCAPED_STRING | BACKTICK_STRING | SINGLE_QUOTE_STRING | javascript_identifier
  32.        """,
  33.         # used this https://stackoverflow.com/a/1661249/1542900 but also see other answers
  34.         # [a-zA-Z_$][0-9a-zA-Z_$]*
  35.         """
  36.        javascript_identifier: ("a".."z"|"A".."Z"|"_"|"$") ("0..9"|"a".."z"|"A".."Z"|"_"|"$")+
  37.  
  38.        BACKTICK_STRING_INNER: "\\`" | /[^`]/
  39.        BACKTICK_STRING: "`" BACKTICK_STRING_INNER* "`"
  40.  
  41.        SINGLE_QUOTE_STRING_INNER: "\\\'" | /[^']/
  42.        SINGLE_QUOTE_STRING: "'" SINGLE_QUOTE_STRING_INNER* "'"
  43.  
  44.        %import common.ESCAPED_STRING
  45.        %import common.SIGNED_NUMBER
  46.        %import common.WS
  47.        %ignore WS
  48.        """
  49.     )
  50. )
  51.  
  52. class TreeToStr(Transformer):
  53.     def string(self, s):
  54.         (s,) = s
  55.         return s[1:-1]
  56.  
  57.     def javascript_identifier(self, s):
  58.         return '"{}"'.format(''.join(s))
  59.  
  60.     def number(self, n):
  61.         (n,) = n
  62.         return float(n)
  63.  
  64.     list = list
  65.     pair = tuple
  66.     dict = dict
  67.  
  68.     null = lambda self, _: None
  69.     true = lambda self, _: True
  70.     false = lambda self, _: False
  71.  
  72. class TreeToJson(Transformer):
  73.     def string(self, s):
  74.         (s,) = s
  75.         return s[1:-1]
  76.  
  77.     def javascript_identifier(self, s):
  78.         return '"{}"'.format(''.join(s))
  79.  
  80.     def number(self, n):
  81.         (n,) = n
  82.         return float(n)
  83.  
  84.     list = list
  85.     pair = tuple
  86.     dict = dict
  87.  
  88.     null = lambda self, _: None
  89.     true = lambda self, _: True
  90.     false = lambda self, _: False
  91.  
  92. _javascript_object_parser = Lark(javascript_object_grammar, start='value', lexer='standard', parser='lalr', transformer=TreeToJson())
  93.  
  94. def parse_javascript_object(string):
  95.     string = string.rstrip(';')
  96.     try:
  97.         return _javascript_object_parser.parse(string)
  98.     except LarkError as e:
  99.         raise ParsingException(str(e))
  100.  
  101. import demjson
  102. import unicodedata
  103. import re
  104.  
  105.  
  106. def parse_json_object(text, array=False):
  107.     """Parse JSON object or JSON array from the string (or byte array).
  108.    :param text: Takes a string (or byte array) containing JSON object
  109.    :param array: True if expected to parse JSON array, defaults to False
  110.    :return: python dictionary or list
  111.    :raises: ValueError
  112.    Selector can create unparseable JSON, if the original data contains
  113.    HTML entities in an otherwise escaped text (Embedded escaped HTML).
  114.    This can cause parse_json_object to be unable to parse.
  115.    Remove these entities before hand.
  116.    Example: title=\\"What is "Was"?\\"
  117.    """
  118.     # replace non-breaking space
  119.     text = demjson.helpers.unicode_decode(text).string.replace(u'\xa0', u' ')
  120.     text = demjson.helpers.strip_format_control_chars(text)
  121.     start_char = '[' if array else '{'
  122.     start = text.find(start_char)
  123.     if start < 0:
  124.         raise ValueError(
  125.             'Input string does not contain "{}" character'.format(start_char)
  126.         )
  127.     text = text[start:]
  128.     try:
  129.         return demjson.decode(text)
  130.     except demjson.JSONDecodeError as e:
  131.         # Workaround for {u'name': u'value (Styles vary}'}
  132.         if e.args[0] == 'Unexpected text after end of JSON value' and e.position.char_position:
  133.             text = text[:e.position.char_position]
  134.             try:
  135.                 return demjson.decode(text)
  136.             except demjson.JSONDecodeError as e:
  137.                 pass
  138.         raise ValueError(e.pretty_description())
  139.  
  140.  
  141. test_json = u"""
  142. {
  143.    '152065' : {
  144.        'canonicalURL': 'https://www.chewy.com/living-world-cuttlebone-bird-treat-2/dp/152065',
  145.        'ajaxURL': "/living-world-cuttlebone-bird-treat-2/dp/152065?features",
  146.        'sku': 124945,
  147.        'images': [
  148.            '//img.chewy.com/is/image/catalog/124945_MAIN._AC_SL400_V1495567031_.jpg',
  149.            '//img.chewy.com/is/image/catalog/124945_PT2._AC_SL320_V1497994333_.jpg'
  150.        ],
  151.        'price': '$1.69'
  152.    },
  153.    '131457' : {
  154.        'canonicalURL': 'https://www.chewy.com/living-world-cuttlebone-bird-treat/dp/131457',
  155.        'ajaxURL': "/living-world-cuttlebone-bird-treat/dp/131457?features",
  156.        'sku': 103970,
  157.        'images': [
  158.            '//img.chewy.com/is/catalog/103970._AC_SL400_V1469015482_.jpg',
  159.            '//img.chewy.com/is/image/catalog/103970_PT1._AC_SL320_V1518213672_.jpg'
  160.        ],
  161.        'price': '$5.91'
  162.    }
  163. }
  164. """
  165.  
  166. def fix_json(script, max_attempt_json_fix=30):
  167.     # Fix broken json Ex. {"description": "There is 3" length"}
  168.     # http://stackoverflow.com/questions/18514910/how-do-i-automatically-fix-an-invalid-json-string
  169.     # with a bit modify
  170.     count = 0
  171.     if not script.strip():
  172.         raise ValueError(
  173.             'Input string looks empty'
  174.         )
  175.  
  176.     while True:
  177.         try:
  178.             result = json.loads(script)
  179.             break
  180.         except Exception as e:
  181.             count += 1
  182.             if count == max_attempt_json_fix:
  183.                 raise e
  184.  
  185.             # find the character index where it breaks
  186.             unexp_index = re.findall(r'\(char (\d+)\)', str(e))
  187.             if unexp_index:
  188.                 unexp_index = int(unexp_index[0])
  189.             else:
  190.                 raise e
  191.  
  192.             # find the previous character that is problematic
  193.             unescaped = script.rfind(r'"', 0, unexp_index)
  194.             if unescaped == -1:
  195.                 raise e
  196.             script = script[:unescaped] + r'\"' + script[unescaped + 1:]
  197.  
  198.     return json.dumps(result)
  199.  
  200.  
  201. import timeit
  202. import json
  203.  
  204. #print(timeit.repeat("json.loads(fix_json(test_json))", "from __main__ import fix_json, json, test_json", number=1000))
  205. #print(timeit.repeat("parse_json_object(test_json)", "from __main__ import parse_json_object, test_json", number=1000))
  206. #print(timeit.repeat("parse_javascript_object(test_json)", "from __main__ import parse_javascript_object, test_json", number=1000))
  207.  
  208. print(parse_javascript_object(test_json))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement