""" Add NaN parsing to normal ijson pure-python parsing backend. This is not RFC 8259, but RFC 8259 is wrong. :Skinner Out Of Touch meme:. References: https://github.com/ICRAR/ijson/issues/33 Example: >>> import ubelt as ub >>> from geowatch.utils import ijson_ext as ijson_ext >>> # >>> import kwcoco >>> dset = kwcoco.CocoDataset.demo() >>> text = dset.dumps() >>> import io >>> file = io.StringIO(text) >>> list(ijson_ext.items(file, 'categories')) >>> # >>> text = ub.codeblock( >>> ''' >>> {"id": 246, "image_id": 123, "flag": true, "empty": null, "category_id": 2, "bbox": [ NaN, 5, 130, 290 ] } >>> ''') >>> import io >>> import json >>> # >>> file = io.StringIO(text) >>> json.load(file) >>> # >>> file = io.StringIO(text) >>> list(ijson_ext.items(file, 'bbox')) >>> import ijson >>> # Regular ijson does not support NaN >>> text = ub.codeblock( >>> ''' >>> {"id": 246, "image_id": 123, "flag": true, "empty": null, "category_id": 2, "bbox": [ null, 5, 130, 290 ] } >>> ''') import timerit ti = timerit.Timerit(100, bestof=10, verbose=2) for timer in ti.reset('time'): with timer: file = io.StringIO(text) list(ijson.items(file, 'bbox')) for timer in ti.reset('time'): with timer: file = io.StringIO(text) list(ijson_ext.items(file, 'bbox')) """ from json.decoder import scanstring import re from ijson import common, utils import codecs LEXEME_RE = re.compile(r'[a-z0-9eEN\.\+-]+|\S') UNARY_LEXEMES = set('[]{},') EOF = -1, None class UnexpectedSymbol(common.JSONError): def __init__(self, symbol, pos): super(UnexpectedSymbol, self).__init__( 'Unexpected symbol %r at %d' % (symbol, pos) ) @utils.coroutine def utf8_encoder(target): decoder = codecs.getincrementaldecoder('utf-8')() decode = decoder.decode send = target.send while True: try: final = False bdata = yield except GeneratorExit: final = True bdata = b'' try: sdata = decode(bdata, final) except UnicodeDecodeError as e: try: target.close() except BaseException: pass raise common.IncompleteJSONError(e) if sdata: send(sdata) elif not bdata: target.close() break @utils.coroutine def Lexer(target): """ Parses lexemes out of the incoming content, and sends them to parse_value. A special EOF result is sent when the data source has been exhausted to give parse_value the possibility of raising custom exceptions due to missing content. """ try: data = yield except GeneratorExit: data = '' buf = data pos = 0 discarded = 0 send = target.send while True: match = LEXEME_RE.search(buf, pos) if match: lexeme = match.group() if lexeme == '"': pos = match.start() start = pos + 1 while True: try: end = buf.index('"', start) escpos = end - 1 while buf[escpos] == '\\': escpos -= 1 if (end - escpos) % 2 == 0: start = end + 1 else: break except ValueError: try: data = yield except GeneratorExit: data = '' if not data: raise common.IncompleteJSONError( 'Incomplete string lexeme') buf += data send((discarded + pos, buf[pos:end + 1])) pos = end + 1 else: while lexeme not in UNARY_LEXEMES and match.end() == len(buf): try: data = yield except GeneratorExit: data = '' if not data: break buf += data match = LEXEME_RE.search(buf, pos) lexeme = match.group() send((discarded + match.start(), lexeme)) pos = match.end() else: # Don't ask data from an already exhausted source if data: try: data = yield except GeneratorExit: data = '' if not data: # Normally should raise StopIteration, but can raise # IncompleteJSONError too, which is the point of sending EOF try: target.send(EOF) except StopIteration: pass break discarded += len(buf) buf = data pos = 0 # Parsing states _PARSE_VALUE = 0 _PARSE_ARRAY_ELEMENT_END = 1 _PARSE_OBJECT_KEY = 2 _PARSE_OBJECT_END = 3 # infinity singleton for overflow checks inf = float("inf") @utils.coroutine def parse_value(target, multivalue, use_float): """ Parses results coming out of the Lexer into ijson events, which are sent to `target`. A stack keeps track of the type of object being parsed at the time (a value, and object or array -- the last two being values themselves). A special EOF result coming from the Lexer indicates that no more content is expected. This is used to check for incomplete content and raise the appropriate exception, which wouldn't be possible if the Lexer simply closed this co-routine (either explicitly via .close(), or implicitly by itself finishing and decreasing the only reference to the co-routine) since that causes a GeneratorExit exception that cannot be replaced with a custom one. """ state_stack = [_PARSE_VALUE] pop = state_stack.pop push = state_stack.append send = target.send prev_pos, prev_symbol = None, None to_number = common.integer_or_float if use_float else common.integer_or_decimal while True: if prev_pos is None: pos, symbol = yield if (pos, symbol) == EOF: if state_stack: raise common.IncompleteJSONError('Incomplete JSON content') break else: pos, symbol = prev_pos, prev_symbol prev_pos, prev_symbol = None, None try: state = state_stack[-1] except IndexError: if multivalue: state = _PARSE_VALUE push(state) else: raise common.JSONError('Additional data found') assert state_stack if state == _PARSE_VALUE: # Simple, common cases if symbol == 'null': send(('null', None)) pop() elif symbol == 'true': send(('boolean', True)) pop() elif symbol == 'false': send(('boolean', False)) pop() elif symbol == 'NaN': send(('number', float('nan'))) pop() elif symbol[0] == '"': send(('string', parse_string(symbol))) pop() # Array start elif symbol == '[': send(('start_array', None)) pos, symbol = yield if (pos, symbol) == EOF: raise common.IncompleteJSONError('Incomplete JSON content') if symbol == ']': send(('end_array', None)) pop() else: prev_pos, prev_symbol = pos, symbol push(_PARSE_ARRAY_ELEMENT_END) push(_PARSE_VALUE) # Object start elif symbol == '{': send(('start_map', None)) pos, symbol = yield if (pos, symbol) == EOF: raise common.IncompleteJSONError('Incomplete JSON content') if symbol == '}': send(('end_map', None)) pop() else: prev_pos, prev_symbol = pos, symbol push(_PARSE_OBJECT_KEY) # A number else: # JSON numbers can't contain leading zeros if ((len(symbol) > 1 and symbol[0] == '0' and symbol[1] not in ('e', 'E', '.')) or (len(symbol) > 2 and symbol[0:2] == '-0' and symbol[2] not in ('e', 'E', '.'))): raise common.JSONError( 'Invalid JSON number: %s' % (symbol,)) # Fractions need a leading digit and must be followed by a # digit if symbol[0] == '.' or symbol[-1] == '.': raise common.JSONError( 'Invalid JSON number: %s' % (symbol,)) try: number = to_number(symbol) if number == inf: raise common.JSONError( "float overflow: %s" % (symbol,)) except BaseException: if 'true'.startswith(symbol) or 'false'.startswith(symbol) or 'null'.startswith(symbol): # or 'NaN'.startswith(symbol): raise common.IncompleteJSONError( 'Incomplete JSON content') raise UnexpectedSymbol(symbol, pos) else: send(('number', number)) pop() elif state == _PARSE_OBJECT_KEY: if symbol[0] != '"': raise UnexpectedSymbol(symbol, pos) send(('map_key', parse_string(symbol))) pos, symbol = yield if (pos, symbol) == EOF: raise common.IncompleteJSONError('Incomplete JSON content') if symbol != ':': raise UnexpectedSymbol(symbol, pos) state_stack[-1] = _PARSE_OBJECT_END push(_PARSE_VALUE) elif state == _PARSE_OBJECT_END: if symbol == ',': state_stack[-1] = _PARSE_OBJECT_KEY elif symbol != '}': raise UnexpectedSymbol(symbol, pos) else: send(('end_map', None)) pop() pop() elif state == _PARSE_ARRAY_ELEMENT_END: if symbol == ',': state_stack[-1] = _PARSE_ARRAY_ELEMENT_END push(_PARSE_VALUE) elif symbol != ']': raise UnexpectedSymbol(symbol, pos) else: send(('end_array', None)) pop() pop() def parse_string(symbol): return scanstring(symbol, 1)[0] def basic_parse_basecoro(target, multiple_values=False, allow_comments=False, use_float=False): ''' Iterator yielding unprefixed events. Parameters: - file: a readable file-like object with JSON input ''' if allow_comments: raise ValueError("Comments are not supported by the python backend") return utf8_encoder(Lexer(parse_value(target, multiple_values, use_float))) common.enrich_backend(globals())