What to do if you have malformed JSON

I had this "JSON" file sent to me:

"{ " key ": " value ", " anotherkey ": \"value" value\ ", " numbe r": 45., " number2 ": nan }"

Let's remove the enclosing quotes and format it for readability:

{ "key" : "value" , "anotherkey" : \ "value" value\ ", " numbe r": 45., " number2 ": nan }

I have no idea how something like this was generated.

If the file is small enough or the data regular enough, you could fix it by hand with some search & replace.

But the file I had was gigabytes in size and most of it looked fine.

Except for lines like these:

{ "key" : ":\\\\\\\\\\" , "anotherkey" : "value" } { "key" : "Something \\\\" Name\\\\ " something\", " anotherkey ": " value " }

Instead of crafting unreadable regexes, I decided to write my own quick & dirty malfomed JSON parser.

Here is the first version:

s = open( 'input.json' ).read() in_key = False i = 1 while i < len(s) - 2 : c = s[i] i += 1 if c == '{' : in_key = True elif c == '}' : pass elif c == '[' : pass elif c == ']' : pass elif c == ':' : in_key = False elif c == ',' : in_key = True elif is_num(c): v = c while i < len(s) and is_num(s[i]): v += s[i] i += 1 i += 1 print(v) elif c == 'n' and s[i] == 'a' and s[i+ 1 ] == 'n' : i += 2 print( 'NaN' ) elif c.isspace(): pass elif c == '"' : v = read_until(s, i, '"' ) i += len(v) + 1 print(v) else : print( 'UNKNOWN: ' + c)

with these two helper functions

def is_num(c): return c.isdigit() or c in [ '.' , '-' ] def read_until(s, i, token ): value = '' prev = None while i+ len ( token ) < len (s): if s[i:i+ len ( token )] == token and prev != '\\' : break value += s[i] prev = s[i] i += 1 return value

Not elegant but that's not my goal here.

I'm going to get this output:

key value anotherkey UNKNOWN : \

Now I need to parse strings like these:

{ "anotherkey" : \ "value" value\ " }

I am going to add:

in_key = False reverse = False # ... elif c == '\\' and s[i] == '"' : if not in_key: reverse = True elif c == '"' : v = '' if not reverse : v = read_until(s, i, '"' ) i += len(v) + 1 else : v = read_until(s, i, '\\"' ) i += len(v) + 2 reverse = False print (v)

and get this:

key value anotherkey value "value number 45. number2 NaN

Unfortunately, there were even more malformed strings like { "key": "wtf\", "another": "value" } , so I had to add some dirty code like this:

elif c == '"' : v = '' if v.endswith( '", ' ): v = v[:-3] i -= 3 if v.endswith( '\\"}, {' ): v = v[:-6] i -= 5 print (v)

Another option would be to ignore everything until the next } (I'd lose this object literal) but in my case I had to preserve all data.

Finally, let's save the result as a valid JSON file:

s = open ( 'input.json' ). read () out = open ( 'output.json' , 'w' ) in_key = False reverse = False def is_num( c ): return c .isdigit() or c in [ '.' , '-' ] def read_until(s, i, token): value = '' prev = None while i+ len (token) < len (s): if s[i:i+ len (token)] == token and prev != '\\' : break value += s[i] prev = s[i] i += 1 return value i = 1 # skip initial line = 1 while i < len (s) - 2 : # skip trailing c = s[i] i += 1 if c == '{' : in_key = True out. write ( '{ ' ) elif c == '}' : out. write ( ' }

' ) line += 1 elif c == '[' : out. write ( '[ ' ) elif c == ']' : out. write ( ' ]' ) elif c == ':' : in_key = False out. write ( ': ' ) elif c == ',' : in_key = True out. write ( ', ' ) elif is_num( c ): v = c while i < len (s) and is_num(s[i]): v += s[i] i += 1 i += 1 out. write (v) if v != '0' : out. write ( '0' ) out. write ( ', ' ) elif c == 'n' and s[i] == 'a' and s[i+ 1 ] == 'n' : i += 2 out. write ( 'null' ) elif c .isspace(): pass elif c == '\\' and s[i] == '"' : if not in_key: reverse = True elif c == '"' : v = '' if not reverse : v = read_until(s, i, '"' ) i += len (v) + 1 else : v = read_until(s, i, '\\"' ) i += len (v) + 2 reverse = False if v.endswith( '", ' ): v = v[:- 3 ] i -= 3 # go back to comma if v.endswith( '\\"}, {' ): v = v[:- 6 ] i -= 5 # go back to } if not in_key: v = v.replace( '\\' , '' ).replace( '"' , '\\"' ) out. write ( '"' ) out. write (v) out. write ( '"' ) else : print ( 'UNKNOWN: ' + c ) #break out. close ()

Which looks like this:

{ "key" : "value" , "anotherkey" : "value\"value" , "number" : 45.0 , "number2" : null }

Took me about 2 hours.