json: Nicer recovery from lexical errors
authorMarkus Armbruster <armbru@redhat.com>
Fri, 31 Aug 2018 07:58:39 +0000 (09:58 +0200)
committerMarkus Armbruster <armbru@redhat.com>
Mon, 24 Sep 2018 16:08:07 +0000 (18:08 +0200)
When the lexer chokes on an input character, it consumes the
character, emits a JSON error token, and enters its start state.  This
can lead to suboptimal error recovery.  For instance, input

    0123 ,

produces the tokens

    JSON_ERROR    01
    JSON_INTEGER  23
    JSON_COMMA    ,

Make the lexer skip characters after a lexical error until a
structural character ('[', ']', '{', '}', ':', ','), an ASCII control
character, or '\xFE', or '\xFF'.

Note that we must not skip ASCII control characters, '\xFE', '\xFF',
because those are documented to force the JSON parser into known-good
state, by docs/interop/qmp-spec.txt.

The lexer now produces

    JSON_ERROR    01
    JSON_COMMA    ,

Update qmp-test for the nicer error recovery: QMP now reports just one
error for input %p instead of two.  Also drop the newline after %p; it
was needed to tease out the second error.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-5-armbru@redhat.com>
[Conflict with commit ebb4d82d888 resolved]

qobject/json-lexer.c
tests/qmp-test.c

index 28582e17d980728638a04d694da335d7ce01e17e..39c7ce7adc271afa1bb028340e239b65835f93ee 100644 (file)
 
 enum json_lexer_state {
     IN_ERROR = 0,               /* must really be 0, see json_lexer[] */
+    IN_RECOVERY,
     IN_DQ_STRING_ESCAPE,
     IN_DQ_STRING,
     IN_SQ_STRING_ESCAPE,
@@ -130,6 +131,28 @@ QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1);
 static const uint8_t json_lexer[][256] =  {
     /* Relies on default initialization to IN_ERROR! */
 
+    /* error recovery */
+    [IN_RECOVERY] = {
+        /*
+         * Skip characters until a structural character, an ASCII
+         * control character other than '\t', or impossible UTF-8
+         * bytes '\xFE', '\xFF'.  Structural characters and line
+         * endings are promising resynchronization points.  Clients
+         * may use the others to force the JSON parser into known-good
+         * state; see docs/interop/qmp-spec.txt.
+         */
+        [0 ... 0x1F] = IN_START | LOOKAHEAD,
+        [0x20 ... 0xFD] = IN_RECOVERY,
+        [0xFE ... 0xFF] = IN_START | LOOKAHEAD,
+        ['\t'] = IN_RECOVERY,
+        ['['] = IN_START | LOOKAHEAD,
+        [']'] = IN_START | LOOKAHEAD,
+        ['{'] = IN_START | LOOKAHEAD,
+        ['}'] = IN_START | LOOKAHEAD,
+        [':'] = IN_START | LOOKAHEAD,
+        [','] = IN_START | LOOKAHEAD,
+    },
+
     /* double quote string */
     [IN_DQ_STRING_ESCAPE] = {
         [0x20 ... 0xFD] = IN_DQ_STRING,
@@ -301,26 +324,18 @@ static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
             /* fall through */
         case JSON_SKIP:
             g_string_truncate(lexer->token, 0);
+            /* fall through */
+        case IN_START:
             new_state = lexer->start_state;
             break;
         case IN_ERROR:
-            /* XXX: To avoid having previous bad input leaving the parser in an
-             * unresponsive state where we consume unpredictable amounts of
-             * subsequent "good" input, percolate this error state up to the
-             * parser by emitting a JSON_ERROR token, then reset lexer state.
-             *
-             * Also note that this handling is required for reliable channel
-             * negotiation between QMP and the guest agent, since chr(0xFF)
-             * is placed at the beginning of certain events to ensure proper
-             * delivery when the channel is in an unknown state. chr(0xFF) is
-             * never a valid ASCII/UTF-8 sequence, so this should reliably
-             * induce an error/flush state.
-             */
             json_message_process_token(lexer, lexer->token, JSON_ERROR,
                                        lexer->x, lexer->y);
+            new_state = IN_RECOVERY;
+            /* fall through */
+        case IN_RECOVERY:
             g_string_truncate(lexer->token, 0);
-            lexer->state = lexer->start_state;
-            return;
+            break;
         default:
             break;
         }
index b3472281ae2c03bdf2732ec55668ae24b45c3982..6c419f6023abdb4e29e47cf49e6d11a46c2c8c5e 100644 (file)
@@ -76,10 +76,7 @@ static void test_malformed(QTestState *qts)
     assert_recovered(qts);
 
     /* lexical error: interpolation */
-    qtest_qmp_send_raw(qts, "%%p\n");
-    /* two errors, one for "%", one for "p" */
-    resp = qtest_qmp_receive(qts);
-    qmp_assert_error_class(resp, "GenericError");
+    qtest_qmp_send_raw(qts, "%%p");
     resp = qtest_qmp_receive(qts);
     qmp_assert_error_class(resp, "GenericError");
     assert_recovered(qts);