fuzz: Add a few more comments

2024-10-26 12:25:09 +03:00 · 2024-04-02 23:19:28 +02:00 · 2024-04-02 23:19:28 +02:00 · 1f18d37798
commit 1f18d37798
parent 5bb84b47b8
1 changed files with 53 additions and 12 deletions
--- a/fuzz/api.c
+++ b/fuzz/api.c
@ -1,22 +1,35 @@
 /*
- * api.c: a libFuzzer target to test all kinds of API functions.
+ * api.c: a libFuzzer target to test node-related API functions.
 *
 * See Copyright for the status of this software.
 *
 * This is a simple virtual machine which runs fuzz data as a program.
+ * An important design goal is to execute as many API calls as possible
+ * per input byte.
 *
- * There is a fixed number of registers for basic types like integers
- * or strings as well as libxml2 objects like xmlNode. An opcode
- * typically results in a call to an API function using the freshest
- * registers for each argument type and storing the result in the
- * stalest register. This can be implemented using a ring buffer.
+ * We use a fixed number of registers for basic types like integers
+ * or strings as well as libxml2 objects like xmlNode. The opcodes are
+ * single bytes which typically result in a call to an API function
+ * using the freshest registers for each argument type and storing the
+ * result in the stalest register. This can be implemented using a ring
+ * buffer.
 *
 * There are a few other opcodes to initialize or duplicate registers,
- * so all kinds of API calls can potentially be generated from
- * fuzz data.
+ * so all kinds of API calls can potentially be generated from fuzz
+ * data.
 *
- * TODO:
- * - Create documents with a dictionary.
+ * This architecture is similar to stack machine and benefits from
+ * great code density. The main difference is that values aren't
+ * destroyed when popping arguments from the stack and that the bottom
+ * of the stack is eventually overwritten if the ring buffer overflows.
+ *
+ * The main complication is memory management of nodes. Whenever a
+ * reference between two nodes is removed, whether by an API call or
+ * the VM clearing a register, we must check whether this leaves
+ * unreferenced nodes which can then be freed. There are no opcodes
+ * to free a node explicitly. The FIFO patterns generated by
+ * overflowing the ring buffer and freeing the registers at the end of
+ * a program seem to do a good enough job.
 */

 #include <stdlib.h>
@ -672,7 +685,7 @@ dropNode(xmlNodePtr node) {
 /*
 * removeNode and removeChildren remove all references to a node
 * or its children from the registers. These functions should be
- * called in an API function destroys nodes, for example by merging
+ * called if an API function destroys nodes, for example by merging
 * text nodes.
 */

@ -971,10 +984,25 @@ LLVMFuzzerTestOneInput(const char *data, size_t size) {
    maxAlloc = xmlFuzzReadInt(4) % (size * 50 + 10);
    xmlFuzzMemSetLimit(maxAlloc);

+    /*
+     * Interpreter loop
+     *
+     * Processing an opcode typically involves
+     *
+     * - startOp for debugging
+     * - increase output register index if non-void
+     * - get arguments from input registers
+     * - invoke API function
+     * - set oomReport
+     * - set output register
+     * - memory management and other adjustments
+     * - endOp for void functions
+     */
+
    while (xmlFuzzBytesRemaining()) {
        size_t readSize;
        int op = xmlFuzzReadInt(1);
-        int oomReport = -1;
+        int oomReport = -1; /* -1 means unknown */

        vars->opName = "[unset]";

@ -996,6 +1024,14 @@ LLVMFuzzerTestOneInput(const char *data, size_t size) {
                break;

            case OP_PARSE_DOCUMENT:
+                /*
+                 * We don't really want to test the parser but exposing
+                 * xmlReadDoc seems like a useful way generate or
+                 * round-trip documents.
+                 *
+                 * This also creates documents with a dictionary which
+                 * is crucial to hit some code paths.
+                 */
                startOp("xmlReadDoc");
                incNodeIdx();
                setNode(0, (xmlNodePtr) xmlReadDoc(
@ -1008,6 +1044,11 @@ LLVMFuzzerTestOneInput(const char *data, size_t size) {
            case OP_XML_NEW_DOC: {
                xmlDocPtr doc;

+                /*
+                 * TODO: There's no public API function to generate a
+                 * document with a dictionary. We should add an extra
+                 * opcode that sets doc->dict.
+                 */
                startOp("xmlNewDoc");
                incNodeIdx();
                doc = xmlNewDoc(getStr(0));