Update fuzzing code

- Shorten timeouts - Align options from Makefile and options files - Add section headers to Makefile - Skip invalid UTF-8 in regexp fuzzer - Update regexp.dict - Generate HTML seed corpus in correct format
2025-03-09 04:58:16 +03:00 · 2020-07-12 22:59:39 +02:00 · 2020-07-12 22:59:39 +02:00 · 905820a44c
commit 905820a44c
parent 68eadabd00
10 changed files with 233 additions and 22 deletions
--- a/fuzz/.gitignore
+++ b/fuzz/.gitignore
@ -1,8 +1,10 @@
 corpus/
 html
+htmlSeed
 regexp
 schema
 schemaSeed
+seed/html*
 seed/xml*
 seed/schema*
 testFuzzer
--- a/fuzz/Makefile.am
+++ b/fuzz/Makefile.am
@ -1,11 +1,11 @@
-EXTRA_PROGRAMS = html regexp uri schema schemaSeed xml xmlSeed
+EXTRA_PROGRAMS = html htmlSeed regexp uri schema schemaSeed xml xmlSeed
 check_PROGRAMS = testFuzzer
 CLEANFILES = $(EXTRA_PROGRAMS)
 AM_CPPFLAGS = -I$(top_srcdir)/include
 DEPENDENCIES = $(top_builddir)/libxml2.la
 LDADD = $(STATIC_BINARIES) $(top_builddir)/libxml2.la $(THREAD_LIBS) $(Z_LIBS) $(LZMA_LIBS) $(ICONV_LIBS) $(M_LIBS) $(WIN32_EXTRA_LIBADD)

-PARSER_FUZZER_MAX_LEN = 100000
+XML_MAX_LEN = 80000
 XML_SEED_CORPUS_SRC = \
    $(top_srcdir)/test/* \
    $(top_srcdir)/test/errors/*.xml \
@ -16,6 +16,14 @@ XML_SEED_CORPUS_SRC = \
    $(top_srcdir)/test/VC/* \
    $(top_srcdir)/test/VCM/*

+testFuzzer_SOURCES = testFuzzer.c fuzz.c
+
+tests: testFuzzer$(EXEEXT)
+	@echo "## Running fuzzer tests"
+	@./testFuzzer$(EXEEXT)
+
+# XML fuzzer
+
 xmlSeed_SOURCES = xmlSeed.c fuzz.c

 seed/xml.stamp: xmlSeed$(EXEEXT)
@ -28,19 +36,13 @@ seed/xml.stamp: xmlSeed$(EXEEXT)
                pushd $$(dirname $$i) >/dev/null; \
 	        $(abs_builddir)/xmlSeed$(EXEEXT) $$base > $$outfile; \
                popd >/dev/null; \
-	        if [ "$$(wc -c < $$outfile)" -gt $(PARSER_FUZZER_MAX_LEN) ]; then \
+	        if [ "$$(wc -c < $$outfile)" -gt $(XML_MAX_LEN) ]; then \
 	            rm $$outfile; \
 	        fi; \
 	    fi; \
 	done
 	@touch seed/xml.stamp

-testFuzzer_SOURCES = testFuzzer.c fuzz.c
-
-tests: testFuzzer$(EXEEXT)
-	@echo "## Running fuzzer tests"
-	@./testFuzzer$(EXEEXT)
-
 xml_SOURCES = xml.c fuzz.c
 xml_LDFLAGS = -fsanitize=fuzzer

@ -48,20 +50,36 @@ fuzz-xml: xml$(EXEEXT) seed/xml.stamp
 	@mkdir -p corpus/xml
 	./xml$(EXEEXT) \
 	    -dict=xml.dict \
-	    -max_len=$(PARSER_FUZZER_MAX_LEN) \
+	    -max_len=$(XML_MAX_LEN) \
 	    -timeout=20 \
 	    corpus/xml seed/xml

+# HTML fuzzer
+
+htmlSeed_SOURCES = htmlSeed.c fuzz.c
+
+seed/html.stamp: htmlSeed$(EXEEXT)
+	@mkdir -p seed/html
+	@for i in $(top_srcdir)/test/HTML/*; do \
+	    if [ -f $$i ]; then \
+	        echo Processing seed $$i; \
+	        ./htmlSeed$(EXEEXT) $$i > seed/html/$$(basename $$i); \
+	    fi; \
+	done
+	@touch seed/html.stamp
+
 html_SOURCES = html.c fuzz.c
 html_LDFLAGS = -fsanitize=fuzzer

-fuzz-html: html$(EXEEXT)
+fuzz-html: html$(EXEEXT) seed/html.stamp
 	@mkdir -p corpus/html
 	./html$(EXEEXT) \
 	    -dict=html.dict \
 	    -max_len=1000000 \
 	    -timeout=20 \
-	    corpus/html $(top_srcdir)/test/HTML
+	    corpus/html seed/html
+
+# Regexp fuzzer

 regexp_SOURCES = regexp.c fuzz.c
 regexp_LDFLAGS = -fsanitize=fuzzer
@ -70,10 +88,12 @@ fuzz-regexp: regexp$(EXEEXT)
 	@mkdir -p corpus/regexp
 	./regexp$(EXEEXT) \
 	    -dict=regexp.dict \
-	    -max_len=10000 \
-	    -timeout=20 \
+	    -max_len=200 \
+	    -timeout=5 \
 	    corpus/regexp $(srcdir)/seed/regexp

+# URI fuzzer
+
 uri_SOURCES = uri.c fuzz.c
 uri_LDFLAGS = -fsanitize=fuzzer

@ -81,9 +101,11 @@ fuzz-uri: uri$(EXEEXT)
 	@mkdir -p corpus/uri
 	./uri$(EXEEXT) \
 	    -max_len=10000 \
-	    -timeout=2 \
+	    -timeout=5 \
 	    corpus/uri $(srcdir)/seed/uri

+# XML Schema fuzzer
+
 schemaSeed_SOURCES = schemaSeed.c fuzz.c

 seed/schema.stamp: schemaSeed$(EXEEXT)
@ -107,7 +129,7 @@ fuzz-schema: schema$(EXEEXT) seed/schema.stamp
 	@mkdir -p corpus/schema
 	./schema$(EXEEXT) \
 	    -dict=schema.dict \
-	    -max_len=$(PARSER_FUZZER_MAX_LEN) \
+	    -max_len=$(XML_MAX_LEN) \
 	    -timeout=20 \
 	    corpus/schema seed/schema

--- a/fuzz/html.options
+++ b/fuzz/html.options
@ -1,2 +1,3 @@
 [libfuzzer]
 max_len = 1000000
+timeout = 20
--- a/fuzz/htmlSeed.c
+++ b/fuzz/htmlSeed.c
@ -0,0 +1,36 @@
+/*
+ * htmlSeed.c: Generate the HTML seed corpus for fuzzing.
+ *
+ * See Copyright for the status of this software.
+ */
+
+#include <stdio.h>
+
+#define SEED_BUF_SIZE 16384
+
+int
+main(int argc, char **argv) {
+    int opts = 0;
+    FILE *file;
+    char buf[SEED_BUF_SIZE];
+    size_t size;
+
+    if (argc != 2) {
+        fprintf(stderr, "Usage: htmlSeed [FILE]\n");
+        return(1);
+    }
+
+    fwrite(&opts, sizeof(opts), 1, stdout);
+
+    /* Copy file */
+    file = fopen(argv[1], "rb");
+    do {
+        size = fread(buf, 1, SEED_BUF_SIZE, file);
+        if (size > 0)
+            fwrite(buf, 1, size, stdout);
+    } while (size == SEED_BUF_SIZE);
+    fclose(file);
+
+    return(0);
+}
+
--- a/fuzz/regexp.c
+++ b/fuzz/regexp.c
@ -23,14 +23,17 @@ LLVMFuzzerTestOneInput(const char *data, size_t size) {

    numStrings = xmlFuzzExtractStrings(data, size, str, 2);

-    regexp = xmlRegexpCompile(BAD_CAST str[0]);
-    /* xmlRegexpExec has pathological performance in too many cases. */
+    /* CUR_SCHAR doesn't handle invalid UTF-8 and may cause infinite loops. */
+    if (xmlCheckUTF8(BAD_CAST str[0]) != 0) {
+        regexp = xmlRegexpCompile(BAD_CAST str[0]);
+        /* xmlRegexpExec has pathological performance in too many cases. */
 #if 0
-    if ((regexp != NULL) && (numStrings >= 2)) {
-        xmlRegexpExec(regexp, BAD_CAST str[1]);
-    }
+        if ((regexp != NULL) && (numStrings >= 2)) {
+            xmlRegexpExec(regexp, BAD_CAST str[1]);
+        }
 #endif
-    xmlRegFreeRegexp(regexp);
+        xmlRegFreeRegexp(regexp);
+    }

    xmlFree(str[0]);
    xmlFree(str[1]);
--- a/fuzz/regexp.dict
+++ b/fuzz/regexp.dict
@ -3,9 +3,19 @@ quant_opt="?"
 quant_some="+"
 quant_num="{1,2}"

+dot="."
 branch="|a"
+parens="()"
+parens_inner=")("
 pos_group="[a]"
 neg_group="[^a]"
+class_subtraction="[a-[b]]"
+
+esc_space="\\s"
+esc_initial="\\i"
+esc_name="\\c"
+esc_digit="\\d"
+esc_word="\\w"

 cat_letter="\\p{L}"
 cat_mark="\\p{M}"
@ -14,3 +24,132 @@ cat_punct="\\p{P}"
 cat_sym="\\p{S}"
 cat_sep="\\p{Z}"
 cat_other="\\p{C}"
+
+block_aegean_numbers="\\p{IsAegeanNumbers}"
+block_alphabetic_presentation_forms="\\p{IsAlphabeticPresentationForms}"
+block_arabic="\\p{IsArabic}"
+block_arabic_presentation_forms_a="\\p{IsArabicPresentationFormsA}"
+block_arabic_presentation_forms_b="\\p{IsArabicPresentationFormsB}"
+block_armenian="\\p{IsArmenian}"
+block_arrows="\\p{IsArrows}"
+block_basic_latin="\\p{IsBasicLatin}"
+block_bengali="\\p{IsBengali}"
+block_block_elements="\\p{IsBlockElements}"
+block_bopomofo="\\p{IsBopomofo}"
+block_bopomofo_extended="\\p{IsBopomofoExtended}"
+block_box_drawing="\\p{IsBoxDrawing}"
+block_braille_patterns="\\p{IsBraillePatterns}"
+block_buhid="\\p{IsBuhid}"
+block_byzantine_musical_symbols="\\p{IsByzantineMusicalSymbols}"
+block_c_j_k_compatibility="\\p{IsCJKCompatibility}"
+block_c_j_k_compatibility_forms="\\p{IsCJKCompatibilityForms}"
+block_c_j_k_compatibility_ideographs="\\p{IsCJKCompatibilityIdeographs}"
+block_c_j_k_compatibility_ideographs_supplement="\\p{IsCJKCompatibilityIdeographsSupplement}"
+block_c_j_k_radicals_supplement="\\p{IsCJKRadicalsSupplement}"
+block_c_j_k_symbolsand_punctuation="\\p{IsCJKSymbolsandPunctuation}"
+block_c_j_k_unified_ideographs="\\p{IsCJKUnifiedIdeographs}"
+block_c_j_k_unified_ideographs_extension_a="\\p{IsCJKUnifiedIdeographsExtensionA}"
+block_cjk_unified_ideographs_extension_b="\\p{IsCJKUnifiedIdeographsExtensionB}"
+block_cherokee="\\p{IsCherokee}"
+block_combining_diacritical_marks="\\p{IsCombiningDiacriticalMarks}"
+block_combining_diacritical_marksfor_symbols="\\p{IsCombiningDiacriticalMarksforSymbols}"
+block_combining_half_marks="\\p{IsCombiningHalfMarks}"
+block_combining_marksfor_symbols="\\p{IsCombiningMarksforSymbols}"
+block_control_pictures="\\p{IsControlPictures}"
+block_currency_symbols="\\p{IsCurrencySymbols}"
+block_cypriot_syllabary="\\p{IsCypriotSyllabary}"
+block_cyrillic="\\p{IsCyrillic}"
+block_cyrillic_supplement="\\p{IsCyrillicSupplement}"
+block_deseret="\\p{IsDeseret}"
+block_devanagari="\\p{IsDevanagari}"
+block_dingbats="\\p{IsDingbats}"
+block_enclosed_alphanumerics="\\p{IsEnclosedAlphanumerics}"
+block_enclosed_cjk_lettersand_months="\\p{IsEnclosedCJKLettersandMonths}"
+block_ethiopic="\\p{IsEthiopic}"
+block_general_punctuation="\\p{IsGeneralPunctuation}"
+block_geometric_shapes="\\p{IsGeometricShapes}"
+block_georgian="\\p{IsGeorgian}"
+block_gothic="\\p{IsGothic}"
+block_greek="\\p{IsGreek}"
+block_greek_extended="\\p{IsGreekExtended}"
+block_greekand_coptic="\\p{IsGreekandCoptic}"
+block_gujarati="\\p{IsGujarati}"
+block_gurmukhi="\\p{IsGurmukhi}"
+block_halfwidthand_fullwidth_forms="\\p{IsHalfwidthandFullwidthForms}"
+block_hangul_compatibility_jamo="\\p{IsHangulCompatibilityJamo}"
+block_hangul_jamo="\\p{IsHangulJamo}"
+block_hangul_syllables="\\p{IsHangulSyllables}"
+block_hanunoo="\\p{IsHanunoo}"
+block_hebrew="\\p{IsHebrew}"
+block_high_private_use_surrogates="\\p{IsHighPrivateUseSurrogates}"
+block_high_surrogates="\\p{IsHighSurrogates}"
+block_hiragana="\\p{IsHiragana}"
+block_ipa_extensions="\\p{IsIPAExtensions}"
+block_ideographic_description_characters="\\p{IsIdeographicDescriptionCharacters}"
+block_kanbun="\\p{IsKanbun}"
+block_kangxi_radicals="\\p{IsKangxiRadicals}"
+block_kannada="\\p{IsKannada}"
+block_katakana="\\p{IsKatakana}"
+block_katakana_phonetic_extensions="\\p{IsKatakanaPhoneticExtensions}"
+block_khmer="\\p{IsKhmer}"
+block_khmer_symbols="\\p{IsKhmerSymbols}"
+block_lao="\\p{IsLao}"
+block_latin1Supplement="\\p{IsLatin1Supplement}"
+block_latin_extended_a="\\p{IsLatinExtendedA}"
+block_latin_extended_b="\\p{IsLatinExtendedB}"
+block_latin_extended_additional="\\p{IsLatinExtendedAdditional}"
+block_letterlike_symbols="\\p{IsLetterlikeSymbols}"
+block_limbu="\\p{IsLimbu}"
+block_linear_b_ideograms="\\p{IsLinearBIdeograms}"
+block_linear_b_syllabary="\\p{IsLinearBSyllabary}"
+block_low_surrogates="\\p{IsLowSurrogates}"
+block_malayalam="\\p{IsMalayalam}"
+block_mathematical_alphanumeric_symbols="\\p{IsMathematicalAlphanumericSymbols}"
+block_mathematical_operators="\\p{IsMathematicalOperators}"
+block_miscellaneous_mathematical_symbols_a="\\p{IsMiscellaneousMathematicalSymbolsA}"
+block_miscellaneous_mathematical_symbols_b="\\p{IsMiscellaneousMathematicalSymbolsB}"
+block_miscellaneous_symbols="\\p{IsMiscellaneousSymbols}"
+block_miscellaneous_symbolsand_arrows="\\p{IsMiscellaneousSymbolsandArrows}"
+block_miscellaneous_technical="\\p{IsMiscellaneousTechnical}"
+block_mongolian="\\p{IsMongolian}"
+block_musical_symbols="\\p{IsMusicalSymbols}"
+block_myanmar="\\p{IsMyanmar}"
+block_number_forms="\\p{IsNumberForms}"
+block_ogham="\\p{IsOgham}"
+block_old_italic="\\p{IsOldItalic}"
+block_optical_character_recognition="\\p{IsOpticalCharacterRecognition}"
+block_oriya="\\p{IsOriya}"
+block_osmanya="\\p{IsOsmanya}"
+block_phonetic_extensions="\\p{IsPhoneticExtensions}"
+block_private_use="\\p{IsPrivateUse}"
+block_private_use_area="\\p{IsPrivateUseArea}"
+block_runic="\\p{IsRunic}"
+block_shavian="\\p{IsShavian}"
+block_sinhala="\\p{IsSinhala}"
+block_small_form_variants="\\p{IsSmallFormVariants}"
+block_spacing_modifier_letters="\\p{IsSpacingModifierLetters}"
+block_specials="\\p{IsSpecials}"
+block_superscriptsand_subscripts="\\p{IsSuperscriptsandSubscripts}"
+block_supplemental_arrows_a="\\p{IsSupplementalArrowsA}"
+block_supplemental_arrows_b="\\p{IsSupplementalArrowsB}"
+block_supplemental_mathematical_operators="\\p{IsSupplementalMathematicalOperators}"
+block_supplementary_private_use_area_a="\\p{IsSupplementaryPrivateUseAreaA}"
+block_supplementary_private_use_area_b="\\p{IsSupplementaryPrivateUseAreaB}"
+block_syriac="\\p{IsSyriac}"
+block_tagalog="\\p{IsTagalog}"
+block_tagbanwa="\\p{IsTagbanwa}"
+block_tags="\\p{IsTags}"
+block_tai_le="\\p{IsTaiLe}"
+block_tai_xuan_jing_symbols="\\p{IsTaiXuanJingSymbols}"
+block_tamil="\\p{IsTamil}"
+block_telugu="\\p{IsTelugu}"
+block_thaana="\\p{IsThaana}"
+block_thai="\\p{IsThai}"
+block_tibetan="\\p{IsTibetan}"
+block_ugaritic="\\p{IsUgaritic}"
+block_unified_canadian_aboriginal_syllabics="\\p{IsUnifiedCanadianAboriginalSyllabics}"
+block_variation_selectors="\\p{IsVariationSelectors}"
+block_variation_selectors_supplement="\\p{IsVariationSelectorsSupplement}"
+block_yi_radicals="\\p{IsYiRadicals}"
+block_yi_syllables="\\p{IsYiSyllables}"
+block_yijing_hexagram_symbols="\\p{IsYijingHexagramSymbols}"
--- a/fuzz/regexp.options
+++ b/fuzz/regexp.options
@ -0,0 +1,3 @@
+[libfuzzer]
+max_len = 200
+timeout = 5
--- a/fuzz/schema.options
+++ b/fuzz/schema.options
@ -1,2 +1,3 @@
 [libfuzzer]
 max_len = 80000
+timeout = 20
--- a/fuzz/uri.options
+++ b/fuzz/uri.options
@ -0,0 +1,3 @@
+[libfuzzer]
+max_len = 10000
+timeout = 5
--- a/fuzz/xml.options
+++ b/fuzz/xml.options
@ -1,2 +1,3 @@
 [libfuzzer]
 max_len = 80000
+timeout = 20