1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-03-24 06:50:08 +03:00

regexp: Fix status codes and handle invalid UTF-8

Fixes #561.
This commit is contained in:
Nick Wellnhofer 2023-09-22 15:25:40 +02:00
parent b94283fbda
commit f98fa86318
2 changed files with 89 additions and 50 deletions

View File

@ -29,16 +29,13 @@ LLVMFuzzerTestOneInput(const char *data, size_t size) {
maxAlloc = xmlFuzzReadInt(4) % (size * 8 + 1);
str1 = xmlFuzzReadString(NULL);
/* CUR_SCHAR doesn't handle invalid UTF-8 and may cause infinite loops. */
if (xmlCheckUTF8(BAD_CAST str1) != 0) {
xmlFuzzMemSetLimit(maxAlloc);
regexp = xmlRegexpCompile(BAD_CAST str1);
/* xmlRegexpExec has pathological performance in too many cases. */
xmlFuzzMemSetLimit(maxAlloc);
regexp = xmlRegexpCompile(BAD_CAST str1);
/* xmlRegexpExec has pathological performance in too many cases. */
#if 0
xmlRegexpExec(regexp, BAD_CAST str2);
xmlRegexpExec(regexp, BAD_CAST str2);
#endif
xmlRegFreeRegexp(regexp);
}
xmlRegFreeRegexp(regexp);
xmlFuzzMemSetLimit(0);
xmlFuzzDataCleanup();

View File

@ -38,6 +38,16 @@
#define MAX_PUSH 10000000
/*
* -2 and -3 are used by xmlValidateElementType for other things.
*/
#define XML_REGEXP_OK 0
#define XML_REGEXP_NOT_FOUND (-1)
#define XML_REGEXP_INTERNAL_ERROR (-4)
#define XML_REGEXP_OUT_OF_MEMORY (-5)
#define XML_REGEXP_INTERNAL_LIMIT (-6)
#define XML_REGEXP_INVALID_UTF8 (-7)
#ifdef ERROR
#undef ERROR
#endif
@ -48,7 +58,6 @@
#define CUR (*(ctxt->cur))
#define NXT(index) (ctxt->cur[index])
#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
#define NEXTL(l) ctxt->cur += l;
#define XML_REG_STRING_SEPARATOR '|'
/*
@ -3036,6 +3045,7 @@ static void
xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
#ifdef MAX_PUSH
if (exec->nbPush > MAX_PUSH) {
exec->status = XML_REGEXP_INTERNAL_LIMIT;
return;
}
exec->nbPush++;
@ -3077,7 +3087,7 @@ xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
xmlMalloc(exec->comp->nbCounters * sizeof(int));
if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
xmlRegexpErrMemory(NULL, "saving regexp");
exec->status = -5;
exec->status = XML_REGEXP_OUT_OF_MEMORY;
return;
}
}
@ -3090,7 +3100,7 @@ xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
static void
xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
if (exec->nbRollbacks <= 0) {
exec->status = -1;
exec->status = XML_REGEXP_NOT_FOUND;
return;
}
exec->nbRollbacks--;
@ -3100,7 +3110,7 @@ xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
if (exec->comp->nbCounters > 0) {
if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
fprintf(stderr, "exec save: allocation failed");
exec->status = -6;
exec->status = XML_REGEXP_INTERNAL_ERROR;
return;
}
if (exec->counts) {
@ -3129,7 +3139,7 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
exec->maxRollbacks = 0;
exec->nbRollbacks = 0;
exec->rollbacks = NULL;
exec->status = 0;
exec->status = XML_REGEXP_OK;
exec->comp = comp;
exec->state = comp->states[0];
exec->transno = 0;
@ -3145,7 +3155,7 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
memset(exec->counts, 0, comp->nbCounters * sizeof(int));
} else
exec->counts = NULL;
while ((exec->status == 0) && (exec->state != NULL) &&
while ((exec->status == XML_REGEXP_OK) && (exec->state != NULL) &&
((exec->inputString[exec->index] != 0) ||
((exec->state != NULL) &&
(exec->state->type != XML_REGEXP_FINAL_STATE)))) {
@ -3189,7 +3199,7 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
xmlRegCounterPtr counter;
if (exec->counts == NULL) {
exec->status = -1;
exec->status = XML_REGEXP_INTERNAL_ERROR;
goto error;
}
/*
@ -3203,10 +3213,16 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
deter = 0;
} else if (atom == NULL) {
fprintf(stderr, "epsilon transition left at runtime\n");
exec->status = -2;
exec->status = XML_REGEXP_INTERNAL_ERROR;
break;
} else if (exec->inputString[exec->index] != 0) {
codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
len = 4;
codepoint = xmlGetUTF8Char(&exec->inputString[exec->index],
&len);
if (codepoint < 0) {
exec->status = XML_REGEXP_INVALID_UTF8;
goto error;
}
ret = xmlRegCheckCharacter(atom, codepoint);
if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
xmlRegStatePtr to = comp->states[trans->to];
@ -3223,7 +3239,7 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
if ((exec->counts == NULL) ||
(exec->comp == NULL) ||
(exec->comp->counters == NULL)) {
exec->status = -1;
exec->status = XML_REGEXP_INTERNAL_ERROR;
goto error;
}
counter = &exec->comp->counters[trans->counter];
@ -3266,8 +3282,13 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
exec->transno = transno;
exec->state = state;
}
codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
len);
len = 4;
codepoint = xmlGetUTF8Char(
&exec->inputString[exec->index], &len);
if (codepoint < 0) {
exec->status = XML_REGEXP_INVALID_UTF8;
goto error;
}
ret = xmlRegCheckCharacter(atom, codepoint);
exec->transcount++;
} while (ret == 1);
@ -3285,7 +3306,7 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
}
if (trans->counter >= 0) {
if (exec->counts == NULL) {
exec->status = -1;
exec->status = XML_REGEXP_INTERNAL_ERROR;
goto error;
}
exec->counts[trans->counter]--;
@ -3319,7 +3340,7 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
if ((exec->counts == NULL) ||
(exec->comp == NULL) ||
(exec->comp->counters == NULL)) {
exec->status = -1;
exec->status = XML_REGEXP_INTERNAL_ERROR;
goto error;
}
counter = &exec->comp->counters[trans->counter];
@ -3330,7 +3351,7 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
if ((trans->count >= 0) &&
(trans->count < REGEXP_ALL_COUNTER)) {
if (exec->counts == NULL) {
exec->status = -1;
exec->status = XML_REGEXP_INTERNAL_ERROR;
goto error;
}
exec->counts[trans->count] = 0;
@ -3342,7 +3363,7 @@ xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
}
goto progress;
} else if (ret < 0) {
exec->status = -4;
exec->status = XML_REGEXP_INTERNAL_ERROR;
break;
}
}
@ -3369,16 +3390,13 @@ error:
xmlFree(exec->rollbacks);
}
if (exec->state == NULL)
return(-1);
return(XML_REGEXP_INTERNAL_ERROR);
if (exec->counts != NULL)
xmlFree(exec->counts);
if (exec->status == 0)
if (exec->status == XML_REGEXP_OK)
return(1);
if (exec->status == -1) {
if (exec->nbPush > MAX_PUSH)
return(-1);
if (exec->status == XML_REGEXP_NOT_FOUND)
return(0);
}
return(exec->status);
}
@ -3419,7 +3437,7 @@ xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
exec->maxRollbacks = 0;
exec->nbRollbacks = 0;
exec->rollbacks = NULL;
exec->status = 0;
exec->status = XML_REGEXP_OK;
exec->comp = comp;
if (comp->compact == NULL)
exec->state = comp->states[0];
@ -3639,8 +3657,8 @@ error:
xmlFree(exec->errString);
exec->errString = xmlStrdup(value);
exec->errStateNo = state;
exec->status = -1;
return(-1);
exec->status = XML_REGEXP_NOT_FOUND;
return(XML_REGEXP_NOT_FOUND);
}
/**
@ -3668,7 +3686,7 @@ xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
return(-1);
if (exec->comp == NULL)
return(-1);
if (exec->status != 0)
if (exec->status != XML_REGEXP_OK)
return(exec->status);
if (exec->comp->compact != NULL)
@ -3690,7 +3708,7 @@ xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
data = exec->inputStack[exec->index].data;
}
while ((exec->status == 0) &&
while ((exec->status == XML_REGEXP_OK) &&
((value != NULL) ||
((final == 1) &&
(exec->state->type != XML_REGEXP_FINAL_STATE)))) {
@ -3780,7 +3798,7 @@ xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
ret = ((count >= counter->min) && (count <= counter->max));
} else if (atom == NULL) {
fprintf(stderr, "epsilon transition left at runtime\n");
exec->status = -2;
exec->status = XML_REGEXP_INTERNAL_ERROR;
break;
} else if (value != NULL) {
ret = xmlRegStrEqualWildcard(atom->valuep, value);
@ -3915,7 +3933,7 @@ xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
}
goto progress;
} else if (ret < 0) {
exec->status = -4;
exec->status = XML_REGEXP_INTERNAL_ERROR;
break;
}
}
@ -3942,7 +3960,8 @@ rollback:
*/
exec->determinist = 0;
xmlFARegExecRollBack(exec);
if ((exec->inputStack != NULL ) && (exec->status == 0)) {
if ((exec->inputStack != NULL ) &&
(exec->status == XML_REGEXP_OK)) {
value = exec->inputStack[exec->index].value;
data = exec->inputStack[exec->index].data;
}
@ -3952,7 +3971,7 @@ progress:
progress = 1;
continue;
}
if (exec->status == 0) {
if (exec->status == XML_REGEXP_OK) {
return(exec->state->type == XML_REGEXP_FINAL_STATE);
}
return(exec->status);
@ -3998,7 +4017,7 @@ xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
return(-1);
if (exec->comp == NULL)
return(-1);
if (exec->status != 0)
if (exec->status != XML_REGEXP_OK)
return(exec->status);
if (value2 == NULL)
@ -4010,7 +4029,7 @@ xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
if (150 < lenn + lenp + 2) {
str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
if (str == NULL) {
exec->status = -1;
exec->status = XML_REGEXP_OUT_OF_MEMORY;
return(-1);
}
} else {
@ -4239,7 +4258,7 @@ xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
if (exec == NULL)
return(-1);
if (string != NULL) {
if (exec->status != 0)
if (exec->status != XML_REGEXP_OK)
*string = exec->errString;
else
*string = NULL;
@ -4257,10 +4276,10 @@ xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
if (exec == NULL)
return(-1);
if (exec->status != 0)
if (exec->status != XML_REGEXP_OK)
return(exec->status);
while ((exec->status == 0) &&
while ((exec->status == XML_REGEXP_OK) &&
((exec->inputString[exec->index] != 0) ||
(exec->state->type != XML_REGEXP_FINAL_STATE))) {
@ -4292,7 +4311,7 @@ xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
ret = ((count >= counter->min) && (count <= counter->max));
} else if (atom == NULL) {
fprintf(stderr, "epsilon transition left at runtime\n");
exec->status = -2;
exec->status = XML_REGEXP_INTERNAL_ERROR;
break;
} else if (exec->inputString[exec->index] != 0) {
codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
@ -4374,7 +4393,7 @@ xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
}
goto progress;
} else if (ret < 0) {
exec->status = -4;
exec->status = XML_REGEXP_INTERNAL_ERROR;
break;
}
}
@ -4409,7 +4428,12 @@ xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
int cur;
int len;
cur = CUR_SCHAR(ctxt->cur, len);
len = 4;
cur = xmlGetUTF8Char(ctxt->cur, &len);
if (cur < 0) {
ERROR("Invalid UTF-8");
return(0);
}
if ((cur == '.') || (cur == '\\') || (cur == '?') ||
(cur == '*') || (cur == '+') || (cur == '(') ||
(cur == ')') || (cur == '|') || (cur == 0x5B) ||
@ -4897,7 +4921,12 @@ xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
end = start;
len = 1;
} else if ((cur != 0x5B) && (cur != 0x5D)) {
end = start = CUR_SCHAR(ctxt->cur, len);
len = 4;
end = start = xmlGetUTF8Char(ctxt->cur, &len);
if (start < 0) {
ERROR("Invalid UTF-8");
return;
}
} else {
ERROR("Expecting a char range");
return;
@ -4936,7 +4965,12 @@ xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
}
len = 1;
} else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
end = CUR_SCHAR(ctxt->cur, len);
len = 4;
end = xmlGetUTF8Char(ctxt->cur, &len);
if (end < 0) {
ERROR("Invalid UTF-8");
return;
}
} else {
ERROR("Expecting the end of a char range");
return;
@ -5151,7 +5185,12 @@ xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
if (ctxt->atom == NULL)
return(-1);
codepoint = CUR_SCHAR(ctxt->cur, len);
len = 4;
codepoint = xmlGetUTF8Char(ctxt->cur, &len);
if (codepoint < 0) {
ERROR("Invalid UTF-8");
return(-1);
}
ctxt->atom->codepoint = codepoint;
NEXTL(len);
return(1);
@ -5365,6 +5404,9 @@ xmlRegexpCompile(const xmlChar *regexp) {
xmlRegexpPtr ret = NULL;
xmlRegParserCtxtPtr ctxt;
if (regexp == NULL)
return(NULL);
ctxt = xmlRegNewParserCtxt(regexp);
if (ctxt == NULL)
return(NULL);