1
0
mirror of https://github.com/samba-team/samba.git synced 2024-12-22 13:34:15 +03:00

util/str: helper to check for utf-8 validity

Signed-off-by: Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Reviewed-by: Andrew Bartlett <abartlet@samba.org>
This commit is contained in:
Douglas Bagnall 2023-07-06 15:31:52 +12:00 committed by Andrew Bartlett
parent 65674cde60
commit e4da279b1c
2 changed files with 126 additions and 0 deletions

View File

@ -167,6 +167,11 @@ char *strrchr_m(const char *s, char c);
char *strchr_m(const char *s, char c);
char *strstr_m(const char *src, const char *findstr);
bool utf8_check(const char *input, size_t maxlen,
size_t *byte_len,
size_t *char_len,
size_t *utf16_len);
bool push_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src, size_t *converted_size);
bool push_ucs2_talloc(TALLOC_CTX *ctx, smb_ucs2_t **dest, const char *src, size_t *converted_size);
bool push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src, size_t *converted_size);

View File

@ -222,6 +222,127 @@ size_t utf16_len_n(const void *src, size_t n)
return len;
}
/**
* Determine the length and validity of a utf-8 string.
*
* @param input the string pointer
* @param maxlen maximum size of the string
* @param byte_len receives the length of the valid section
* @param char_len receives the number of unicode characters in the valid section
* @param utf16_len receives the number of bytes the string would need in UTF16 encoding.
*
* @return true if the input is valid up to maxlen, or a '\0' byte, otherwise false.
*/
bool utf8_check(const char *input, size_t maxlen,
size_t *byte_len,
size_t *char_len,
size_t *utf16_len)
{
const uint8_t *s = (const uint8_t *)input;
size_t i;
size_t chars = 0;
size_t long_chars = 0;
uint32_t codepoint;
uint8_t a, b, c, d;
for (i = 0; i < maxlen; i++, chars++) {
if (s[i] == 0) {
break;
}
if (s[i] < 0x80) {
continue;
}
if ((s[i] & 0xe0) == 0xc0) {
/* 110xxxxx 10xxxxxx */
a = s[i];
if (maxlen - i < 2) {
goto error;
}
b = s[i + 1];
if ((b & 0xc0) != 0x80) {
goto error;
}
codepoint = (a & 31) << 6 | (b & 63);
if (codepoint < 0x80) {
goto error;
}
i++;
continue;
}
if ((s[i] & 0xf0) == 0xe0) {
/* 1110xxxx 10xxxxxx 10xxxxxx */
if (maxlen - i < 3) {
goto error;
}
a = s[i];
b = s[i + 1];
c = s[i + 2];
if ((b & 0xc0) != 0x80 || (c & 0xc0) != 0x80) {
goto error;
}
codepoint = (c & 63) | (b & 63) << 6 | (a & 15) << 12;
if (codepoint < 0x800) {
goto error;
}
if (codepoint >= 0xd800 && codepoint <= 0xdfff) {
/*
* This is an invalid codepoint, per
* RFC3629, as it encodes part of a
* UTF-16 surrogate pair for a
* character over U+10000, which ought
* to have been encoded as a four byte
* utf-8 sequence.
*/
goto error;
}
i += 2;
continue;
}
if ((s[i] & 0xf8) == 0xf0) {
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
if (maxlen - i < 4) {
goto error;
}
a = s[i];
b = s[i + 1];
c = s[i + 2];
d = s[i + 3];
if ((b & 0xc0) != 0x80 ||
(c & 0xc0) != 0x80 ||
(d & 0xc0) != 0x80) {
goto error;
}
codepoint = (d & 63) | (c & 63) << 6 | (b & 63) << 12 | (a & 7) << 18;
if (codepoint < 0x10000 || codepoint > 0x10ffff) {
goto error;
}
/* this one will need two UTF16 characters */
long_chars++;
i += 3;
continue;
}
/*
* If it wasn't handled yet, it's wrong.
*/
goto error;
}
*byte_len = i;
*char_len = chars;
*utf16_len = chars + long_chars;
return true;
error:
*byte_len = i;
*char_len = chars;
*utf16_len = chars + long_chars;
return false;
}
/**
* Copy a string from a char* unix src to a dos codepage string destination.
*