2024-12-22 13:34:15 +03:00 · 2023-07-06 15:31:52 +12:00 · 2023-07-06 15:31:52 +12:00 · e4da279b1c
commit e4da279b1c
parent 65674cde60
2 changed files with 126 additions and 0 deletions
--- a/lib/util/charset/charset.h
+++ b/lib/util/charset/charset.h
@ -167,6 +167,11 @@ char *strrchr_m(const char *s, char c);
 char *strchr_m(const char *s, char c);
 char *strstr_m(const char *src, const char *findstr);
 bool utf8_check(const char *input, size_t maxlen,
 		size_t *byte_len,
 		size_t *char_len,
 		size_t *utf16_len);
 bool push_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src, size_t *converted_size);
 bool push_ucs2_talloc(TALLOC_CTX *ctx, smb_ucs2_t **dest, const char *src, size_t *converted_size);
 bool push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src, size_t *converted_size);
--- a/lib/util/charset/util_unistr.c
+++ b/lib/util/charset/util_unistr.c
@ -222,6 +222,127 @@ size_t utf16_len_n(const void *src, size_t n)
 	return len;
 }
 /**
 * Determine the length and validity of a utf-8 string.
 *
 * @param input the string pointer
 * @param maxlen maximum size of the string
 * @param byte_len receives the length of the valid section
 * @param char_len receives the number of unicode characters in the valid section
 * @param utf16_len receives the number of bytes the string would need in UTF16 encoding.
 *
 * @return true if the input is valid up to maxlen, or a '\0' byte, otherwise false.
 */
 bool utf8_check(const char *input, size_t maxlen,
 		size_t *byte_len,
 		size_t *char_len,
 		size_t *utf16_len)
 {
 	const uint8_t *s = (const uint8_t *)input;
 	size_t i;
 	size_t chars = 0;
 	size_t long_chars = 0;
 	uint32_t codepoint;
 	uint8_t a, b, c, d;
 	for (i = 0; i < maxlen; i++, chars++) {
 		if (s[i] == 0) {
 			break;
 		}
 		if (s[i] < 0x80) {
 			continue;
 		}
 		if ((s[i] & 0xe0) == 0xc0) {
 			/* 110xxxxx 10xxxxxx */
 			a = s[i];
 			if (maxlen - i < 2) {
 				goto error;
 			}
 			b = s[i + 1];
 			if ((b & 0xc0) != 0x80) {
 				goto error;
 			}
 			codepoint = (a & 31) << 6 | (b & 63);
 			if (codepoint < 0x80) {
 				goto error;
 			}
 			i++;
 			continue;
 		}
 		if ((s[i] & 0xf0) == 0xe0) {
 			/* 1110xxxx 10xxxxxx 10xxxxxx */
 			if (maxlen - i < 3) {
 				goto error;
 			}
 			a = s[i];
 			b = s[i + 1];
 			c = s[i + 2];
 			if ((b & 0xc0) != 0x80 || (c & 0xc0) != 0x80) {
 				goto error;
 			}
 			codepoint = (c & 63) | (b & 63) << 6 | (a & 15) << 12;
 			if (codepoint < 0x800) {
 				goto error;
 			}
 			if (codepoint >= 0xd800 && codepoint <= 0xdfff) {
 				/*
 				 * This is an invalid codepoint, per
 				 * RFC3629, as it encodes part of a
 				 * UTF-16 surrogate pair for a
 				 * character over U+10000, which ought
 				 * to have been encoded as a four byte
 				 * utf-8 sequence.
 				 */
 				goto error;
 			}
 			i += 2;
 			continue;
 		}
 		if ((s[i] & 0xf8) == 0xf0) {
 			/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
 			if (maxlen - i < 4) {
 				goto error;
 			}
 			a = s[i];
 			b = s[i + 1];
 			c = s[i + 2];
 			d = s[i + 3];
 			if ((b & 0xc0) != 0x80 ||
 			    (c & 0xc0) != 0x80 ||
 			    (d & 0xc0) != 0x80) {
 				goto error;
 			}
 			codepoint = (d & 63) | (c & 63) << 6 | (b & 63) << 12 | (a & 7) << 18;
 			if (codepoint < 0x10000 || codepoint > 0x10ffff) {
 				goto error;
 			}
 			/* this one will need two UTF16 characters */
 			long_chars++;
 			i += 3;
 			continue;
 		}
 		/*
 		 * If it wasn't handled yet, it's wrong.
 		 */
 		goto error;
 	}
 	*byte_len = i;
 	*char_len = chars;
 	*utf16_len = chars + long_chars;
 	return true;
 error:
 	*byte_len = i;
 	*char_len = chars;
 	*utf16_len = chars + long_chars;
 	return false;
 }
 /**
 * Copy a string from a char* unix src to a dos codepage string destination.
 *