linux/fs/unicode/utf8n.h

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (c) 2014 SGI.
 * All rights reserved.
 */

#ifndef UTF8NORM_H
#define UTF8NORM_H

#include <linux/types.h>
#include <linux/export.h>
#include <linux/string.h>
#include <linux/module.h>

/* Encoding a unicode version number as a single unsigned int. */
#define UNICODE_MAJ_SHIFT		(16)
#define UNICODE_MIN_SHIFT		(8)

#define UNICODE_AGE(MAJ, MIN, REV)			\
	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
	 ((unsigned int)(REV)))

/* Highest unicode version supported by the data tables. */
extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
extern int utf8version_latest(void);

/*
 * Look for the correct const struct utf8data for a unicode version.
 * Returns NULL if the version requested is too new.
 *
 * Two normalization forms are supported: nfdi and nfdicf.
 *
 * nfdi:
 *  - Apply unicode normalization form NFD.
 *  - Remove any Default_Ignorable_Code_Point.
 *
 * nfdicf:
 *  - Apply unicode normalization form NFD.
 *  - Remove any Default_Ignorable_Code_Point.
 *  - Apply a full casefold (C + F).
 */
extern const struct utf8data *utf8nfdi(unsigned int maxage);
extern const struct utf8data *utf8nfdicf(unsigned int maxage);

/*
 * Determine the maximum age of any unicode character in the string.
 * Returns 0 if only unassigned code points are present.
 * Returns -1 if the input is not valid UTF-8.
 */
extern int utf8agemax(const struct utf8data *data, const char *s);
extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len);

/*
 * Determine the minimum age of any unicode character in the string.
 * Returns 0 if any unassigned code points are present.
 * Returns -1 if the input is not valid UTF-8.
 */
extern int utf8agemin(const struct utf8data *data, const char *s);
extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len);

/*
 * Determine the length of the normalized from of the string,
 * excluding any terminating NULL byte.
 * Returns 0 if only ignorable code points are present.
 * Returns -1 if the input is not valid UTF-8.
 */
extern ssize_t utf8len(const struct utf8data *data, const char *s);
extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len);

/* Needed in struct utf8cursor below. */
#define UTF8HANGULLEAF	(12)

/*
 * Cursor structure used by the normalizer.
 */
struct utf8cursor {
	const struct utf8data	*data;
	const char	*s;
	const char	*p;
	const char	*ss;
	const char	*sp;
	unsigned int	len;
	unsigned int	slen;
	short int	ccc;
	short int	nccc;
	unsigned char	hangul[UTF8HANGULLEAF];
};

/*
 * Initialize a utf8cursor to normalize a string.
 * Returns 0 on success.
 * Returns -1 on failure.
 */
extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
		      const char *s);
extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
		       const char *s, size_t len);

/*
 * Get the next byte in the normalization.
 * Returns a value > 0 && < 256 on success.
 * Returns 0 when the end of the normalization is reached.
 * Returns -1 if the string being normalized is not valid UTF-8.
 */
extern int utf8byte(struct utf8cursor *u8c);

#endif /* UTF8NORM_H */
treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 294 Based on 2 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license 2 as published by the free software foundation this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation this program is distributed in the hope that it [would] be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 9 file(s). Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Allison Randal <allison@lohutok.net> Reviewed-by: Alexios Zavras <alexios.zavras@intel.com> Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190529141901.804956444@linutronix.de Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2019-05-29 17:18:08 +03:00			`/* SPDX-License-Identifier: GPL-2.0-only */`
unicode: introduce code for UTF-8 normalization Supporting functions for UTF-8 normalization are in utf8norm.c with the header utf8norm.h. Two normalization forms are supported: nfdi and nfdicf. nfdi: - Apply unicode normalization form NFD. - Remove any Default_Ignorable_Code_Point. nfdicf: - Apply unicode normalization form NFD. - Remove any Default_Ignorable_Code_Point. - Apply a full casefold (C + F). For the purposes of the code, a string is valid UTF-8 if: - The values encoded are 0x1..0x10FFFF. - The surrogate codepoints 0xD800..0xDFFFF are not encoded. - The shortest possible encoding is used for all values. The supporting functions work on null-terminated strings (utf8 prefix) and on length-limited strings (utf8n prefix). From the original SGI patch and for conformity with coding standards, the utf8data_t typedef was dropped, since it was just masking the struct keyword. On other occasions, namely utf8leaf_t and utf8trie_t, I decided to keep it, since they are simple pointers to memory buffers, and using uchars here wouldn't provide any more meaningful information. From the original submission, we also converted from the compatibility form to canonical. Changes made by Gabriel: Rebase to Mainline Fix up checkpatch.pl warnings Drop typedefs move out of libxfs Convert from NFKD to NFD Signed-off-by: Olaf Weber <olaf@sgi.com> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu> 2019-04-25 20:45:46 +03:00			`/*`
			`* Copyright (c) 2014 SGI.`
			`* All rights reserved.`
			`*/`

			`#ifndef UTF8NORM_H`
			`#define UTF8NORM_H`

			`#include <linux/types.h>`
			`#include <linux/export.h>`
			`#include <linux/string.h>`
			`#include <linux/module.h>`

			`/* Encoding a unicode version number as a single unsigned int. */`
			`#define UNICODE_MAJ_SHIFT (16)`
			`#define UNICODE_MIN_SHIFT (8)`

			`#define UNICODE_AGE(MAJ, MIN, REV) \`
			`(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) \| \`
			`((unsigned int)(MIN) << UNICODE_MIN_SHIFT) \| \`
			`((unsigned int)(REV)))`

			`/* Highest unicode version supported by the data tables. */`
			`extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);`
unicode: implement higher level API for string handling This patch integrates the utf8n patches with some higher level API to perform UTF-8 string comparison, normalization and casefolding operations. Implemented is a variation of NFD, and casefold is performed by doing full casefold on top of NFD. These algorithms are based on the core implemented by Olaf Weber from SGI. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu> 2019-04-25 20:51:22 +03:00			`extern int utf8version_latest(void);`
unicode: introduce code for UTF-8 normalization Supporting functions for UTF-8 normalization are in utf8norm.c with the header utf8norm.h. Two normalization forms are supported: nfdi and nfdicf. nfdi: - Apply unicode normalization form NFD. - Remove any Default_Ignorable_Code_Point. nfdicf: - Apply unicode normalization form NFD. - Remove any Default_Ignorable_Code_Point. - Apply a full casefold (C + F). For the purposes of the code, a string is valid UTF-8 if: - The values encoded are 0x1..0x10FFFF. - The surrogate codepoints 0xD800..0xDFFFF are not encoded. - The shortest possible encoding is used for all values. The supporting functions work on null-terminated strings (utf8 prefix) and on length-limited strings (utf8n prefix). From the original SGI patch and for conformity with coding standards, the utf8data_t typedef was dropped, since it was just masking the struct keyword. On other occasions, namely utf8leaf_t and utf8trie_t, I decided to keep it, since they are simple pointers to memory buffers, and using uchars here wouldn't provide any more meaningful information. From the original submission, we also converted from the compatibility form to canonical. Changes made by Gabriel: Rebase to Mainline Fix up checkpatch.pl warnings Drop typedefs move out of libxfs Convert from NFKD to NFD Signed-off-by: Olaf Weber <olaf@sgi.com> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu> 2019-04-25 20:45:46 +03:00
			`/*`
			`* Look for the correct const struct utf8data for a unicode version.`
			`* Returns NULL if the version requested is too new.`
			`*`
			`* Two normalization forms are supported: nfdi and nfdicf.`
			`*`
			`* nfdi:`
			`* - Apply unicode normalization form NFD.`
			`* - Remove any Default_Ignorable_Code_Point.`
			`*`
			`* nfdicf:`
			`* - Apply unicode normalization form NFD.`
			`* - Remove any Default_Ignorable_Code_Point.`
			`* - Apply a full casefold (C + F).`
			`*/`
			`extern const struct utf8data *utf8nfdi(unsigned int maxage);`
			`extern const struct utf8data *utf8nfdicf(unsigned int maxage);`

			`/*`
			`* Determine the maximum age of any unicode character in the string.`
			`* Returns 0 if only unassigned code points are present.`
			`* Returns -1 if the input is not valid UTF-8.`
			`*/`
			`extern int utf8agemax(const struct utf8data data, const char s);`
			`extern int utf8nagemax(const struct utf8data data, const char s, size_t len);`

			`/*`
			`* Determine the minimum age of any unicode character in the string.`
			`* Returns 0 if any unassigned code points are present.`
			`* Returns -1 if the input is not valid UTF-8.`
			`*/`
			`extern int utf8agemin(const struct utf8data data, const char s);`
			`extern int utf8nagemin(const struct utf8data data, const char s, size_t len);`

			`/*`
			`* Determine the length of the normalized from of the string,`
			`* excluding any terminating NULL byte.`
			`* Returns 0 if only ignorable code points are present.`
			`* Returns -1 if the input is not valid UTF-8.`
			`*/`
			`extern ssize_t utf8len(const struct utf8data data, const char s);`
			`extern ssize_t utf8nlen(const struct utf8data data, const char s, size_t len);`

unicode: reduce the size of utf8data[] Remove the Hangul decompositions from the utf8data trie, and do algorithmic decomposition to calculate them on the fly. To store the decomposition the caller of utf8lookup()/utf8nlookup() must provide a 12-byte buffer, which is used to synthesize a leaf with the decomposition. This significantly reduces the size of the utf8data[] array. Changes made by Gabriel: Rebase to mainline Fix checkpatch errors Extract robustness fixes and merge back to original mkutf8data.c patch Regenerate utf8data.h Signed-off-by: Olaf Weber <olaf@sgi.com> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu> 2019-04-25 20:49:18 +03:00			`/* Needed in struct utf8cursor below. */`
			`#define UTF8HANGULLEAF (12)`

unicode: introduce code for UTF-8 normalization Supporting functions for UTF-8 normalization are in utf8norm.c with the header utf8norm.h. Two normalization forms are supported: nfdi and nfdicf. nfdi: - Apply unicode normalization form NFD. - Remove any Default_Ignorable_Code_Point. nfdicf: - Apply unicode normalization form NFD. - Remove any Default_Ignorable_Code_Point. - Apply a full casefold (C + F). For the purposes of the code, a string is valid UTF-8 if: - The values encoded are 0x1..0x10FFFF. - The surrogate codepoints 0xD800..0xDFFFF are not encoded. - The shortest possible encoding is used for all values. The supporting functions work on null-terminated strings (utf8 prefix) and on length-limited strings (utf8n prefix). From the original SGI patch and for conformity with coding standards, the utf8data_t typedef was dropped, since it was just masking the struct keyword. On other occasions, namely utf8leaf_t and utf8trie_t, I decided to keep it, since they are simple pointers to memory buffers, and using uchars here wouldn't provide any more meaningful information. From the original submission, we also converted from the compatibility form to canonical. Changes made by Gabriel: Rebase to Mainline Fix up checkpatch.pl warnings Drop typedefs move out of libxfs Convert from NFKD to NFD Signed-off-by: Olaf Weber <olaf@sgi.com> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu> 2019-04-25 20:45:46 +03:00			`/*`
			`* Cursor structure used by the normalizer.`
			`*/`
			`struct utf8cursor {`
			`const struct utf8data *data;`
			`const char *s;`
			`const char *p;`
			`const char *ss;`
			`const char *sp;`
			`unsigned int len;`
			`unsigned int slen;`
			`short int ccc;`
			`short int nccc;`
unicode: reduce the size of utf8data[] Remove the Hangul decompositions from the utf8data trie, and do algorithmic decomposition to calculate them on the fly. To store the decomposition the caller of utf8lookup()/utf8nlookup() must provide a 12-byte buffer, which is used to synthesize a leaf with the decomposition. This significantly reduces the size of the utf8data[] array. Changes made by Gabriel: Rebase to mainline Fix checkpatch errors Extract robustness fixes and merge back to original mkutf8data.c patch Regenerate utf8data.h Signed-off-by: Olaf Weber <olaf@sgi.com> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu> 2019-04-25 20:49:18 +03:00			`unsigned char hangul[UTF8HANGULLEAF];`
unicode: introduce code for UTF-8 normalization Supporting functions for UTF-8 normalization are in utf8norm.c with the header utf8norm.h. Two normalization forms are supported: nfdi and nfdicf. nfdi: - Apply unicode normalization form NFD. - Remove any Default_Ignorable_Code_Point. nfdicf: - Apply unicode normalization form NFD. - Remove any Default_Ignorable_Code_Point. - Apply a full casefold (C + F). For the purposes of the code, a string is valid UTF-8 if: - The values encoded are 0x1..0x10FFFF. - The surrogate codepoints 0xD800..0xDFFFF are not encoded. - The shortest possible encoding is used for all values. The supporting functions work on null-terminated strings (utf8 prefix) and on length-limited strings (utf8n prefix). From the original SGI patch and for conformity with coding standards, the utf8data_t typedef was dropped, since it was just masking the struct keyword. On other occasions, namely utf8leaf_t and utf8trie_t, I decided to keep it, since they are simple pointers to memory buffers, and using uchars here wouldn't provide any more meaningful information. From the original submission, we also converted from the compatibility form to canonical. Changes made by Gabriel: Rebase to Mainline Fix up checkpatch.pl warnings Drop typedefs move out of libxfs Convert from NFKD to NFD Signed-off-by: Olaf Weber <olaf@sgi.com> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu> 2019-04-25 20:45:46 +03:00			`};`

			`/*`
			`* Initialize a utf8cursor to normalize a string.`
			`* Returns 0 on success.`
			`* Returns -1 on failure.`
			`*/`
			`extern int utf8cursor(struct utf8cursor u8c, const struct utf8data data,`
			`const char *s);`
			`extern int utf8ncursor(struct utf8cursor u8c, const struct utf8data data,`
			`const char *s, size_t len);`

			`/*`
			`* Get the next byte in the normalization.`
			`* Returns a value > 0 && < 256 on success.`
			`* Returns 0 when the end of the normalization is reached.`
			`* Returns -1 if the string being normalized is not valid UTF-8.`
			`*/`
			`extern int utf8byte(struct utf8cursor *u8c);`

			`#endif /* UTF8NORM_H */`