1
0
mirror of https://github.com/samba-team/samba.git synced 2025-01-11 05:18:09 +03:00
samba-mirror/lib/util/charset/codepoints.c
Andrew Bartlett f9acf770e9 lib/util/charset: remove charset module loading
Now that the 'table' modules are gone, there is no reason for there to
be charset modules at all.  This builds the macosxfs and weird modules
into the binary at the appropriate times, and changes the tests to
test instead the difference between the remaining internal handlers
and iconv().

Andrew Bartlett

Autobuild-User: Andrew Bartlett <abartlet@samba.org>
Autobuild-Date: Tue Sep 20 06:27:06 CEST 2011 on sn-devel-104
2011-09-20 06:27:06 +02:00

478 lines
12 KiB
C

/*
Unix SMB/CIFS implementation.
Character set conversion Extensions
Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
Copyright (C) Andrew Tridgell 2001
Copyright (C) Simo Sorce 2001
Copyright (C) Jelmer Vernooij 2007
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "includes.h"
#include "lib/util/charset/charset.h"
#include "system/locale.h"
#include "dynconfig/dynconfig.h"
#ifdef strcasecmp
#undef strcasecmp
#endif
/**
* @file
* @brief Unicode string manipulation
*/
/* these 2 tables define the unicode case handling. They are loaded
at startup either via mmap() or read() from the lib directory */
static void *upcase_table;
static void *lowcase_table;
/*******************************************************************
load the case handling tables
This is the function that should be called from library code.
********************************************************************/
void load_case_tables_library(void)
{
TALLOC_CTX *mem_ctx;
mem_ctx = talloc_init("load_case_tables");
if (!mem_ctx) {
smb_panic("No memory for case_tables");
}
upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
talloc_free(mem_ctx);
if (upcase_table == NULL) {
DEBUG(1, ("Failed to load upcase.dat, will use lame ASCII-only case sensitivity rules\n"));
upcase_table = (void *)-1;
}
if (lowcase_table == NULL) {
DEBUG(1, ("Failed to load lowcase.dat, will use lame ASCII-only case sensitivity rules\n"));
lowcase_table = (void *)-1;
}
}
/*******************************************************************
load the case handling tables
This MUST only be called from main() in application code, never from a
library. We don't know if the calling program has already done
setlocale() to another value, and can't tell if they have.
********************************************************************/
void load_case_tables(void)
{
/* This is a useful global hook where we can ensure that the
* locale is set from the environment. This is needed so that
* we can use LOCALE as a codepage */
#ifdef HAVE_SETLOCALE
setlocale(LC_ALL, "");
#endif
load_case_tables_library();
}
/**
Convert a codepoint_t to upper case.
**/
_PUBLIC_ codepoint_t toupper_m(codepoint_t val)
{
if (val < 128) {
return toupper(val);
}
if (upcase_table == NULL) {
load_case_tables_library();
}
if (upcase_table == (void *)-1) {
return val;
}
if (val & 0xFFFF0000) {
return val;
}
return SVAL(upcase_table, val*2);
}
/**
Convert a codepoint_t to lower case.
**/
_PUBLIC_ codepoint_t tolower_m(codepoint_t val)
{
if (val < 128) {
return tolower(val);
}
if (lowcase_table == NULL) {
load_case_tables_library();
}
if (lowcase_table == (void *)-1) {
return val;
}
if (val & 0xFFFF0000) {
return val;
}
return SVAL(lowcase_table, val*2);
}
/**
If we upper cased this character, would we get the same character?
**/
_PUBLIC_ bool islower_m(codepoint_t val)
{
return (toupper_m(val) != val);
}
/**
If we lower cased this character, would we get the same character?
**/
_PUBLIC_ bool isupper_m(codepoint_t val)
{
return (tolower_m(val) != val);
}
/**
compare two codepoints case insensitively
*/
_PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
{
if (c1 == c2 ||
toupper_m(c1) == toupper_m(c2)) {
return 0;
}
return c1 - c2;
}
struct smb_iconv_handle {
TALLOC_CTX *child_ctx;
const char *unix_charset;
const char *dos_charset;
const char *display_charset;
bool use_builtin_handlers;
smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
};
struct smb_iconv_handle *global_iconv_handle = NULL;
struct smb_iconv_handle *get_iconv_handle(void)
{
if (global_iconv_handle == NULL)
global_iconv_handle = smb_iconv_handle_reinit(talloc_autofree_context(),
"ASCII", "UTF-8", true, NULL);
return global_iconv_handle;
}
struct smb_iconv_handle *get_iconv_testing_handle(TALLOC_CTX *mem_ctx,
const char *dos_charset,
const char *unix_charset,
bool use_builtin_handlers)
{
return smb_iconv_handle_reinit(mem_ctx,
dos_charset, unix_charset, use_builtin_handlers, NULL);
}
/**
* Return the name of a charset to give to iconv().
**/
const char *charset_name(struct smb_iconv_handle *ic, charset_t ch)
{
switch (ch) {
case CH_UTF16: return "UTF-16LE";
case CH_UNIX: return ic->unix_charset;
case CH_DOS: return ic->dos_charset;
case CH_UTF8: return "UTF8";
case CH_UTF16BE: return "UTF-16BE";
case CH_UTF16MUNGED: return "UTF16_MUNGED";
default:
return "ASCII";
}
}
/**
re-initialize iconv conversion descriptors
**/
static int close_iconv_handle(struct smb_iconv_handle *data)
{
unsigned c1, c2;
for (c1=0;c1<NUM_CHARSETS;c1++) {
for (c2=0;c2<NUM_CHARSETS;c2++) {
if (data->conv_handles[c1][c2] != NULL) {
if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
smb_iconv_close(data->conv_handles[c1][c2]);
}
data->conv_handles[c1][c2] = NULL;
}
}
}
return 0;
}
/*
the old_ic is passed in here as the smb_iconv_handle structure
is used as a global pointer in some places (eg. python modules). We
don't want to invalidate those global pointers, but we do want to
update them with the right charset information when loadparm
runs. To do that we need to re-use the structure pointer, but
re-fill the elements in the structure with the updated values
*/
_PUBLIC_ struct smb_iconv_handle *smb_iconv_handle_reinit(TALLOC_CTX *mem_ctx,
const char *dos_charset,
const char *unix_charset,
bool use_builtin_handlers,
struct smb_iconv_handle *old_ic)
{
struct smb_iconv_handle *ret;
if (old_ic != NULL) {
ret = old_ic;
close_iconv_handle(ret);
talloc_free(ret->child_ctx);
ZERO_STRUCTP(ret);
} else {
ret = talloc_zero(mem_ctx, struct smb_iconv_handle);
}
if (ret == NULL) {
return NULL;
}
/* we use a child context to allow us to free all ptrs without
freeing the structure itself */
ret->child_ctx = talloc_new(ret);
if (ret->child_ctx == NULL) {
return NULL;
}
talloc_set_destructor(ret, close_iconv_handle);
if (strcasecmp(dos_charset, "UTF8") == 0 || strcasecmp(dos_charset, "UTF-8") == 0) {
DEBUG(0,("ERROR: invalid DOS charset: 'dos charset' must not be UTF8, using (default value) CP850 instead\n"));
dos_charset = "CP850";
}
ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset);
ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset);
ret->use_builtin_handlers = use_builtin_handlers;
return ret;
}
/*
on-demand initialisation of conversion handles
*/
smb_iconv_t get_conv_handle(struct smb_iconv_handle *ic,
charset_t from, charset_t to)
{
const char *n1, *n2;
if (ic->conv_handles[from][to]) {
return ic->conv_handles[from][to];
}
n1 = charset_name(ic, from);
n2 = charset_name(ic, to);
ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
ic->use_builtin_handlers);
if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
if ((from == CH_DOS || to == CH_DOS) &&
strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
charset_name(ic, CH_DOS)));
ic->dos_charset = "ASCII";
n1 = charset_name(ic, from);
n2 = charset_name(ic, to);
ic->conv_handles[from][to] =
smb_iconv_open_ex(ic, n2, n1, ic->use_builtin_handlers);
}
}
return ic->conv_handles[from][to];
}
/**
* Return the unicode codepoint for the next character in the input
* string in the given src_charset.
* The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
*
* Also return the number of bytes consumed (which tells the caller
* how many bytes to skip to get to the next src_charset-character).
*
* This is implemented (in the non-ascii-case) by first converting the
* next character in the input string to UTF16_LE and then calculating
* the unicode codepoint from that.
*
* Return INVALID_CODEPOINT if the next character cannot be converted.
*/
_PUBLIC_ codepoint_t next_codepoint_handle_ext(
struct smb_iconv_handle *ic,
const char *str, charset_t src_charset,
size_t *bytes_consumed)
{
/* it cannot occupy more than 4 bytes in UTF16 format */
uint8_t buf[4];
smb_iconv_t descriptor;
size_t ilen_orig;
size_t ilen;
size_t olen;
char *outbuf;
if ((str[0] & 0x80) == 0) {
*bytes_consumed = 1;
return (codepoint_t)str[0];
}
/*
* we assume that no multi-byte character can take more than 5 bytes.
* This is OK as we only support codepoints up to 1M (U+100000)
*/
ilen_orig = strnlen(str, 5);
ilen = ilen_orig;
descriptor = get_conv_handle(ic, src_charset, CH_UTF16);
if (descriptor == (smb_iconv_t)-1) {
*bytes_consumed = 1;
return INVALID_CODEPOINT;
}
/*
* this looks a little strange, but it is needed to cope with
* codepoints above 64k (U+1000) which are encoded as per RFC2781.
*/
olen = 2;
outbuf = (char *)buf;
smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
if (olen == 2) {
olen = 4;
outbuf = (char *)buf;
smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
if (olen == 4) {
/* we didn't convert any bytes */
*bytes_consumed = 1;
return INVALID_CODEPOINT;
}
olen = 4 - olen;
} else {
olen = 2 - olen;
}
*bytes_consumed = ilen_orig - ilen;
if (olen == 2) {
return (codepoint_t)SVAL(buf, 0);
}
if (olen == 4) {
/* decode a 4 byte UTF16 character manually */
return (codepoint_t)0x10000 +
(buf[2] | ((buf[3] & 0x3)<<8) |
(buf[0]<<10) | ((buf[1] & 0x3)<<18));
}
/* no other length is valid */
return INVALID_CODEPOINT;
}
/*
return the unicode codepoint for the next multi-byte CH_UNIX character
in the string
also return the number of bytes consumed (which tells the caller
how many bytes to skip to get to the next CH_UNIX character)
return INVALID_CODEPOINT if the next character cannot be converted
*/
_PUBLIC_ codepoint_t next_codepoint_handle(struct smb_iconv_handle *ic,
const char *str, size_t *size)
{
return next_codepoint_handle_ext(ic, str, CH_UNIX, size);
}
/*
push a single codepoint into a CH_UNIX string the target string must
be able to hold the full character, which is guaranteed if it is at
least 5 bytes in size. The caller may pass less than 5 bytes if they
are sure the character will fit (for example, you can assume that
uppercase/lowercase of a character will not add more than 1 byte)
return the number of bytes occupied by the CH_UNIX character, or
-1 on failure
*/
_PUBLIC_ ssize_t push_codepoint_handle(struct smb_iconv_handle *ic,
char *str, codepoint_t c)
{
smb_iconv_t descriptor;
uint8_t buf[4];
size_t ilen, olen;
const char *inbuf;
if (c < 128) {
*str = c;
return 1;
}
descriptor = get_conv_handle(ic,
CH_UTF16, CH_UNIX);
if (descriptor == (smb_iconv_t)-1) {
return -1;
}
if (c < 0x10000) {
ilen = 2;
olen = 5;
inbuf = (char *)buf;
SSVAL(buf, 0, c);
smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
if (ilen != 0) {
return -1;
}
return 5 - olen;
}
c -= 0x10000;
buf[0] = (c>>10) & 0xFF;
buf[1] = (c>>18) | 0xd8;
buf[2] = c & 0xFF;
buf[3] = ((c>>8) & 0x3) | 0xdc;
ilen = 4;
olen = 5;
inbuf = (char *)buf;
smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
if (ilen != 0) {
return -1;
}
return 5 - olen;
}
_PUBLIC_ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
size_t *size)
{
return next_codepoint_handle_ext(get_iconv_handle(), str,
src_charset, size);
}
_PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size)
{
return next_codepoint_handle(get_iconv_handle(), str, size);
}
_PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)
{
return push_codepoint_handle(get_iconv_handle(), str, c);
}