2003-08-13 05:53:07 +04:00
/*
Unix SMB / CIFS implementation .
minimal iconv implementation
Copyright ( C ) Andrew Tridgell 2001
Copyright ( C ) Jelmer Vernooij 2002
This program is free software ; you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
2007-07-10 06:07:03 +04:00
the Free Software Foundation ; either version 3 of the License , or
2003-08-13 05:53:07 +04:00
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
2007-07-10 06:07:03 +04:00
along with this program . If not , see < http : //www.gnu.org/licenses/>.
2003-08-13 05:53:07 +04:00
*/
# include "includes.h"
2008-10-11 23:31:42 +04:00
# include "../lib/util/dlinklist.h"
2004-11-02 02:45:40 +03:00
# include "system/iconv.h"
2005-02-10 08:09:35 +03:00
# include "system/filesys.h"
2003-08-13 05:53:07 +04:00
/**
* @ file
*
* @ brief Samba wrapper / stub for iconv character set conversion .
*
* iconv is the XPG2 interface for converting between character
* encodings . This file provides a Samba wrapper around it , and also
* a simple reimplementation that is used if the system does not
* implement iconv .
*
* Samba only works with encodings that are supersets of ASCII : ascii
* characters like whitespace can be tested for directly , multibyte
* sequences start with a byte with the high bit set , and strings are
* terminated by a nul byte .
*
* Note that the only function provided by iconv is conversion between
* characters . It doesn ' t directly support operations like
2004-09-01 08:39:06 +04:00
* uppercasing or comparison . We have to convert to UTF - 16L E and
* compare there .
2003-08-13 05:53:07 +04:00
*
* @ sa Samba Developers Guide
* */
2003-12-16 12:20:34 +03:00
static size_t ascii_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t ascii_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t utf8_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t utf8_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
2008-10-31 05:51:37 +03:00
static size_t utf8_munged_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
2003-08-13 05:53:07 +04:00
static size_t ucs2hex_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t ucs2hex_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
2003-12-16 12:20:34 +03:00
static size_t iconv_copy ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t iconv_swab ( void * , const char * * , size_t * , char * * , size_t * ) ;
2003-08-13 05:53:07 +04:00
2004-11-02 15:43:25 +03:00
static const struct charset_functions builtin_functions [ ] = {
2004-09-01 08:39:06 +04:00
/* windows is closest to UTF-16 */
2003-08-13 05:53:07 +04:00
{ " UCS-2LE " , iconv_copy , iconv_copy } ,
2004-08-30 16:03:01 +04:00
{ " UTF-16LE " , iconv_copy , iconv_copy } ,
2003-12-16 12:20:34 +03:00
{ " UCS-2BE " , iconv_swab , iconv_swab } ,
2004-09-01 08:39:06 +04:00
{ " UTF-16BE " , iconv_swab , iconv_swab } ,
2004-08-30 16:03:01 +04:00
/* we include the UTF-8 alias to cope with differing locale settings */
2003-08-13 05:53:07 +04:00
{ " UTF8 " , utf8_pull , utf8_push } ,
2004-08-30 16:03:01 +04:00
{ " UTF-8 " , utf8_pull , utf8_push } ,
2008-10-31 05:51:37 +03:00
/* this handles the munging needed for String2Key */
{ " UTF8_MUNGED " , utf8_pull , utf8_munged_push } ,
2003-08-13 05:53:07 +04:00
{ " ASCII " , ascii_pull , ascii_push } ,
2004-12-19 03:13:24 +03:00
{ " UCS2-HEX " , ucs2hex_pull , ucs2hex_push }
2003-08-13 05:53:07 +04:00
} ;
static struct charset_functions * charsets = NULL ;
2007-08-31 03:15:12 +04:00
bool charset_register_backend ( const void * _funcs )
2003-08-13 05:53:07 +04:00
{
2007-09-08 20:46:30 +04:00
struct charset_functions * funcs = ( struct charset_functions * ) memdup ( _funcs , sizeof ( struct charset_functions ) ) ;
2007-08-31 03:15:12 +04:00
struct charset_functions * c ;
2003-08-13 05:53:07 +04:00
/* Check whether we already have this charset... */
2007-08-31 03:15:12 +04:00
for ( c = charsets ; c ! = NULL ; c = c - > next ) {
if ( ! strcasecmp ( c - > name , funcs - > name ) ) {
2003-08-13 05:53:07 +04:00
DEBUG ( 2 , ( " Duplicate charset %s, not registering \n " , funcs - > name ) ) ;
2007-08-31 03:15:12 +04:00
return false ;
2003-08-13 05:53:07 +04:00
}
}
funcs - > next = funcs - > prev = NULL ;
DLIST_ADD ( charsets , funcs ) ;
2007-08-31 03:15:12 +04:00
return true ;
2003-08-13 05:53:07 +04:00
}
# ifdef HAVE_NATIVE_ICONV
/* if there was an error then reset the internal state,
this ensures that we don ' t have a shift state remaining for
character sets like SJIS */
static size_t sys_iconv ( void * cd ,
const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
size_t ret = iconv ( ( iconv_t ) cd ,
2004-10-08 12:13:00 +04:00
discard_const_p ( char * , inbuf ) , inbytesleft ,
2003-08-13 05:53:07 +04:00
outbuf , outbytesleft ) ;
if ( ret = = ( size_t ) - 1 ) iconv ( cd , NULL , NULL , NULL , NULL ) ;
return ret ;
}
# endif
/**
* This is a simple portable iconv ( ) implementaion .
*
* It only knows about a very small number of character sets - just
* enough that Samba works on systems that don ' t have iconv .
* */
2008-04-02 06:53:27 +04:00
_PUBLIC_ size_t smb_iconv ( smb_iconv_t cd ,
2003-08-13 05:53:07 +04:00
const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
char cvtbuf [ 2048 ] ;
size_t bufsize ;
/* in many cases we can go direct */
if ( cd - > direct ) {
return cd - > direct ( cd - > cd_direct ,
inbuf , inbytesleft , outbuf , outbytesleft ) ;
}
/* otherwise we have to do it chunks at a time */
while ( * inbytesleft > 0 ) {
2004-10-28 16:46:59 +04:00
char * bufp1 = cvtbuf ;
const char * bufp2 = cvtbuf ;
2003-08-13 05:53:07 +04:00
bufsize = sizeof ( cvtbuf ) ;
if ( cd - > pull ( cd - > cd_pull ,
2004-10-28 16:46:59 +04:00
inbuf , inbytesleft , & bufp1 , & bufsize ) = = - 1
2003-08-13 05:53:07 +04:00
& & errno ! = E2BIG ) return - 1 ;
bufsize = sizeof ( cvtbuf ) - bufsize ;
if ( cd - > push ( cd - > cd_push ,
2004-10-28 16:46:59 +04:00
& bufp2 , & bufsize ,
2003-08-13 05:53:07 +04:00
outbuf , outbytesleft ) = = - 1 ) return - 1 ;
}
return 0 ;
}
2007-08-27 22:10:19 +04:00
static bool is_utf16 ( const char * name )
2004-09-01 09:19:00 +04:00
{
return strcasecmp ( name , " UCS-2LE " ) = = 0 | |
strcasecmp ( name , " UTF-16LE " ) = = 0 ;
}
2007-12-17 10:32:00 +03:00
2008-04-02 06:53:27 +04:00
_PUBLIC_ smb_iconv_t smb_iconv_open_ex ( TALLOC_CTX * mem_ctx , const char * tocode ,
2007-12-17 10:32:00 +03:00
const char * fromcode , bool native_iconv )
2003-08-13 05:53:07 +04:00
{
smb_iconv_t ret ;
2004-12-19 03:13:24 +03:00
const struct charset_functions * from = NULL , * to = NULL ;
int i ;
2003-08-13 05:53:07 +04:00
2007-12-17 10:32:00 +03:00
ret = ( smb_iconv_t ) talloc_named ( mem_ctx ,
sizeof ( * ret ) ,
2004-09-01 13:45:00 +04:00
" iconv(%s,%s) " , tocode , fromcode ) ;
2003-08-13 05:53:07 +04:00
if ( ! ret ) {
errno = ENOMEM ;
return ( smb_iconv_t ) - 1 ;
}
memset ( ret , 0 , sizeof ( * ret ) ) ;
/* check for the simplest null conversion */
if ( strcmp ( fromcode , tocode ) = = 0 ) {
ret - > direct = iconv_copy ;
return ret ;
}
2004-12-19 03:13:24 +03:00
for ( i = 0 ; i < ARRAY_SIZE ( builtin_functions ) ; i + + ) {
if ( strcasecmp ( fromcode , builtin_functions [ i ] . name ) = = 0 ) {
from = & builtin_functions [ i ] ;
}
if ( strcasecmp ( tocode , builtin_functions [ i ] . name ) = = 0 ) {
to = & builtin_functions [ i ] ;
}
}
if ( from = = NULL ) {
for ( from = charsets ; from ; from = from - > next ) {
if ( strcasecmp ( from - > name , fromcode ) = = 0 ) break ;
}
2003-08-13 05:53:07 +04:00
}
2004-12-19 03:13:24 +03:00
if ( to = = NULL ) {
for ( to = charsets ; to ; to = to - > next ) {
if ( strcasecmp ( to - > name , tocode ) = = 0 ) break ;
}
2003-08-13 05:53:07 +04:00
}
# ifdef HAVE_NATIVE_ICONV
2007-12-06 19:16:40 +03:00
if ( ( ! from | | ! to ) & & ! native_iconv ) {
2006-09-11 13:19:58 +04:00
goto failed ;
}
2003-08-13 05:53:07 +04:00
if ( ! from ) {
ret - > pull = sys_iconv ;
2004-09-01 09:19:00 +04:00
ret - > cd_pull = iconv_open ( " UTF-16LE " , fromcode ) ;
if ( ret - > cd_pull = = ( iconv_t ) - 1 )
ret - > cd_pull = iconv_open ( " UCS-2LE " , fromcode ) ;
2003-08-13 05:53:07 +04:00
if ( ret - > cd_pull = = ( iconv_t ) - 1 ) goto failed ;
}
if ( ! to ) {
ret - > push = sys_iconv ;
2004-09-01 09:19:00 +04:00
ret - > cd_push = iconv_open ( tocode , " UTF-16LE " ) ;
if ( ret - > cd_push = = ( iconv_t ) - 1 )
ret - > cd_push = iconv_open ( tocode , " UCS-2LE " ) ;
2003-08-13 05:53:07 +04:00
if ( ret - > cd_push = = ( iconv_t ) - 1 ) goto failed ;
}
# else
if ( ! from | | ! to ) {
goto failed ;
}
# endif
/* check for conversion to/from ucs2 */
2004-09-01 09:19:00 +04:00
if ( is_utf16 ( fromcode ) & & to ) {
2003-08-13 05:53:07 +04:00
ret - > direct = to - > push ;
return ret ;
}
2004-09-01 09:19:00 +04:00
if ( is_utf16 ( tocode ) & & from ) {
2003-08-13 05:53:07 +04:00
ret - > direct = from - > pull ;
return ret ;
}
# ifdef HAVE_NATIVE_ICONV
2004-09-01 09:19:00 +04:00
if ( is_utf16 ( fromcode ) ) {
2003-08-13 05:53:07 +04:00
ret - > direct = sys_iconv ;
ret - > cd_direct = ret - > cd_push ;
ret - > cd_push = NULL ;
return ret ;
}
2004-09-01 09:19:00 +04:00
if ( is_utf16 ( tocode ) ) {
2003-08-13 05:53:07 +04:00
ret - > direct = sys_iconv ;
ret - > cd_direct = ret - > cd_pull ;
ret - > cd_pull = NULL ;
return ret ;
}
# endif
/* the general case has to go via a buffer */
if ( ! ret - > pull ) ret - > pull = from - > pull ;
if ( ! ret - > push ) ret - > push = to - > push ;
return ret ;
failed :
2004-09-01 13:45:00 +04:00
talloc_free ( ret ) ;
2003-08-13 05:53:07 +04:00
errno = EINVAL ;
return ( smb_iconv_t ) - 1 ;
}
2007-12-17 10:32:00 +03:00
/*
simple iconv_open ( ) wrapper
*/
2008-04-02 06:53:27 +04:00
_PUBLIC_ smb_iconv_t smb_iconv_open ( const char * tocode , const char * fromcode )
2007-12-17 10:32:00 +03:00
{
return smb_iconv_open_ex ( NULL , tocode , fromcode , true ) ;
}
2003-08-13 05:53:07 +04:00
/*
simple iconv_close ( ) wrapper
*/
2008-04-02 06:53:27 +04:00
_PUBLIC_ int smb_iconv_close ( smb_iconv_t cd )
2003-08-13 05:53:07 +04:00
{
# ifdef HAVE_NATIVE_ICONV
if ( cd - > cd_direct ) iconv_close ( ( iconv_t ) cd - > cd_direct ) ;
if ( cd - > cd_pull ) iconv_close ( ( iconv_t ) cd - > cd_pull ) ;
if ( cd - > cd_push ) iconv_close ( ( iconv_t ) cd - > cd_push ) ;
# endif
2004-09-01 13:45:00 +04:00
talloc_free ( cd ) ;
2003-08-13 05:53:07 +04:00
return 0 ;
}
/**********************************************************************
the following functions implement the builtin character sets in Samba
and also the " test " character sets that are designed to test
multi - byte character set support for english users
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
static size_t ascii_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
while ( * inbytesleft > = 1 & & * outbytesleft > = 2 ) {
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
( * outbuf ) [ 1 ] = 0 ;
( * inbytesleft ) - = 1 ;
( * outbytesleft ) - = 2 ;
( * inbuf ) + = 1 ;
( * outbuf ) + = 2 ;
}
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
static size_t ascii_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
int ir_count = 0 ;
while ( * inbytesleft > = 2 & & * outbytesleft > = 1 ) {
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] & 0x7F ;
if ( ( * inbuf ) [ 1 ] ) ir_count + + ;
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 1 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 1 ;
}
if ( * inbytesleft = = 1 ) {
errno = EINVAL ;
return - 1 ;
}
if ( * inbytesleft > 1 ) {
errno = E2BIG ;
return - 1 ;
}
return ir_count ;
}
static size_t ucs2hex_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
while ( * inbytesleft > = 1 & & * outbytesleft > = 2 ) {
2004-06-01 14:12:52 +04:00
uint_t v ;
2003-08-13 05:53:07 +04:00
if ( ( * inbuf ) [ 0 ] ! = ' @ ' ) {
/* seven bit ascii case */
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
( * outbuf ) [ 1 ] = 0 ;
( * inbytesleft ) - = 1 ;
( * outbytesleft ) - = 2 ;
( * inbuf ) + = 1 ;
( * outbuf ) + = 2 ;
continue ;
}
/* it's a hex character */
if ( * inbytesleft < 5 ) {
errno = EINVAL ;
return - 1 ;
}
if ( sscanf ( & ( * inbuf ) [ 1 ] , " %04x " , & v ) ! = 1 ) {
errno = EILSEQ ;
return - 1 ;
}
( * outbuf ) [ 0 ] = v & 0xff ;
( * outbuf ) [ 1 ] = v > > 8 ;
( * inbytesleft ) - = 5 ;
( * outbytesleft ) - = 2 ;
( * inbuf ) + = 5 ;
( * outbuf ) + = 2 ;
}
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
static size_t ucs2hex_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
while ( * inbytesleft > = 2 & & * outbytesleft > = 1 ) {
char buf [ 6 ] ;
if ( ( * inbuf ) [ 1 ] = = 0 & &
( ( * inbuf ) [ 0 ] & 0x80 ) = = 0 & &
( * inbuf ) [ 0 ] ! = ' @ ' ) {
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 1 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 1 ;
continue ;
}
if ( * outbytesleft < 5 ) {
errno = E2BIG ;
return - 1 ;
}
snprintf ( buf , 6 , " @%04x " , SVAL ( * inbuf , 0 ) ) ;
memcpy ( * outbuf , buf , 5 ) ;
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 5 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 5 ;
}
if ( * inbytesleft = = 1 ) {
errno = EINVAL ;
return - 1 ;
}
if ( * inbytesleft > 1 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
2003-12-16 12:20:34 +03:00
static size_t iconv_swab ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
int n ;
n = MIN ( * inbytesleft , * outbytesleft ) ;
swab ( * inbuf , * outbuf , ( n & ~ 1 ) ) ;
if ( n & 1 ) {
( * outbuf ) [ n - 1 ] = 0 ;
}
( * inbytesleft ) - = n ;
( * outbytesleft ) - = n ;
( * inbuf ) + = n ;
( * outbuf ) + = n ;
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
2003-08-13 05:53:07 +04:00
static size_t iconv_copy ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
int n ;
n = MIN ( * inbytesleft , * outbytesleft ) ;
memmove ( * outbuf , * inbuf , n ) ;
( * inbytesleft ) - = n ;
( * outbytesleft ) - = n ;
( * inbuf ) + = n ;
( * outbuf ) + = n ;
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
2008-08-19 11:49:34 +04:00
/*
this takes a UTF8 sequence and produces a UTF16 sequence
*/
2003-08-13 05:53:07 +04:00
static size_t utf8_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
2004-09-01 08:39:06 +04:00
size_t in_left = * inbytesleft , out_left = * outbytesleft ;
const uint8_t * c = ( const uint8_t * ) * inbuf ;
uint8_t * uc = ( uint8_t * ) * outbuf ;
2003-08-13 05:53:07 +04:00
2004-09-01 08:39:06 +04:00
while ( in_left > = 1 & & out_left > = 2 ) {
2003-08-13 05:53:07 +04:00
if ( ( c [ 0 ] & 0x80 ) = = 0 ) {
uc [ 0 ] = c [ 0 ] ;
uc [ 1 ] = 0 ;
2004-09-01 08:39:06 +04:00
c + = 1 ;
in_left - = 1 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
if ( ( c [ 0 ] & 0xe0 ) = = 0xc0 ) {
if ( in_left < 2 | |
( c [ 1 ] & 0xc0 ) ! = 0x80 ) {
errno = EILSEQ ;
goto error ;
}
uc [ 1 ] = ( c [ 0 ] > > 2 ) & 0x7 ;
uc [ 0 ] = ( c [ 0 ] < < 6 ) | ( c [ 1 ] & 0x3f ) ;
c + = 2 ;
in_left - = 2 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
if ( ( c [ 0 ] & 0xf0 ) = = 0xe0 ) {
if ( in_left < 3 | |
( c [ 1 ] & 0xc0 ) ! = 0x80 | |
( c [ 2 ] & 0xc0 ) ! = 0x80 ) {
errno = EILSEQ ;
goto error ;
2003-08-13 05:53:07 +04:00
}
uc [ 1 ] = ( ( c [ 0 ] & 0xF ) < < 4 ) | ( ( c [ 1 ] > > 2 ) & 0xF ) ;
uc [ 0 ] = ( c [ 1 ] < < 6 ) | ( c [ 2 ] & 0x3f ) ;
2004-09-01 08:39:06 +04:00
c + = 3 ;
in_left - = 3 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
if ( ( c [ 0 ] & 0xf8 ) = = 0xf0 ) {
unsigned int codepoint ;
if ( in_left < 4 | |
( c [ 1 ] & 0xc0 ) ! = 0x80 | |
( c [ 2 ] & 0xc0 ) ! = 0x80 | |
( c [ 3 ] & 0xc0 ) ! = 0x80 ) {
errno = EILSEQ ;
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
codepoint =
( c [ 3 ] & 0x3f ) |
( ( c [ 2 ] & 0x3f ) < < 6 ) |
( ( c [ 1 ] & 0x3f ) < < 12 ) |
( ( c [ 0 ] & 0x7 ) < < 18 ) ;
if ( codepoint < 0x10000 ) {
/* accept UTF-8 characters that are not
minimally packed , but pack the result */
uc [ 0 ] = ( codepoint & 0xFF ) ;
uc [ 1 ] = ( codepoint > > 8 ) ;
c + = 4 ;
in_left - = 4 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
codepoint - = 0x10000 ;
if ( out_left < 4 ) {
errno = E2BIG ;
goto error ;
}
uc [ 0 ] = ( codepoint > > 10 ) & 0xFF ;
uc [ 1 ] = ( codepoint > > 18 ) | 0xd8 ;
uc [ 2 ] = codepoint & 0xFF ;
uc [ 3 ] = ( ( codepoint > > 8 ) & 0x3 ) | 0xdc ;
c + = 4 ;
in_left - = 4 ;
out_left - = 4 ;
uc + = 4 ;
continue ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
/* we don't handle 5 byte sequences */
errno = EINVAL ;
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
if ( in_left > 0 ) {
2003-08-13 05:53:07 +04:00
errno = E2BIG ;
2004-09-01 08:39:06 +04:00
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
* inbytesleft = in_left ;
* outbytesleft = out_left ;
2006-08-22 23:05:27 +04:00
* inbuf = ( const char * ) c ;
* outbuf = ( char * ) uc ;
2003-08-13 05:53:07 +04:00
return 0 ;
2004-09-01 08:39:06 +04:00
error :
* inbytesleft = in_left ;
* outbytesleft = out_left ;
2006-08-22 23:05:27 +04:00
* inbuf = ( const char * ) c ;
* outbuf = ( char * ) uc ;
2003-08-13 05:53:07 +04:00
return - 1 ;
}
2008-08-19 11:49:34 +04:00
/*
this takes a UTF16 sequence and produces a UTF8 sequence
*/
2003-08-13 05:53:07 +04:00
static size_t utf8_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
2004-09-01 08:39:06 +04:00
char * * outbuf , size_t * outbytesleft )
2003-08-13 05:53:07 +04:00
{
2004-09-01 08:39:06 +04:00
size_t in_left = * inbytesleft , out_left = * outbytesleft ;
uint8_t * c = ( uint8_t * ) * outbuf ;
const uint8_t * uc = ( const uint8_t * ) * inbuf ;
while ( in_left > = 2 & & out_left > = 1 ) {
unsigned int codepoint ;
if ( uc [ 1 ] = = 0 & & ! ( uc [ 0 ] & 0x80 ) ) {
/* simplest case */
c [ 0 ] = uc [ 0 ] ;
in_left - = 2 ;
out_left - = 1 ;
uc + = 2 ;
c + = 1 ;
continue ;
}
if ( ( uc [ 1 ] & 0xf8 ) = = 0 ) {
/* next simplest case */
if ( out_left < 2 ) {
errno = E2BIG ;
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
c [ 0 ] = 0xc0 | ( uc [ 0 ] > > 6 ) | ( uc [ 1 ] < < 2 ) ;
c [ 1 ] = 0x80 | ( uc [ 0 ] & 0x3f ) ;
in_left - = 2 ;
out_left - = 2 ;
uc + = 2 ;
c + = 2 ;
continue ;
}
if ( ( uc [ 1 ] & 0xfc ) = = 0xdc ) {
/* its the second part of a 4 byte sequence. Illegal */
if ( in_left < 4 ) {
errno = EINVAL ;
} else {
errno = EILSEQ ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
if ( ( uc [ 1 ] & 0xfc ) ! = 0xd8 ) {
codepoint = uc [ 0 ] | ( uc [ 1 ] < < 8 ) ;
if ( out_left < 3 ) {
errno = E2BIG ;
goto error ;
}
c [ 0 ] = 0xe0 | ( codepoint > > 12 ) ;
c [ 1 ] = 0x80 | ( ( codepoint > > 6 ) & 0x3f ) ;
c [ 2 ] = 0x80 | ( codepoint & 0x3f ) ;
in_left - = 2 ;
out_left - = 3 ;
uc + = 2 ;
c + = 3 ;
continue ;
}
2003-08-13 05:53:07 +04:00
2004-09-01 08:39:06 +04:00
/* its the first part of a 4 byte sequence */
if ( in_left < 4 ) {
errno = EINVAL ;
goto error ;
}
if ( ( uc [ 3 ] & 0xfc ) ! = 0xdc ) {
errno = EILSEQ ;
goto error ;
}
codepoint = 0x10000 + ( uc [ 2 ] | ( ( uc [ 3 ] & 0x3 ) < < 8 ) |
( uc [ 0 ] < < 10 ) | ( ( uc [ 1 ] & 0x3 ) < < 18 ) ) ;
if ( out_left < 4 ) {
errno = E2BIG ;
goto error ;
}
c [ 0 ] = 0xf0 | ( codepoint > > 18 ) ;
c [ 1 ] = 0x80 | ( ( codepoint > > 12 ) & 0x3f ) ;
c [ 2 ] = 0x80 | ( ( codepoint > > 6 ) & 0x3f ) ;
c [ 3 ] = 0x80 | ( codepoint & 0x3f ) ;
in_left - = 4 ;
out_left - = 4 ;
uc + = 4 ;
c + = 4 ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
if ( in_left = = 1 ) {
2003-08-13 05:53:07 +04:00
errno = EINVAL ;
2004-09-01 08:39:06 +04:00
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
if ( in_left > 1 ) {
2003-08-13 05:53:07 +04:00
errno = E2BIG ;
2004-09-01 08:39:06 +04:00
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
* inbytesleft = in_left ;
* outbytesleft = out_left ;
2006-08-22 23:05:27 +04:00
* inbuf = ( const char * ) uc ;
* outbuf = ( char * ) c ;
2003-08-13 05:53:07 +04:00
return 0 ;
2004-09-01 08:39:06 +04:00
error :
* inbytesleft = in_left ;
* outbytesleft = out_left ;
2006-08-22 23:05:27 +04:00
* inbuf = ( const char * ) uc ;
* outbuf = ( char * ) c ;
2003-08-13 05:53:07 +04:00
return - 1 ;
}
2004-09-01 13:45:00 +04:00
2008-10-31 05:51:37 +03:00
/*
this takes a UTF16 sequence , munges it according to the string2key
rules , and produces a UTF8 sequence
The rules are :
1 ) convert any instance of 0xD800 - 0xDBFF ( high surrogate )
without an immediately following 0xDC00 - 0x0 xDFFF ( low surrogate ) to
U + FFFD ( OBJECT REPLACEMENT CHARACTER ) .
2 ) the same for any low surrogate that was not preceded by a high surrogate .
*/
static size_t utf8_munged_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
size_t in_left = * inbytesleft , out_left = * outbytesleft ;
uint8_t * c = ( uint8_t * ) * outbuf ;
const uint8_t * uc = ( const uint8_t * ) * inbuf ;
while ( in_left > = 2 & & out_left > = 1 ) {
unsigned int codepoint ;
if ( uc [ 1 ] = = 0 & & ! ( uc [ 0 ] & 0x80 ) ) {
/* simplest case */
c [ 0 ] = uc [ 0 ] ;
in_left - = 2 ;
out_left - = 1 ;
uc + = 2 ;
c + = 1 ;
continue ;
}
if ( ( uc [ 1 ] & 0xf8 ) = = 0 ) {
/* next simplest case */
if ( out_left < 2 ) {
errno = E2BIG ;
goto error ;
}
c [ 0 ] = 0xc0 | ( uc [ 0 ] > > 6 ) | ( uc [ 1 ] < < 2 ) ;
c [ 1 ] = 0x80 | ( uc [ 0 ] & 0x3f ) ;
in_left - = 2 ;
out_left - = 2 ;
uc + = 2 ;
c + = 2 ;
continue ;
}
if ( ( uc [ 1 ] & 0xfc ) = = 0xdc ) {
/* low surrogate not preceded by high surrogate
convert to 0xfffd */
codepoint = 0xfffd ;
goto codepoint16 ;
}
if ( ( uc [ 1 ] & 0xfc ) ! = 0xd8 ) {
codepoint = uc [ 0 ] | ( uc [ 1 ] < < 8 ) ;
goto codepoint16 ;
}
/* its the first part of a 4 byte sequence */
if ( in_left < 4 | | ( uc [ 3 ] & 0xfc ) ! = 0xdc ) {
/* high surrogate not followed by low surrogate
convert to 0xfffd */
codepoint = 0xfffd ;
goto codepoint16 ;
}
codepoint = 0x10000 + ( uc [ 2 ] | ( ( uc [ 3 ] & 0x3 ) < < 8 ) |
( uc [ 0 ] < < 10 ) | ( ( uc [ 1 ] & 0x3 ) < < 18 ) ) ;
if ( out_left < 4 ) {
errno = E2BIG ;
goto error ;
}
c [ 0 ] = 0xf0 | ( codepoint > > 18 ) ;
c [ 1 ] = 0x80 | ( ( codepoint > > 12 ) & 0x3f ) ;
c [ 2 ] = 0x80 | ( ( codepoint > > 6 ) & 0x3f ) ;
c [ 3 ] = 0x80 | ( codepoint & 0x3f ) ;
in_left - = 4 ;
out_left - = 4 ;
uc + = 4 ;
c + = 4 ;
continue ;
codepoint16 :
if ( out_left < 3 ) {
errno = E2BIG ;
goto error ;
}
c [ 0 ] = 0xe0 | ( codepoint > > 12 ) ;
c [ 1 ] = 0x80 | ( ( codepoint > > 6 ) & 0x3f ) ;
c [ 2 ] = 0x80 | ( codepoint & 0x3f ) ;
in_left - = 2 ;
out_left - = 3 ;
uc + = 2 ;
c + = 3 ;
continue ;
}
if ( in_left = = 1 ) {
errno = EINVAL ;
goto error ;
}
if ( in_left > 1 ) {
errno = E2BIG ;
goto error ;
}
* inbytesleft = in_left ;
* outbytesleft = out_left ;
* inbuf = ( const char * ) uc ;
* outbuf = ( char * ) c ;
return 0 ;
error :
* inbytesleft = in_left ;
* outbytesleft = out_left ;
* inbuf = ( const char * ) uc ;
* outbuf = ( char * ) c ;
return - 1 ;
}
2004-09-01 13:45:00 +04:00