2001-07-04 11:15:53 +04:00
/*
2002-01-30 09:08:46 +03:00
Unix SMB / CIFS implementation .
2001-07-04 11:15:53 +04:00
minimal iconv implementation
Copyright ( C ) Andrew Tridgell 2001
2003-04-16 17:09:00 +04:00
Copyright ( C ) Jelmer Vernooij 2002 , 2003
2001-07-04 11:15:53 +04:00
This program is free software ; you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation ; either version 2 of the License , or
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with this program ; if not , write to the Free Software
Foundation , Inc . , 675 Mass Ave , Cambridge , MA 0213 9 , USA .
*/
# include "includes.h"
2003-10-23 03:38:20 +04:00
/*
* We have to use strcasecmp here as the character conversions
* haven ' t been initialised yet . JRA .
*/
# undef strcasecmp
2003-04-16 17:09:00 +04:00
/**
* @ file
*
* @ brief Samba wrapper / stub for iconv character set conversion .
*
* iconv is the XPG2 interface for converting between character
* encodings . This file provides a Samba wrapper around it , and also
* a simple reimplementation that is used if the system does not
* implement iconv .
*
* Samba only works with encodings that are supersets of ASCII : ascii
* characters like whitespace can be tested for directly , multibyte
* sequences start with a byte with the high bit set , and strings are
* terminated by a nul byte .
*
* Note that the only function provided by iconv is conversion between
* characters . It doesn ' t directly support operations like
* uppercasing or comparison . We have to convert to UCS - 2 and compare
* there .
*
* @ sa Samba Developers Guide
* */
2004-08-31 01:35:43 +04:00
static size_t ascii_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t ascii_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t latin1_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t utf8_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t utf8_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t ucs2hex_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t ucs2hex_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t iconv_copy ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t iconv_swab ( void * , const char * * , size_t * , char * * , size_t * ) ;
2001-07-04 11:15:53 +04:00
2003-04-16 17:09:00 +04:00
static struct charset_functions builtin_functions [ ] = {
2004-08-31 01:35:43 +04:00
/* windows is really neither UCS-2 not UTF-16 */
2001-07-22 04:27:30 +04:00
{ " UCS-2LE " , iconv_copy , iconv_copy } ,
2004-08-31 01:35:43 +04:00
{ " UTF-16LE " , iconv_copy , iconv_copy } ,
{ " UCS-2BE " , iconv_swab , iconv_swab } ,
2004-09-01 09:17:40 +04:00
{ " UTF-16BE " , iconv_swab , iconv_swab } ,
2004-08-31 01:35:43 +04:00
/* we include the UTF-8 alias to cope with differing locale settings */
2001-07-04 19:08:30 +04:00
{ " UTF8 " , utf8_pull , utf8_push } ,
2004-08-31 01:35:43 +04:00
{ " UTF-8 " , utf8_pull , utf8_push } ,
2001-07-04 11:15:53 +04:00
{ " ASCII " , ascii_pull , ascii_push } ,
2003-09-15 16:42:10 +04:00
{ " 646 " , ascii_pull , ascii_push } ,
2003-12-10 18:59:28 +03:00
{ " ISO-8859-1 " , ascii_pull , latin1_push } ,
2001-07-22 11:38:32 +04:00
{ " UCS2-HEX " , ucs2hex_pull , ucs2hex_push } ,
2001-07-04 11:15:53 +04:00
{ NULL , NULL , NULL }
} ;
2003-04-16 17:09:00 +04:00
static struct charset_functions * charsets = NULL ;
static struct charset_functions * find_charset_functions ( const char * name )
{
struct charset_functions * c = charsets ;
while ( c ) {
2003-04-25 00:27:19 +04:00
if ( strcasecmp ( name , c - > name ) = = 0 ) {
2003-04-16 17:09:00 +04:00
return c ;
}
c = c - > next ;
}
return NULL ;
}
2003-04-28 21:48:48 +04:00
NTSTATUS smb_register_charset ( struct charset_functions * funcs )
2003-04-16 17:09:00 +04:00
{
2003-04-28 21:48:48 +04:00
if ( ! funcs ) {
return NT_STATUS_INVALID_PARAMETER ;
}
2003-04-16 17:09:00 +04:00
DEBUG ( 5 , ( " Attempting to register new charset %s \n " , funcs - > name ) ) ;
/* Check whether we already have this charset... */
2003-04-25 00:27:19 +04:00
if ( find_charset_functions ( funcs - > name ) ) {
DEBUG ( 0 , ( " Duplicate charset %s, not registering \n " , funcs - > name ) ) ;
2003-04-28 21:48:48 +04:00
return NT_STATUS_OBJECT_NAME_COLLISION ;
2003-04-16 17:09:00 +04:00
}
funcs - > next = funcs - > prev = NULL ;
DEBUG ( 5 , ( " Registered charset %s \n " , funcs - > name ) ) ;
DLIST_ADD ( charsets , funcs ) ;
2003-04-28 21:48:48 +04:00
return NT_STATUS_OK ;
2003-04-16 17:09:00 +04:00
}
2004-02-08 11:38:42 +03:00
static void lazy_initialize_iconv ( void )
2003-04-16 17:09:00 +04:00
{
static BOOL initialized ;
int i ;
if ( ! initialized ) {
initialized = True ;
for ( i = 0 ; builtin_functions [ i ] . name ; i + + )
smb_register_charset ( & builtin_functions [ i ] ) ;
static_init_charset ;
}
}
2001-07-22 11:38:32 +04:00
/* if there was an error then reset the internal state,
this ensures that we don ' t have a shift state remaining for
character sets like SJIS */
static size_t sys_iconv ( void * cd ,
2004-08-31 01:35:43 +04:00
const char * * inbuf , size_t * inbytesleft ,
2001-07-22 11:38:32 +04:00
char * * outbuf , size_t * outbytesleft )
{
# ifdef HAVE_NATIVE_ICONV
size_t ret = iconv ( ( iconv_t ) cd ,
2004-08-31 01:35:43 +04:00
( char * * ) inbuf , inbytesleft ,
2001-07-22 11:38:32 +04:00
outbuf , outbytesleft ) ;
2004-05-28 21:57:18 +04:00
if ( ret = = ( size_t ) - 1 ) {
int saved_errno = errno ;
iconv ( cd , NULL , NULL , NULL , NULL ) ;
errno = saved_errno ;
}
2001-07-22 11:38:32 +04:00
return ret ;
# else
errno = EINVAL ;
return - 1 ;
# endif
}
2003-02-27 08:57:21 +03:00
/**
* This is a simple portable iconv ( ) implementaion .
*
* It only knows about a very small number of character sets - just
* enough that Samba works on systems that don ' t have iconv .
* */
2001-07-04 11:15:53 +04:00
size_t smb_iconv ( smb_iconv_t cd ,
2004-08-31 01:35:43 +04:00
const char * * inbuf , size_t * inbytesleft ,
2003-07-14 05:18:43 +04:00
char * * outbuf , size_t * outbytesleft )
2001-07-04 11:15:53 +04:00
{
char cvtbuf [ 2048 ] ;
char * bufp = cvtbuf ;
size_t bufsize ;
2001-07-22 11:38:32 +04:00
/* in many cases we can go direct */
2001-07-04 11:15:53 +04:00
if ( cd - > direct ) {
2003-07-14 05:18:43 +04:00
return cd - > direct ( cd - > cd_direct ,
2004-08-31 01:35:43 +04:00
inbuf , inbytesleft , outbuf , outbytesleft ) ;
2001-07-04 11:15:53 +04:00
}
2001-07-22 11:38:32 +04:00
2001-07-04 11:15:53 +04:00
/* otherwise we have to do it chunks at a time */
while ( * inbytesleft > 0 ) {
bufp = cvtbuf ;
bufsize = sizeof ( cvtbuf ) ;
2001-07-22 11:38:32 +04:00
2003-07-14 05:18:43 +04:00
if ( cd - > pull ( cd - > cd_pull ,
2004-08-31 01:35:43 +04:00
inbuf , inbytesleft , & bufp , & bufsize ) = = - 1
2003-07-14 05:18:43 +04:00
& & errno ! = E2BIG ) return - 1 ;
2001-07-04 11:15:53 +04:00
bufp = cvtbuf ;
bufsize = sizeof ( cvtbuf ) - bufsize ;
2001-07-22 11:38:32 +04:00
2003-07-14 05:18:43 +04:00
if ( cd - > push ( cd - > cd_push ,
2004-08-31 01:35:43 +04:00
( const char * * ) & bufp , & bufsize ,
2003-07-14 05:18:43 +04:00
outbuf , outbytesleft ) = = - 1 ) return - 1 ;
2001-07-04 11:15:53 +04:00
}
2003-07-14 05:18:43 +04:00
2001-07-04 11:15:53 +04:00
return 0 ;
}
2004-09-01 09:17:40 +04:00
static BOOL is_utf16 ( const char * name )
{
return strcasecmp ( name , " UCS-2LE " ) = = 0 | |
strcasecmp ( name , " UTF-16LE " ) = = 0 ;
}
2001-07-04 11:15:53 +04:00
/*
simple iconv_open ( ) wrapper
*/
smb_iconv_t smb_iconv_open ( const char * tocode , const char * fromcode )
{
smb_iconv_t ret ;
2003-04-16 17:09:00 +04:00
struct charset_functions * from , * to ;
lazy_initialize_iconv ( ) ;
from = charsets ;
to = charsets ;
2001-07-22 11:38:32 +04:00
ret = ( smb_iconv_t ) malloc ( sizeof ( * ret ) ) ;
if ( ! ret ) {
errno = ENOMEM ;
return ( smb_iconv_t ) - 1 ;
}
memset ( ret , 0 , sizeof ( * ret ) ) ;
2001-12-20 09:18:52 +03:00
ret - > from_name = strdup ( fromcode ) ;
ret - > to_name = strdup ( tocode ) ;
2001-07-22 11:38:32 +04:00
/* check for the simplest null conversion */
2003-04-16 17:09:00 +04:00
if ( strcasecmp ( fromcode , tocode ) = = 0 ) {
2001-07-22 11:38:32 +04:00
ret - > direct = iconv_copy ;
return ret ;
}
2001-07-04 11:15:53 +04:00
2003-04-16 17:09:00 +04:00
/* check if we have a builtin function for this conversion */
from = find_charset_functions ( fromcode ) ;
if ( from ) ret - > pull = from - > pull ;
to = find_charset_functions ( tocode ) ;
if ( to ) ret - > push = to - > push ;
2001-07-04 11:15:53 +04:00
2003-04-16 17:09:00 +04:00
/* check if we can use iconv for this conversion */
2001-07-04 11:15:53 +04:00
# ifdef HAVE_NATIVE_ICONV
2003-04-16 17:09:00 +04:00
if ( ! ret - > pull ) {
2004-09-01 09:17:40 +04:00
ret - > cd_pull = iconv_open ( " UTF-16LE " , fromcode ) ;
if ( ret - > cd_pull = = ( iconv_t ) - 1 )
ret - > cd_pull = iconv_open ( " UCS-2LE " , fromcode ) ;
2003-04-16 17:09:00 +04:00
if ( ret - > cd_pull ! = ( iconv_t ) - 1 )
ret - > pull = sys_iconv ;
2001-07-04 11:15:53 +04:00
}
2003-04-16 17:09:00 +04:00
if ( ! ret - > push ) {
2004-09-01 09:17:40 +04:00
ret - > cd_push = iconv_open ( tocode , " UTF-16LE " ) ;
if ( ret - > cd_push = = ( iconv_t ) - 1 )
ret - > cd_push = iconv_open ( tocode , " UCS-2LE " ) ;
2003-04-16 17:09:00 +04:00
if ( ret - > cd_push ! = ( iconv_t ) - 1 )
ret - > push = sys_iconv ;
2001-07-04 11:15:53 +04:00
}
# endif
2003-04-16 17:09:00 +04:00
/* check if there is a module available that can do this conversion */
2003-04-28 21:48:48 +04:00
if ( ! ret - > pull & & NT_STATUS_IS_OK ( smb_probe_module ( " charset " , fromcode ) ) ) {
2003-04-16 17:09:00 +04:00
if ( ! ( from = find_charset_functions ( fromcode ) ) )
DEBUG ( 0 , ( " Module %s doesn't provide charset %s! \n " , fromcode , fromcode ) ) ;
else
ret - > pull = from - > pull ;
}
2003-04-28 21:48:48 +04:00
if ( ! ret - > push & & NT_STATUS_IS_OK ( smb_probe_module ( " charset " , tocode ) ) ) {
2003-04-16 17:09:00 +04:00
if ( ! ( to = find_charset_functions ( tocode ) ) )
DEBUG ( 0 , ( " Module %s doesn't provide charset %s! \n " , tocode , tocode ) ) ;
else
ret - > push = to - > push ;
}
if ( ! ret - > push | | ! ret - > pull ) {
SAFE_FREE ( ret - > from_name ) ;
SAFE_FREE ( ret - > to_name ) ;
SAFE_FREE ( ret ) ;
errno = EINVAL ;
return ( smb_iconv_t ) - 1 ;
}
2001-07-04 11:15:53 +04:00
2001-07-22 11:38:32 +04:00
/* check for conversion to/from ucs2 */
2004-09-01 09:17:40 +04:00
if ( is_utf16 ( fromcode ) & & to ) {
2003-04-16 17:09:00 +04:00
ret - > direct = to - > push ;
ret - > push = ret - > pull = NULL ;
2001-07-22 11:38:32 +04:00
return ret ;
}
2003-04-16 17:09:00 +04:00
2004-09-01 09:17:40 +04:00
if ( is_utf16 ( tocode ) & & from ) {
2003-04-16 17:09:00 +04:00
ret - > direct = from - > pull ;
ret - > push = ret - > pull = NULL ;
2001-07-04 11:15:53 +04:00
return ret ;
}
2003-04-16 17:09:00 +04:00
/* Check if we can do the conversion direct */
2001-07-22 11:38:32 +04:00
# ifdef HAVE_NATIVE_ICONV
2004-09-01 09:17:40 +04:00
if ( is_utf16 ( fromcode ) ) {
2001-07-22 11:38:32 +04:00
ret - > direct = sys_iconv ;
ret - > cd_direct = ret - > cd_push ;
ret - > cd_push = NULL ;
2001-07-04 11:15:53 +04:00
return ret ;
}
2004-09-01 09:17:40 +04:00
if ( is_utf16 ( tocode ) ) {
2001-07-22 11:38:32 +04:00
ret - > direct = sys_iconv ;
ret - > cd_direct = ret - > cd_pull ;
ret - > cd_pull = NULL ;
2001-07-04 11:15:53 +04:00
return ret ;
}
2001-07-22 11:38:32 +04:00
# endif
2001-07-04 11:15:53 +04:00
return ret ;
}
/*
simple iconv_close ( ) wrapper
*/
int smb_iconv_close ( smb_iconv_t cd )
{
# ifdef HAVE_NATIVE_ICONV
2001-07-22 11:38:32 +04:00
if ( cd - > cd_direct ) iconv_close ( ( iconv_t ) cd - > cd_direct ) ;
if ( cd - > cd_pull ) iconv_close ( ( iconv_t ) cd - > cd_pull ) ;
if ( cd - > cd_push ) iconv_close ( ( iconv_t ) cd - > cd_push ) ;
2001-07-04 11:15:53 +04:00
# endif
2001-07-22 11:38:32 +04:00
2001-12-20 09:18:52 +03:00
SAFE_FREE ( cd - > from_name ) ;
SAFE_FREE ( cd - > to_name ) ;
2001-07-04 11:15:53 +04:00
memset ( cd , 0 , sizeof ( * cd ) ) ;
2001-09-17 06:19:44 +04:00
SAFE_FREE ( cd ) ;
2001-07-04 11:15:53 +04:00
return 0 ;
}
/**********************************************************************
the following functions implement the builtin character sets in Samba
and also the " test " character sets that are designed to test
multi - byte character set support for english users
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2004-08-31 01:35:43 +04:00
static size_t ascii_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
2001-07-04 11:15:53 +04:00
char * * outbuf , size_t * outbytesleft )
{
while ( * inbytesleft > = 1 & & * outbytesleft > = 2 ) {
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
( * outbuf ) [ 1 ] = 0 ;
( * inbytesleft ) - = 1 ;
( * outbytesleft ) - = 2 ;
( * inbuf ) + = 1 ;
( * outbuf ) + = 2 ;
}
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
2004-08-31 01:35:43 +04:00
static size_t ascii_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
2001-07-04 11:15:53 +04:00
char * * outbuf , size_t * outbytesleft )
{
int ir_count = 0 ;
while ( * inbytesleft > = 2 & & * outbytesleft > = 1 ) {
2001-12-20 09:18:52 +03:00
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] & 0x7F ;
2001-07-04 11:15:53 +04:00
if ( ( * inbuf ) [ 1 ] ) ir_count + + ;
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 1 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 1 ;
}
if ( * inbytesleft = = 1 ) {
errno = EINVAL ;
return - 1 ;
}
if ( * inbytesleft > 1 ) {
errno = E2BIG ;
return - 1 ;
}
return ir_count ;
}
2004-08-31 01:35:43 +04:00
static size_t latin1_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
2003-12-10 18:59:28 +03:00
char * * outbuf , size_t * outbytesleft )
{
int ir_count = 0 ;
while ( * inbytesleft > = 2 & & * outbytesleft > = 1 ) {
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
if ( ( * inbuf ) [ 1 ] ) ir_count + + ;
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 1 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 1 ;
}
if ( * inbytesleft = = 1 ) {
errno = EINVAL ;
return - 1 ;
}
if ( * inbytesleft > 1 ) {
errno = E2BIG ;
return - 1 ;
}
return ir_count ;
}
2001-07-04 11:15:53 +04:00
2004-08-31 01:35:43 +04:00
static size_t ucs2hex_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
2001-07-22 11:38:32 +04:00
char * * outbuf , size_t * outbytesleft )
{
while ( * inbytesleft > = 1 & & * outbytesleft > = 2 ) {
unsigned v ;
if ( ( * inbuf ) [ 0 ] ! = ' @ ' ) {
/* seven bit ascii case */
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
( * outbuf ) [ 1 ] = 0 ;
( * inbytesleft ) - = 1 ;
( * outbytesleft ) - = 2 ;
( * inbuf ) + = 1 ;
( * outbuf ) + = 2 ;
continue ;
}
/* it's a hex character */
if ( * inbytesleft < 5 ) {
errno = EINVAL ;
return - 1 ;
}
if ( sscanf ( & ( * inbuf ) [ 1 ] , " %04x " , & v ) ! = 1 ) {
errno = EILSEQ ;
return - 1 ;
}
( * outbuf ) [ 0 ] = v & 0xff ;
( * outbuf ) [ 1 ] = v > > 8 ;
( * inbytesleft ) - = 5 ;
( * outbytesleft ) - = 2 ;
( * inbuf ) + = 5 ;
( * outbuf ) + = 2 ;
}
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
2004-08-31 01:35:43 +04:00
static size_t ucs2hex_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
2001-07-22 11:38:32 +04:00
char * * outbuf , size_t * outbytesleft )
{
while ( * inbytesleft > = 2 & & * outbytesleft > = 1 ) {
char buf [ 6 ] ;
if ( ( * inbuf ) [ 1 ] = = 0 & &
( ( * inbuf ) [ 0 ] & 0x80 ) = = 0 & &
( * inbuf ) [ 0 ] ! = ' @ ' ) {
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 1 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 1 ;
continue ;
}
if ( * outbytesleft < 5 ) {
errno = E2BIG ;
return - 1 ;
}
snprintf ( buf , 6 , " @%04x " , SVAL ( * inbuf , 0 ) ) ;
memcpy ( * outbuf , buf , 5 ) ;
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 5 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 5 ;
}
if ( * inbytesleft = = 1 ) {
errno = EINVAL ;
return - 1 ;
}
if ( * inbytesleft > 1 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
2004-08-31 01:35:43 +04:00
static size_t iconv_swab ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
int n ;
n = MIN ( * inbytesleft , * outbytesleft ) ;
swab ( * inbuf , * outbuf , ( n & ~ 1 ) ) ;
if ( n & 1 ) {
( * outbuf ) [ n - 1 ] = 0 ;
}
( * inbytesleft ) - = n ;
( * outbytesleft ) - = n ;
( * inbuf ) + = n ;
( * outbuf ) + = n ;
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
2001-07-22 11:38:32 +04:00
2004-08-31 01:35:43 +04:00
static size_t iconv_copy ( void * cd , const char * * inbuf , size_t * inbytesleft ,
2001-07-04 11:15:53 +04:00
char * * outbuf , size_t * outbytesleft )
{
int n ;
n = MIN ( * inbytesleft , * outbytesleft ) ;
memmove ( * outbuf , * inbuf , n ) ;
( * inbytesleft ) - = n ;
( * outbytesleft ) - = n ;
( * inbuf ) + = n ;
( * outbuf ) + = n ;
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
2001-07-04 19:08:30 +04:00
2004-08-31 01:35:43 +04:00
static size_t utf8_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
2001-07-04 19:08:30 +04:00
char * * outbuf , size_t * outbytesleft )
{
2004-09-01 09:17:40 +04:00
size_t in_left = * inbytesleft , out_left = * outbytesleft ;
const uint8_t * c = ( const uint8_t * ) * inbuf ;
uint8_t * uc = ( uint8_t * ) * outbuf ;
2001-07-04 19:08:30 +04:00
2004-09-01 09:17:40 +04:00
while ( in_left > = 1 & & out_left > = 2 ) {
2001-07-05 04:57:42 +04:00
if ( ( c [ 0 ] & 0x80 ) = = 0 ) {
uc [ 0 ] = c [ 0 ] ;
uc [ 1 ] = 0 ;
2004-09-01 09:17:40 +04:00
c + = 1 ;
in_left - = 1 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
if ( ( c [ 0 ] & 0xe0 ) = = 0xc0 ) {
if ( in_left < 2 | |
( c [ 1 ] & 0xc0 ) ! = 0x80 ) {
errno = EILSEQ ;
goto error ;
}
uc [ 1 ] = ( c [ 0 ] > > 2 ) & 0x7 ;
uc [ 0 ] = ( c [ 0 ] < < 6 ) | ( c [ 1 ] & 0x3f ) ;
c + = 2 ;
in_left - = 2 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
if ( ( c [ 0 ] & 0xf0 ) = = 0xe0 ) {
if ( in_left < 3 | |
( c [ 1 ] & 0xc0 ) ! = 0x80 | |
( c [ 2 ] & 0xc0 ) ! = 0x80 ) {
errno = EILSEQ ;
goto error ;
2001-07-04 19:08:30 +04:00
}
uc [ 1 ] = ( ( c [ 0 ] & 0xF ) < < 4 ) | ( ( c [ 1 ] > > 2 ) & 0xF ) ;
uc [ 0 ] = ( c [ 1 ] < < 6 ) | ( c [ 2 ] & 0x3f ) ;
2004-09-01 09:17:40 +04:00
c + = 3 ;
in_left - = 3 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
if ( ( c [ 0 ] & 0xf8 ) = = 0xf0 ) {
unsigned int codepoint ;
if ( in_left < 4 | |
( c [ 1 ] & 0xc0 ) ! = 0x80 | |
( c [ 2 ] & 0xc0 ) ! = 0x80 | |
( c [ 3 ] & 0xc0 ) ! = 0x80 ) {
errno = EILSEQ ;
goto error ;
2001-07-04 19:08:30 +04:00
}
2004-09-01 09:17:40 +04:00
codepoint =
( c [ 3 ] & 0x3f ) |
( ( c [ 2 ] & 0x3f ) < < 6 ) |
( ( c [ 1 ] & 0x3f ) < < 12 ) |
( ( c [ 0 ] & 0x7 ) < < 18 ) ;
if ( codepoint < 0x10000 ) {
/* accept UTF-8 characters that are not
minimally packed , but pack the result */
uc [ 0 ] = ( codepoint & 0xFF ) ;
uc [ 1 ] = ( codepoint > > 8 ) ;
c + = 4 ;
in_left - = 4 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
codepoint - = 0x10000 ;
if ( out_left < 4 ) {
errno = E2BIG ;
goto error ;
}
uc [ 0 ] = ( codepoint > > 10 ) & 0xFF ;
uc [ 1 ] = ( codepoint > > 18 ) | 0xd8 ;
uc [ 2 ] = codepoint & 0xFF ;
uc [ 3 ] = ( ( codepoint > > 8 ) & 0x3 ) | 0xdc ;
c + = 4 ;
in_left - = 4 ;
out_left - = 4 ;
uc + = 4 ;
continue ;
2001-07-04 19:08:30 +04:00
}
2004-09-01 09:17:40 +04:00
/* we don't handle 5 byte sequences */
errno = EINVAL ;
goto error ;
2001-07-04 19:08:30 +04:00
}
2004-09-01 09:17:40 +04:00
if ( in_left > 0 ) {
2001-07-04 19:08:30 +04:00
errno = E2BIG ;
2004-09-01 09:17:40 +04:00
goto error ;
2001-07-04 19:08:30 +04:00
}
2004-09-01 09:17:40 +04:00
* inbytesleft = in_left ;
* outbytesleft = out_left ;
* inbuf = c ;
* outbuf = uc ;
2001-07-04 19:08:30 +04:00
return 0 ;
2004-09-01 09:17:40 +04:00
error :
* inbytesleft = in_left ;
* outbytesleft = out_left ;
* inbuf = c ;
* outbuf = uc ;
2001-07-04 19:08:30 +04:00
return - 1 ;
}
2004-08-31 01:35:43 +04:00
static size_t utf8_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
2004-09-01 09:17:40 +04:00
char * * outbuf , size_t * outbytesleft )
2001-07-04 19:08:30 +04:00
{
2004-09-01 09:17:40 +04:00
size_t in_left = * inbytesleft , out_left = * outbytesleft ;
uint8_t * c = ( uint8_t * ) * outbuf ;
const uint8_t * uc = ( const uint8_t * ) * inbuf ;
while ( in_left > = 2 & & out_left > = 1 ) {
unsigned int codepoint ;
if ( uc [ 1 ] = = 0 & & ! ( uc [ 0 ] & 0x80 ) ) {
/* simplest case */
c [ 0 ] = uc [ 0 ] ;
in_left - = 2 ;
out_left - = 1 ;
uc + = 2 ;
c + = 1 ;
continue ;
}
if ( ( uc [ 1 ] & 0xf8 ) = = 0 ) {
/* next simplest case */
if ( out_left < 2 ) {
errno = E2BIG ;
goto error ;
2001-07-04 19:08:30 +04:00
}
2004-09-01 09:17:40 +04:00
c [ 0 ] = 0xc0 | ( uc [ 0 ] > > 6 ) | ( uc [ 1 ] < < 2 ) ;
c [ 1 ] = 0x80 | ( uc [ 0 ] & 0x3f ) ;
in_left - = 2 ;
out_left - = 2 ;
uc + = 2 ;
c + = 2 ;
continue ;
}
if ( ( uc [ 1 ] & 0xfc ) = = 0xdc ) {
/* its the second part of a 4 byte sequence. Illegal */
if ( in_left < 4 ) {
errno = EINVAL ;
} else {
errno = EILSEQ ;
2001-07-04 19:08:30 +04:00
}
2004-09-01 09:17:40 +04:00
goto error ;
2001-07-04 19:08:30 +04:00
}
2004-09-01 09:17:40 +04:00
if ( ( uc [ 1 ] & 0xfc ) ! = 0xd8 ) {
codepoint = uc [ 0 ] | ( uc [ 1 ] < < 8 ) ;
if ( out_left < 3 ) {
errno = E2BIG ;
goto error ;
}
c [ 0 ] = 0xe0 | ( codepoint > > 12 ) ;
c [ 1 ] = 0x80 | ( ( codepoint > > 6 ) & 0x3f ) ;
c [ 2 ] = 0x80 | ( codepoint & 0x3f ) ;
in_left - = 2 ;
out_left - = 3 ;
uc + = 2 ;
c + = 3 ;
continue ;
}
2001-07-04 19:08:30 +04:00
2004-09-01 09:17:40 +04:00
/* its the first part of a 4 byte sequence */
if ( in_left < 4 ) {
errno = EINVAL ;
goto error ;
}
if ( ( uc [ 3 ] & 0xfc ) ! = 0xdc ) {
errno = EILSEQ ;
goto error ;
}
codepoint = 0x10000 + ( uc [ 2 ] | ( ( uc [ 3 ] & 0x3 ) < < 8 ) |
( uc [ 0 ] < < 10 ) | ( ( uc [ 1 ] & 0x3 ) < < 18 ) ) ;
if ( out_left < 4 ) {
errno = E2BIG ;
goto error ;
}
c [ 0 ] = 0xf0 | ( codepoint > > 18 ) ;
c [ 1 ] = 0x80 | ( ( codepoint > > 12 ) & 0x3f ) ;
c [ 2 ] = 0x80 | ( ( codepoint > > 6 ) & 0x3f ) ;
c [ 3 ] = 0x80 | ( codepoint & 0x3f ) ;
in_left - = 4 ;
out_left - = 4 ;
uc + = 4 ;
c + = 4 ;
2001-07-04 19:08:30 +04:00
}
2004-09-01 09:17:40 +04:00
if ( in_left = = 1 ) {
2001-07-04 19:08:30 +04:00
errno = EINVAL ;
2004-09-01 09:17:40 +04:00
goto error ;
2001-07-04 19:08:30 +04:00
}
2004-09-01 09:17:40 +04:00
if ( in_left > 1 ) {
2001-07-04 19:08:30 +04:00
errno = E2BIG ;
2004-09-01 09:17:40 +04:00
goto error ;
2001-07-04 19:08:30 +04:00
}
2004-09-01 09:17:40 +04:00
* inbytesleft = in_left ;
* outbytesleft = out_left ;
* inbuf = uc ;
* outbuf = c ;
2001-07-04 19:08:30 +04:00
return 0 ;
2004-09-01 09:17:40 +04:00
error :
* inbytesleft = in_left ;
* outbytesleft = out_left ;
* inbuf = uc ;
* outbuf = c ;
2001-07-04 19:08:30 +04:00
return - 1 ;
}
2004-09-01 09:17:40 +04:00