2003-08-13 05:53:07 +04:00
/*
Unix SMB / CIFS implementation .
minimal iconv implementation
Copyright ( C ) Andrew Tridgell 2001
Copyright ( C ) Jelmer Vernooij 2002
This program is free software ; you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
2007-07-10 06:07:03 +04:00
the Free Software Foundation ; either version 3 of the License , or
2003-08-13 05:53:07 +04:00
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
2007-07-10 06:07:03 +04:00
along with this program . If not , see < http : //www.gnu.org/licenses/>.
2003-08-13 05:53:07 +04:00
*/
# include "includes.h"
2008-10-11 23:31:42 +04:00
# include "../lib/util/dlinklist.h"
2004-11-02 02:45:40 +03:00
# include "system/iconv.h"
2005-02-10 08:09:35 +03:00
# include "system/filesys.h"
2011-09-20 06:13:21 +04:00
# include "charset_proto.h"
2003-08-13 05:53:07 +04:00
2011-02-18 02:24:58 +03:00
# ifdef strcasecmp
# undef strcasecmp
# endif
2003-08-13 05:53:07 +04:00
/**
* @ file
*
* @ brief Samba wrapper / stub for iconv character set conversion .
*
* iconv is the XPG2 interface for converting between character
* encodings . This file provides a Samba wrapper around it , and also
* a simple reimplementation that is used if the system does not
* implement iconv .
*
* Samba only works with encodings that are supersets of ASCII : ascii
* characters like whitespace can be tested for directly , multibyte
* sequences start with a byte with the high bit set , and strings are
* terminated by a nul byte .
*
* Note that the only function provided by iconv is conversion between
* characters . It doesn ' t directly support operations like
2004-09-01 08:39:06 +04:00
* uppercasing or comparison . We have to convert to UTF - 16L E and
* compare there .
2003-08-13 05:53:07 +04:00
*
* @ sa Samba Developers Guide
* */
2003-12-16 12:20:34 +03:00
static size_t ascii_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t ascii_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
2011-04-12 04:35:43 +04:00
static size_t latin1_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
2011-02-18 02:24:58 +03:00
static size_t latin1_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
2003-12-16 12:20:34 +03:00
static size_t utf8_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t utf8_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
2008-10-31 07:41:34 +03:00
static size_t utf16_munged_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
2003-08-13 05:53:07 +04:00
static size_t ucs2hex_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t ucs2hex_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
2003-12-16 12:20:34 +03:00
static size_t iconv_copy ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t iconv_swab ( void * , const char * * , size_t * , char * * , size_t * ) ;
2003-08-13 05:53:07 +04:00
2004-11-02 15:43:25 +03:00
static const struct charset_functions builtin_functions [ ] = {
2004-09-01 08:39:06 +04:00
/* windows is closest to UTF-16 */
2003-08-13 05:53:07 +04:00
{ " UCS-2LE " , iconv_copy , iconv_copy } ,
2004-08-30 16:03:01 +04:00
{ " UTF-16LE " , iconv_copy , iconv_copy } ,
2003-12-16 12:20:34 +03:00
{ " UCS-2BE " , iconv_swab , iconv_swab } ,
2004-09-01 08:39:06 +04:00
{ " UTF-16BE " , iconv_swab , iconv_swab } ,
2004-08-30 16:03:01 +04:00
/* we include the UTF-8 alias to cope with differing locale settings */
2003-08-13 05:53:07 +04:00
{ " UTF8 " , utf8_pull , utf8_push } ,
2004-08-30 16:03:01 +04:00
{ " UTF-8 " , utf8_pull , utf8_push } ,
2008-10-31 05:51:37 +03:00
/* this handles the munging needed for String2Key */
2011-09-20 06:13:21 +04:00
{ " UTF16_MUNGED " , utf16_munged_pull , iconv_copy , true } ,
2008-10-31 05:51:37 +03:00
2003-08-13 05:53:07 +04:00
{ " ASCII " , ascii_pull , ascii_push } ,
2011-02-18 02:24:58 +03:00
{ " 646 " , ascii_pull , ascii_push } ,
2011-04-12 04:35:43 +04:00
{ " ISO-8859-1 " , latin1_pull , latin1_push } ,
2011-09-20 06:13:21 +04:00
# ifdef DEVELOPER
{ " WEIRD " , weird_pull , weird_push , true } ,
2011-02-18 02:24:58 +03:00
# endif
2011-09-20 06:13:21 +04:00
# ifdef DARWINOS
{ " MACOSXFS " , macosxfs_encoding_pull , macosxfs_encoding_push , true } ,
# endif
{ " UCS2-HEX " , ucs2hex_pull , ucs2hex_push , true }
} ;
2011-02-18 02:24:58 +03:00
2003-08-13 05:53:07 +04:00
# ifdef HAVE_NATIVE_ICONV
/* if there was an error then reset the internal state,
this ensures that we don ' t have a shift state remaining for
character sets like SJIS */
static size_t sys_iconv ( void * cd ,
const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
size_t ret = iconv ( ( iconv_t ) cd ,
2004-10-08 12:13:00 +04:00
discard_const_p ( char * , inbuf ) , inbytesleft ,
2003-08-13 05:53:07 +04:00
outbuf , outbytesleft ) ;
if ( ret = = ( size_t ) - 1 ) iconv ( cd , NULL , NULL , NULL , NULL ) ;
return ret ;
}
# endif
/**
* This is a simple portable iconv ( ) implementaion .
*
* It only knows about a very small number of character sets - just
* enough that Samba works on systems that don ' t have iconv .
* */
2008-04-02 06:53:27 +04:00
_PUBLIC_ size_t smb_iconv ( smb_iconv_t cd ,
2003-08-13 05:53:07 +04:00
const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
/* in many cases we can go direct */
if ( cd - > direct ) {
return cd - > direct ( cd - > cd_direct ,
inbuf , inbytesleft , outbuf , outbytesleft ) ;
}
/* otherwise we have to do it chunks at a time */
2011-03-31 05:13:05 +04:00
{
# ifndef SMB_ICONV_BUFSIZE
# define SMB_ICONV_BUFSIZE 2048
# endif
size_t bufsize ;
2013-12-06 14:31:07 +04:00
char cvtbuf [ SMB_ICONV_BUFSIZE ] ;
2011-03-31 05:13:05 +04:00
while ( * inbytesleft > 0 ) {
char * bufp1 = cvtbuf ;
const char * bufp2 = cvtbuf ;
2011-04-11 15:34:21 +04:00
int saved_errno = errno ;
bool pull_failed = false ;
2011-03-31 05:13:05 +04:00
bufsize = SMB_ICONV_BUFSIZE ;
2003-08-13 05:53:07 +04:00
2011-03-31 05:13:05 +04:00
if ( cd - > pull ( cd - > cd_pull ,
inbuf , inbytesleft , & bufp1 , & bufsize ) = = - 1
2011-03-31 06:27:29 +04:00
& & errno ! = E2BIG ) {
2011-04-11 15:34:21 +04:00
saved_errno = errno ;
pull_failed = true ;
2011-03-31 06:27:29 +04:00
}
2003-08-13 05:53:07 +04:00
2011-03-31 05:13:05 +04:00
bufsize = SMB_ICONV_BUFSIZE - bufsize ;
if ( cd - > push ( cd - > cd_push ,
& bufp2 , & bufsize ,
2011-03-31 06:27:29 +04:00
outbuf , outbytesleft ) = = - 1 ) {
return - 1 ;
2011-04-11 15:34:21 +04:00
} else if ( pull_failed ) {
/* We want the pull errno if possible */
errno = saved_errno ;
return - 1 ;
2011-03-31 06:27:29 +04:00
}
2011-03-31 05:13:05 +04:00
}
2003-08-13 05:53:07 +04:00
}
return 0 ;
}
2007-08-27 22:10:19 +04:00
static bool is_utf16 ( const char * name )
2004-09-01 09:19:00 +04:00
{
return strcasecmp ( name , " UCS-2LE " ) = = 0 | |
strcasecmp ( name , " UTF-16LE " ) = = 0 ;
}
2011-02-18 02:24:58 +03:00
static int smb_iconv_t_destructor ( smb_iconv_t hwd )
2010-10-20 21:55:28 +04:00
{
2009-09-07 11:40:34 +04:00
# ifdef HAVE_NATIVE_ICONV
if ( hwd - > cd_pull ! = NULL & & hwd - > cd_pull ! = ( iconv_t ) - 1 )
iconv_close ( hwd - > cd_pull ) ;
if ( hwd - > cd_push ! = NULL & & hwd - > cd_push ! = ( iconv_t ) - 1 )
iconv_close ( hwd - > cd_push ) ;
if ( hwd - > cd_direct ! = NULL & & hwd - > cd_direct ! = ( iconv_t ) - 1 )
iconv_close ( hwd - > cd_direct ) ;
# endif
2007-12-17 10:32:00 +03:00
2009-09-07 11:40:34 +04:00
return 0 ;
}
2007-12-17 10:32:00 +03:00
2008-04-02 06:53:27 +04:00
_PUBLIC_ smb_iconv_t smb_iconv_open_ex ( TALLOC_CTX * mem_ctx , const char * tocode ,
2011-09-20 06:13:21 +04:00
const char * fromcode , bool use_builtin_handlers )
2003-08-13 05:53:07 +04:00
{
smb_iconv_t ret ;
2004-12-19 03:13:24 +03:00
const struct charset_functions * from = NULL , * to = NULL ;
int i ;
2003-08-13 05:53:07 +04:00
2007-12-17 10:32:00 +03:00
ret = ( smb_iconv_t ) talloc_named ( mem_ctx ,
sizeof ( * ret ) ,
2004-09-01 13:45:00 +04:00
" iconv(%s,%s) " , tocode , fromcode ) ;
2003-08-13 05:53:07 +04:00
if ( ! ret ) {
errno = ENOMEM ;
return ( smb_iconv_t ) - 1 ;
}
memset ( ret , 0 , sizeof ( * ret ) ) ;
2009-09-07 11:40:34 +04:00
talloc_set_destructor ( ret , smb_iconv_t_destructor ) ;
2003-08-13 05:53:07 +04:00
/* check for the simplest null conversion */
if ( strcmp ( fromcode , tocode ) = = 0 ) {
ret - > direct = iconv_copy ;
return ret ;
}
2011-09-09 17:51:06 +04:00
/* check if we have a builtin function for this conversion */
2004-12-19 03:13:24 +03:00
for ( i = 0 ; i < ARRAY_SIZE ( builtin_functions ) ; i + + ) {
if ( strcasecmp ( fromcode , builtin_functions [ i ] . name ) = = 0 ) {
2011-09-20 06:13:21 +04:00
if ( use_builtin_handlers | | builtin_functions [ i ] . samba_internal_charset ) {
from = & builtin_functions [ i ] ;
}
2004-12-19 03:13:24 +03:00
}
2011-09-20 06:13:21 +04:00
if ( strcasecmp ( tocode , builtin_functions [ i ] . name ) = = 0 ) {
if ( use_builtin_handlers | | builtin_functions [ i ] . samba_internal_charset ) {
to = & builtin_functions [ i ] ;
}
2004-12-19 03:13:24 +03:00
}
}
2003-08-13 05:53:07 +04:00
# ifdef HAVE_NATIVE_ICONV
2011-09-09 17:51:06 +04:00
/* the from and to varaibles indicate a samba module or
* internal conversion , ret - > pull and ret - > push are
* initialised only in this block for iconv based
* conversions */
2011-09-20 06:13:21 +04:00
if ( from = = NULL ) {
2004-09-01 09:19:00 +04:00
ret - > cd_pull = iconv_open ( " UTF-16LE " , fromcode ) ;
if ( ret - > cd_pull = = ( iconv_t ) - 1 )
ret - > cd_pull = iconv_open ( " UCS-2LE " , fromcode ) ;
2011-09-09 17:51:06 +04:00
if ( ret - > cd_pull ! = ( iconv_t ) - 1 ) {
ret - > pull = sys_iconv ;
}
2003-08-13 05:53:07 +04:00
}
2011-09-09 17:51:06 +04:00
2011-09-20 06:13:21 +04:00
if ( to = = NULL ) {
2004-09-01 09:19:00 +04:00
ret - > cd_push = iconv_open ( tocode , " UTF-16LE " ) ;
if ( ret - > cd_push = = ( iconv_t ) - 1 )
ret - > cd_push = iconv_open ( tocode , " UCS-2LE " ) ;
2011-09-09 17:51:06 +04:00
if ( ret - > cd_push ! = ( iconv_t ) - 1 ) {
ret - > push = sys_iconv ;
}
2003-08-13 05:53:07 +04:00
}
2011-09-09 17:51:06 +04:00
# endif
if ( ret - > pull = = NULL & & from = = NULL ) {
goto failed ;
}
if ( ret - > push = = NULL & & to = = NULL ) {
goto failed ;
}
2003-08-13 05:53:07 +04:00
/* check for conversion to/from ucs2 */
2004-09-01 09:19:00 +04:00
if ( is_utf16 ( fromcode ) & & to ) {
2003-08-13 05:53:07 +04:00
ret - > direct = to - > push ;
return ret ;
}
2004-09-01 09:19:00 +04:00
if ( is_utf16 ( tocode ) & & from ) {
2003-08-13 05:53:07 +04:00
ret - > direct = from - > pull ;
return ret ;
}
# ifdef HAVE_NATIVE_ICONV
2004-09-01 09:19:00 +04:00
if ( is_utf16 ( fromcode ) ) {
2003-08-13 05:53:07 +04:00
ret - > direct = sys_iconv ;
ret - > cd_direct = ret - > cd_push ;
ret - > cd_push = NULL ;
return ret ;
}
2004-09-01 09:19:00 +04:00
if ( is_utf16 ( tocode ) ) {
2003-08-13 05:53:07 +04:00
ret - > direct = sys_iconv ;
ret - > cd_direct = ret - > cd_pull ;
ret - > cd_pull = NULL ;
return ret ;
}
# endif
/* the general case has to go via a buffer */
if ( ! ret - > pull ) ret - > pull = from - > pull ;
if ( ! ret - > push ) ret - > push = to - > push ;
return ret ;
failed :
2004-09-01 13:45:00 +04:00
talloc_free ( ret ) ;
2003-08-13 05:53:07 +04:00
errno = EINVAL ;
return ( smb_iconv_t ) - 1 ;
}
2007-12-17 10:32:00 +03:00
/*
simple iconv_open ( ) wrapper
*/
2008-04-02 06:53:27 +04:00
_PUBLIC_ smb_iconv_t smb_iconv_open ( const char * tocode , const char * fromcode )
2007-12-17 10:32:00 +03:00
{
2011-02-18 02:24:58 +03:00
return smb_iconv_open_ex ( NULL , tocode , fromcode , true ) ;
2007-12-17 10:32:00 +03:00
}
2003-08-13 05:53:07 +04:00
/*
simple iconv_close ( ) wrapper
*/
2008-04-02 06:53:27 +04:00
_PUBLIC_ int smb_iconv_close ( smb_iconv_t cd )
2003-08-13 05:53:07 +04:00
{
2004-09-01 13:45:00 +04:00
talloc_free ( cd ) ;
2003-08-13 05:53:07 +04:00
return 0 ;
}
/**********************************************************************
the following functions implement the builtin character sets in Samba
and also the " test " character sets that are designed to test
multi - byte character set support for english users
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2011-04-12 04:35:43 +04:00
/*
this takes an ASCII sequence and produces a UTF16 sequence
The first 127 codepoints of latin1 matches the first 127 codepoints
of unicode , and so can be put into the first byte of UTF16LE
*/
2003-08-13 05:53:07 +04:00
static size_t ascii_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
while ( * inbytesleft > = 1 & & * outbytesleft > = 2 ) {
2011-04-12 04:35:43 +04:00
if ( ( ( * inbuf ) [ 0 ] & 0x7F ) ! = ( * inbuf ) [ 0 ] ) {
/* If this is multi-byte, then it isn't legal ASCII */
errno = EILSEQ ;
return - 1 ;
}
2003-08-13 05:53:07 +04:00
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
( * outbuf ) [ 1 ] = 0 ;
( * inbytesleft ) - = 1 ;
( * outbytesleft ) - = 2 ;
( * inbuf ) + = 1 ;
( * outbuf ) + = 2 ;
}
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
2011-04-12 04:35:43 +04:00
/*
this takes a UTF16 sequence and produces an ASCII sequence
The first 127 codepoints of ASCII matches the first 127 codepoints
of unicode , and so can be read directly from the first byte of UTF16LE
*/
2003-08-13 05:53:07 +04:00
static size_t ascii_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
int ir_count = 0 ;
while ( * inbytesleft > = 2 & & * outbytesleft > = 1 ) {
2011-04-12 04:35:43 +04:00
if ( ( ( * inbuf ) [ 0 ] & 0x7F ) ! = ( * inbuf ) [ 0 ] | |
( * inbuf ) [ 1 ] ! = 0 ) {
/* If this is multi-byte, then it isn't legal ASCII */
errno = EILSEQ ;
return - 1 ;
}
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
2003-08-13 05:53:07 +04:00
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 1 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 1 ;
}
if ( * inbytesleft = = 1 ) {
errno = EINVAL ;
return - 1 ;
}
if ( * inbytesleft > 1 ) {
errno = E2BIG ;
return - 1 ;
}
return ir_count ;
}
2011-04-12 04:35:43 +04:00
/*
this takes a latin1 / ISO - 8859 - 1 sequence and produces a UTF16 sequence
The first 256 codepoints of latin1 matches the first 256 codepoints
of unicode , and so can be put into the first byte of UTF16LE
*/
static size_t latin1_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
while ( * inbytesleft > = 1 & & * outbytesleft > = 2 ) {
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
( * outbuf ) [ 1 ] = 0 ;
( * inbytesleft ) - = 1 ;
( * outbytesleft ) - = 2 ;
( * inbuf ) + = 1 ;
( * outbuf ) + = 2 ;
}
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
/*
this takes a UTF16 sequence and produces a latin1 / ISO - 8859 - 1 sequence
The first 256 codepoints of latin1 matches the first 256 codepoints
of unicode , and so can be read directly from the first byte of UTF16LE
*/
2011-02-18 02:24:58 +03:00
static size_t latin1_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
int ir_count = 0 ;
while ( * inbytesleft > = 2 & & * outbytesleft > = 1 ) {
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
2011-04-12 04:35:43 +04:00
if ( ( * inbuf ) [ 1 ] ! = 0 ) {
/* If this is multi-byte, then it isn't legal latin1 */
errno = EILSEQ ;
return - 1 ;
}
2011-02-18 02:24:58 +03:00
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 1 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 1 ;
}
if ( * inbytesleft = = 1 ) {
errno = EINVAL ;
return - 1 ;
}
if ( * inbytesleft > 1 ) {
errno = E2BIG ;
return - 1 ;
}
return ir_count ;
}
2003-08-13 05:53:07 +04:00
static size_t ucs2hex_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
while ( * inbytesleft > = 1 & & * outbytesleft > = 2 ) {
2010-01-05 20:40:54 +03:00
unsigned int v ;
2003-08-13 05:53:07 +04:00
if ( ( * inbuf ) [ 0 ] ! = ' @ ' ) {
/* seven bit ascii case */
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
( * outbuf ) [ 1 ] = 0 ;
( * inbytesleft ) - = 1 ;
( * outbytesleft ) - = 2 ;
( * inbuf ) + = 1 ;
( * outbuf ) + = 2 ;
continue ;
}
/* it's a hex character */
if ( * inbytesleft < 5 ) {
errno = EINVAL ;
return - 1 ;
}
if ( sscanf ( & ( * inbuf ) [ 1 ] , " %04x " , & v ) ! = 1 ) {
errno = EILSEQ ;
return - 1 ;
}
( * outbuf ) [ 0 ] = v & 0xff ;
( * outbuf ) [ 1 ] = v > > 8 ;
( * inbytesleft ) - = 5 ;
( * outbytesleft ) - = 2 ;
( * inbuf ) + = 5 ;
( * outbuf ) + = 2 ;
}
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
static size_t ucs2hex_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
while ( * inbytesleft > = 2 & & * outbytesleft > = 1 ) {
char buf [ 6 ] ;
if ( ( * inbuf ) [ 1 ] = = 0 & &
( ( * inbuf ) [ 0 ] & 0x80 ) = = 0 & &
( * inbuf ) [ 0 ] ! = ' @ ' ) {
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 1 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 1 ;
continue ;
}
if ( * outbytesleft < 5 ) {
errno = E2BIG ;
return - 1 ;
}
snprintf ( buf , 6 , " @%04x " , SVAL ( * inbuf , 0 ) ) ;
memcpy ( * outbuf , buf , 5 ) ;
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 5 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 5 ;
}
if ( * inbytesleft = = 1 ) {
errno = EINVAL ;
return - 1 ;
}
if ( * inbytesleft > 1 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
2003-12-16 12:20:34 +03:00
static size_t iconv_swab ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
int n ;
n = MIN ( * inbytesleft , * outbytesleft ) ;
swab ( * inbuf , * outbuf , ( n & ~ 1 ) ) ;
if ( n & 1 ) {
( * outbuf ) [ n - 1 ] = 0 ;
}
( * inbytesleft ) - = n ;
( * outbytesleft ) - = n ;
( * inbuf ) + = n ;
( * outbuf ) + = n ;
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
2003-08-13 05:53:07 +04:00
static size_t iconv_copy ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
int n ;
n = MIN ( * inbytesleft , * outbytesleft ) ;
memmove ( * outbuf , * inbuf , n ) ;
( * inbytesleft ) - = n ;
( * outbytesleft ) - = n ;
( * inbuf ) + = n ;
( * outbuf ) + = n ;
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
2008-08-19 11:49:34 +04:00
/*
this takes a UTF8 sequence and produces a UTF16 sequence
*/
2003-08-13 05:53:07 +04:00
static size_t utf8_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
2004-09-01 08:39:06 +04:00
size_t in_left = * inbytesleft , out_left = * outbytesleft ;
const uint8_t * c = ( const uint8_t * ) * inbuf ;
uint8_t * uc = ( uint8_t * ) * outbuf ;
2003-08-13 05:53:07 +04:00
2004-09-01 08:39:06 +04:00
while ( in_left > = 1 & & out_left > = 2 ) {
2003-08-13 05:53:07 +04:00
if ( ( c [ 0 ] & 0x80 ) = = 0 ) {
uc [ 0 ] = c [ 0 ] ;
uc [ 1 ] = 0 ;
2004-09-01 08:39:06 +04:00
c + = 1 ;
in_left - = 1 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
if ( ( c [ 0 ] & 0xe0 ) = = 0xc0 ) {
if ( in_left < 2 | |
( c [ 1 ] & 0xc0 ) ! = 0x80 ) {
errno = EILSEQ ;
goto error ;
}
uc [ 1 ] = ( c [ 0 ] > > 2 ) & 0x7 ;
uc [ 0 ] = ( c [ 0 ] < < 6 ) | ( c [ 1 ] & 0x3f ) ;
c + = 2 ;
in_left - = 2 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
if ( ( c [ 0 ] & 0xf0 ) = = 0xe0 ) {
if ( in_left < 3 | |
( c [ 1 ] & 0xc0 ) ! = 0x80 | |
( c [ 2 ] & 0xc0 ) ! = 0x80 ) {
errno = EILSEQ ;
goto error ;
2003-08-13 05:53:07 +04:00
}
uc [ 1 ] = ( ( c [ 0 ] & 0xF ) < < 4 ) | ( ( c [ 1 ] > > 2 ) & 0xF ) ;
uc [ 0 ] = ( c [ 1 ] < < 6 ) | ( c [ 2 ] & 0x3f ) ;
2004-09-01 08:39:06 +04:00
c + = 3 ;
in_left - = 3 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
if ( ( c [ 0 ] & 0xf8 ) = = 0xf0 ) {
unsigned int codepoint ;
if ( in_left < 4 | |
( c [ 1 ] & 0xc0 ) ! = 0x80 | |
( c [ 2 ] & 0xc0 ) ! = 0x80 | |
( c [ 3 ] & 0xc0 ) ! = 0x80 ) {
errno = EILSEQ ;
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
codepoint =
( c [ 3 ] & 0x3f ) |
( ( c [ 2 ] & 0x3f ) < < 6 ) |
( ( c [ 1 ] & 0x3f ) < < 12 ) |
( ( c [ 0 ] & 0x7 ) < < 18 ) ;
if ( codepoint < 0x10000 ) {
/* accept UTF-8 characters that are not
minimally packed , but pack the result */
uc [ 0 ] = ( codepoint & 0xFF ) ;
uc [ 1 ] = ( codepoint > > 8 ) ;
c + = 4 ;
in_left - = 4 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
codepoint - = 0x10000 ;
if ( out_left < 4 ) {
errno = E2BIG ;
goto error ;
}
uc [ 0 ] = ( codepoint > > 10 ) & 0xFF ;
uc [ 1 ] = ( codepoint > > 18 ) | 0xd8 ;
uc [ 2 ] = codepoint & 0xFF ;
uc [ 3 ] = ( ( codepoint > > 8 ) & 0x3 ) | 0xdc ;
c + = 4 ;
in_left - = 4 ;
out_left - = 4 ;
uc + = 4 ;
continue ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
/* we don't handle 5 byte sequences */
errno = EINVAL ;
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
if ( in_left > 0 ) {
2003-08-13 05:53:07 +04:00
errno = E2BIG ;
2004-09-01 08:39:06 +04:00
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
* inbytesleft = in_left ;
* outbytesleft = out_left ;
2006-08-22 23:05:27 +04:00
* inbuf = ( const char * ) c ;
* outbuf = ( char * ) uc ;
2003-08-13 05:53:07 +04:00
return 0 ;
2004-09-01 08:39:06 +04:00
error :
* inbytesleft = in_left ;
* outbytesleft = out_left ;
2006-08-22 23:05:27 +04:00
* inbuf = ( const char * ) c ;
* outbuf = ( char * ) uc ;
2003-08-13 05:53:07 +04:00
return - 1 ;
}
2008-08-19 11:49:34 +04:00
/*
this takes a UTF16 sequence and produces a UTF8 sequence
*/
2003-08-13 05:53:07 +04:00
static size_t utf8_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
2004-09-01 08:39:06 +04:00
char * * outbuf , size_t * outbytesleft )
2003-08-13 05:53:07 +04:00
{
2004-09-01 08:39:06 +04:00
size_t in_left = * inbytesleft , out_left = * outbytesleft ;
uint8_t * c = ( uint8_t * ) * outbuf ;
const uint8_t * uc = ( const uint8_t * ) * inbuf ;
while ( in_left > = 2 & & out_left > = 1 ) {
unsigned int codepoint ;
if ( uc [ 1 ] = = 0 & & ! ( uc [ 0 ] & 0x80 ) ) {
/* simplest case */
c [ 0 ] = uc [ 0 ] ;
in_left - = 2 ;
out_left - = 1 ;
uc + = 2 ;
c + = 1 ;
continue ;
}
if ( ( uc [ 1 ] & 0xf8 ) = = 0 ) {
/* next simplest case */
if ( out_left < 2 ) {
errno = E2BIG ;
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
c [ 0 ] = 0xc0 | ( uc [ 0 ] > > 6 ) | ( uc [ 1 ] < < 2 ) ;
c [ 1 ] = 0x80 | ( uc [ 0 ] & 0x3f ) ;
in_left - = 2 ;
out_left - = 2 ;
uc + = 2 ;
c + = 2 ;
continue ;
}
if ( ( uc [ 1 ] & 0xfc ) = = 0xdc ) {
/* its the second part of a 4 byte sequence. Illegal */
if ( in_left < 4 ) {
errno = EINVAL ;
} else {
errno = EILSEQ ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
if ( ( uc [ 1 ] & 0xfc ) ! = 0xd8 ) {
codepoint = uc [ 0 ] | ( uc [ 1 ] < < 8 ) ;
if ( out_left < 3 ) {
errno = E2BIG ;
goto error ;
}
c [ 0 ] = 0xe0 | ( codepoint > > 12 ) ;
c [ 1 ] = 0x80 | ( ( codepoint > > 6 ) & 0x3f ) ;
c [ 2 ] = 0x80 | ( codepoint & 0x3f ) ;
in_left - = 2 ;
out_left - = 3 ;
uc + = 2 ;
c + = 3 ;
continue ;
}
2003-08-13 05:53:07 +04:00
2004-09-01 08:39:06 +04:00
/* its the first part of a 4 byte sequence */
if ( in_left < 4 ) {
errno = EINVAL ;
goto error ;
}
if ( ( uc [ 3 ] & 0xfc ) ! = 0xdc ) {
errno = EILSEQ ;
goto error ;
}
codepoint = 0x10000 + ( uc [ 2 ] | ( ( uc [ 3 ] & 0x3 ) < < 8 ) |
( uc [ 0 ] < < 10 ) | ( ( uc [ 1 ] & 0x3 ) < < 18 ) ) ;
if ( out_left < 4 ) {
errno = E2BIG ;
goto error ;
}
c [ 0 ] = 0xf0 | ( codepoint > > 18 ) ;
c [ 1 ] = 0x80 | ( ( codepoint > > 12 ) & 0x3f ) ;
c [ 2 ] = 0x80 | ( ( codepoint > > 6 ) & 0x3f ) ;
c [ 3 ] = 0x80 | ( codepoint & 0x3f ) ;
in_left - = 4 ;
out_left - = 4 ;
uc + = 4 ;
c + = 4 ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
if ( in_left = = 1 ) {
2003-08-13 05:53:07 +04:00
errno = EINVAL ;
2004-09-01 08:39:06 +04:00
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
if ( in_left > 1 ) {
2003-08-13 05:53:07 +04:00
errno = E2BIG ;
2004-09-01 08:39:06 +04:00
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
* inbytesleft = in_left ;
* outbytesleft = out_left ;
2006-08-22 23:05:27 +04:00
* inbuf = ( const char * ) uc ;
* outbuf = ( char * ) c ;
2003-08-13 05:53:07 +04:00
return 0 ;
2004-09-01 08:39:06 +04:00
error :
* inbytesleft = in_left ;
* outbytesleft = out_left ;
2006-08-22 23:05:27 +04:00
* inbuf = ( const char * ) uc ;
* outbuf = ( char * ) c ;
2003-08-13 05:53:07 +04:00
return - 1 ;
}
2004-09-01 13:45:00 +04:00
2008-10-31 05:51:37 +03:00
/*
2008-10-31 07:41:34 +03:00
this takes a UTF16 munged sequence , modifies it according to the
string2key rules , and produces a UTF16 sequence
2008-10-31 05:51:37 +03:00
The rules are :
2008-10-31 07:41:34 +03:00
1 ) any 0x0000 characters are mapped to 0x0001
2 ) convert any instance of 0xD800 - 0xDBFF ( high surrogate )
2008-10-31 05:51:37 +03:00
without an immediately following 0xDC00 - 0x0 xDFFF ( low surrogate ) to
U + FFFD ( OBJECT REPLACEMENT CHARACTER ) .
2008-10-31 07:41:34 +03:00
3 ) the same for any low surrogate that was not preceded by a high surrogate .
2008-10-31 05:51:37 +03:00
*/
2008-10-31 07:41:34 +03:00
static size_t utf16_munged_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
2008-10-31 05:51:37 +03:00
char * * outbuf , size_t * outbytesleft )
{
size_t in_left = * inbytesleft , out_left = * outbytesleft ;
uint8_t * c = ( uint8_t * ) * outbuf ;
const uint8_t * uc = ( const uint8_t * ) * inbuf ;
2008-10-31 07:41:34 +03:00
while ( in_left > = 2 & & out_left > = 2 ) {
unsigned int codepoint = uc [ 0 ] | ( uc [ 1 ] < < 8 ) ;
2008-10-31 05:51:37 +03:00
2008-10-31 07:41:34 +03:00
if ( codepoint = = 0 ) {
codepoint = 1 ;
2008-10-31 05:51:37 +03:00
}
2008-10-31 07:41:34 +03:00
if ( ( codepoint & 0xfc00 ) = = 0xd800 ) {
/* a high surrogate */
unsigned int codepoint2 ;
if ( in_left < 4 ) {
codepoint = 0xfffd ;
goto codepoint16 ;
}
codepoint2 = uc [ 2 ] | ( uc [ 3 ] < < 8 ) ;
if ( ( codepoint2 & 0xfc00 ) ! = 0xdc00 ) {
/* high surrogate not followed by low
surrogate : convert to 0xfffd */
codepoint = 0xfffd ;
goto codepoint16 ;
}
if ( out_left < 4 ) {
2008-10-31 05:51:37 +03:00
errno = E2BIG ;
goto error ;
}
2008-10-31 07:41:34 +03:00
memcpy ( c , uc , 4 ) ;
in_left - = 4 ;
out_left - = 4 ;
uc + = 4 ;
c + = 4 ;
2008-10-31 05:51:37 +03:00
continue ;
}
2008-10-31 07:41:34 +03:00
if ( ( codepoint & 0xfc00 ) = = 0xdc00 ) {
/* low surrogate not preceded by high
surrogate : convert to 0xfffd */
2008-10-31 05:51:37 +03:00
codepoint = 0xfffd ;
}
codepoint16 :
2008-10-31 07:41:34 +03:00
c [ 0 ] = codepoint & 0xFF ;
c [ 1 ] = ( codepoint > > 8 ) & 0xFF ;
2008-10-31 05:51:37 +03:00
in_left - = 2 ;
2008-10-31 07:41:34 +03:00
out_left - = 2 ;
2008-10-31 05:51:37 +03:00
uc + = 2 ;
2008-10-31 07:41:34 +03:00
c + = 2 ;
2008-10-31 05:51:37 +03:00
continue ;
}
if ( in_left = = 1 ) {
errno = EINVAL ;
goto error ;
}
if ( in_left > 1 ) {
errno = E2BIG ;
goto error ;
}
* inbytesleft = in_left ;
* outbytesleft = out_left ;
* inbuf = ( const char * ) uc ;
* outbuf = ( char * ) c ;
return 0 ;
error :
* inbytesleft = in_left ;
* outbytesleft = out_left ;
* inbuf = ( const char * ) uc ;
* outbuf = ( char * ) c ;
return - 1 ;
}
2004-09-01 13:45:00 +04:00