2014-12-07 16:13:51 +03:00
/*
2003-08-13 05:53:07 +04:00
Unix SMB / CIFS implementation .
minimal iconv implementation
Copyright ( C ) Andrew Tridgell 2001
Copyright ( C ) Jelmer Vernooij 2002
2014-12-07 16:13:51 +03:00
2003-08-13 05:53:07 +04:00
This program is free software ; you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
2007-07-10 06:07:03 +04:00
the Free Software Foundation ; either version 3 of the License , or
2003-08-13 05:53:07 +04:00
( at your option ) any later version .
2014-12-07 16:13:51 +03:00
2003-08-13 05:53:07 +04:00
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
2014-12-07 16:13:51 +03:00
2003-08-13 05:53:07 +04:00
You should have received a copy of the GNU General Public License
2007-07-10 06:07:03 +04:00
along with this program . If not , see < http : //www.gnu.org/licenses/>.
2003-08-13 05:53:07 +04:00
*/
2017-01-08 22:52:47 +03:00
# include "replace.h"
2004-11-02 02:45:40 +03:00
# include "system/iconv.h"
2005-02-10 08:09:35 +03:00
# include "system/filesys.h"
2017-01-08 22:52:47 +03:00
# include "lib/util/byteorder.h"
# include "lib/util/dlinklist.h"
# include "lib/util/charset/charset.h"
# include "lib/util/charset/charset_proto.h"
2003-08-13 05:53:07 +04:00
2019-04-09 12:21:57 +03:00
# ifdef HAVE_ICU_I18N
# include <unicode/ustring.h>
# include <unicode/utrans.h>
# endif
2011-02-18 02:24:58 +03:00
# ifdef strcasecmp
# undef strcasecmp
# endif
2003-08-13 05:53:07 +04:00
/**
* @ file
*
* @ brief Samba wrapper / stub for iconv character set conversion .
*
* iconv is the XPG2 interface for converting between character
* encodings . This file provides a Samba wrapper around it , and also
* a simple reimplementation that is used if the system does not
* implement iconv .
*
* Samba only works with encodings that are supersets of ASCII : ascii
* characters like whitespace can be tested for directly , multibyte
* sequences start with a byte with the high bit set , and strings are
* terminated by a nul byte .
*
* Note that the only function provided by iconv is conversion between
* characters . It doesn ' t directly support operations like
2004-09-01 08:39:06 +04:00
* uppercasing or comparison . We have to convert to UTF - 16L E and
* compare there .
2003-08-13 05:53:07 +04:00
*
* @ sa Samba Developers Guide
* */
2003-12-16 12:20:34 +03:00
static size_t ascii_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t ascii_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
2011-04-12 04:35:43 +04:00
static size_t latin1_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
2011-02-18 02:24:58 +03:00
static size_t latin1_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
2003-12-16 12:20:34 +03:00
static size_t utf8_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t utf8_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
2008-10-31 07:41:34 +03:00
static size_t utf16_munged_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
2003-08-13 05:53:07 +04:00
static size_t ucs2hex_pull ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t ucs2hex_push ( void * , const char * * , size_t * , char * * , size_t * ) ;
2003-12-16 12:20:34 +03:00
static size_t iconv_copy ( void * , const char * * , size_t * , char * * , size_t * ) ;
static size_t iconv_swab ( void * , const char * * , size_t * , char * * , size_t * ) ;
2003-08-13 05:53:07 +04:00
2004-11-02 15:43:25 +03:00
static const struct charset_functions builtin_functions [ ] = {
2004-09-01 08:39:06 +04:00
/* windows is closest to UTF-16 */
2018-12-12 23:25:37 +03:00
{
. name = " UCS-2LE " ,
. pull = iconv_copy ,
. push = iconv_copy
} ,
{
. name = " UTF-16LE " ,
. pull = iconv_copy ,
. push = iconv_copy
} ,
{
. name = " UCS-2BE " ,
. pull = iconv_swab ,
. push = iconv_swab
} ,
{
. name = " UTF-16BE " ,
. pull = iconv_swab ,
. push = iconv_swab
} ,
2004-08-30 16:03:01 +04:00
/* we include the UTF-8 alias to cope with differing locale settings */
2018-12-12 23:25:37 +03:00
{
. name = " UTF8 " ,
. pull = utf8_pull ,
. push = utf8_push
} ,
{
. name = " UTF-8 " ,
. pull = utf8_pull ,
. push = utf8_push
} ,
2008-10-31 05:51:37 +03:00
/* this handles the munging needed for String2Key */
2018-12-12 23:25:37 +03:00
{
. name = " UTF16_MUNGED " ,
. pull = utf16_munged_pull ,
. push = iconv_copy ,
. samba_internal_charset = true
} ,
2008-10-31 05:51:37 +03:00
2018-12-12 23:25:37 +03:00
{
. name = " ASCII " ,
. pull = ascii_pull ,
. push = ascii_push
} ,
{
. name = " 646 " ,
. pull = ascii_pull ,
. push = ascii_push
} ,
{
. name = " ISO-8859-1 " ,
. pull = latin1_pull ,
. push = latin1_push
} ,
2014-12-07 16:13:51 +03:00
# ifdef DEVELOPER
2018-12-12 23:25:37 +03:00
{
. name = " WEIRD " ,
. pull = weird_pull ,
. push = weird_push ,
. samba_internal_charset = true
} ,
2011-02-18 02:24:58 +03:00
# endif
2011-09-20 06:13:21 +04:00
# ifdef DARWINOS
2018-12-12 23:25:37 +03:00
{
. name = " MACOSXFS " ,
. pull = macosxfs_encoding_pull ,
. push = macosxfs_encoding_push ,
. samba_internal_charset = true
} ,
2011-09-20 06:13:21 +04:00
# endif
2018-12-12 23:25:37 +03:00
{
. name = " UCS2-HEX " ,
. pull = ucs2hex_pull ,
. push = ucs2hex_push ,
. samba_internal_charset = true
}
2011-09-20 06:13:21 +04:00
} ;
2011-02-18 02:24:58 +03:00
2003-08-13 05:53:07 +04:00
# ifdef HAVE_NATIVE_ICONV
/* if there was an error then reset the internal state,
this ensures that we don ' t have a shift state remaining for
character sets like SJIS */
2014-12-07 16:13:51 +03:00
static size_t sys_iconv ( void * cd ,
2003-08-13 05:53:07 +04:00
const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
2014-12-07 16:13:51 +03:00
size_t ret = iconv ( ( iconv_t ) cd ,
discard_const_p ( char * , inbuf ) , inbytesleft ,
2003-08-13 05:53:07 +04:00
outbuf , outbytesleft ) ;
if ( ret = = ( size_t ) - 1 ) iconv ( cd , NULL , NULL , NULL , NULL ) ;
return ret ;
}
# endif
2019-04-09 12:21:57 +03:00
# ifdef HAVE_ICU_I18N
static size_t sys_uconv ( void * cd ,
const char * * inbuf ,
size_t * inbytesleft ,
char * * outbuf ,
size_t * outbytesleft )
{
UTransliterator * t = ( UTransliterator * ) cd ;
size_t bufsize = * inbytesleft * 2 ;
UChar ustr [ bufsize ] ;
UChar * up = NULL ;
char * p = NULL ;
int32_t ustrlen ;
int32_t limit ;
int32_t converted_len ;
size_t inbuf_consumed ;
size_t outbut_consumed ;
UErrorCode ue ;
/* Convert from UTF8 to UCS2 */
ue = 0 ;
up = u_strFromUTF8 ( ustr , /* dst */
bufsize , /* dst buflen */
& converted_len , /* dst written */
* inbuf , /* src */
* inbytesleft , /* src length */
& ue ) ;
if ( up = = NULL | | U_FAILURE ( ue ) ) {
return - 1 ;
}
if ( converted_len > bufsize ) {
/*
* u_strFromUTF8 ( ) returns the required size in
* converted_len . In theory this should never overflow as the
* ustr [ ] array is allocated with a size twice as big as
* inbytesleft and converted_len should be equal to inbytesleft ,
* but you never know . . .
*/
errno = EOVERFLOW ;
return - 1 ;
}
inbuf_consumed = converted_len ;
/*
* The following transliteration function takes two parameters , the
* lenght of the text to be converted ( converted_len ) and a limit which
* may be smaller then converted_len . We just set limit to converted_len
* and also ignore the value returned in limit .
*/
limit = converted_len ;
/* Inplace transliteration */
utrans_transUChars ( t ,
ustr , /* text */
& converted_len , /* text length */
bufsize , /* text buflen */
0 , /* start */
& limit , /* limit */
& ue ) ;
if ( U_FAILURE ( ue ) ) {
return - 1 ;
}
if ( converted_len > bufsize ) {
/*
* In theory this should never happen as the ustr [ ] array is
* allocated with a size twice as big as inbytesleft and
* converted_len should be equal to inbytesleft , but you never
* know . . .
*/
errno = EOVERFLOW ;
return - 1 ;
}
ustrlen = converted_len ;
/* Convert from UCS2 back to UTF8 */
ue = 0 ;
p = u_strToUTF8 ( * outbuf , /* dst */
* outbytesleft , /* dst buflen */
& converted_len , /* dst required length */
ustr , /* src */
ustrlen , /* src length */
& ue ) ;
if ( p = = NULL | | U_FAILURE ( ue ) ) {
return - 1 ;
}
outbut_consumed = converted_len ;
if ( converted_len > * outbytesleft ) {
/*
* The caller ' s result buffer is too small . . .
*/
outbut_consumed = * outbytesleft ;
}
* inbuf + = inbuf_consumed ;
* inbytesleft - = inbuf_consumed ;
* outbuf + = outbut_consumed ;
* outbytesleft - = outbut_consumed ;
return converted_len ;
}
# endif
2003-08-13 05:53:07 +04:00
/**
* This is a simple portable iconv ( ) implementaion .
*
* It only knows about a very small number of character sets - just
* enough that Samba works on systems that don ' t have iconv .
* */
2014-12-07 16:13:51 +03:00
_PUBLIC_ size_t smb_iconv ( smb_iconv_t cd ,
2003-08-13 05:53:07 +04:00
const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
/* in many cases we can go direct */
if ( cd - > direct ) {
2014-12-07 16:13:51 +03:00
return cd - > direct ( cd - > cd_direct ,
2003-08-13 05:53:07 +04:00
inbuf , inbytesleft , outbuf , outbytesleft ) ;
}
/* otherwise we have to do it chunks at a time */
2011-03-31 05:13:05 +04:00
{
# ifndef SMB_ICONV_BUFSIZE
# define SMB_ICONV_BUFSIZE 2048
# endif
size_t bufsize ;
2013-12-06 14:31:07 +04:00
char cvtbuf [ SMB_ICONV_BUFSIZE ] ;
2011-03-31 05:13:05 +04:00
while ( * inbytesleft > 0 ) {
char * bufp1 = cvtbuf ;
const char * bufp2 = cvtbuf ;
2011-04-11 15:34:21 +04:00
int saved_errno = errno ;
bool pull_failed = false ;
2011-03-31 05:13:05 +04:00
bufsize = SMB_ICONV_BUFSIZE ;
2003-08-13 05:53:07 +04:00
2011-03-31 05:13:05 +04:00
if ( cd - > pull ( cd - > cd_pull ,
inbuf , inbytesleft , & bufp1 , & bufsize ) = = - 1
2011-03-31 06:27:29 +04:00
& & errno ! = E2BIG ) {
2011-04-11 15:34:21 +04:00
saved_errno = errno ;
pull_failed = true ;
2011-03-31 06:27:29 +04:00
}
2003-08-13 05:53:07 +04:00
2011-03-31 05:13:05 +04:00
bufsize = SMB_ICONV_BUFSIZE - bufsize ;
if ( cd - > push ( cd - > cd_push ,
& bufp2 , & bufsize ,
2011-03-31 06:27:29 +04:00
outbuf , outbytesleft ) = = - 1 ) {
return - 1 ;
2011-04-11 15:34:21 +04:00
} else if ( pull_failed ) {
/* We want the pull errno if possible */
errno = saved_errno ;
return - 1 ;
2011-03-31 06:27:29 +04:00
}
2011-03-31 05:13:05 +04:00
}
2003-08-13 05:53:07 +04:00
}
return 0 ;
}
2007-08-27 22:10:19 +04:00
static bool is_utf16 ( const char * name )
2004-09-01 09:19:00 +04:00
{
return strcasecmp ( name , " UCS-2LE " ) = = 0 | |
strcasecmp ( name , " UTF-16LE " ) = = 0 ;
}
2011-02-18 02:24:58 +03:00
static int smb_iconv_t_destructor ( smb_iconv_t hwd )
2010-10-20 21:55:28 +04:00
{
2019-04-09 12:21:57 +03:00
# ifdef HAVE_ICU_I18N
/*
* This has to come first , as the cd_direct member won ' t be an iconv
* handle and must not be passed to iconv_close ( ) .
*/
if ( hwd - > direct = = sys_uconv ) {
utrans_close ( hwd - > cd_direct ) ;
return 0 ;
}
# endif
2009-09-07 11:40:34 +04:00
# ifdef HAVE_NATIVE_ICONV
if ( hwd - > cd_pull ! = NULL & & hwd - > cd_pull ! = ( iconv_t ) - 1 )
iconv_close ( hwd - > cd_pull ) ;
if ( hwd - > cd_push ! = NULL & & hwd - > cd_push ! = ( iconv_t ) - 1 )
iconv_close ( hwd - > cd_push ) ;
if ( hwd - > cd_direct ! = NULL & & hwd - > cd_direct ! = ( iconv_t ) - 1 )
iconv_close ( hwd - > cd_direct ) ;
# endif
2007-12-17 10:32:00 +03:00
2009-09-07 11:40:34 +04:00
return 0 ;
}
2007-12-17 10:32:00 +03:00
2008-04-02 06:53:27 +04:00
_PUBLIC_ smb_iconv_t smb_iconv_open_ex ( TALLOC_CTX * mem_ctx , const char * tocode ,
2011-09-20 06:13:21 +04:00
const char * fromcode , bool use_builtin_handlers )
2003-08-13 05:53:07 +04:00
{
smb_iconv_t ret ;
2004-12-19 03:13:24 +03:00
const struct charset_functions * from = NULL , * to = NULL ;
int i ;
2003-08-13 05:53:07 +04:00
2007-12-17 10:32:00 +03:00
ret = ( smb_iconv_t ) talloc_named ( mem_ctx ,
2014-12-07 16:13:51 +03:00
sizeof ( * ret ) ,
2004-09-01 13:45:00 +04:00
" iconv(%s,%s) " , tocode , fromcode ) ;
2003-08-13 05:53:07 +04:00
if ( ! ret ) {
errno = ENOMEM ;
return ( smb_iconv_t ) - 1 ;
}
memset ( ret , 0 , sizeof ( * ret ) ) ;
2009-09-07 11:40:34 +04:00
talloc_set_destructor ( ret , smb_iconv_t_destructor ) ;
2003-08-13 05:53:07 +04:00
/* check for the simplest null conversion */
if ( strcmp ( fromcode , tocode ) = = 0 ) {
ret - > direct = iconv_copy ;
return ret ;
}
2011-09-09 17:51:06 +04:00
/* check if we have a builtin function for this conversion */
2004-12-19 03:13:24 +03:00
for ( i = 0 ; i < ARRAY_SIZE ( builtin_functions ) ; i + + ) {
if ( strcasecmp ( fromcode , builtin_functions [ i ] . name ) = = 0 ) {
2011-09-20 06:13:21 +04:00
if ( use_builtin_handlers | | builtin_functions [ i ] . samba_internal_charset ) {
from = & builtin_functions [ i ] ;
}
2004-12-19 03:13:24 +03:00
}
2014-12-07 16:13:51 +03:00
if ( strcasecmp ( tocode , builtin_functions [ i ] . name ) = = 0 ) {
2011-09-20 06:13:21 +04:00
if ( use_builtin_handlers | | builtin_functions [ i ] . samba_internal_charset ) {
to = & builtin_functions [ i ] ;
}
2004-12-19 03:13:24 +03:00
}
}
2003-08-13 05:53:07 +04:00
# ifdef HAVE_NATIVE_ICONV
2014-12-07 16:09:29 +03:00
/* the from and to variables indicate a samba module or
2011-09-09 17:51:06 +04:00
* internal conversion , ret - > pull and ret - > push are
* initialised only in this block for iconv based
* conversions */
2011-09-20 06:13:21 +04:00
if ( from = = NULL ) {
2004-09-01 09:19:00 +04:00
ret - > cd_pull = iconv_open ( " UTF-16LE " , fromcode ) ;
if ( ret - > cd_pull = = ( iconv_t ) - 1 )
ret - > cd_pull = iconv_open ( " UCS-2LE " , fromcode ) ;
2011-09-09 17:51:06 +04:00
if ( ret - > cd_pull ! = ( iconv_t ) - 1 ) {
ret - > pull = sys_iconv ;
}
2003-08-13 05:53:07 +04:00
}
2014-12-07 16:13:51 +03:00
2011-09-20 06:13:21 +04:00
if ( to = = NULL ) {
2004-09-01 09:19:00 +04:00
ret - > cd_push = iconv_open ( tocode , " UTF-16LE " ) ;
if ( ret - > cd_push = = ( iconv_t ) - 1 )
ret - > cd_push = iconv_open ( tocode , " UCS-2LE " ) ;
2011-09-09 17:51:06 +04:00
if ( ret - > cd_push ! = ( iconv_t ) - 1 ) {
ret - > push = sys_iconv ;
}
2003-08-13 05:53:07 +04:00
}
2011-09-09 17:51:06 +04:00
# endif
2019-04-09 12:21:57 +03:00
# ifdef HAVE_ICU_I18N
if ( strcasecmp ( fromcode , " UTF8-NFD " ) = = 0 & &
strcasecmp ( tocode , " UTF8-NFC " ) = = 0 )
{
U_STRING_DECL ( t , " any-nfc " , 7 ) ;
UErrorCode ue = 0 ;
U_STRING_INIT ( t , " any-nfc " , 7 ) ;
ret - > cd_direct = utrans_openU ( t ,
strlen ( " any-nfc " ) ,
UTRANS_FORWARD ,
NULL ,
0 ,
NULL ,
& ue ) ;
if ( U_FAILURE ( ue ) ) {
return ( smb_iconv_t ) - 1 ;
}
ret - > direct = sys_uconv ;
return ret ;
}
if ( strcasecmp ( fromcode , " UTF8-NFC " ) = = 0 & &
strcasecmp ( tocode , " UTF8-NFD " ) = = 0 )
{
U_STRING_DECL ( tname , " any-nfd " , 7 ) ;
UErrorCode ue = 0 ;
U_STRING_INIT ( tname , " any-nfd " , 7 ) ;
ret - > cd_direct = utrans_openU ( tname ,
7 ,
UTRANS_FORWARD ,
NULL ,
0 ,
NULL ,
& ue ) ;
if ( U_FAILURE ( ue ) ) {
return ( smb_iconv_t ) - 1 ;
}
ret - > direct = sys_uconv ;
return ret ;
}
# endif
2011-09-09 17:51:06 +04:00
if ( ret - > pull = = NULL & & from = = NULL ) {
goto failed ;
}
2014-12-07 16:13:51 +03:00
2011-09-09 17:51:06 +04:00
if ( ret - > push = = NULL & & to = = NULL ) {
goto failed ;
}
2003-08-13 05:53:07 +04:00
/* check for conversion to/from ucs2 */
2004-09-01 09:19:00 +04:00
if ( is_utf16 ( fromcode ) & & to ) {
2003-08-13 05:53:07 +04:00
ret - > direct = to - > push ;
return ret ;
}
2004-09-01 09:19:00 +04:00
if ( is_utf16 ( tocode ) & & from ) {
2003-08-13 05:53:07 +04:00
ret - > direct = from - > pull ;
return ret ;
}
# ifdef HAVE_NATIVE_ICONV
2004-09-01 09:19:00 +04:00
if ( is_utf16 ( fromcode ) ) {
2003-08-13 05:53:07 +04:00
ret - > direct = sys_iconv ;
ret - > cd_direct = ret - > cd_push ;
ret - > cd_push = NULL ;
return ret ;
}
2004-09-01 09:19:00 +04:00
if ( is_utf16 ( tocode ) ) {
2003-08-13 05:53:07 +04:00
ret - > direct = sys_iconv ;
ret - > cd_direct = ret - > cd_pull ;
ret - > cd_pull = NULL ;
return ret ;
}
# endif
/* the general case has to go via a buffer */
if ( ! ret - > pull ) ret - > pull = from - > pull ;
if ( ! ret - > push ) ret - > push = to - > push ;
return ret ;
failed :
2004-09-01 13:45:00 +04:00
talloc_free ( ret ) ;
2003-08-13 05:53:07 +04:00
errno = EINVAL ;
return ( smb_iconv_t ) - 1 ;
}
2007-12-17 10:32:00 +03:00
/*
simple iconv_open ( ) wrapper
*/
2008-04-02 06:53:27 +04:00
_PUBLIC_ smb_iconv_t smb_iconv_open ( const char * tocode , const char * fromcode )
2007-12-17 10:32:00 +03:00
{
2011-02-18 02:24:58 +03:00
return smb_iconv_open_ex ( NULL , tocode , fromcode , true ) ;
2007-12-17 10:32:00 +03:00
}
2003-08-13 05:53:07 +04:00
/*
simple iconv_close ( ) wrapper
*/
2008-04-02 06:53:27 +04:00
_PUBLIC_ int smb_iconv_close ( smb_iconv_t cd )
2003-08-13 05:53:07 +04:00
{
2004-09-01 13:45:00 +04:00
talloc_free ( cd ) ;
2003-08-13 05:53:07 +04:00
return 0 ;
}
/**********************************************************************
the following functions implement the builtin character sets in Samba
and also the " test " character sets that are designed to test
multi - byte character set support for english users
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2011-04-12 04:35:43 +04:00
/*
this takes an ASCII sequence and produces a UTF16 sequence
The first 127 codepoints of latin1 matches the first 127 codepoints
of unicode , and so can be put into the first byte of UTF16LE
*/
2003-08-13 05:53:07 +04:00
static size_t ascii_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
while ( * inbytesleft > = 1 & & * outbytesleft > = 2 ) {
2011-04-12 04:35:43 +04:00
if ( ( ( * inbuf ) [ 0 ] & 0x7F ) ! = ( * inbuf ) [ 0 ] ) {
/* If this is multi-byte, then it isn't legal ASCII */
errno = EILSEQ ;
return - 1 ;
}
2003-08-13 05:53:07 +04:00
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
( * outbuf ) [ 1 ] = 0 ;
( * inbytesleft ) - = 1 ;
( * outbytesleft ) - = 2 ;
( * inbuf ) + = 1 ;
( * outbuf ) + = 2 ;
}
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
2014-12-07 16:13:51 +03:00
2003-08-13 05:53:07 +04:00
return 0 ;
}
2011-04-12 04:35:43 +04:00
/*
this takes a UTF16 sequence and produces an ASCII sequence
The first 127 codepoints of ASCII matches the first 127 codepoints
of unicode , and so can be read directly from the first byte of UTF16LE
*/
2003-08-13 05:53:07 +04:00
static size_t ascii_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
int ir_count = 0 ;
while ( * inbytesleft > = 2 & & * outbytesleft > = 1 ) {
2011-04-12 04:35:43 +04:00
if ( ( ( * inbuf ) [ 0 ] & 0x7F ) ! = ( * inbuf ) [ 0 ] | |
( * inbuf ) [ 1 ] ! = 0 ) {
/* If this is multi-byte, then it isn't legal ASCII */
errno = EILSEQ ;
return - 1 ;
}
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
2003-08-13 05:53:07 +04:00
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 1 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 1 ;
}
if ( * inbytesleft = = 1 ) {
errno = EINVAL ;
return - 1 ;
}
if ( * inbytesleft > 1 ) {
errno = E2BIG ;
return - 1 ;
}
2014-12-07 16:13:51 +03:00
2003-08-13 05:53:07 +04:00
return ir_count ;
}
2011-04-12 04:35:43 +04:00
/*
this takes a latin1 / ISO - 8859 - 1 sequence and produces a UTF16 sequence
The first 256 codepoints of latin1 matches the first 256 codepoints
of unicode , and so can be put into the first byte of UTF16LE
*/
static size_t latin1_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
while ( * inbytesleft > = 1 & & * outbytesleft > = 2 ) {
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
( * outbuf ) [ 1 ] = 0 ;
( * inbytesleft ) - = 1 ;
( * outbytesleft ) - = 2 ;
( * inbuf ) + = 1 ;
( * outbuf ) + = 2 ;
}
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
/*
this takes a UTF16 sequence and produces a latin1 / ISO - 8859 - 1 sequence
The first 256 codepoints of latin1 matches the first 256 codepoints
of unicode , and so can be read directly from the first byte of UTF16LE
*/
2011-02-18 02:24:58 +03:00
static size_t latin1_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
int ir_count = 0 ;
while ( * inbytesleft > = 2 & & * outbytesleft > = 1 ) {
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
2011-04-12 04:35:43 +04:00
if ( ( * inbuf ) [ 1 ] ! = 0 ) {
/* If this is multi-byte, then it isn't legal latin1 */
errno = EILSEQ ;
return - 1 ;
}
2011-02-18 02:24:58 +03:00
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 1 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 1 ;
}
if ( * inbytesleft = = 1 ) {
errno = EINVAL ;
return - 1 ;
}
if ( * inbytesleft > 1 ) {
errno = E2BIG ;
return - 1 ;
}
return ir_count ;
}
2003-08-13 05:53:07 +04:00
static size_t ucs2hex_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
while ( * inbytesleft > = 1 & & * outbytesleft > = 2 ) {
2021-01-04 16:03:28 +03:00
uint8_t hi = 0 , lo = 0 ;
bool ok ;
2003-08-13 05:53:07 +04:00
if ( ( * inbuf ) [ 0 ] ! = ' @ ' ) {
/* seven bit ascii case */
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
( * outbuf ) [ 1 ] = 0 ;
( * inbytesleft ) - = 1 ;
( * outbytesleft ) - = 2 ;
( * inbuf ) + = 1 ;
( * outbuf ) + = 2 ;
continue ;
}
/* it's a hex character */
if ( * inbytesleft < 5 ) {
errno = EINVAL ;
return - 1 ;
}
2014-12-07 16:13:51 +03:00
2021-01-04 16:03:28 +03:00
ok = hex_byte ( & ( * inbuf ) [ 1 ] , & hi ) & & hex_byte ( & ( * inbuf ) [ 3 ] , & lo ) ;
if ( ! ok ) {
2003-08-13 05:53:07 +04:00
errno = EILSEQ ;
return - 1 ;
}
2021-01-04 16:03:28 +03:00
( * outbuf ) [ 0 ] = lo ;
( * outbuf ) [ 1 ] = hi ;
2003-08-13 05:53:07 +04:00
( * inbytesleft ) - = 5 ;
( * outbytesleft ) - = 2 ;
( * inbuf ) + = 5 ;
( * outbuf ) + = 2 ;
}
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
2014-12-07 16:13:51 +03:00
2003-08-13 05:53:07 +04:00
return 0 ;
}
static size_t ucs2hex_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
while ( * inbytesleft > = 2 & & * outbytesleft > = 1 ) {
char buf [ 6 ] ;
2014-12-07 16:13:51 +03:00
if ( ( * inbuf ) [ 1 ] = = 0 & &
2003-08-13 05:53:07 +04:00
( ( * inbuf ) [ 0 ] & 0x80 ) = = 0 & &
( * inbuf ) [ 0 ] ! = ' @ ' ) {
( * outbuf ) [ 0 ] = ( * inbuf ) [ 0 ] ;
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 1 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 1 ;
continue ;
}
if ( * outbytesleft < 5 ) {
errno = E2BIG ;
return - 1 ;
}
snprintf ( buf , 6 , " @%04x " , SVAL ( * inbuf , 0 ) ) ;
memcpy ( * outbuf , buf , 5 ) ;
( * inbytesleft ) - = 2 ;
( * outbytesleft ) - = 5 ;
( * inbuf ) + = 2 ;
( * outbuf ) + = 5 ;
}
if ( * inbytesleft = = 1 ) {
errno = EINVAL ;
return - 1 ;
}
if ( * inbytesleft > 1 ) {
errno = E2BIG ;
return - 1 ;
}
2014-12-07 16:13:51 +03:00
2003-08-13 05:53:07 +04:00
return 0 ;
}
2003-12-16 12:20:34 +03:00
static size_t iconv_swab ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
int n ;
n = MIN ( * inbytesleft , * outbytesleft ) ;
swab ( * inbuf , * outbuf , ( n & ~ 1 ) ) ;
if ( n & 1 ) {
( * outbuf ) [ n - 1 ] = 0 ;
}
( * inbytesleft ) - = n ;
( * outbytesleft ) - = n ;
( * inbuf ) + = n ;
( * outbuf ) + = n ;
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
2003-08-13 05:53:07 +04:00
static size_t iconv_copy ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
int n ;
n = MIN ( * inbytesleft , * outbytesleft ) ;
memmove ( * outbuf , * inbuf , n ) ;
( * inbytesleft ) - = n ;
( * outbytesleft ) - = n ;
( * inbuf ) + = n ;
( * outbuf ) + = n ;
if ( * inbytesleft > 0 ) {
errno = E2BIG ;
return - 1 ;
}
return 0 ;
}
2008-08-19 11:49:34 +04:00
/*
this takes a UTF8 sequence and produces a UTF16 sequence
*/
2003-08-13 05:53:07 +04:00
static size_t utf8_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
char * * outbuf , size_t * outbytesleft )
{
2004-09-01 08:39:06 +04:00
size_t in_left = * inbytesleft , out_left = * outbytesleft ;
const uint8_t * c = ( const uint8_t * ) * inbuf ;
uint8_t * uc = ( uint8_t * ) * outbuf ;
2003-08-13 05:53:07 +04:00
2004-09-01 08:39:06 +04:00
while ( in_left > = 1 & & out_left > = 2 ) {
2003-08-13 05:53:07 +04:00
if ( ( c [ 0 ] & 0x80 ) = = 0 ) {
uc [ 0 ] = c [ 0 ] ;
uc [ 1 ] = 0 ;
2004-09-01 08:39:06 +04:00
c + = 1 ;
in_left - = 1 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
if ( ( c [ 0 ] & 0xe0 ) = = 0xc0 ) {
if ( in_left < 2 | |
( c [ 1 ] & 0xc0 ) ! = 0x80 ) {
errno = EILSEQ ;
goto error ;
}
uc [ 1 ] = ( c [ 0 ] > > 2 ) & 0x7 ;
uc [ 0 ] = ( c [ 0 ] < < 6 ) | ( c [ 1 ] & 0x3f ) ;
2021-04-08 12:18:46 +03:00
if ( uc [ 1 ] = = 0 & & uc [ 0 ] < 0x80 ) {
/* this should have been a single byte */
errno = EILSEQ ;
goto error ;
}
2004-09-01 08:39:06 +04:00
c + = 2 ;
in_left - = 2 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
if ( ( c [ 0 ] & 0xf0 ) = = 0xe0 ) {
2021-04-08 12:18:46 +03:00
unsigned int codepoint ;
2004-09-01 08:39:06 +04:00
if ( in_left < 3 | |
2014-12-07 16:13:51 +03:00
( c [ 1 ] & 0xc0 ) ! = 0x80 | |
2004-09-01 08:39:06 +04:00
( c [ 2 ] & 0xc0 ) ! = 0x80 ) {
errno = EILSEQ ;
goto error ;
2003-08-13 05:53:07 +04:00
}
2021-04-08 12:18:46 +03:00
codepoint = ( ( c [ 2 ] & 0x3f ) |
( ( c [ 1 ] & 0x3f ) < < 6 ) |
( ( c [ 0 ] & 0x0f ) < < 12 ) ) ;
if ( codepoint < 0x800 ) {
/* this should be a 1 or 2 byte sequence */
errno = EILSEQ ;
goto error ;
}
uc [ 0 ] = codepoint & 0xff ;
uc [ 1 ] = codepoint > > 8 ;
2004-09-01 08:39:06 +04:00
c + = 3 ;
in_left - = 3 ;
out_left - = 2 ;
uc + = 2 ;
continue ;
}
if ( ( c [ 0 ] & 0xf8 ) = = 0xf0 ) {
unsigned int codepoint ;
if ( in_left < 4 | |
2014-12-07 16:13:51 +03:00
( c [ 1 ] & 0xc0 ) ! = 0x80 | |
2004-09-01 08:39:06 +04:00
( c [ 2 ] & 0xc0 ) ! = 0x80 | |
( c [ 3 ] & 0xc0 ) ! = 0x80 ) {
errno = EILSEQ ;
goto error ;
2003-08-13 05:53:07 +04:00
}
2014-12-07 16:13:51 +03:00
codepoint =
( c [ 3 ] & 0x3f ) |
( ( c [ 2 ] & 0x3f ) < < 6 ) |
2004-09-01 08:39:06 +04:00
( ( c [ 1 ] & 0x3f ) < < 12 ) |
( ( c [ 0 ] & 0x7 ) < < 18 ) ;
if ( codepoint < 0x10000 ) {
2021-04-08 12:18:46 +03:00
/* reject UTF-8 characters that are not
minimally packed */
errno = EILSEQ ;
goto error ;
2004-09-01 08:39:06 +04:00
}
codepoint - = 0x10000 ;
if ( out_left < 4 ) {
errno = E2BIG ;
goto error ;
}
uc [ 0 ] = ( codepoint > > 10 ) & 0xFF ;
uc [ 1 ] = ( codepoint > > 18 ) | 0xd8 ;
uc [ 2 ] = codepoint & 0xFF ;
uc [ 3 ] = ( ( codepoint > > 8 ) & 0x3 ) | 0xdc ;
c + = 4 ;
in_left - = 4 ;
out_left - = 4 ;
uc + = 4 ;
continue ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
/* we don't handle 5 byte sequences */
errno = EINVAL ;
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
if ( in_left > 0 ) {
2003-08-13 05:53:07 +04:00
errno = E2BIG ;
2004-09-01 08:39:06 +04:00
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
* inbytesleft = in_left ;
* outbytesleft = out_left ;
2006-08-22 23:05:27 +04:00
* inbuf = ( const char * ) c ;
* outbuf = ( char * ) uc ;
2003-08-13 05:53:07 +04:00
return 0 ;
2004-09-01 08:39:06 +04:00
error :
* inbytesleft = in_left ;
* outbytesleft = out_left ;
2006-08-22 23:05:27 +04:00
* inbuf = ( const char * ) c ;
* outbuf = ( char * ) uc ;
2003-08-13 05:53:07 +04:00
return - 1 ;
}
2008-08-19 11:49:34 +04:00
/*
this takes a UTF16 sequence and produces a UTF8 sequence
*/
2003-08-13 05:53:07 +04:00
static size_t utf8_push ( void * cd , const char * * inbuf , size_t * inbytesleft ,
2004-09-01 08:39:06 +04:00
char * * outbuf , size_t * outbytesleft )
2003-08-13 05:53:07 +04:00
{
2004-09-01 08:39:06 +04:00
size_t in_left = * inbytesleft , out_left = * outbytesleft ;
uint8_t * c = ( uint8_t * ) * outbuf ;
const uint8_t * uc = ( const uint8_t * ) * inbuf ;
while ( in_left > = 2 & & out_left > = 1 ) {
unsigned int codepoint ;
if ( uc [ 1 ] = = 0 & & ! ( uc [ 0 ] & 0x80 ) ) {
/* simplest case */
c [ 0 ] = uc [ 0 ] ;
in_left - = 2 ;
out_left - = 1 ;
uc + = 2 ;
c + = 1 ;
continue ;
}
if ( ( uc [ 1 ] & 0xf8 ) = = 0 ) {
/* next simplest case */
if ( out_left < 2 ) {
errno = E2BIG ;
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
c [ 0 ] = 0xc0 | ( uc [ 0 ] > > 6 ) | ( uc [ 1 ] < < 2 ) ;
c [ 1 ] = 0x80 | ( uc [ 0 ] & 0x3f ) ;
in_left - = 2 ;
out_left - = 2 ;
uc + = 2 ;
c + = 2 ;
continue ;
}
if ( ( uc [ 1 ] & 0xfc ) = = 0xdc ) {
2017-01-30 19:17:38 +03:00
errno = EILSEQ ;
# ifndef HAVE_ICONV_ERRNO_ILLEGAL_MULTIBYTE
2004-09-01 08:39:06 +04:00
if ( in_left < 4 ) {
errno = EINVAL ;
2003-08-13 05:53:07 +04:00
}
2017-01-30 19:17:38 +03:00
# endif
2004-09-01 08:39:06 +04:00
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
if ( ( uc [ 1 ] & 0xfc ) ! = 0xd8 ) {
codepoint = uc [ 0 ] | ( uc [ 1 ] < < 8 ) ;
if ( out_left < 3 ) {
errno = E2BIG ;
goto error ;
}
c [ 0 ] = 0xe0 | ( codepoint > > 12 ) ;
c [ 1 ] = 0x80 | ( ( codepoint > > 6 ) & 0x3f ) ;
c [ 2 ] = 0x80 | ( codepoint & 0x3f ) ;
2014-12-07 16:13:51 +03:00
2004-09-01 08:39:06 +04:00
in_left - = 2 ;
out_left - = 3 ;
uc + = 2 ;
c + = 3 ;
continue ;
}
2003-08-13 05:53:07 +04:00
2004-09-01 08:39:06 +04:00
/* its the first part of a 4 byte sequence */
if ( in_left < 4 ) {
errno = EINVAL ;
goto error ;
}
if ( ( uc [ 3 ] & 0xfc ) ! = 0xdc ) {
errno = EILSEQ ;
goto error ;
}
2014-12-07 16:13:51 +03:00
codepoint = 0x10000 + ( uc [ 2 ] | ( ( uc [ 3 ] & 0x3 ) < < 8 ) |
2004-09-01 08:39:06 +04:00
( uc [ 0 ] < < 10 ) | ( ( uc [ 1 ] & 0x3 ) < < 18 ) ) ;
2014-12-07 16:13:51 +03:00
2004-09-01 08:39:06 +04:00
if ( out_left < 4 ) {
errno = E2BIG ;
goto error ;
}
c [ 0 ] = 0xf0 | ( codepoint > > 18 ) ;
c [ 1 ] = 0x80 | ( ( codepoint > > 12 ) & 0x3f ) ;
c [ 2 ] = 0x80 | ( ( codepoint > > 6 ) & 0x3f ) ;
c [ 3 ] = 0x80 | ( codepoint & 0x3f ) ;
2014-12-07 16:13:51 +03:00
2004-09-01 08:39:06 +04:00
in_left - = 4 ;
out_left - = 4 ;
uc + = 4 ;
c + = 4 ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
if ( in_left = = 1 ) {
2003-08-13 05:53:07 +04:00
errno = EINVAL ;
2004-09-01 08:39:06 +04:00
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
if ( in_left > 1 ) {
2003-08-13 05:53:07 +04:00
errno = E2BIG ;
2004-09-01 08:39:06 +04:00
goto error ;
2003-08-13 05:53:07 +04:00
}
2004-09-01 08:39:06 +04:00
* inbytesleft = in_left ;
* outbytesleft = out_left ;
2006-08-22 23:05:27 +04:00
* inbuf = ( const char * ) uc ;
* outbuf = ( char * ) c ;
2014-12-07 16:13:51 +03:00
2003-08-13 05:53:07 +04:00
return 0 ;
2004-09-01 08:39:06 +04:00
error :
* inbytesleft = in_left ;
* outbytesleft = out_left ;
2006-08-22 23:05:27 +04:00
* inbuf = ( const char * ) uc ;
* outbuf = ( char * ) c ;
2003-08-13 05:53:07 +04:00
return - 1 ;
}
2004-09-01 13:45:00 +04:00
2008-10-31 05:51:37 +03:00
/*
2008-10-31 07:41:34 +03:00
this takes a UTF16 munged sequence , modifies it according to the
string2key rules , and produces a UTF16 sequence
2008-10-31 05:51:37 +03:00
The rules are :
2008-10-31 07:41:34 +03:00
1 ) any 0x0000 characters are mapped to 0x0001
2 ) convert any instance of 0xD800 - 0xDBFF ( high surrogate )
2008-10-31 05:51:37 +03:00
without an immediately following 0xDC00 - 0x0 xDFFF ( low surrogate ) to
U + FFFD ( OBJECT REPLACEMENT CHARACTER ) .
2008-10-31 07:41:34 +03:00
3 ) the same for any low surrogate that was not preceded by a high surrogate .
2008-10-31 05:51:37 +03:00
*/
2008-10-31 07:41:34 +03:00
static size_t utf16_munged_pull ( void * cd , const char * * inbuf , size_t * inbytesleft ,
2008-10-31 05:51:37 +03:00
char * * outbuf , size_t * outbytesleft )
{
size_t in_left = * inbytesleft , out_left = * outbytesleft ;
uint8_t * c = ( uint8_t * ) * outbuf ;
const uint8_t * uc = ( const uint8_t * ) * inbuf ;
2008-10-31 07:41:34 +03:00
while ( in_left > = 2 & & out_left > = 2 ) {
unsigned int codepoint = uc [ 0 ] | ( uc [ 1 ] < < 8 ) ;
2008-10-31 05:51:37 +03:00
2008-10-31 07:41:34 +03:00
if ( codepoint = = 0 ) {
codepoint = 1 ;
2008-10-31 05:51:37 +03:00
}
2008-10-31 07:41:34 +03:00
if ( ( codepoint & 0xfc00 ) = = 0xd800 ) {
/* a high surrogate */
unsigned int codepoint2 ;
if ( in_left < 4 ) {
codepoint = 0xfffd ;
2014-12-07 16:13:51 +03:00
goto codepoint16 ;
2008-10-31 07:41:34 +03:00
}
codepoint2 = uc [ 2 ] | ( uc [ 3 ] < < 8 ) ;
if ( ( codepoint2 & 0xfc00 ) ! = 0xdc00 ) {
/* high surrogate not followed by low
surrogate : convert to 0xfffd */
codepoint = 0xfffd ;
goto codepoint16 ;
}
if ( out_left < 4 ) {
2008-10-31 05:51:37 +03:00
errno = E2BIG ;
goto error ;
}
2008-10-31 07:41:34 +03:00
memcpy ( c , uc , 4 ) ;
in_left - = 4 ;
out_left - = 4 ;
uc + = 4 ;
c + = 4 ;
2008-10-31 05:51:37 +03:00
continue ;
}
2008-10-31 07:41:34 +03:00
if ( ( codepoint & 0xfc00 ) = = 0xdc00 ) {
/* low surrogate not preceded by high
surrogate : convert to 0xfffd */
2008-10-31 05:51:37 +03:00
codepoint = 0xfffd ;
}
codepoint16 :
2008-10-31 07:41:34 +03:00
c [ 0 ] = codepoint & 0xFF ;
c [ 1 ] = ( codepoint > > 8 ) & 0xFF ;
2014-12-07 16:13:51 +03:00
2008-10-31 05:51:37 +03:00
in_left - = 2 ;
2008-10-31 07:41:34 +03:00
out_left - = 2 ;
2008-10-31 05:51:37 +03:00
uc + = 2 ;
2008-10-31 07:41:34 +03:00
c + = 2 ;
2014-12-07 16:13:51 +03:00
continue ;
2008-10-31 05:51:37 +03:00
}
if ( in_left = = 1 ) {
errno = EINVAL ;
goto error ;
}
if ( in_left > 1 ) {
errno = E2BIG ;
goto error ;
}
* inbytesleft = in_left ;
* outbytesleft = out_left ;
* inbuf = ( const char * ) uc ;
* outbuf = ( char * ) c ;
2014-12-07 16:13:51 +03:00
2008-10-31 05:51:37 +03:00
return 0 ;
error :
* inbytesleft = in_left ;
* outbytesleft = out_left ;
* inbuf = ( const char * ) uc ;
* outbuf = ( char * ) c ;
return - 1 ;
}
2004-09-01 13:45:00 +04:00