2005-09-20 23:26:39 +10:00
/** \file tokenizer.c
A specialized tokenizer for tokenizing the fish language . In the
future , the tokenizer should be extended to support marks ,
tokenizing multiple strings and disposing of unused string
segments .
*/
# include "config.h"
# include <stdlib.h>
# include <stdio.h>
# include <wchar.h>
# include <wctype.h>
# include <string.h>
# include <unistd.h>
2006-02-28 23:17:16 +10:00
# include "fallback.h"
2005-09-20 23:26:39 +10:00
# include "util.h"
2006-02-28 23:17:16 +10:00
2005-09-20 23:26:39 +10:00
# include "wutil.h"
# include "tokenizer.h"
# include "common.h"
# include "wildcard.h"
2006-01-04 22:51:02 +10:00
# include "translate.h"
2005-09-20 23:26:39 +10:00
/**
Error string for unexpected end of string
*/
2006-01-04 22:51:02 +10:00
# define EOL_ERROR _( L"Unexpected end of token" )
2005-09-20 23:26:39 +10:00
/**
Error string for mismatched parenthesis
*/
2006-01-04 22:51:02 +10:00
# define PARAN_ERROR _( L"Parenthesis mismatch" )
2005-09-20 23:26:39 +10:00
/**
Error string for invalid redirections
*/
2006-01-04 22:51:02 +10:00
# define REDIRECT_ERROR _( L"Invalid redirection" )
2005-09-20 23:26:39 +10:00
/**
Error string for invalid input
*/
2006-01-04 22:51:02 +10:00
# define INPUT_ERROR _( L"Invalid input" )
2005-09-20 23:26:39 +10:00
2005-10-08 00:08:57 +10:00
/**
Error string for when trying to pipe from fd 0
*/
2006-01-04 22:51:02 +10:00
# define PIPE_ERROR _( L"Can not use fd 0 as pipe output" )
2005-10-08 00:08:57 +10:00
2005-09-20 23:26:39 +10:00
/**
Characters that separate tokens . They are ordered by frequency of occurrence to increase parsing speed .
*/
2005-10-26 20:51:02 +10:00
# define SEP L" \n|\t;#\r<>^&"
2005-10-08 12:00:08 +10:00
/**
Maximum length of a string containing a file descriptor number
*/
# define FD_STR_MAX_LEN 16
2006-01-24 06:40:14 +10:00
/**
Descriptions of all tokenizer errors
*/
2006-01-04 22:51:02 +10:00
static const wchar_t * tok_desc [ ] =
2005-09-20 23:26:39 +10:00
{
2006-01-04 22:51:02 +10:00
N_ ( L " Tokenizer not yet initialized " ) ,
N_ ( L " Tokenizer error " ) ,
N_ ( L " Invalid token " ) ,
N_ ( L " String " ) ,
N_ ( L " Pipe " ) ,
N_ ( L " End of command " ) ,
N_ ( L " Redirect output to file " ) ,
N_ ( L " Append output to file " ) ,
N_ ( L " Redirect input to file " ) ,
N_ ( L " Redirect to file descriptor " ) ,
N_ ( L " Run job in background " ) ,
N_ ( L " Comment " )
2005-09-20 23:26:39 +10:00
}
2006-01-04 22:51:02 +10:00
;
2005-09-20 23:26:39 +10:00
/**
2006-01-04 22:51:02 +10:00
Tests if the tokenizer buffer is large enough to hold contents of
the specified length , and if not , reallocates the tokenizer buffer .
\ return 0 if the system could not provide the memory needed , and 1 otherwise .
2005-09-20 23:26:39 +10:00
*/
static int check_size ( tokenizer * tok , size_t len )
{
if ( tok - > last_len < = len )
{
wchar_t * tmp ;
tok - > last_len = len + 1 ;
tmp = realloc ( tok - > last , sizeof ( wchar_t ) * tok - > last_len ) ;
if ( tmp = = 0 )
{
wperror ( L " realloc " ) ;
return 0 ;
}
tok - > last = tmp ;
}
return 1 ;
}
/**
Set the latest tokens string to be the specified error message
*/
static void tok_error ( tokenizer * tok , const wchar_t * err )
{
tok - > last_type = TOK_ERROR ;
if ( ! check_size ( tok , wcslen ( err ) + 1 ) )
{
if ( tok - > last ! = 0 )
* tok - > last = 0 ;
return ;
}
wcscpy ( tok - > last , err ) ;
}
void tok_init ( tokenizer * tok , const wchar_t * b , int flags )
{
// fwprintf( stderr, L"CREATE: \'%ls\'\n", b );
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
memset ( tok , 0 , sizeof ( tokenizer ) ) ;
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
tok - > accept_unfinished = flags & TOK_ACCEPT_UNFINISHED ;
tok - > show_comments = flags & TOK_SHOW_COMMENTS ;
2006-01-31 05:53:10 +10:00
tok - > has_next = 1 ;
2005-09-20 23:26:39 +10:00
2006-01-31 05:53:10 +10:00
/*
2005-09-20 23:26:39 +10:00
Before we copy the buffer we need to check that it is not
null . But before that , we need to init the tokenizer far enough
so that errors can be properly flagged
*/
if ( ! b )
{
tok_error ( tok , INPUT_ERROR ) ;
return ;
}
tok - > has_next = ( * b ! = L ' \0 ' ) ;
2005-12-10 10:33:35 +10:00
tok - > orig_buff = tok - > buff = ( wchar_t * ) ( b ) ;
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
if ( tok - > accept_unfinished )
{
int l = wcslen ( tok - > orig_buff ) ;
if ( l ! = 0 )
{
if ( tok - > orig_buff [ l - 1 ] = = L ' \\ ' )
2005-12-10 10:33:35 +10:00
{
tok - > free_orig = 1 ;
2005-12-26 08:00:44 +10:00
tok - > orig_buff = tok - > buff = wcsdup ( tok - > orig_buff ) ;
if ( ! tok - > orig_buff )
{
2006-01-31 05:53:10 +10:00
die_mem ( ) ;
2005-12-26 08:00:44 +10:00
}
2005-09-20 23:26:39 +10:00
tok - > orig_buff [ l - 1 ] = L ' \0 ' ;
2005-12-10 10:33:35 +10:00
}
2005-09-20 23:26:39 +10:00
}
}
tok_next ( tok ) ;
}
void tok_destroy ( tokenizer * tok )
{
free ( tok - > last ) ;
2005-12-10 10:33:35 +10:00
if ( tok - > free_orig )
free ( tok - > orig_buff ) ;
2005-09-20 23:26:39 +10:00
}
int tok_last_type ( tokenizer * tok )
{
return tok - > last_type ;
}
wchar_t * tok_last ( tokenizer * tok )
{
return tok - > last ;
}
int tok_has_next ( tokenizer * tok )
{
/* fwprintf( stderr, L"has_next is %ls \n", tok->has_next?L"true":L"false" );*/
return tok - > has_next ;
}
/**
Tests if this character can be a part of a string
*/
static int is_string_char ( wchar_t c )
{
if ( ! c | | wcschr ( SEP , c ) )
{
return 0 ;
}
2005-10-26 20:51:02 +10:00
return 1 ;
}
2005-09-20 23:26:39 +10:00
2005-10-26 20:51:02 +10:00
/**
Quick test to catch the most common ' non - magical ' characters , makes
read_string slightly faster by adding a fast path for the most
common characters . This is obviously not a suitable replacement for
iswalpha .
*/
static int myal ( wchar_t c )
{
return ( c > = L ' a ' & & c < = L ' z ' ) | | ( c > = L ' A ' & & c < = L ' Z ' ) ;
2005-09-20 23:26:39 +10:00
}
/**
Read the next token as a string
*/
static void read_string ( tokenizer * tok )
{
const wchar_t * start ;
int len ;
int mode = 0 ;
wchar_t prev ;
int do_loop = 1 ;
int paran_count = 0 ;
start = tok - > buff ;
while ( 1 )
{
2006-01-31 05:53:10 +10:00
2005-10-26 20:51:02 +10:00
if ( ! myal ( * tok - > buff ) )
{
// debug(1, L"%lc", *tok->buff );
2005-09-20 23:26:39 +10:00
if ( * tok - > buff = = L ' \\ ' )
{
tok - > buff + + ;
if ( * tok - > buff = = L ' \0 ' )
{
tok_error ( tok , EOL_ERROR ) ;
return ;
}
tok - > buff + + ;
continue ;
}
/*
The modes are as follows :
0 : regular text
1 : inside of subshell
2 : inside of array brackets
3 : inside of array brackets and subshell , like in ' $ foo [ ( ech '
*/
switch ( mode )
{
case 0 :
{
switch ( * tok - > buff )
{
case L ' ( ' :
{
paran_count = 1 ;
mode = 1 ;
break ;
}
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
case L ' [ ' :
{
if ( tok - > buff ! = start )
mode = 2 ;
break ;
}
case L ' \' ' :
case L ' " ' :
{
2006-01-31 05:53:10 +10:00
2006-02-13 02:13:31 +10:00
const wchar_t * end = quote_end ( tok - > buff ) ;
2005-09-20 23:26:39 +10:00
tok - > last_quote = * tok - > buff ;
if ( end )
{
2006-02-13 02:13:31 +10:00
tok - > buff = ( wchar_t * ) end ;
2005-09-20 23:26:39 +10:00
}
else
{
tok - > buff + = wcslen ( tok - > buff ) ;
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
if ( ( ! tok - > accept_unfinished ) )
{
tok_error ( tok , EOL_ERROR ) ;
return ;
}
do_loop = 0 ;
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
}
break ;
}
default :
{
if ( ! is_string_char ( * ( tok - > buff ) ) )
{
do_loop = 0 ;
}
}
}
break ;
}
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
case 3 :
case 1 :
switch ( * tok - > buff )
{
case L ' \' ' :
case L ' \" ' :
{
2006-02-13 02:13:31 +10:00
const wchar_t * end = quote_end ( tok - > buff ) ;
2005-09-20 23:26:39 +10:00
if ( end )
{
2006-02-13 02:13:31 +10:00
tok - > buff = ( wchar_t * ) end ;
2005-09-20 23:26:39 +10:00
}
else
do_loop = 0 ;
break ;
}
case L ' ( ' :
paran_count + + ;
break ;
case L ' ) ' :
paran_count - - ;
if ( paran_count = = 0 )
{
mode - - ;
}
break ;
case L ' \0 ' :
do_loop = 0 ;
break ;
}
break ;
case 2 :
switch ( * tok - > buff )
{
case L ' ( ' :
paran_count = 1 ;
mode = 3 ;
break ;
case L ' ] ' :
mode = 0 ;
break ;
case L ' \0 ' :
do_loop = 0 ;
break ;
}
break ;
}
2005-10-26 20:51:02 +10:00
}
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
if ( ! do_loop )
break ;
prev = * tok - > buff ;
tok - > buff + + ;
}
if ( ( ! tok - > accept_unfinished ) & & ( mode ! = 0 ) )
{
tok_error ( tok , PARAN_ERROR ) ;
return ;
}
len = tok - > buff - start ;
if ( ! check_size ( tok , len ) )
return ;
memcpy ( tok - > last , start , sizeof ( wchar_t ) * len ) ;
tok - > last [ len ] = L ' \0 ' ;
tok - > last_type = TOK_STRING ;
}
/**
Read the next token as a comment .
*/
static void read_comment ( tokenizer * tok )
{
const wchar_t * start ;
int len ;
start = tok - > buff ;
while ( * ( tok - > buff ) ! = L ' \n ' & & * ( tok - > buff ) ! = L ' \0 ' )
tok - > buff + + ;
len = tok - > buff - start ;
if ( ! check_size ( tok , len ) )
return ;
memcpy ( tok - > last , start , sizeof ( wchar_t ) * len ) ;
tok - > last [ len ] = L ' \0 ' ;
tok - > last_type = TOK_COMMENT ;
}
/**
2006-02-19 11:54:38 +10:00
Read a FD redirection .
2005-09-20 23:26:39 +10:00
*/
static void read_redirect ( tokenizer * tok , int fd )
{
int mode = - 1 ;
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
if ( ( * tok - > buff = = L ' > ' ) | |
( * tok - > buff = = L ' ^ ' ) )
{
tok - > buff + + ;
if ( * tok - > buff = = * ( tok - > buff - 1 ) )
{
tok - > buff + + ;
mode = 1 ;
}
else
{
mode = 0 ;
}
2005-10-15 20:12:19 +10:00
if ( * tok - > buff = = L ' | ' )
{
if ( fd = = 0 )
{
tok_error ( tok , PIPE_ERROR ) ;
return ;
}
2006-01-31 05:53:10 +10:00
check_size ( tok , FD_STR_MAX_LEN ) ;
2005-10-15 20:12:19 +10:00
tok - > buff + + ;
swprintf ( tok - > last , FD_STR_MAX_LEN , L " %d " , fd ) ;
tok - > last_type = TOK_PIPE ;
return ;
}
2005-09-20 23:26:39 +10:00
}
else if ( * tok - > buff = = L ' < ' )
{
tok - > buff + + ;
mode = 2 ;
}
else
{
tok_error ( tok , REDIRECT_ERROR ) ;
}
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
if ( ! check_size ( tok , 2 ) )
{
return ;
}
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
swprintf ( tok - > last , tok - > last_len , L " %d " , fd ) ;
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
if ( * tok - > buff = = L ' & ' )
{
tok - > buff + + ;
tok - > last_type = TOK_REDIRECT_FD ;
}
else
{
tok - > last_type = TOK_REDIRECT_OUT + mode ;
}
}
wchar_t tok_last_quote ( tokenizer * tok )
{
return tok - > last_quote ;
}
/**
Test if a character is whitespace . Differs from iswspace in that it
does not consider a newline to be whitespace .
*/
static int my_iswspace ( wchar_t c )
{
if ( c = = L ' \n ' )
return 0 ;
else
return iswspace ( c ) ;
}
const wchar_t * tok_get_desc ( int type )
{
2006-01-04 22:51:02 +10:00
return _ ( tok_desc [ type ] ) ;
2005-09-20 23:26:39 +10:00
}
void tok_next ( tokenizer * tok )
{
// fwprintf( stderr, L"tok_next on %ls (prev=%ls)\n", tok->orig_buff, tok_desc[tok->last_type] );
if ( tok_last_type ( tok ) = = TOK_ERROR )
{
2006-01-31 05:53:10 +10:00
tok - > has_next = 0 ;
return ;
2005-09-20 23:26:39 +10:00
}
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
if ( ! tok - > has_next )
{
/* wprintf( L"EOL\n" );*/
tok - > last_type = TOK_END ;
return ;
}
2006-02-18 02:41:19 +10:00
while ( 1 )
{
if ( my_iswspace ( * ( tok - > buff ) ) )
{
tok - > buff + + ;
}
else
{
if ( ( * ( tok - > buff ) = = L ' \\ ' ) & & ( * ( tok - > buff + 1 ) = = L ' \n ' ) )
{
tok - > buff + = 2 ;
}
break ;
}
}
2005-09-20 23:26:39 +10:00
if ( * tok - > buff = = L ' # ' )
{
if ( tok - > show_comments )
{
tok - > last_pos = tok - > buff - tok - > orig_buff ;
read_comment ( tok ) ;
return ;
}
else
{
while ( * ( tok - > buff ) ! = L ' \n ' & & * ( tok - > buff ) ! = L ' \0 ' )
tok - > buff + + ;
}
while ( my_iswspace ( * ( tok - > buff ) ) )
tok - > buff + + ;
}
tok - > last_pos = tok - > buff - tok - > orig_buff ;
switch ( * tok - > buff )
{
case L ' \0 ' :
tok - > last_type = TOK_END ;
/*fwprintf( stderr, L"End of string\n" );*/
tok - > has_next = 0 ;
break ;
case 13 :
case L ' \n ' :
case L ' ; ' :
tok - > last_type = TOK_END ;
tok - > buff + + ;
break ;
case L ' & ' :
tok - > last_type = TOK_BACKGROUND ;
tok - > buff + + ;
break ;
2005-10-08 00:08:57 +10:00
case L ' | ' :
2005-10-08 07:30:08 +10:00
check_size ( tok , 2 ) ;
2006-01-31 05:53:10 +10:00
2005-10-08 00:08:57 +10:00
tok - > last [ 0 ] = L ' 1 ' ;
2006-01-31 05:53:10 +10:00
tok - > last [ 1 ] = L ' \0 ' ;
2005-10-08 00:08:57 +10:00
tok - > last_type = TOK_PIPE ;
tok - > buff + + ;
break ;
2005-09-20 23:26:39 +10:00
case L ' > ' :
return read_redirect ( tok , 1 ) ;
case L ' < ' :
return read_redirect ( tok , 0 ) ;
case L ' ^ ' :
return read_redirect ( tok , 2 ) ;
default :
2005-10-08 07:30:08 +10:00
{
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
if ( iswdigit ( * tok - > buff ) )
2006-01-31 05:53:10 +10:00
{
2005-10-08 07:30:08 +10:00
wchar_t * orig = tok - > buff ;
int fd = 0 ;
while ( iswdigit ( * tok - > buff ) )
fd = ( fd * 10 ) + ( * ( tok - > buff + + ) - L ' 0 ' ) ;
switch ( * ( tok - > buff ) )
2005-09-20 23:26:39 +10:00
{
2005-10-15 20:12:19 +10:00
case L ' ^ ' :
2005-09-20 23:26:39 +10:00
case L ' > ' :
case L ' < ' :
read_redirect ( tok , fd ) ;
return ;
}
2006-01-31 05:53:10 +10:00
tok - > buff = orig ;
2005-09-20 23:26:39 +10:00
}
read_string ( tok ) ;
2005-10-08 07:30:08 +10:00
}
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
}
}
wchar_t * tok_string ( tokenizer * tok )
{
2006-05-10 21:54:31 +10:00
return tok ? tok - > orig_buff : 0 ;
2005-09-20 23:26:39 +10:00
}
wchar_t * tok_first ( const wchar_t * str )
{
tokenizer t ;
wchar_t * res = 0 ;
tok_init ( & t , str , 0 ) ;
2006-01-31 05:53:10 +10:00
2005-09-20 23:26:39 +10:00
switch ( tok_last_type ( & t ) )
{
case TOK_STRING :
// fwprintf( stderr, L"Got token %ls\n", tok_last( &t ));
res = wcsdup ( tok_last ( & t ) ) ;
break ;
default :
break ;
}
tok_destroy ( & t ) ;
return res ;
}
int tok_get_pos ( tokenizer * tok )
{
2005-12-26 08:00:44 +10:00
return tok - > last_pos + ( tok - > free_orig ? 1 : 0 ) ;
2005-09-20 23:26:39 +10:00
}
void tok_set_pos ( tokenizer * tok , int pos )
{
tok - > buff = tok - > orig_buff + pos ;
tok - > has_next = 1 ;
tok_next ( tok ) ;
}
# ifdef TOKENIZER_TEST
/**
This main function is used for compiling the tokenizer_test command , used for testing the tokenizer .
*/
int main ( int argc , char * * argv )
{
tokenizer tok ;
int i ;
for ( i = 1 ; i < argc ; i + + )
{
wprintf ( L " Tokenizing string %s \n " , argv [ i ] ) ;
for ( tok_init ( & tok , str2wcs ( argv [ i ] ) , 0 ) ; tok_has_next ( & tok ) ; tok_next ( & tok ) )
{
switch ( tok_last_type ( & tok ) )
{
case TOK_INVALID :
wprintf ( L " Type: INVALID \n " ) ;
break ;
case TOK_STRING :
wprintf ( L " Type: STRING \t Value: %ls \n " , tok_last ( & tok ) ) ;
break ;
case TOK_PIPE :
wprintf ( L " Type: PIPE \n " ) ;
break ;
case TOK_END :
wprintf ( L " Type: END \n " ) ;
break ;
case TOK_ERROR :
wprintf ( L " Type: ERROR \n " ) ;
break ;
default :
wprintf ( L " Type: Unknown \n " ) ;
break ;
}
}
tok_destroy ( & tok ) ;
}
}
# endif