2001-10-19 18:36:57 +04:00
/*
* Copyright ( C ) 2001 Sistina Software ( UK ) Limited .
*
2001-10-31 15:47:01 +03:00
* This file is released under the LGPL .
2001-10-19 18:36:57 +04:00
*/
# include "parse_rx.h"
# include "bitset.h"
# include "log.h"
# include <string.h>
# include <stdlib.h>
# include <stdio.h>
struct parse_sp { /* scratch pad for the parsing process */
struct pool * mem ;
int type ; /* token type, 0 indicates a charset */
bitset_t charset ; /* The current charset */
const char * cursor ; /* where we are in the regex */
const char * rx_end ; /* 1pte for the expression being parsed */
} ;
2001-10-21 14:24:10 +04:00
static struct rx_node * _or_term ( struct parse_sp * ps ) ;
2001-10-19 18:36:57 +04:00
/*
* Get the next token from the regular expression .
* Returns : 1 success , 0 end of input , - 1 error .
*/
static int _get_token ( struct parse_sp * ps )
{
int neg = 0 , range = 0 ;
char c , lc = 0 ;
const char * ptr = ps - > cursor ;
2002-04-24 22:20:51 +04:00
if ( ptr = = ps - > rx_end ) { /* end of input ? */
2001-10-19 18:36:57 +04:00
ps - > type = - 1 ;
return 0 ;
}
2002-04-24 22:20:51 +04:00
switch ( * ptr ) {
/* charsets and ncharsets */
2001-10-19 18:36:57 +04:00
case ' [ ' :
ptr + + ;
2002-04-24 22:20:51 +04:00
if ( * ptr = = ' ^ ' ) {
2001-10-19 18:36:57 +04:00
bit_set_all ( ps - > charset ) ;
/* never transition on zero */
bit_clear ( ps - > charset , 0 ) ;
neg = 1 ;
ptr + + ;
} else
bit_clear_all ( ps - > charset ) ;
2002-04-24 22:20:51 +04:00
while ( ( ptr < ps - > rx_end ) & & ( * ptr ! = ' ] ' ) ) {
if ( * ptr = = ' \\ ' ) {
2001-10-19 18:36:57 +04:00
/* an escaped character */
ptr + + ;
2002-04-24 22:20:51 +04:00
switch ( * ptr ) {
case ' n ' :
c = ' \n ' ;
break ;
case ' r ' :
c = ' \r ' ;
break ;
case ' t ' :
c = ' \t ' ;
break ;
2001-10-19 18:36:57 +04:00
default :
c = * ptr ;
}
2002-04-24 22:20:51 +04:00
} else if ( * ptr = = ' - ' & & lc ) {
2001-10-19 18:36:57 +04:00
/* we've got a range on our hands */
range = 1 ;
ptr + + ;
2002-04-24 22:20:51 +04:00
if ( ptr = = ps - > rx_end ) {
2002-01-28 00:30:47 +03:00
log_error ( " Incomplete range "
2002-04-24 22:20:51 +04:00
" specification " ) ;
2001-10-19 18:36:57 +04:00
return - 1 ;
}
c = * ptr ;
} else
c = * ptr ;
2002-04-24 22:20:51 +04:00
if ( range ) {
2001-10-19 18:36:57 +04:00
/* add lc - c into the bitset */
2002-04-24 22:20:51 +04:00
if ( lc > c ) {
2001-10-19 18:36:57 +04:00
char tmp = c ;
c = lc ;
lc = tmp ;
}
2002-04-24 22:20:51 +04:00
for ( ; lc < = c ; lc + + ) {
if ( neg )
2001-10-19 18:36:57 +04:00
bit_clear ( ps - > charset , lc ) ;
else
bit_set ( ps - > charset , lc ) ;
}
range = 0 ;
} else {
/* add c into the bitset */
2002-04-24 22:20:51 +04:00
if ( neg )
2001-10-19 18:36:57 +04:00
bit_clear ( ps - > charset , c ) ;
else
bit_set ( ps - > charset , c ) ;
}
ptr + + ;
lc = c ;
}
2002-04-24 22:20:51 +04:00
if ( ptr > = ps - > rx_end ) {
2001-10-19 18:36:57 +04:00
ps - > type = - 1 ;
return - 1 ;
}
ps - > type = 0 ;
ps - > cursor = ptr + 1 ;
break ;
/* These characters are special, we just return their ASCII
codes as the type . Sorted into ascending order to help the
compiler */
case ' ( ' :
case ' ) ' :
case ' * ' :
case ' + ' :
case ' ? ' :
case ' | ' :
case ' ^ ' :
case ' $ ' :
ps - > type = ( int ) * ptr ;
ps - > cursor = ptr + 1 ;
break ;
case ' . ' :
/* The 'all but newline' character set */
ps - > type = 0 ;
ps - > cursor = ptr + 1 ;
bit_set_all ( ps - > charset ) ;
bit_clear ( ps - > charset , ( int ) ' \n ' ) ;
bit_clear ( ps - > charset , ( int ) ' \r ' ) ;
bit_clear ( ps - > charset , 0 ) ;
break ;
case ' \\ ' :
/* escaped character */
ptr + + ;
2002-04-24 22:20:51 +04:00
if ( ptr > = ps - > rx_end ) {
2002-01-28 00:30:47 +03:00
log_error ( " Badly quoted character at end "
" of expression " ) ;
2001-10-19 18:36:57 +04:00
ps - > type = - 1 ;
return - 1 ;
}
ps - > type = 0 ;
ps - > cursor = ptr + 1 ;
bit_clear_all ( ps - > charset ) ;
2002-04-24 22:20:51 +04:00
switch ( * ptr ) {
case ' n ' :
bit_set ( ps - > charset , ( int ) ' \n ' ) ;
break ;
case ' r ' :
bit_set ( ps - > charset , ( int ) ' \r ' ) ;
break ;
case ' t ' :
bit_set ( ps - > charset , ( int ) ' \t ' ) ;
break ;
2001-10-19 18:36:57 +04:00
default :
bit_set ( ps - > charset , ( int ) * ptr ) ;
}
break ;
default :
/* add a single character to the bitset */
ps - > type = 0 ;
ps - > cursor = ptr + 1 ;
bit_clear_all ( ps - > charset ) ;
bit_set ( ps - > charset , ( int ) * ptr ) ;
break ;
}
return 1 ;
}
2001-10-21 14:24:10 +04:00
static struct rx_node * _node ( struct pool * mem , int type ,
struct rx_node * l , struct rx_node * r )
2001-10-19 18:36:57 +04:00
{
struct rx_node * n = pool_zalloc ( mem , sizeof ( * n ) ) ;
if ( n ) {
if ( ! ( n - > charset = bitset_create ( mem , 256 ) ) ) {
pool_free ( mem , n ) ;
return NULL ;
}
n - > type = type ;
n - > left = l ;
n - > right = r ;
}
return n ;
}
static struct rx_node * _term ( struct parse_sp * ps )
{
struct rx_node * n ;
2002-04-24 22:20:51 +04:00
switch ( ps - > type ) {
2001-10-19 18:36:57 +04:00
case 0 :
2001-10-21 14:24:10 +04:00
if ( ! ( n = _node ( ps - > mem , CHARSET , NULL , NULL ) ) ) {
2001-10-19 18:36:57 +04:00
stack ;
return NULL ;
}
bit_copy ( n - > charset , ps - > charset ) ;
2002-04-24 22:20:51 +04:00
_get_token ( ps ) ; /* match charset */
2001-10-19 18:36:57 +04:00
break ;
case ' ( ' :
2002-04-24 22:20:51 +04:00
_get_token ( ps ) ; /* match '(' */
2001-10-21 14:24:10 +04:00
n = _or_term ( ps ) ;
2002-04-24 22:20:51 +04:00
if ( ps - > type ! = ' ) ' ) {
2002-01-28 00:30:47 +03:00
log_error ( " missing ')' in regular expression " ) ;
2001-10-19 18:36:57 +04:00
return 0 ;
}
2002-04-24 22:20:51 +04:00
_get_token ( ps ) ; /* match ')' */
2001-10-19 18:36:57 +04:00
break ;
default :
n = 0 ;
}
return n ;
}
static struct rx_node * _closure_term ( struct parse_sp * ps )
{
struct rx_node * l , * n ;
2002-04-24 22:20:51 +04:00
if ( ! ( l = _term ( ps ) ) )
2001-10-19 18:36:57 +04:00
return NULL ;
2001-10-21 14:24:10 +04:00
for ( ; ; ) {
2002-04-24 22:20:51 +04:00
switch ( ps - > type ) {
2001-10-21 14:24:10 +04:00
case ' * ' :
n = _node ( ps - > mem , STAR , l , NULL ) ;
break ;
2001-10-19 18:36:57 +04:00
2001-10-21 14:24:10 +04:00
case ' + ' :
n = _node ( ps - > mem , PLUS , l , NULL ) ;
break ;
2001-10-19 18:36:57 +04:00
2001-10-21 14:24:10 +04:00
case ' ? ' :
n = _node ( ps - > mem , QUEST , l , NULL ) ;
break ;
2001-10-19 18:36:57 +04:00
2001-10-21 14:24:10 +04:00
default :
return l ;
}
2001-10-19 18:36:57 +04:00
2001-10-21 14:24:10 +04:00
if ( ! n ) {
stack ;
return NULL ;
}
_get_token ( ps ) ;
l = n ;
2001-10-19 18:36:57 +04:00
}
return n ;
}
static struct rx_node * _cat_term ( struct parse_sp * ps )
{
struct rx_node * l , * r , * n ;
if ( ! ( l = _closure_term ( ps ) ) )
return NULL ;
if ( ps - > type = = ' | ' )
return l ;
if ( ! ( r = _cat_term ( ps ) ) )
return l ;
2001-10-21 14:24:10 +04:00
if ( ! ( n = _node ( ps - > mem , CAT , l , r ) ) )
2001-10-19 18:36:57 +04:00
stack ;
return n ;
}
2001-10-21 14:24:10 +04:00
static struct rx_node * _or_term ( struct parse_sp * ps )
2001-10-19 18:36:57 +04:00
{
2001-10-21 14:24:10 +04:00
struct rx_node * l , * r , * n ;
2001-10-19 18:36:57 +04:00
2001-10-21 14:24:10 +04:00
if ( ! ( l = _cat_term ( ps ) ) )
return NULL ;
2001-10-19 18:36:57 +04:00
2001-10-21 14:24:10 +04:00
if ( ps - > type ! = ' | ' )
return l ;
2001-10-19 18:36:57 +04:00
2002-04-24 22:20:51 +04:00
_get_token ( ps ) ; /* match '|' */
2001-10-19 18:36:57 +04:00
2001-10-21 14:24:10 +04:00
if ( ! ( r = _or_term ( ps ) ) ) {
2002-01-28 00:30:47 +03:00
log_error ( " Badly formed 'or' expression " ) ;
2001-10-21 14:24:10 +04:00
return NULL ;
2001-10-19 18:36:57 +04:00
}
2001-10-21 14:24:10 +04:00
if ( ! ( n = _node ( ps - > mem , OR , l , r ) ) )
stack ;
2001-10-19 18:36:57 +04:00
return n ;
}
2001-10-21 14:24:10 +04:00
struct rx_node * rx_parse_tok ( struct pool * mem ,
2001-10-19 18:36:57 +04:00
const char * begin , const char * end )
{
struct rx_node * r ;
2001-10-21 14:24:10 +04:00
struct parse_sp * ps = pool_zalloc ( mem , sizeof ( * ps ) ) ;
2001-10-19 18:36:57 +04:00
if ( ! ps ) {
stack ;
return NULL ;
}
ps - > mem = mem ;
ps - > charset = bitset_create ( mem , 256 ) ;
ps - > cursor = begin ;
ps - > rx_end = end ;
2002-04-24 22:20:51 +04:00
_get_token ( ps ) ; /* load the first token */
2001-10-21 14:24:10 +04:00
if ( ! ( r = _or_term ( ps ) ) ) {
2002-01-28 00:30:47 +03:00
log_error ( " Parse error in regex " ) ;
2001-10-19 18:36:57 +04:00
pool_free ( mem , ps ) ;
2001-10-21 14:24:10 +04:00
}
2001-10-19 18:36:57 +04:00
return r ;
}
struct rx_node * rx_parse_str ( struct pool * mem , const char * str )
{
return rx_parse_tok ( mem , str , str + strlen ( str ) ) ;
}