2007-04-27 22:40:23 +04:00
/*
* Copyright ( C ) 2001 - 2004 Sistina Software , Inc . All rights reserved .
* Copyright ( C ) 2004 - 2007 Red Hat , Inc . All rights reserved .
*
* This file is part of the device - mapper userspace tools .
*
* This copyrighted material is made available to anyone wishing to use ,
* modify , copy , or redistribute it subject to the terms and conditions
2007-08-21 20:26:07 +04:00
* of the GNU Lesser General Public License v .2 .1 .
2007-04-27 22:40:23 +04:00
*
2007-08-21 20:26:07 +04:00
* You should have received a copy of the GNU Lesser General Public License
2007-04-27 22:40:23 +04:00
* along with this program ; if not , write to the Free Software Foundation ,
* Inc . , 59 Temple Place , Suite 330 , Boston , MA 02111 - 1307 USA
*/
2008-11-03 21:59:59 +03:00
# include "dmlib.h"
2007-04-27 22:40:23 +04:00
# include "parse_rx.h"
2010-04-22 17:18:27 +04:00
# include <ctype.h>
2007-04-27 22:40:23 +04:00
struct parse_sp { /* scratch pad for the parsing process */
struct dm_pool * mem ;
int type ; /* token type, 0 indicates a charset */
dm_bitset_t charset ; /* The current charset */
const char * cursor ; /* where we are in the regex */
const char * rx_end ; /* 1pte for the expression being parsed */
} ;
static struct rx_node * _or_term ( struct parse_sp * ps ) ;
static void _single_char ( struct parse_sp * ps , unsigned int c , const char * ptr )
{
ps - > type = 0 ;
ps - > cursor = ptr + 1 ;
dm_bit_clear_all ( ps - > charset ) ;
dm_bit_set ( ps - > charset , c ) ;
}
/*
* Get the next token from the regular expression .
* Returns : 1 success , 0 end of input , - 1 error .
*/
static int _rx_get_token ( struct parse_sp * ps )
{
int neg = 0 , range = 0 ;
char c , lc = 0 ;
const char * ptr = ps - > cursor ;
if ( ptr = = ps - > rx_end ) { /* end of input ? */
ps - > type = - 1 ;
return 0 ;
}
switch ( * ptr ) {
/* charsets and ncharsets */
case ' [ ' :
ptr + + ;
if ( * ptr = = ' ^ ' ) {
dm_bit_set_all ( ps - > charset ) ;
/* never transition on zero */
dm_bit_clear ( ps - > charset , 0 ) ;
neg = 1 ;
ptr + + ;
} else
dm_bit_clear_all ( ps - > charset ) ;
while ( ( ptr < ps - > rx_end ) & & ( * ptr ! = ' ] ' ) ) {
if ( * ptr = = ' \\ ' ) {
/* an escaped character */
ptr + + ;
switch ( * ptr ) {
case ' n ' :
c = ' \n ' ;
break ;
case ' r ' :
c = ' \r ' ;
break ;
case ' t ' :
c = ' \t ' ;
break ;
default :
c = * ptr ;
}
} else if ( * ptr = = ' - ' & & lc ) {
/* we've got a range on our hands */
range = 1 ;
ptr + + ;
if ( ptr = = ps - > rx_end ) {
log_error ( " Incomplete range "
" specification " ) ;
return - 1 ;
}
c = * ptr ;
} else
c = * ptr ;
if ( range ) {
/* add lc - c into the bitset */
if ( lc > c ) {
char tmp = c ;
c = lc ;
lc = tmp ;
}
for ( ; lc < = c ; lc + + ) {
if ( neg )
dm_bit_clear ( ps - > charset , lc ) ;
else
dm_bit_set ( ps - > charset , lc ) ;
}
range = 0 ;
} else {
/* add c into the bitset */
if ( neg )
dm_bit_clear ( ps - > charset , c ) ;
else
dm_bit_set ( ps - > charset , c ) ;
}
ptr + + ;
lc = c ;
}
if ( ptr > = ps - > rx_end ) {
ps - > type = - 1 ;
return - 1 ;
}
ps - > type = 0 ;
ps - > cursor = ptr + 1 ;
break ;
/* These characters are special, we just return their ASCII
codes as the type . Sorted into ascending order to help the
compiler */
case ' ( ' :
case ' ) ' :
case ' * ' :
case ' + ' :
case ' ? ' :
case ' | ' :
ps - > type = ( int ) * ptr ;
ps - > cursor = ptr + 1 ;
break ;
case ' ^ ' :
_single_char ( ps , HAT_CHAR , ptr ) ;
break ;
case ' $ ' :
_single_char ( ps , DOLLAR_CHAR , ptr ) ;
break ;
case ' . ' :
/* The 'all but newline' character set */
ps - > type = 0 ;
ps - > cursor = ptr + 1 ;
dm_bit_set_all ( ps - > charset ) ;
dm_bit_clear ( ps - > charset , ( int ) ' \n ' ) ;
dm_bit_clear ( ps - > charset , ( int ) ' \r ' ) ;
dm_bit_clear ( ps - > charset , 0 ) ;
break ;
case ' \\ ' :
/* escaped character */
ptr + + ;
if ( ptr > = ps - > rx_end ) {
log_error ( " Badly quoted character at end "
" of expression " ) ;
ps - > type = - 1 ;
return - 1 ;
}
ps - > type = 0 ;
ps - > cursor = ptr + 1 ;
dm_bit_clear_all ( ps - > charset ) ;
switch ( * ptr ) {
case ' n ' :
dm_bit_set ( ps - > charset , ( int ) ' \n ' ) ;
break ;
case ' r ' :
dm_bit_set ( ps - > charset , ( int ) ' \r ' ) ;
break ;
case ' t ' :
dm_bit_set ( ps - > charset , ( int ) ' \t ' ) ;
break ;
default :
dm_bit_set ( ps - > charset , ( int ) * ptr ) ;
}
break ;
default :
/* add a single character to the bitset */
ps - > type = 0 ;
ps - > cursor = ptr + 1 ;
dm_bit_clear_all ( ps - > charset ) ;
dm_bit_set ( ps - > charset , ( int ) * ptr ) ;
break ;
}
return 1 ;
}
static struct rx_node * _node ( struct dm_pool * mem , int type ,
struct rx_node * l , struct rx_node * r )
{
struct rx_node * n = dm_pool_zalloc ( mem , sizeof ( * n ) ) ;
if ( n ) {
if ( ! ( n - > charset = dm_bitset_create ( mem , 256 ) ) ) {
dm_pool_free ( mem , n ) ;
return NULL ;
}
n - > type = type ;
n - > left = l ;
n - > right = r ;
}
return n ;
}
static struct rx_node * _term ( struct parse_sp * ps )
{
struct rx_node * n ;
switch ( ps - > type ) {
case 0 :
if ( ! ( n = _node ( ps - > mem , CHARSET , NULL , NULL ) ) ) {
stack ;
return NULL ;
}
dm_bit_copy ( n - > charset , ps - > charset ) ;
_rx_get_token ( ps ) ; /* match charset */
break ;
case ' ( ' :
_rx_get_token ( ps ) ; /* match '(' */
n = _or_term ( ps ) ;
if ( ps - > type ! = ' ) ' ) {
log_error ( " missing ')' in regular expression " ) ;
return 0 ;
}
_rx_get_token ( ps ) ; /* match ')' */
break ;
default :
n = 0 ;
}
return n ;
}
static struct rx_node * _closure_term ( struct parse_sp * ps )
{
struct rx_node * l , * n ;
if ( ! ( l = _term ( ps ) ) )
return NULL ;
for ( ; ; ) {
switch ( ps - > type ) {
case ' * ' :
n = _node ( ps - > mem , STAR , l , NULL ) ;
break ;
case ' + ' :
n = _node ( ps - > mem , PLUS , l , NULL ) ;
break ;
case ' ? ' :
n = _node ( ps - > mem , QUEST , l , NULL ) ;
break ;
default :
return l ;
}
if ( ! n ) {
stack ;
return NULL ;
}
_rx_get_token ( ps ) ;
l = n ;
}
return n ;
}
static struct rx_node * _cat_term ( struct parse_sp * ps )
{
struct rx_node * l , * r , * n ;
if ( ! ( l = _closure_term ( ps ) ) )
return NULL ;
if ( ps - > type = = ' | ' )
return l ;
if ( ! ( r = _cat_term ( ps ) ) )
return l ;
if ( ! ( n = _node ( ps - > mem , CAT , l , r ) ) )
stack ;
return n ;
}
static struct rx_node * _or_term ( struct parse_sp * ps )
{
struct rx_node * l , * r , * n ;
if ( ! ( l = _cat_term ( ps ) ) )
return NULL ;
if ( ps - > type ! = ' | ' )
return l ;
_rx_get_token ( ps ) ; /* match '|' */
if ( ! ( r = _or_term ( ps ) ) ) {
log_error ( " Badly formed 'or' expression " ) ;
return NULL ;
}
if ( ! ( n = _node ( ps - > mem , OR , l , r ) ) )
stack ;
return n ;
}
2010-04-21 02:31:22 +04:00
/*----------------------------------------------------------------*/
2010-04-22 17:42:34 +04:00
/* Macros for left and right nodes. Inverted if 'leftmost' is set. */
# define LEFT(a) (leftmost ? (a)->left : (a)->right)
# define RIGHT(a) (leftmost ? (a)->right : (a)->left)
2010-04-22 07:24:24 +04:00
2010-04-21 02:31:22 +04:00
/*
* The optimiser spots common prefixes on either side of an ' or ' node , and
* lifts them outside the ' or ' with a ' cat ' .
*/
2010-04-22 07:24:24 +04:00
static unsigned _depth ( struct rx_node * r , unsigned leftmost )
2010-04-21 02:31:22 +04:00
{
int count = 1 ;
while ( r - > type ! = CHARSET ) {
count + + ;
2010-04-22 17:42:34 +04:00
r = LEFT ( r ) ;
2010-04-21 02:31:22 +04:00
}
return count ;
}
/*
* FIXME : a unique key could be built up as part of the parse , to make the
* comparison quick . Alternatively we could use cons - hashing , and then
* this would simply be a pointer comparison .
*/
static int _nodes_equal ( struct rx_node * l , struct rx_node * r )
{
if ( l - > type ! = r - > type )
return 0 ;
switch ( l - > type ) {
case CAT :
case OR :
return _nodes_equal ( l - > left , r - > left ) & &
_nodes_equal ( l - > right , r - > right ) ;
case STAR :
case PLUS :
case QUEST :
return _nodes_equal ( l - > left , r - > left ) ;
case CHARSET :
return dm_bitset_equal ( l - > charset , r - > charset ) ;
}
/* NOTREACHED */
return_0 ;
}
static int _find_leftmost_common ( struct rx_node * or ,
struct rx_node * * l ,
2010-04-22 07:24:24 +04:00
struct rx_node * * r ,
unsigned leftmost )
2010-04-21 02:31:22 +04:00
{
struct rx_node * left = or - > left , * right = or - > right ;
2010-04-22 07:24:24 +04:00
unsigned left_depth = _depth ( left , leftmost ) ;
unsigned right_depth = _depth ( right , leftmost ) ;
2010-04-21 02:31:22 +04:00
while ( left_depth > right_depth ) {
2010-04-22 17:42:34 +04:00
left = LEFT ( left ) ;
2010-04-21 02:31:22 +04:00
left_depth - - ;
}
while ( right_depth > left_depth ) {
2010-04-22 17:42:34 +04:00
right = LEFT ( right ) ;
2010-04-21 02:31:22 +04:00
right_depth - - ;
}
while ( left_depth ) {
if ( left - > type = = CAT & & right - > type = = CAT ) {
2010-04-22 17:42:34 +04:00
if ( _nodes_equal ( LEFT ( left ) , LEFT ( right ) ) ) {
2010-04-21 02:31:22 +04:00
* l = left ;
* r = right ;
return 1 ;
}
}
2010-04-22 17:42:34 +04:00
left = LEFT ( left ) ;
right = LEFT ( right ) ;
2010-04-21 02:31:22 +04:00
left_depth - - ;
}
return 0 ;
}
2010-04-22 17:42:34 +04:00
/* If top node is OR, rotate (leftmost example) from ((ab)|((ac)|d)) to (((ab)|(ac))|d) */
static int _rotate_ors ( struct rx_node * r , unsigned leftmost )
2010-04-22 17:18:27 +04:00
{
2010-04-22 17:42:34 +04:00
struct rx_node * old_node ;
if ( r - > type ! = OR | | RIGHT ( r ) - > type ! = OR )
return 0 ;
old_node = RIGHT ( r ) ;
if ( leftmost ) {
r - > right = RIGHT ( old_node ) ;
old_node - > right = LEFT ( old_node ) ;
old_node - > left = LEFT ( r ) ;
r - > left = old_node ;
} else {
r - > left = RIGHT ( old_node ) ;
old_node - > left = LEFT ( old_node ) ;
old_node - > right = LEFT ( r ) ;
r - > right = old_node ;
2010-04-22 17:18:27 +04:00
}
2010-04-22 17:42:34 +04:00
return 1 ;
2010-04-22 17:18:27 +04:00
}
static struct rx_node * _exchange_nodes ( struct dm_pool * mem , struct rx_node * r ,
struct rx_node * left_cat , struct rx_node * right_cat ,
unsigned leftmost )
2010-04-22 07:24:24 +04:00
{
2010-04-22 17:18:27 +04:00
struct rx_node * new_r ;
2010-04-22 07:24:24 +04:00
2010-04-22 17:42:34 +04:00
if ( leftmost )
new_r = _node ( mem , CAT , LEFT ( left_cat ) , r ) ;
else
new_r = _node ( mem , CAT , r , LEFT ( right_cat ) ) ;
2010-04-22 07:24:24 +04:00
2010-04-22 17:42:34 +04:00
if ( ! new_r )
return_NULL ;
2010-04-22 07:24:24 +04:00
2010-04-22 17:42:34 +04:00
memcpy ( left_cat , RIGHT ( left_cat ) , sizeof ( * left_cat ) ) ;
memcpy ( right_cat , RIGHT ( right_cat ) , sizeof ( * right_cat ) ) ;
2010-04-22 07:24:24 +04:00
return new_r ;
}
2010-04-21 02:31:22 +04:00
static struct rx_node * _pass ( struct dm_pool * mem ,
struct rx_node * r ,
int * changed )
{
2010-04-22 17:18:27 +04:00
struct rx_node * left , * right ;
2010-04-21 02:31:22 +04:00
/*
* walk the tree , optimising every ' or ' node .
*/
switch ( r - > type ) {
case CAT :
if ( ! ( r - > left = _pass ( mem , r - > left , changed ) ) )
return_NULL ;
if ( ! ( r - > right = _pass ( mem , r - > right , changed ) ) )
return_NULL ;
break ;
case STAR :
case PLUS :
case QUEST :
if ( ! ( r - > left = _pass ( mem , r - > left , changed ) ) )
return_NULL ;
break ;
case OR :
/* It's important we optimise sub nodes first */
if ( ! ( r - > left = _pass ( mem , r - > left , changed ) ) )
return_NULL ;
if ( ! ( r - > right = _pass ( mem , r - > right , changed ) ) )
return_NULL ;
2010-04-22 17:42:34 +04:00
/*
* If rotate_ors changes the tree , left and right are stale ,
* so just set ' changed ' to repeat the search .
*
* FIXME Check we can ' t ' bounce ' between left and right rotations here .
*/
2010-04-22 17:18:27 +04:00
if ( _find_leftmost_common ( r , & left , & right , 1 ) ) {
2010-04-22 17:42:34 +04:00
if ( ! _rotate_ors ( r , 1 ) )
2010-04-22 07:24:24 +04:00
r = _exchange_nodes ( mem , r , left , right , 1 ) ;
2010-04-22 17:18:27 +04:00
* changed = 1 ;
} else if ( _find_leftmost_common ( r , & left , & right , 0 ) ) {
2010-04-22 17:42:34 +04:00
if ( ! _rotate_ors ( r , 0 ) )
r = _exchange_nodes ( mem , r , left , right , 0 ) ;
2010-04-22 17:18:27 +04:00
* changed = 1 ;
2010-04-21 02:31:22 +04:00
}
break ;
case CHARSET :
break ;
}
return r ;
}
static struct rx_node * _optimise ( struct dm_pool * mem , struct rx_node * r )
{
/*
* We ' re looking for ( or ( . . . ( cat < foo > a ) ) ( . . . ( cat < foo > b ) ) )
* and want to turn it into ( cat < foo > ( or ( . . . a ) ( . . . b ) ) )
2010-04-22 07:24:24 +04:00
*
* ( fa ) | ( fb ) becomes f ( a | b )
2010-04-21 02:31:22 +04:00
*/
/*
* Initially done as an inefficient multipass algorithm .
*/
int changed ;
do {
changed = 0 ;
r = _pass ( mem , r , & changed ) ;
} while ( r & & changed ) ;
return r ;
}
/*----------------------------------------------------------------*/
2007-04-27 22:40:23 +04:00
struct rx_node * rx_parse_tok ( struct dm_pool * mem ,
const char * begin , const char * end )
{
struct rx_node * r ;
struct parse_sp * ps = dm_pool_zalloc ( mem , sizeof ( * ps ) ) ;
2010-04-21 02:31:22 +04:00
if ( ! ps )
return_NULL ;
2007-04-27 22:40:23 +04:00
ps - > mem = mem ;
2010-04-21 02:31:22 +04:00
if ( ! ( ps - > charset = dm_bitset_create ( mem , 256 ) ) ) {
log_error ( " Regex charset allocation failed " ) ;
dm_pool_free ( mem , ps ) ;
return NULL ;
}
2007-04-27 22:40:23 +04:00
ps - > cursor = begin ;
ps - > rx_end = end ;
_rx_get_token ( ps ) ; /* load the first token */
if ( ! ( r = _or_term ( ps ) ) ) {
log_error ( " Parse error in regex " ) ;
dm_pool_free ( mem , ps ) ;
2010-04-21 02:31:22 +04:00
return NULL ;
}
if ( ! ( r = _optimise ( mem , r ) ) ) {
log_error ( " Regex optimisation error " ) ;
dm_pool_free ( mem , ps ) ;
return NULL ;
2007-04-27 22:40:23 +04:00
}
return r ;
}
struct rx_node * rx_parse_str ( struct dm_pool * mem , const char * str )
{
return rx_parse_tok ( mem , str , str + strlen ( str ) ) ;
}