2007-04-27 22:40:23 +04:00
/*
2010-07-21 15:58:49 +04:00
* Copyright ( C ) 2001 - 2004 Sistina Software , Inc . All rights reserved .
2007-04-27 22:40:23 +04:00
* Copyright ( C ) 2004 - 2007 Red Hat , Inc . All rights reserved .
*
* This file is part of the device - mapper userspace tools .
*
* This copyrighted material is made available to anyone wishing to use ,
* modify , copy , or redistribute it subject to the terms and conditions
2007-08-21 20:26:07 +04:00
* of the GNU Lesser General Public License v .2 .1 .
2007-04-27 22:40:23 +04:00
*
2007-08-21 20:26:07 +04:00
* You should have received a copy of the GNU Lesser General Public License
2007-04-27 22:40:23 +04:00
* along with this program ; if not , write to the Free Software Foundation ,
* Inc . , 59 Temple Place , Suite 330 , Boston , MA 02111 - 1307 USA
*/
2008-11-03 21:59:59 +03:00
# include "dmlib.h"
2007-04-27 22:40:23 +04:00
# include "parse_rx.h"
# include "ttree.h"
# include "assert.h"
struct dfa_state {
2010-07-21 16:02:51 +04:00
struct dfa_state * next ;
2007-04-27 22:40:23 +04:00
int final ;
dm_bitset_t bits ;
2010-07-21 16:02:51 +04:00
struct dfa_state * lookup [ 256 ] ;
2007-04-27 22:40:23 +04:00
} ;
struct dm_regex { /* Instance variables for the lexer */
struct dfa_state * start ;
unsigned num_nodes ;
2010-07-21 15:58:49 +04:00
unsigned num_charsets ;
2007-04-27 22:40:23 +04:00
int nodes_entered ;
struct rx_node * * nodes ;
2010-07-21 15:58:49 +04:00
int charsets_entered ;
struct rx_node * * charsets ;
2007-04-27 22:40:23 +04:00
struct dm_pool * scratch , * mem ;
2010-07-21 16:00:53 +04:00
/* stuff for on the fly dfa calculation */
dm_bitset_t charmap [ 256 ] ;
dm_bitset_t dfa_copy ;
struct ttree * tt ;
dm_bitset_t bs ;
2010-07-21 16:02:51 +04:00
struct dfa_state * h , * t ;
2007-04-27 22:40:23 +04:00
} ;
static int _count_nodes ( struct rx_node * rx )
{
int r = 1 ;
if ( rx - > left )
r + = _count_nodes ( rx - > left ) ;
if ( rx - > right )
r + = _count_nodes ( rx - > right ) ;
return r ;
}
2010-07-21 15:58:49 +04:00
static unsigned _count_charsets ( struct rx_node * rx )
{
if ( rx - > type = = CHARSET )
return 1 ;
return ( rx - > left ? _count_charsets ( rx - > left ) : 0 ) +
( rx - > right ? _count_charsets ( rx - > right ) : 0 ) ;
}
static void _enumerate_charsets_internal ( struct rx_node * rx , unsigned * i )
{
if ( rx - > type = = CHARSET )
rx - > charset_index = ( * i ) + + ;
else {
if ( rx - > left )
_enumerate_charsets_internal ( rx - > left , i ) ;
if ( rx - > right )
_enumerate_charsets_internal ( rx - > right , i ) ;
}
}
static void _enumerate_charsets ( struct rx_node * rx )
{
unsigned i = 0 ;
_enumerate_charsets_internal ( rx , & i ) ;
}
2007-04-27 22:40:23 +04:00
static void _fill_table ( struct dm_regex * m , struct rx_node * rx )
{
assert ( ( rx - > type ! = OR ) | | ( rx - > left & & rx - > right ) ) ;
if ( rx - > left )
_fill_table ( m , rx - > left ) ;
if ( rx - > right )
_fill_table ( m , rx - > right ) ;
m - > nodes [ m - > nodes_entered + + ] = rx ;
2010-07-21 15:58:49 +04:00
if ( rx - > type = = CHARSET )
m - > charsets [ m - > charsets_entered + + ] = rx ;
2007-04-27 22:40:23 +04:00
}
static void _create_bitsets ( struct dm_regex * m )
{
int i ;
for ( i = 0 ; i < m - > num_nodes ; i + + ) {
struct rx_node * n = m - > nodes [ i ] ;
2010-07-21 15:58:49 +04:00
n - > firstpos = dm_bitset_create ( m - > scratch , m - > num_charsets ) ;
n - > lastpos = dm_bitset_create ( m - > scratch , m - > num_charsets ) ;
n - > followpos = dm_bitset_create ( m - > scratch , m - > num_charsets ) ;
2007-04-27 22:40:23 +04:00
}
}
static void _calc_functions ( struct dm_regex * m )
{
int i , j , final = 1 ;
struct rx_node * rx , * c1 , * c2 ;
for ( i = 0 ; i < m - > num_nodes ; i + + ) {
rx = m - > nodes [ i ] ;
c1 = rx - > left ;
c2 = rx - > right ;
2010-07-21 15:58:49 +04:00
if ( rx - > type = = CHARSET & & dm_bit ( rx - > charset , TARGET_TRANS ) )
2007-04-27 22:40:23 +04:00
rx - > final = final + + ;
switch ( rx - > type ) {
case CAT :
if ( c1 - > nullable )
dm_bit_union ( rx - > firstpos ,
c1 - > firstpos , c2 - > firstpos ) ;
else
dm_bit_copy ( rx - > firstpos , c1 - > firstpos ) ;
if ( c2 - > nullable )
dm_bit_union ( rx - > lastpos ,
c1 - > lastpos , c2 - > lastpos ) ;
else
dm_bit_copy ( rx - > lastpos , c2 - > lastpos ) ;
rx - > nullable = c1 - > nullable & & c2 - > nullable ;
break ;
case PLUS :
dm_bit_copy ( rx - > firstpos , c1 - > firstpos ) ;
dm_bit_copy ( rx - > lastpos , c1 - > lastpos ) ;
rx - > nullable = c1 - > nullable ;
break ;
case OR :
dm_bit_union ( rx - > firstpos , c1 - > firstpos , c2 - > firstpos ) ;
dm_bit_union ( rx - > lastpos , c1 - > lastpos , c2 - > lastpos ) ;
rx - > nullable = c1 - > nullable | | c2 - > nullable ;
break ;
case QUEST :
case STAR :
dm_bit_copy ( rx - > firstpos , c1 - > firstpos ) ;
dm_bit_copy ( rx - > lastpos , c1 - > lastpos ) ;
rx - > nullable = 1 ;
break ;
case CHARSET :
2010-07-21 15:58:49 +04:00
dm_bit_set ( rx - > firstpos , rx - > charset_index ) ;
dm_bit_set ( rx - > lastpos , rx - > charset_index ) ;
2007-04-27 22:40:23 +04:00
rx - > nullable = 0 ;
break ;
default :
2010-03-25 21:22:04 +03:00
log_error ( INTERNAL_ERROR " Unknown calc node type " ) ;
2007-04-27 22:40:23 +04:00
}
/*
* followpos has it ' s own switch
* because PLUS and STAR do the
* same thing .
*/
switch ( rx - > type ) {
case CAT :
2010-07-21 15:58:49 +04:00
for ( j = 0 ; j < m - > num_charsets ; j + + ) {
struct rx_node * n = m - > charsets [ j ] ;
if ( dm_bit ( c1 - > lastpos , j ) )
2007-04-27 22:40:23 +04:00
dm_bit_union ( n - > followpos ,
2010-07-21 15:58:49 +04:00
n - > followpos , c2 - > firstpos ) ;
2007-04-27 22:40:23 +04:00
}
break ;
case PLUS :
case STAR :
2010-07-21 15:58:49 +04:00
for ( j = 0 ; j < m - > num_charsets ; j + + ) {
struct rx_node * n = m - > charsets [ j ] ;
if ( dm_bit ( rx - > lastpos , j ) )
2007-04-27 22:40:23 +04:00
dm_bit_union ( n - > followpos ,
2010-07-21 15:58:49 +04:00
n - > followpos , rx - > firstpos ) ;
2007-04-27 22:40:23 +04:00
}
break ;
}
}
}
static struct dfa_state * _create_dfa_state ( struct dm_pool * mem )
{
return dm_pool_zalloc ( mem , sizeof ( struct dfa_state ) ) ;
}
2010-07-21 16:02:51 +04:00
static struct dfa_state * _create_state_queue ( struct dm_pool * mem ,
struct dfa_state * dfa ,
dm_bitset_t bits )
2007-04-27 22:40:23 +04:00
{
2010-07-21 16:02:51 +04:00
dfa - > bits = dm_bitset_create ( mem , bits [ 0 ] ) ; /* first element is the size */
dm_bit_copy ( dfa - > bits , bits ) ;
dfa - > next = 0 ;
2010-07-21 16:09:12 +04:00
dfa - > final = - 1 ;
2010-07-21 16:02:51 +04:00
return dfa ;
2007-04-27 22:40:23 +04:00
}
2010-07-21 16:02:51 +04:00
static void _calc_state ( struct dm_regex * m , struct dfa_state * dfa , int a )
2010-07-21 16:00:53 +04:00
{
int set_bits = 0 , i ;
2010-07-21 16:02:51 +04:00
dm_bitset_t dfa_bits = dfa - > bits ;
2010-07-21 16:00:53 +04:00
dm_bit_and ( m - > dfa_copy , m - > charmap [ a ] , dfa_bits ) ;
/* iterate through all the states in firstpos */
for ( i = dm_bit_get_first ( m - > dfa_copy ) ; i > = 0 ; i = dm_bit_get_next ( m - > dfa_copy , i ) ) {
if ( a = = TARGET_TRANS )
dfa - > final = m - > charsets [ i ] - > final ;
dm_bit_union ( m - > bs , m - > bs , m - > charsets [ i ] - > followpos ) ;
set_bits = 1 ;
}
2010-07-21 16:09:12 +04:00
if ( set_bits ) {
2010-07-21 16:02:51 +04:00
struct dfa_state * tmp ;
2010-07-21 16:00:53 +04:00
struct dfa_state * ldfa = ttree_lookup ( m - > tt , m - > bs + 1 ) ;
if ( ! ldfa ) {
/* push */
ldfa = _create_dfa_state ( m - > mem ) ;
ttree_insert ( m - > tt , m - > bs + 1 , ldfa ) ;
tmp = _create_state_queue ( m - > scratch , ldfa , m - > bs ) ;
if ( ! m - > h )
m - > h = m - > t = tmp ;
else {
m - > t - > next = tmp ;
m - > t = tmp ;
}
}
dfa - > lookup [ a ] = ldfa ;
dm_bit_clear_all ( m - > bs ) ;
}
}
2007-04-27 22:40:23 +04:00
static int _calc_states ( struct dm_regex * m , struct rx_node * rx )
{
2010-07-21 15:58:49 +04:00
unsigned iwidth = ( m - > num_charsets / DM_BITS_PER_INT ) + 1 ;
2010-07-21 16:00:53 +04:00
struct dfa_state * dfa ;
int i , a ;
2007-04-27 22:40:23 +04:00
2010-07-21 16:00:53 +04:00
m - > tt = ttree_create ( m - > scratch , iwidth ) ;
if ( ! m - > tt )
2007-04-27 22:40:23 +04:00
return_0 ;
2010-07-21 16:00:53 +04:00
if ( ! ( m - > bs = dm_bitset_create ( m - > scratch , m - > num_charsets ) ) )
2007-04-27 22:40:23 +04:00
return_0 ;
2010-07-21 15:58:49 +04:00
/* build some char maps */
for ( a = 0 ; a < 256 ; a + + ) {
2010-07-21 16:00:53 +04:00
m - > charmap [ a ] = dm_bitset_create ( m - > scratch , m - > num_charsets ) ;
if ( ! m - > charmap [ a ] )
2010-07-21 15:58:49 +04:00
return_0 ;
}
for ( i = 0 ; i < m - > num_nodes ; i + + ) {
struct rx_node * n = m - > nodes [ i ] ;
if ( n - > type = = CHARSET ) {
for ( a = dm_bit_get_first ( n - > charset ) ;
a > = 0 ; a = dm_bit_get_next ( n - > charset , a ) )
2010-07-21 16:00:53 +04:00
dm_bit_set ( m - > charmap [ a ] , n - > charset_index ) ;
2010-07-21 15:58:49 +04:00
}
}
2007-04-27 22:40:23 +04:00
/* create first state */
dfa = _create_dfa_state ( m - > mem ) ;
m - > start = dfa ;
2010-07-21 16:00:53 +04:00
ttree_insert ( m - > tt , rx - > firstpos + 1 , dfa ) ;
2007-04-27 22:40:23 +04:00
/* prime the queue */
2010-07-21 16:00:53 +04:00
m - > h = m - > t = _create_state_queue ( m - > scratch , dfa , rx - > firstpos ) ;
m - > dfa_copy = dm_bitset_create ( m - > scratch , m - > num_charsets ) ;
2010-07-21 16:09:12 +04:00
return 1 ;
}
/*
* Forces all the dfa states to be calculated up front , ie . what
* _calc_states ( ) used to do before we switched to calculating on demand .
*/
static void _force_states ( struct dm_regex * m )
{
int a ;
2010-04-23 00:35:24 +04:00
2010-07-21 16:00:53 +04:00
/* keep processing until there's nothing in the queue */
2010-07-21 16:02:51 +04:00
struct dfa_state * s ;
2010-07-21 16:09:12 +04:00
while ( ( s = m - > h ) ) {
/* pop state off front of the queue */
m - > h = m - > h - > next ;
2007-04-27 22:40:23 +04:00
2010-07-21 16:09:12 +04:00
/* iterate through all the inputs for this state */
dm_bit_clear_all ( m - > bs ) ;
for ( a = 0 ; a < 256 ; a + + )
2010-07-21 16:00:53 +04:00
_calc_state ( m , s , a ) ;
2010-07-21 16:09:12 +04:00
}
2007-04-27 22:40:23 +04:00
}
struct dm_regex * dm_regex_create ( struct dm_pool * mem , const char * * patterns ,
unsigned num_patterns )
{
char * all , * ptr ;
int i ;
size_t len = 0 ;
struct rx_node * rx ;
struct dm_regex * m ;
2010-07-21 16:09:12 +04:00
struct dm_pool * scratch = mem ;
2007-04-27 22:40:23 +04:00
2010-10-01 01:06:50 +04:00
if ( ! ( m = dm_pool_zalloc ( mem , sizeof ( * m ) ) ) )
2007-04-27 22:40:23 +04:00
return_NULL ;
/* join the regexps together, delimiting with zero */
for ( i = 0 ; i < num_patterns ; i + + )
len + = strlen ( patterns [ i ] ) + 8 ;
ptr = all = dm_pool_alloc ( scratch , len + 1 ) ;
if ( ! all )
goto_bad ;
for ( i = 0 ; i < num_patterns ; i + + ) {
ptr + = sprintf ( ptr , " (.*(%s)%c) " , patterns [ i ] , TARGET_TRANS ) ;
if ( i < ( num_patterns - 1 ) )
* ptr + + = ' | ' ;
}
/* parse this expression */
if ( ! ( rx = rx_parse_tok ( scratch , all , ptr ) ) ) {
log_error ( " Couldn't parse regex " ) ;
goto bad ;
}
m - > mem = mem ;
m - > scratch = scratch ;
m - > num_nodes = _count_nodes ( rx ) ;
2010-07-21 15:58:49 +04:00
m - > num_charsets = _count_charsets ( rx ) ;
_enumerate_charsets ( rx ) ;
2007-04-27 22:40:23 +04:00
m - > nodes = dm_pool_alloc ( scratch , sizeof ( * m - > nodes ) * m - > num_nodes ) ;
if ( ! m - > nodes )
goto_bad ;
2010-07-21 15:58:49 +04:00
m - > charsets = dm_pool_alloc ( scratch , sizeof ( * m - > charsets ) * m - > num_charsets ) ;
if ( ! m - > charsets )
goto_bad ;
2007-04-27 22:40:23 +04:00
_fill_table ( m , rx ) ;
_create_bitsets ( m ) ;
_calc_functions ( m ) ;
_calc_states ( m , rx ) ;
return m ;
bad :
dm_pool_free ( mem , m ) ;
return NULL ;
}
2010-07-21 16:09:12 +04:00
static struct dfa_state * _step_matcher ( struct dm_regex * m , int c , struct dfa_state * cs , int * r )
2007-04-27 22:40:23 +04:00
{
2010-07-21 16:09:12 +04:00
struct dfa_state * ns ;
2010-11-29 17:25:13 +03:00
if ( ! ( ns = cs - > lookup [ ( unsigned char ) c ] ) ) {
_calc_state ( m , cs , ( unsigned char ) c ) ;
if ( ! ( ns = cs - > lookup [ ( unsigned char ) c ] ) )
return NULL ;
}
2007-04-27 22:40:23 +04:00
2010-07-21 16:09:12 +04:00
// yuck, we have to special case the target trans
if ( ns - > final = = - 1 )
_calc_state ( m , ns , TARGET_TRANS ) ;
2007-04-27 22:40:23 +04:00
2010-07-21 16:09:12 +04:00
if ( ns - > final & & ( ns - > final > * r ) )
* r = ns - > final ;
return ns ;
2007-04-27 22:40:23 +04:00
}
int dm_regex_match ( struct dm_regex * regex , const char * s )
{
struct dfa_state * cs = regex - > start ;
int r = 0 ;
2010-07-21 16:09:12 +04:00
dm_bit_clear_all ( regex - > bs ) ;
if ( ! ( cs = _step_matcher ( regex , HAT_CHAR , cs , & r ) ) )
2007-04-27 22:40:23 +04:00
goto out ;
for ( ; * s ; s + + )
2010-07-21 16:09:12 +04:00
if ( ! ( cs = _step_matcher ( regex , * s , cs , & r ) ) )
2007-04-27 22:40:23 +04:00
goto out ;
2010-07-21 16:09:12 +04:00
_step_matcher ( regex , DOLLAR_CHAR , cs , & r ) ;
2007-04-27 22:40:23 +04:00
out :
/* subtract 1 to get back to zero index */
return r - 1 ;
}
2010-07-20 19:32:07 +04:00
/*
* The next block of code concerns calculating a fingerprint for the dfa .
*
* We ' re not calculating a minimal dfa in _calculate_state ( maybe a future
* improvement ) . As such it ' s possible that two non - isomorphic dfas
* recognise the same language . This can only really happen if you start
* with equivalent , but different regexes ( for example the simplifier in
* parse_rx . c may have changed ) .
*
* The code is inefficient ; repeatedly searching a singly linked list for
* previously seen nodes . Not worried since this is test code .
*/
struct node_list {
unsigned node_id ;
struct dfa_state * node ;
struct node_list * next ;
} ;
struct printer {
struct dm_pool * mem ;
struct node_list * pending ;
struct node_list * processed ;
unsigned next_index ;
} ;
static uint32_t randomise_ ( uint32_t n )
{
/* 2^32 - 5 */
uint32_t const prime = ( ~ 0 ) - 4 ;
return n * prime ;
}
static int seen_ ( struct node_list * n , struct dfa_state * node , uint32_t * i )
{
while ( n ) {
if ( n - > node = = node ) {
* i = n - > node_id ;
return 1 ;
}
n = n - > next ;
}
return 0 ;
}
/*
* Push node if it ' s not been seen before , returning a unique index .
*/
static uint32_t push_node_ ( struct printer * p , struct dfa_state * node )
{
uint32_t i ;
if ( seen_ ( p - > pending , node , & i ) | |
seen_ ( p - > processed , node , & i ) )
return i ;
else {
struct node_list * n = dm_pool_alloc ( p - > mem , sizeof ( * n ) ) ;
assert ( n ) ;
n - > node_id = p - > next_index + + ;
n - > node = node ;
n - > next = p - > pending ;
p - > pending = n ;
return n - > node_id ;
}
}
/*
* Pop the front node , and fill out it ' s previously assigned index .
*/
static struct dfa_state * pop_node_ ( struct printer * p )
{
struct dfa_state * node = NULL ;
if ( p - > pending ) {
struct node_list * n = p - > pending ;
p - > pending = n - > next ;
n - > next = p - > processed ;
p - > processed = n ;
node = n - > node ;
}
return node ;
}
static uint32_t combine_ ( uint32_t n1 , uint32_t n2 )
{
return ( ( n1 < < 8 ) | ( n1 > > 24 ) ) ^ randomise_ ( n2 ) ;
}
static uint32_t fingerprint_ ( struct printer * p )
{
int c ;
uint32_t result = 0 ;
struct dfa_state * node ;
while ( ( node = pop_node_ ( p ) ) ) {
2010-07-21 16:09:12 +04:00
result = combine_ ( result , node - > final < 0 ? 0 : node - > final ) ;
2010-07-20 19:32:07 +04:00
for ( c = 0 ; c < 256 ; c + + )
result = combine_ ( result ,
push_node_ ( p , node - > lookup [ c ] ) ) ;
}
return result ;
}
uint32_t dm_regex_fingerprint ( struct dm_regex * regex )
{
uint32_t result ;
struct printer p ;
struct dm_pool * mem = dm_pool_create ( " regex fingerprint " , 1024 ) ;
2010-07-21 16:09:12 +04:00
_force_states ( regex ) ;
2010-07-20 19:32:07 +04:00
assert ( mem ) ;
p . mem = mem ;
p . pending = NULL ;
p . processed = NULL ;
p . next_index = 0 ;
push_node_ ( & p , regex - > start ) ;
result = fingerprint_ ( & p ) ;
dm_pool_destroy ( mem ) ;
return result ;
}