mirror of
git://sourceware.org/git/lvm2.git
synced 2025-01-06 17:18:29 +03:00
535 lines
14 KiB
C
535 lines
14 KiB
C
/*
|
|
* Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
|
|
* Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
|
|
*
|
|
* This file is part of the device-mapper userspace tools.
|
|
*
|
|
* This copyrighted material is made available to anyone wishing to use,
|
|
* modify, copy, or redistribute it subject to the terms and conditions
|
|
* of the GNU Lesser General Public License v.2.1.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program; if not, write to the Free Software Foundation,
|
|
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
*/
|
|
|
|
#include "dmlib.h"
|
|
#include "parse_rx.h"
|
|
#include "ttree.h"
|
|
#include "assert.h"
|
|
|
|
struct dfa_state {
|
|
struct dfa_state *next;
|
|
int final;
|
|
dm_bitset_t bits;
|
|
struct dfa_state *lookup[256];
|
|
};
|
|
|
|
struct dm_regex { /* Instance variables for the lexer */
|
|
struct dfa_state *start;
|
|
unsigned num_nodes;
|
|
unsigned num_charsets;
|
|
int nodes_entered;
|
|
struct rx_node **nodes;
|
|
int charsets_entered;
|
|
struct rx_node **charsets;
|
|
struct dm_pool *scratch, *mem;
|
|
|
|
/* stuff for on the fly dfa calculation */
|
|
dm_bitset_t charmap[256];
|
|
dm_bitset_t dfa_copy;
|
|
struct ttree *tt;
|
|
dm_bitset_t bs;
|
|
struct dfa_state *h, *t;
|
|
};
|
|
|
|
static int _count_nodes(struct rx_node *rx)
|
|
{
|
|
int r = 1;
|
|
|
|
if (rx->left)
|
|
r += _count_nodes(rx->left);
|
|
|
|
if (rx->right)
|
|
r += _count_nodes(rx->right);
|
|
|
|
return r;
|
|
}
|
|
|
|
static unsigned _count_charsets(struct rx_node *rx)
|
|
{
|
|
if (rx->type == CHARSET)
|
|
return 1;
|
|
|
|
return (rx->left ? _count_charsets(rx->left) : 0) +
|
|
(rx->right ? _count_charsets(rx->right) : 0);
|
|
}
|
|
|
|
static void _enumerate_charsets_internal(struct rx_node *rx, unsigned *i)
|
|
{
|
|
if (rx->type == CHARSET)
|
|
rx->charset_index = (*i)++;
|
|
else {
|
|
if (rx->left)
|
|
_enumerate_charsets_internal(rx->left, i);
|
|
if (rx->right)
|
|
_enumerate_charsets_internal(rx->right, i);
|
|
}
|
|
}
|
|
|
|
static void _enumerate_charsets(struct rx_node *rx)
|
|
{
|
|
unsigned i = 0;
|
|
_enumerate_charsets_internal(rx, &i);
|
|
}
|
|
|
|
static void _fill_table(struct dm_regex *m, struct rx_node *rx)
|
|
{
|
|
assert((rx->type != OR) || (rx->left && rx->right));
|
|
|
|
if (rx->left)
|
|
_fill_table(m, rx->left);
|
|
|
|
if (rx->right)
|
|
_fill_table(m, rx->right);
|
|
|
|
m->nodes[m->nodes_entered++] = rx;
|
|
if (rx->type == CHARSET)
|
|
m->charsets[m->charsets_entered++] = rx;
|
|
}
|
|
|
|
static void _create_bitsets(struct dm_regex *m)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < m->num_nodes; i++) {
|
|
struct rx_node *n = m->nodes[i];
|
|
n->firstpos = dm_bitset_create(m->scratch, m->num_charsets);
|
|
n->lastpos = dm_bitset_create(m->scratch, m->num_charsets);
|
|
n->followpos = dm_bitset_create(m->scratch, m->num_charsets);
|
|
}
|
|
}
|
|
|
|
static void _calc_functions(struct dm_regex *m)
|
|
{
|
|
int i, j, final = 1;
|
|
struct rx_node *rx, *c1, *c2;
|
|
|
|
for (i = 0; i < m->num_nodes; i++) {
|
|
rx = m->nodes[i];
|
|
c1 = rx->left;
|
|
c2 = rx->right;
|
|
|
|
if (rx->type == CHARSET && dm_bit(rx->charset, TARGET_TRANS))
|
|
rx->final = final++;
|
|
|
|
switch (rx->type) {
|
|
case CAT:
|
|
if (c1->nullable)
|
|
dm_bit_union(rx->firstpos,
|
|
c1->firstpos, c2->firstpos);
|
|
else
|
|
dm_bit_copy(rx->firstpos, c1->firstpos);
|
|
|
|
if (c2->nullable)
|
|
dm_bit_union(rx->lastpos,
|
|
c1->lastpos, c2->lastpos);
|
|
else
|
|
dm_bit_copy(rx->lastpos, c2->lastpos);
|
|
|
|
rx->nullable = c1->nullable && c2->nullable;
|
|
break;
|
|
|
|
case PLUS:
|
|
dm_bit_copy(rx->firstpos, c1->firstpos);
|
|
dm_bit_copy(rx->lastpos, c1->lastpos);
|
|
rx->nullable = c1->nullable;
|
|
break;
|
|
|
|
case OR:
|
|
dm_bit_union(rx->firstpos, c1->firstpos, c2->firstpos);
|
|
dm_bit_union(rx->lastpos, c1->lastpos, c2->lastpos);
|
|
rx->nullable = c1->nullable || c2->nullable;
|
|
break;
|
|
|
|
case QUEST:
|
|
case STAR:
|
|
dm_bit_copy(rx->firstpos, c1->firstpos);
|
|
dm_bit_copy(rx->lastpos, c1->lastpos);
|
|
rx->nullable = 1;
|
|
break;
|
|
|
|
case CHARSET:
|
|
dm_bit_set(rx->firstpos, rx->charset_index);
|
|
dm_bit_set(rx->lastpos, rx->charset_index);
|
|
rx->nullable = 0;
|
|
break;
|
|
|
|
default:
|
|
log_error(INTERNAL_ERROR "Unknown calc node type");
|
|
}
|
|
|
|
/*
|
|
* followpos has it's own switch
|
|
* because PLUS and STAR do the
|
|
* same thing.
|
|
*/
|
|
switch (rx->type) {
|
|
case CAT:
|
|
for (j = 0; j < m->num_charsets; j++) {
|
|
struct rx_node *n = m->charsets[j];
|
|
if (dm_bit(c1->lastpos, j))
|
|
dm_bit_union(n->followpos,
|
|
n->followpos, c2->firstpos);
|
|
}
|
|
break;
|
|
|
|
case PLUS:
|
|
case STAR:
|
|
for (j = 0; j < m->num_charsets; j++) {
|
|
struct rx_node *n = m->charsets[j];
|
|
if (dm_bit(rx->lastpos, j))
|
|
dm_bit_union(n->followpos,
|
|
n->followpos, rx->firstpos);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
static struct dfa_state *_create_dfa_state(struct dm_pool *mem)
|
|
{
|
|
return dm_pool_zalloc(mem, sizeof(struct dfa_state));
|
|
}
|
|
|
|
static struct dfa_state *_create_state_queue(struct dm_pool *mem,
|
|
struct dfa_state *dfa,
|
|
dm_bitset_t bits)
|
|
{
|
|
dfa->bits = dm_bitset_create(mem, bits[0]); /* first element is the size */
|
|
dm_bit_copy(dfa->bits, bits);
|
|
dfa->next = 0;
|
|
dfa->final = -1;
|
|
return dfa;
|
|
}
|
|
|
|
static void _calc_state(struct dm_regex *m, struct dfa_state *dfa, int a)
|
|
{
|
|
int set_bits = 0, i;
|
|
dm_bitset_t dfa_bits = dfa->bits;
|
|
dm_bit_and(m->dfa_copy, m->charmap[a], dfa_bits);
|
|
|
|
/* iterate through all the states in firstpos */
|
|
for (i = dm_bit_get_first(m->dfa_copy); i >= 0; i = dm_bit_get_next(m->dfa_copy, i)) {
|
|
if (a == TARGET_TRANS)
|
|
dfa->final = m->charsets[i]->final;
|
|
|
|
dm_bit_union(m->bs, m->bs, m->charsets[i]->followpos);
|
|
set_bits = 1;
|
|
}
|
|
|
|
if (set_bits) {
|
|
struct dfa_state *tmp;
|
|
struct dfa_state *ldfa = ttree_lookup(m->tt, m->bs + 1);
|
|
if (!ldfa) {
|
|
/* push */
|
|
ldfa = _create_dfa_state(m->mem);
|
|
ttree_insert(m->tt, m->bs + 1, ldfa);
|
|
tmp = _create_state_queue(m->scratch, ldfa, m->bs);
|
|
if (!m->h)
|
|
m->h = m->t = tmp;
|
|
else {
|
|
m->t->next = tmp;
|
|
m->t = tmp;
|
|
}
|
|
}
|
|
|
|
dfa->lookup[a] = ldfa;
|
|
dm_bit_clear_all(m->bs);
|
|
}
|
|
}
|
|
|
|
static int _calc_states(struct dm_regex *m, struct rx_node *rx)
|
|
{
|
|
unsigned iwidth = (m->num_charsets / DM_BITS_PER_INT) + 1;
|
|
struct dfa_state *dfa;
|
|
int i, a;
|
|
|
|
m->tt = ttree_create(m->scratch, iwidth);
|
|
if (!m->tt)
|
|
return_0;
|
|
|
|
if (!(m->bs = dm_bitset_create(m->scratch, m->num_charsets)))
|
|
return_0;
|
|
|
|
/* build some char maps */
|
|
for (a = 0; a < 256; a++) {
|
|
m->charmap[a] = dm_bitset_create(m->scratch, m->num_charsets);
|
|
if (!m->charmap[a])
|
|
return_0;
|
|
}
|
|
|
|
for (i = 0; i < m->num_nodes; i++) {
|
|
struct rx_node *n = m->nodes[i];
|
|
if (n->type == CHARSET) {
|
|
for (a = dm_bit_get_first(n->charset);
|
|
a >= 0; a = dm_bit_get_next(n->charset, a))
|
|
dm_bit_set(m->charmap[a], n->charset_index);
|
|
}
|
|
}
|
|
|
|
/* create first state */
|
|
dfa = _create_dfa_state(m->mem);
|
|
m->start = dfa;
|
|
ttree_insert(m->tt, rx->firstpos + 1, dfa);
|
|
|
|
/* prime the queue */
|
|
m->h = m->t = _create_state_queue(m->scratch, dfa, rx->firstpos);
|
|
m->dfa_copy = dm_bitset_create(m->scratch, m->num_charsets);
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* Forces all the dfa states to be calculated up front, ie. what
|
|
* _calc_states() used to do before we switched to calculating on demand.
|
|
*/
|
|
static void _force_states(struct dm_regex *m)
|
|
{
|
|
int a;
|
|
|
|
/* keep processing until there's nothing in the queue */
|
|
struct dfa_state *s;
|
|
while ((s = m->h)) {
|
|
/* pop state off front of the queue */
|
|
m->h = m->h->next;
|
|
|
|
/* iterate through all the inputs for this state */
|
|
dm_bit_clear_all(m->bs);
|
|
for (a = 0; a < 256; a++)
|
|
_calc_state(m, s, a);
|
|
}
|
|
}
|
|
|
|
struct dm_regex *dm_regex_create(struct dm_pool *mem, const char **patterns,
|
|
unsigned num_patterns)
|
|
{
|
|
char *all, *ptr;
|
|
int i;
|
|
size_t len = 0;
|
|
struct rx_node *rx;
|
|
struct dm_regex *m;
|
|
struct dm_pool *scratch = mem;
|
|
|
|
if (!(m = dm_pool_zalloc(mem, sizeof(*m))))
|
|
return_NULL;
|
|
|
|
/* join the regexps together, delimiting with zero */
|
|
for (i = 0; i < num_patterns; i++)
|
|
len += strlen(patterns[i]) + 8;
|
|
|
|
ptr = all = dm_pool_alloc(scratch, len + 1);
|
|
|
|
if (!all)
|
|
goto_bad;
|
|
|
|
for (i = 0; i < num_patterns; i++) {
|
|
ptr += sprintf(ptr, "(.*(%s)%c)", patterns[i], TARGET_TRANS);
|
|
if (i < (num_patterns - 1))
|
|
*ptr++ = '|';
|
|
}
|
|
|
|
/* parse this expression */
|
|
if (!(rx = rx_parse_tok(scratch, all, ptr))) {
|
|
log_error("Couldn't parse regex");
|
|
goto bad;
|
|
}
|
|
|
|
m->mem = mem;
|
|
m->scratch = scratch;
|
|
m->num_nodes = _count_nodes(rx);
|
|
m->num_charsets = _count_charsets(rx);
|
|
_enumerate_charsets(rx);
|
|
m->nodes = dm_pool_alloc(scratch, sizeof(*m->nodes) * m->num_nodes);
|
|
if (!m->nodes)
|
|
goto_bad;
|
|
|
|
m->charsets = dm_pool_alloc(scratch, sizeof(*m->charsets) * m->num_charsets);
|
|
if (!m->charsets)
|
|
goto_bad;
|
|
|
|
_fill_table(m, rx);
|
|
_create_bitsets(m);
|
|
_calc_functions(m);
|
|
_calc_states(m, rx);
|
|
return m;
|
|
|
|
bad:
|
|
dm_pool_free(mem, m);
|
|
return NULL;
|
|
}
|
|
|
|
static struct dfa_state *_step_matcher(struct dm_regex *m, int c, struct dfa_state *cs, int *r)
|
|
{
|
|
struct dfa_state *ns;
|
|
|
|
if (!(ns = cs->lookup[(unsigned char) c]))
|
|
_calc_state(m, cs, (unsigned char) c);
|
|
|
|
if (!(ns = cs->lookup[(unsigned char) c]))
|
|
return NULL;
|
|
|
|
// yuck, we have to special case the target trans
|
|
if (ns->final == -1)
|
|
_calc_state(m, ns, TARGET_TRANS);
|
|
|
|
if (ns->final && (ns->final > *r))
|
|
*r = ns->final;
|
|
|
|
return ns;
|
|
}
|
|
|
|
int dm_regex_match(struct dm_regex *regex, const char *s)
|
|
{
|
|
struct dfa_state *cs = regex->start;
|
|
int r = 0;
|
|
|
|
dm_bit_clear_all(regex->bs);
|
|
if (!(cs = _step_matcher(regex, HAT_CHAR, cs, &r)))
|
|
goto out;
|
|
|
|
for (; *s; s++)
|
|
if (!(cs = _step_matcher(regex, *s, cs, &r)))
|
|
goto out;
|
|
|
|
_step_matcher(regex, DOLLAR_CHAR, cs, &r);
|
|
|
|
out:
|
|
/* subtract 1 to get back to zero index */
|
|
return r - 1;
|
|
}
|
|
|
|
/*
|
|
* The next block of code concerns calculating a fingerprint for the dfa.
|
|
*
|
|
* We're not calculating a minimal dfa in _calculate_state (maybe a future
|
|
* improvement). As such it's possible that two non-isomorphic dfas
|
|
* recognise the same language. This can only really happen if you start
|
|
* with equivalent, but different regexes (for example the simplifier in
|
|
* parse_rx.c may have changed).
|
|
*
|
|
* The code is inefficient; repeatedly searching a singly linked list for
|
|
* previously seen nodes. Not worried since this is test code.
|
|
*/
|
|
struct node_list {
|
|
unsigned node_id;
|
|
struct dfa_state *node;
|
|
struct node_list *next;
|
|
};
|
|
|
|
struct printer {
|
|
struct dm_pool *mem;
|
|
struct node_list *pending;
|
|
struct node_list *processed;
|
|
unsigned next_index;
|
|
};
|
|
|
|
static uint32_t randomise_(uint32_t n)
|
|
{
|
|
/* 2^32 - 5 */
|
|
uint32_t const prime = (~0) - 4;
|
|
return n * prime;
|
|
}
|
|
|
|
static int seen_(struct node_list *n, struct dfa_state *node, uint32_t *i)
|
|
{
|
|
while (n) {
|
|
if (n->node == node) {
|
|
*i = n->node_id;
|
|
return 1;
|
|
}
|
|
n = n->next;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Push node if it's not been seen before, returning a unique index.
|
|
*/
|
|
static uint32_t push_node_(struct printer *p, struct dfa_state *node)
|
|
{
|
|
uint32_t i;
|
|
if (seen_(p->pending, node, &i) ||
|
|
seen_(p->processed, node, &i))
|
|
return i;
|
|
else {
|
|
struct node_list *n = dm_pool_alloc(p->mem, sizeof(*n));
|
|
assert(n);
|
|
n->node_id = p->next_index++;
|
|
n->node = node;
|
|
n->next = p->pending;
|
|
p->pending = n;
|
|
return n->node_id;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Pop the front node, and fill out it's previously assigned index.
|
|
*/
|
|
static struct dfa_state *pop_node_(struct printer *p)
|
|
{
|
|
struct dfa_state *node = NULL;
|
|
|
|
if (p->pending) {
|
|
struct node_list *n = p->pending;
|
|
p->pending = n->next;
|
|
n->next = p->processed;
|
|
p->processed = n;
|
|
|
|
node = n->node;
|
|
}
|
|
|
|
return node;
|
|
}
|
|
|
|
static uint32_t combine_(uint32_t n1, uint32_t n2)
|
|
{
|
|
return ((n1 << 8) | (n1 >> 24)) ^ randomise_(n2);
|
|
}
|
|
|
|
static uint32_t fingerprint_(struct printer *p)
|
|
{
|
|
int c;
|
|
uint32_t result = 0;
|
|
struct dfa_state *node;
|
|
|
|
while ((node = pop_node_(p))) {
|
|
result = combine_(result, node->final < 0 ? 0 : node->final);
|
|
for (c = 0; c < 256; c++)
|
|
result = combine_(result,
|
|
push_node_(p, node->lookup[c]));
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
uint32_t dm_regex_fingerprint(struct dm_regex *regex)
|
|
{
|
|
uint32_t result;
|
|
struct printer p;
|
|
struct dm_pool *mem = dm_pool_create("regex fingerprint", 1024);
|
|
|
|
_force_states(regex);
|
|
|
|
assert(mem);
|
|
p.mem = mem;
|
|
p.pending = NULL;
|
|
p.processed = NULL;
|
|
p.next_index = 0;
|
|
|
|
push_node_(&p, regex->start);
|
|
result = fingerprint_(&p);
|
|
dm_pool_destroy(mem);
|
|
return result;
|
|
}
|