1
0
mirror of git://sourceware.org/git/lvm2.git synced 2024-12-29 15:22:30 +03:00
lvm2/libdm/regex/matcher.c
Zdenek Kabelac fcbef05aae doc: change fsf address
Hmm rpmlint suggest fsf is using a different address these days,
so lets keep it up-to-date
2016-01-21 12:11:37 +01:00

576 lines
14 KiB
C

/*
* Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
* Copyright (C) 2004-2012 Red Hat, Inc. All rights reserved.
*
* This file is part of the device-mapper userspace tools.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
* of the GNU Lesser General Public License v.2.1.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "dmlib.h"
#include "parse_rx.h"
#include "ttree.h"
#include "assert.h"
struct dfa_state {
struct dfa_state *next;
int final;
dm_bitset_t bits;
struct dfa_state *lookup[256];
};
struct dm_regex { /* Instance variables for the lexer */
struct dfa_state *start;
unsigned num_nodes;
unsigned num_charsets;
int nodes_entered;
struct rx_node **nodes;
int charsets_entered;
struct rx_node **charsets;
struct dm_pool *scratch, *mem;
/* stuff for on the fly dfa calculation */
dm_bitset_t charmap[256];
dm_bitset_t dfa_copy;
struct ttree *tt;
dm_bitset_t bs;
struct dfa_state *h, *t;
};
static int _count_nodes(struct rx_node *rx)
{
int r = 1;
if (rx->left)
r += _count_nodes(rx->left);
if (rx->right)
r += _count_nodes(rx->right);
return r;
}
static unsigned _count_charsets(struct rx_node *rx)
{
if (rx->type == CHARSET)
return 1;
return (rx->left ? _count_charsets(rx->left) : 0) +
(rx->right ? _count_charsets(rx->right) : 0);
}
static void _enumerate_charsets_internal(struct rx_node *rx, unsigned *i)
{
if (rx->type == CHARSET)
rx->charset_index = (*i)++;
else {
if (rx->left)
_enumerate_charsets_internal(rx->left, i);
if (rx->right)
_enumerate_charsets_internal(rx->right, i);
}
}
static void _enumerate_charsets(struct rx_node *rx)
{
unsigned i = 0;
_enumerate_charsets_internal(rx, &i);
}
static void _fill_table(struct dm_regex *m, struct rx_node *rx)
{
assert((rx->type != OR) || (rx->left && rx->right));
if (rx->left)
_fill_table(m, rx->left);
if (rx->right)
_fill_table(m, rx->right);
m->nodes[m->nodes_entered++] = rx;
if (rx->type == CHARSET)
m->charsets[m->charsets_entered++] = rx;
}
static int _create_bitsets(struct dm_regex *m)
{
unsigned i;
struct rx_node *n;
for (i = 0; i < m->num_nodes; i++) {
n = m->nodes[i];
if (!(n->firstpos = dm_bitset_create(m->scratch, m->num_charsets)))
return_0;
if (!(n->lastpos = dm_bitset_create(m->scratch, m->num_charsets)))
return_0;
if (!(n->followpos = dm_bitset_create(m->scratch, m->num_charsets)))
return_0;
}
return 1;
}
static void _calc_functions(struct dm_regex *m)
{
unsigned i, j, final = 1;
struct rx_node *rx, *c1, *c2;
for (i = 0; i < m->num_nodes; i++) {
rx = m->nodes[i];
c1 = rx->left;
c2 = rx->right;
if (rx->type == CHARSET && dm_bit(rx->charset, TARGET_TRANS))
rx->final = final++;
switch (rx->type) {
case CAT:
if (c1->nullable)
dm_bit_union(rx->firstpos,
c1->firstpos, c2->firstpos);
else
dm_bit_copy(rx->firstpos, c1->firstpos);
if (c2->nullable)
dm_bit_union(rx->lastpos,
c1->lastpos, c2->lastpos);
else
dm_bit_copy(rx->lastpos, c2->lastpos);
rx->nullable = c1->nullable && c2->nullable;
break;
case PLUS:
dm_bit_copy(rx->firstpos, c1->firstpos);
dm_bit_copy(rx->lastpos, c1->lastpos);
rx->nullable = c1->nullable;
break;
case OR:
dm_bit_union(rx->firstpos, c1->firstpos, c2->firstpos);
dm_bit_union(rx->lastpos, c1->lastpos, c2->lastpos);
rx->nullable = c1->nullable || c2->nullable;
break;
case QUEST:
case STAR:
dm_bit_copy(rx->firstpos, c1->firstpos);
dm_bit_copy(rx->lastpos, c1->lastpos);
rx->nullable = 1;
break;
case CHARSET:
dm_bit_set(rx->firstpos, rx->charset_index);
dm_bit_set(rx->lastpos, rx->charset_index);
rx->nullable = 0;
break;
default:
log_error(INTERNAL_ERROR "Unknown calc node type");
}
/*
* followpos has it's own switch
* because PLUS and STAR do the
* same thing.
*/
switch (rx->type) {
case CAT:
for (j = 0; j < m->num_charsets; j++) {
struct rx_node *n = m->charsets[j];
if (dm_bit(c1->lastpos, j))
dm_bit_union(n->followpos,
n->followpos, c2->firstpos);
}
break;
case PLUS:
case STAR:
for (j = 0; j < m->num_charsets; j++) {
struct rx_node *n = m->charsets[j];
if (dm_bit(rx->lastpos, j))
dm_bit_union(n->followpos,
n->followpos, rx->firstpos);
}
break;
}
}
}
static struct dfa_state *_create_dfa_state(struct dm_pool *mem)
{
return dm_pool_zalloc(mem, sizeof(struct dfa_state));
}
static struct dfa_state *_create_state_queue(struct dm_pool *mem,
struct dfa_state *dfa,
dm_bitset_t bits)
{
if (!(dfa->bits = dm_bitset_create(mem, bits[0]))) /* first element is the size */
return_NULL;
dm_bit_copy(dfa->bits, bits);
dfa->next = 0;
dfa->final = -1;
return dfa;
}
static int _calc_state(struct dm_regex *m, struct dfa_state *dfa, int a)
{
int set_bits = 0, i;
dm_bitset_t dfa_bits = dfa->bits;
dm_bit_and(m->dfa_copy, m->charmap[a], dfa_bits);
/* iterate through all the states in firstpos */
for (i = dm_bit_get_first(m->dfa_copy); i >= 0; i = dm_bit_get_next(m->dfa_copy, i)) {
if (a == TARGET_TRANS)
dfa->final = m->charsets[i]->final;
dm_bit_union(m->bs, m->bs, m->charsets[i]->followpos);
set_bits = 1;
}
if (set_bits) {
struct dfa_state *tmp;
struct dfa_state *ldfa = ttree_lookup(m->tt, m->bs + 1);
if (!ldfa) {
/* push */
if (!(ldfa = _create_dfa_state(m->mem)))
return_0;
ttree_insert(m->tt, m->bs + 1, ldfa);
if (!(tmp = _create_state_queue(m->scratch, ldfa, m->bs)))
return_0;
if (!m->h)
m->h = m->t = tmp;
else {
m->t->next = tmp;
m->t = tmp;
}
}
dfa->lookup[a] = ldfa;
dm_bit_clear_all(m->bs);
}
return 1;
}
static int _calc_states(struct dm_regex *m, struct rx_node *rx)
{
unsigned iwidth = (m->num_charsets / DM_BITS_PER_INT) + 1;
struct dfa_state *dfa;
struct rx_node *n;
unsigned i;
int a;
if (!(m->tt = ttree_create(m->scratch, iwidth)))
return_0;
if (!(m->bs = dm_bitset_create(m->scratch, m->num_charsets)))
return_0;
/* build some char maps */
for (a = 0; a < 256; a++)
if (!(m->charmap[a] = dm_bitset_create(m->scratch, m->num_charsets)))
return_0;
for (i = 0; i < m->num_nodes; i++) {
n = m->nodes[i];
if (n->type == CHARSET) {
for (a = dm_bit_get_first(n->charset);
a >= 0; a = dm_bit_get_next(n->charset, a))
dm_bit_set(m->charmap[a], n->charset_index);
}
}
/* create first state */
if (!(dfa = _create_dfa_state(m->mem)))
return_0;
m->start = dfa;
ttree_insert(m->tt, rx->firstpos + 1, dfa);
/* prime the queue */
if (!(m->h = m->t = _create_state_queue(m->scratch, dfa, rx->firstpos)))
return_0;
if (!(m->dfa_copy = dm_bitset_create(m->scratch, m->num_charsets)))
return_0;
return 1;
}
/*
* Forces all the dfa states to be calculated up front, ie. what
* _calc_states() used to do before we switched to calculating on demand.
*/
static int _force_states(struct dm_regex *m)
{
int a;
/* keep processing until there's nothing in the queue */
struct dfa_state *s;
while ((s = m->h)) {
/* pop state off front of the queue */
m->h = m->h->next;
/* iterate through all the inputs for this state */
dm_bit_clear_all(m->bs);
for (a = 0; a < 256; a++)
if (!_calc_state(m, s, a))
return_0;
}
return 1;
}
struct dm_regex *dm_regex_create(struct dm_pool *mem, const char * const *patterns,
unsigned num_patterns)
{
char *all, *ptr;
unsigned i;
size_t len = 0;
struct rx_node *rx;
struct dm_regex *m;
struct dm_pool *scratch = mem;
if (!(m = dm_pool_zalloc(mem, sizeof(*m))))
return_NULL;
/* join the regexps together, delimiting with zero */
for (i = 0; i < num_patterns; i++)
len += strlen(patterns[i]) + 8;
ptr = all = dm_pool_alloc(scratch, len + 1);
if (!all)
goto_bad;
for (i = 0; i < num_patterns; i++) {
ptr += sprintf(ptr, "(.*(%s)%c)", patterns[i], TARGET_TRANS);
if (i < (num_patterns - 1))
*ptr++ = '|';
}
/* parse this expression */
if (!(rx = rx_parse_tok(scratch, all, ptr))) {
log_error("Couldn't parse regex");
goto bad;
}
m->mem = mem;
m->scratch = scratch;
m->num_nodes = _count_nodes(rx);
m->num_charsets = _count_charsets(rx);
_enumerate_charsets(rx);
if (!(m->nodes = dm_pool_alloc(scratch, sizeof(*m->nodes) * m->num_nodes)))
goto_bad;
if (!(m->charsets = dm_pool_alloc(scratch, sizeof(*m->charsets) * m->num_charsets)))
goto_bad;
_fill_table(m, rx);
if (!_create_bitsets(m))
goto_bad;
_calc_functions(m);
if (!_calc_states(m, rx))
goto_bad;
return m;
bad:
dm_pool_free(mem, m);
return NULL;
}
static struct dfa_state *_step_matcher(struct dm_regex *m, int c, struct dfa_state *cs, int *r)
{
struct dfa_state *ns;
if (!(ns = cs->lookup[(unsigned char) c])) {
if (!_calc_state(m, cs, (unsigned char) c))
return_NULL;
if (!(ns = cs->lookup[(unsigned char) c]))
return NULL;
}
// yuck, we have to special case the target trans
if ((ns->final == -1) &&
!_calc_state(m, ns, TARGET_TRANS))
return_NULL;
if (ns->final && (ns->final > *r))
*r = ns->final;
return ns;
}
int dm_regex_match(struct dm_regex *regex, const char *s)
{
struct dfa_state *cs = regex->start;
int r = 0;
dm_bit_clear_all(regex->bs);
if (!(cs = _step_matcher(regex, HAT_CHAR, cs, &r)))
goto out;
for (; *s; s++)
if (!(cs = _step_matcher(regex, *s, cs, &r)))
goto out;
_step_matcher(regex, DOLLAR_CHAR, cs, &r);
out:
/* subtract 1 to get back to zero index */
return r - 1;
}
/*
* The next block of code concerns calculating a fingerprint for the dfa.
*
* We're not calculating a minimal dfa in _calculate_state (maybe a future
* improvement). As such it's possible that two non-isomorphic dfas
* recognise the same language. This can only really happen if you start
* with equivalent, but different regexes (for example the simplifier in
* parse_rx.c may have changed).
*
* The code is inefficient; repeatedly searching a singly linked list for
* previously seen nodes. Not worried since this is test code.
*/
struct node_list {
unsigned node_id;
struct dfa_state *node;
struct node_list *next;
};
struct printer {
struct dm_pool *mem;
struct node_list *pending;
struct node_list *processed;
unsigned next_index;
};
static uint32_t _randomise(uint32_t n)
{
/* 2^32 - 5 */
uint32_t const prime = (~0) - 4;
return n * prime;
}
static int _seen(struct node_list *n, struct dfa_state *node, uint32_t *i)
{
while (n) {
if (n->node == node) {
*i = n->node_id;
return 1;
}
n = n->next;
}
return 0;
}
/*
* Push node if it's not been seen before, returning a unique index.
*/
static uint32_t _push_node(struct printer *p, struct dfa_state *node)
{
uint32_t i;
struct node_list *n;
if (_seen(p->pending, node, &i) ||
_seen(p->processed, node, &i))
return i;
if (!(n = dm_pool_alloc(p->mem, sizeof(*n))))
return_0;
n->node_id = ++p->next_index; /* start from 1, keep 0 as error code */
n->node = node;
n->next = p->pending;
p->pending = n;
return n->node_id;
}
/*
* Pop the front node, and fill out it's previously assigned index.
*/
static struct dfa_state *_pop_node(struct printer *p)
{
struct dfa_state *node = NULL;
struct node_list *n;
if (p->pending) {
n = p->pending;
p->pending = n->next;
n->next = p->processed;
p->processed = n;
node = n->node;
}
return node;
}
static uint32_t _combine(uint32_t n1, uint32_t n2)
{
return ((n1 << 8) | (n1 >> 24)) ^ _randomise(n2);
}
static uint32_t _fingerprint(struct printer *p)
{
int c;
uint32_t result = 0;
struct dfa_state *node;
while ((node = _pop_node(p))) {
result = _combine(result, (node->final < 0) ? 0 : node->final);
for (c = 0; c < 256; c++)
result = _combine(result,
_push_node(p, node->lookup[c]));
}
return result;
}
uint32_t dm_regex_fingerprint(struct dm_regex *regex)
{
struct printer p;
uint32_t result = 0;
struct dm_pool *mem = dm_pool_create("regex fingerprint", 1024);
if (!mem)
return_0;
if (!_force_states(regex))
goto_out;
p.mem = mem;
p.pending = NULL;
p.processed = NULL;
p.next_index = 0;
if (!_push_node(&p, regex->start))
goto_out;
result = _fingerprint(&p);
out:
dm_pool_destroy(mem);
return result;
}