1
0
mirror of git://sourceware.org/git/lvm2.git synced 2024-12-21 13:34:40 +03:00
lvm2/device_mapper/regex/matcher.c
Joe Thornber ccc35e2647 device-mapper: Fork libdm internally.
The device-mapper directory now holds a copy of libdm source.  At
the moment this code is identical to libdm.  Over time code will
migrate out to appropriate places (see doc/refactoring.txt).

The libdm directory still exists, and contains the source for the
libdevmapper shared library, which we will continue to ship (though
not neccessarily update).

All code using libdm should now use the version in device-mapper.
2018-05-16 13:00:50 +01:00

576 lines
14 KiB
C

/*
* Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
* Copyright (C) 2004-2012 Red Hat, Inc. All rights reserved.
*
* This file is part of the device-mapper userspace tools.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
* of the GNU Lesser General Public License v.2.1.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "misc/dmlib.h"
#include "parse_rx.h"
#include "ttree.h"
#include "assert.h"
struct dfa_state {
struct dfa_state *next;
int final;
dm_bitset_t bits;
struct dfa_state *lookup[256];
};
struct dm_regex { /* Instance variables for the lexer */
struct dfa_state *start;
unsigned num_nodes;
unsigned num_charsets;
int nodes_entered;
struct rx_node **nodes;
int charsets_entered;
struct rx_node **charsets;
struct dm_pool *scratch, *mem;
/* stuff for on the fly dfa calculation */
dm_bitset_t charmap[256];
dm_bitset_t dfa_copy;
struct ttree *tt;
dm_bitset_t bs;
struct dfa_state *h, *t;
};
static int _count_nodes(struct rx_node *rx)
{
int r = 1;
if (rx->left)
r += _count_nodes(rx->left);
if (rx->right)
r += _count_nodes(rx->right);
return r;
}
static unsigned _count_charsets(struct rx_node *rx)
{
if (rx->type == CHARSET)
return 1;
return (rx->left ? _count_charsets(rx->left) : 0) +
(rx->right ? _count_charsets(rx->right) : 0);
}
static void _enumerate_charsets_internal(struct rx_node *rx, unsigned *i)
{
if (rx->type == CHARSET)
rx->charset_index = (*i)++;
else {
if (rx->left)
_enumerate_charsets_internal(rx->left, i);
if (rx->right)
_enumerate_charsets_internal(rx->right, i);
}
}
static void _enumerate_charsets(struct rx_node *rx)
{
unsigned i = 0;
_enumerate_charsets_internal(rx, &i);
}
static void _fill_table(struct dm_regex *m, struct rx_node *rx)
{
assert((rx->type != OR) || (rx->left && rx->right));
if (rx->left)
_fill_table(m, rx->left);
if (rx->right)
_fill_table(m, rx->right);
m->nodes[m->nodes_entered++] = rx;
if (rx->type == CHARSET)
m->charsets[m->charsets_entered++] = rx;
}
static int _create_bitsets(struct dm_regex *m)
{
unsigned i;
struct rx_node *n;
for (i = 0; i < m->num_nodes; i++) {
n = m->nodes[i];
if (!(n->firstpos = dm_bitset_create(m->scratch, m->num_charsets)))
return_0;
if (!(n->lastpos = dm_bitset_create(m->scratch, m->num_charsets)))
return_0;
if (!(n->followpos = dm_bitset_create(m->scratch, m->num_charsets)))
return_0;
}
return 1;
}
static void _calc_functions(struct dm_regex *m)
{
unsigned i, j, final = 1;
struct rx_node *rx, *c1, *c2;
for (i = 0; i < m->num_nodes; i++) {
rx = m->nodes[i];
c1 = rx->left;
c2 = rx->right;
if (rx->type == CHARSET && dm_bit(rx->charset, TARGET_TRANS))
rx->final = final++;
switch (rx->type) {
case CAT:
if (c1->nullable)
dm_bit_union(rx->firstpos,
c1->firstpos, c2->firstpos);
else
dm_bit_copy(rx->firstpos, c1->firstpos);
if (c2->nullable)
dm_bit_union(rx->lastpos,
c1->lastpos, c2->lastpos);
else
dm_bit_copy(rx->lastpos, c2->lastpos);
rx->nullable = c1->nullable && c2->nullable;
break;
case PLUS:
dm_bit_copy(rx->firstpos, c1->firstpos);
dm_bit_copy(rx->lastpos, c1->lastpos);
rx->nullable = c1->nullable;
break;
case OR:
dm_bit_union(rx->firstpos, c1->firstpos, c2->firstpos);
dm_bit_union(rx->lastpos, c1->lastpos, c2->lastpos);
rx->nullable = c1->nullable || c2->nullable;
break;
case QUEST:
case STAR:
dm_bit_copy(rx->firstpos, c1->firstpos);
dm_bit_copy(rx->lastpos, c1->lastpos);
rx->nullable = 1;
break;
case CHARSET:
dm_bit_set(rx->firstpos, rx->charset_index);
dm_bit_set(rx->lastpos, rx->charset_index);
rx->nullable = 0;
break;
default:
log_error(INTERNAL_ERROR "Unknown calc node type");
}
/*
* followpos has it's own switch
* because PLUS and STAR do the
* same thing.
*/
switch (rx->type) {
case CAT:
for (j = 0; j < m->num_charsets; j++) {
struct rx_node *n = m->charsets[j];
if (dm_bit(c1->lastpos, j))
dm_bit_union(n->followpos,
n->followpos, c2->firstpos);
}
break;
case PLUS:
case STAR:
for (j = 0; j < m->num_charsets; j++) {
struct rx_node *n = m->charsets[j];
if (dm_bit(rx->lastpos, j))
dm_bit_union(n->followpos,
n->followpos, rx->firstpos);
}
break;
}
}
}
static struct dfa_state *_create_dfa_state(struct dm_pool *mem)
{
return dm_pool_zalloc(mem, sizeof(struct dfa_state));
}
static struct dfa_state *_create_state_queue(struct dm_pool *mem,
struct dfa_state *dfa,
dm_bitset_t bits)
{
if (!(dfa->bits = dm_bitset_create(mem, bits[0]))) /* first element is the size */
return_NULL;
dm_bit_copy(dfa->bits, bits);
dfa->next = 0;
dfa->final = -1;
return dfa;
}
static int _calc_state(struct dm_regex *m, struct dfa_state *dfa, int a)
{
int set_bits = 0, i;
dm_bitset_t dfa_bits = dfa->bits;
dm_bit_and(m->dfa_copy, m->charmap[a], dfa_bits);
/* iterate through all the states in firstpos */
for (i = dm_bit_get_first(m->dfa_copy); i >= 0; i = dm_bit_get_next(m->dfa_copy, i)) {
if (a == TARGET_TRANS)
dfa->final = m->charsets[i]->final;
dm_bit_union(m->bs, m->bs, m->charsets[i]->followpos);
set_bits = 1;
}
if (set_bits) {
struct dfa_state *tmp;
struct dfa_state *ldfa = ttree_lookup(m->tt, m->bs + 1);
if (!ldfa) {
/* push */
if (!(ldfa = _create_dfa_state(m->mem)))
return_0;
ttree_insert(m->tt, m->bs + 1, ldfa);
if (!(tmp = _create_state_queue(m->scratch, ldfa, m->bs)))
return_0;
if (!m->h)
m->h = m->t = tmp;
else {
m->t->next = tmp;
m->t = tmp;
}
}
dfa->lookup[a] = ldfa;
dm_bit_clear_all(m->bs);
}
return 1;
}
static int _calc_states(struct dm_regex *m, struct rx_node *rx)
{
unsigned iwidth = (m->num_charsets / DM_BITS_PER_INT) + 1;
struct dfa_state *dfa;
struct rx_node *n;
unsigned i;
int a;
if (!(m->tt = ttree_create(m->scratch, iwidth)))
return_0;
if (!(m->bs = dm_bitset_create(m->scratch, m->num_charsets)))
return_0;
/* build some char maps */
for (a = 0; a < 256; a++)
if (!(m->charmap[a] = dm_bitset_create(m->scratch, m->num_charsets)))
return_0;
for (i = 0; i < m->num_nodes; i++) {
n = m->nodes[i];
if (n->type == CHARSET) {
for (a = dm_bit_get_first(n->charset);
a >= 0; a = dm_bit_get_next(n->charset, a))
dm_bit_set(m->charmap[a], n->charset_index);
}
}
/* create first state */
if (!(dfa = _create_dfa_state(m->mem)))
return_0;
m->start = dfa;
ttree_insert(m->tt, rx->firstpos + 1, dfa);
/* prime the queue */
if (!(m->h = m->t = _create_state_queue(m->scratch, dfa, rx->firstpos)))
return_0;
if (!(m->dfa_copy = dm_bitset_create(m->scratch, m->num_charsets)))
return_0;
return 1;
}
/*
* Forces all the dfa states to be calculated up front, ie. what
* _calc_states() used to do before we switched to calculating on demand.
*/
static int _force_states(struct dm_regex *m)
{
int a;
/* keep processing until there's nothing in the queue */
struct dfa_state *s;
while ((s = m->h)) {
/* pop state off front of the queue */
m->h = m->h->next;
/* iterate through all the inputs for this state */
dm_bit_clear_all(m->bs);
for (a = 0; a < 256; a++)
if (!_calc_state(m, s, a))
return_0;
}
return 1;
}
struct dm_regex *dm_regex_create(struct dm_pool *mem, const char * const *patterns,
unsigned num_patterns)
{
char *all, *ptr;
unsigned i;
size_t len = 0;
struct rx_node *rx;
struct dm_regex *m;
struct dm_pool *scratch = mem;
if (!(m = dm_pool_zalloc(mem, sizeof(*m))))
return_NULL;
/* join the regexps together, delimiting with zero */
for (i = 0; i < num_patterns; i++)
len += strlen(patterns[i]) + 8;
ptr = all = dm_pool_alloc(scratch, len + 1);
if (!all)
goto_bad;
for (i = 0; i < num_patterns; i++) {
ptr += sprintf(ptr, "(.*(%s)%c)", patterns[i], TARGET_TRANS);
if (i < (num_patterns - 1))
*ptr++ = '|';
}
/* parse this expression */
if (!(rx = rx_parse_tok(scratch, all, ptr))) {
log_error("Couldn't parse regex");
goto bad;
}
m->mem = mem;
m->scratch = scratch;
m->num_nodes = _count_nodes(rx);
m->num_charsets = _count_charsets(rx);
_enumerate_charsets(rx);
if (!(m->nodes = dm_pool_alloc(scratch, sizeof(*m->nodes) * m->num_nodes)))
goto_bad;
if (!(m->charsets = dm_pool_alloc(scratch, sizeof(*m->charsets) * m->num_charsets)))
goto_bad;
_fill_table(m, rx);
if (!_create_bitsets(m))
goto_bad;
_calc_functions(m);
if (!_calc_states(m, rx))
goto_bad;
return m;
bad:
dm_pool_free(mem, m);
return NULL;
}
static struct dfa_state *_step_matcher(struct dm_regex *m, int c, struct dfa_state *cs, int *r)
{
struct dfa_state *ns;
if (!(ns = cs->lookup[(unsigned char) c])) {
if (!_calc_state(m, cs, (unsigned char) c))
return_NULL;
if (!(ns = cs->lookup[(unsigned char) c]))
return NULL;
}
// yuck, we have to special case the target trans
if ((ns->final == -1) &&
!_calc_state(m, ns, TARGET_TRANS))
return_NULL;
if (ns->final && (ns->final > *r))
*r = ns->final;
return ns;
}
int dm_regex_match(struct dm_regex *regex, const char *s)
{
struct dfa_state *cs = regex->start;
int r = 0;
dm_bit_clear_all(regex->bs);
if (!(cs = _step_matcher(regex, HAT_CHAR, cs, &r)))
goto out;
for (; *s; s++)
if (!(cs = _step_matcher(regex, *s, cs, &r)))
goto out;
_step_matcher(regex, DOLLAR_CHAR, cs, &r);
out:
/* subtract 1 to get back to zero index */
return r - 1;
}
/*
* The next block of code concerns calculating a fingerprint for the dfa.
*
* We're not calculating a minimal dfa in _calculate_state (maybe a future
* improvement). As such it's possible that two non-isomorphic dfas
* recognise the same language. This can only really happen if you start
* with equivalent, but different regexes (for example the simplifier in
* parse_rx.c may have changed).
*
* The code is inefficient; repeatedly searching a singly linked list for
* previously seen nodes. Not worried since this is test code.
*/
struct node_list {
unsigned node_id;
struct dfa_state *node;
struct node_list *next;
};
struct printer {
struct dm_pool *mem;
struct node_list *pending;
struct node_list *processed;
unsigned next_index;
};
static uint32_t _randomise(uint32_t n)
{
/* 2^32 - 5 */
uint32_t const prime = (~0) - 4;
return n * prime;
}
static int _seen(struct node_list *n, struct dfa_state *node, uint32_t *i)
{
while (n) {
if (n->node == node) {
*i = n->node_id;
return 1;
}
n = n->next;
}
return 0;
}
/*
* Push node if it's not been seen before, returning a unique index.
*/
static uint32_t _push_node(struct printer *p, struct dfa_state *node)
{
uint32_t i;
struct node_list *n;
if (_seen(p->pending, node, &i) ||
_seen(p->processed, node, &i))
return i;
if (!(n = dm_pool_alloc(p->mem, sizeof(*n))))
return_0;
n->node_id = ++p->next_index; /* start from 1, keep 0 as error code */
n->node = node;
n->next = p->pending;
p->pending = n;
return n->node_id;
}
/*
* Pop the front node, and fill out it's previously assigned index.
*/
static struct dfa_state *_pop_node(struct printer *p)
{
struct dfa_state *node = NULL;
struct node_list *n;
if (p->pending) {
n = p->pending;
p->pending = n->next;
n->next = p->processed;
p->processed = n;
node = n->node;
}
return node;
}
static uint32_t _combine(uint32_t n1, uint32_t n2)
{
return ((n1 << 8) | (n1 >> 24)) ^ _randomise(n2);
}
static uint32_t _fingerprint(struct printer *p)
{
int c;
uint32_t result = 0;
struct dfa_state *node;
while ((node = _pop_node(p))) {
result = _combine(result, (node->final < 0) ? 0 : node->final);
for (c = 0; c < 256; c++)
result = _combine(result,
_push_node(p, node->lookup[c]));
}
return result;
}
uint32_t dm_regex_fingerprint(struct dm_regex *regex)
{
struct printer p;
uint32_t result = 0;
struct dm_pool *mem = dm_pool_create("regex fingerprint", 1024);
if (!mem)
return_0;
if (!_force_states(regex))
goto_out;
p.mem = mem;
p.pending = NULL;
p.processed = NULL;
p.next_index = 0;
if (!_push_node(&p, regex->start))
goto_out;
result = _fingerprint(&p);
out:
dm_pool_destroy(mem);
return result;
}