1
0
mirror of git://sourceware.org/git/lvm2.git synced 2025-01-19 14:04:17 +03:00

[REGEX] reduce the number of charset nodes that are produced

This commit is contained in:
Joe Thornber 2010-07-21 11:58:49 +00:00
parent 2ef5d30611
commit 529edb1853
3 changed files with 87 additions and 33 deletions

View File

@ -32,8 +32,11 @@ struct state_queue {
struct dm_regex { /* Instance variables for the lexer */ struct dm_regex { /* Instance variables for the lexer */
struct dfa_state *start; struct dfa_state *start;
unsigned num_nodes; unsigned num_nodes;
unsigned num_charsets;
int nodes_entered; int nodes_entered;
struct rx_node **nodes; struct rx_node **nodes;
int charsets_entered;
struct rx_node **charsets;
struct dm_pool *scratch, *mem; struct dm_pool *scratch, *mem;
}; };
@ -50,6 +53,33 @@ static int _count_nodes(struct rx_node *rx)
return r; return r;
} }
static unsigned _count_charsets(struct rx_node *rx)
{
if (rx->type == CHARSET)
return 1;
return (rx->left ? _count_charsets(rx->left) : 0) +
(rx->right ? _count_charsets(rx->right) : 0);
}
static void _enumerate_charsets_internal(struct rx_node *rx, unsigned *i)
{
if (rx->type == CHARSET)
rx->charset_index = (*i)++;
else {
if (rx->left)
_enumerate_charsets_internal(rx->left, i);
if (rx->right)
_enumerate_charsets_internal(rx->right, i);
}
}
static void _enumerate_charsets(struct rx_node *rx)
{
unsigned i = 0;
_enumerate_charsets_internal(rx, &i);
}
static void _fill_table(struct dm_regex *m, struct rx_node *rx) static void _fill_table(struct dm_regex *m, struct rx_node *rx)
{ {
assert((rx->type != OR) || (rx->left && rx->right)); assert((rx->type != OR) || (rx->left && rx->right));
@ -61,6 +91,8 @@ static void _fill_table(struct dm_regex *m, struct rx_node *rx)
_fill_table(m, rx->right); _fill_table(m, rx->right);
m->nodes[m->nodes_entered++] = rx; m->nodes[m->nodes_entered++] = rx;
if (rx->type == CHARSET)
m->charsets[m->charsets_entered++] = rx;
} }
static void _create_bitsets(struct dm_regex *m) static void _create_bitsets(struct dm_regex *m)
@ -69,9 +101,9 @@ static void _create_bitsets(struct dm_regex *m)
for (i = 0; i < m->num_nodes; i++) { for (i = 0; i < m->num_nodes; i++) {
struct rx_node *n = m->nodes[i]; struct rx_node *n = m->nodes[i];
n->firstpos = dm_bitset_create(m->scratch, m->num_nodes); n->firstpos = dm_bitset_create(m->scratch, m->num_charsets);
n->lastpos = dm_bitset_create(m->scratch, m->num_nodes); n->lastpos = dm_bitset_create(m->scratch, m->num_charsets);
n->followpos = dm_bitset_create(m->scratch, m->num_nodes); n->followpos = dm_bitset_create(m->scratch, m->num_charsets);
} }
} }
@ -85,7 +117,7 @@ static void _calc_functions(struct dm_regex *m)
c1 = rx->left; c1 = rx->left;
c2 = rx->right; c2 = rx->right;
if (dm_bit(rx->charset, TARGET_TRANS)) if (rx->type == CHARSET && dm_bit(rx->charset, TARGET_TRANS))
rx->final = final++; rx->final = final++;
switch (rx->type) { switch (rx->type) {
@ -125,8 +157,8 @@ static void _calc_functions(struct dm_regex *m)
break; break;
case CHARSET: case CHARSET:
dm_bit_set(rx->firstpos, i); dm_bit_set(rx->firstpos, rx->charset_index);
dm_bit_set(rx->lastpos, i); dm_bit_set(rx->lastpos, rx->charset_index);
rx->nullable = 0; rx->nullable = 0;
break; break;
@ -141,23 +173,21 @@ static void _calc_functions(struct dm_regex *m)
*/ */
switch (rx->type) { switch (rx->type) {
case CAT: case CAT:
for (j = 0; j < m->num_nodes; j++) { for (j = 0; j < m->num_charsets; j++) {
if (dm_bit(c1->lastpos, j)) { struct rx_node *n = m->charsets[j];
struct rx_node *n = m->nodes[j]; if (dm_bit(c1->lastpos, j))
dm_bit_union(n->followpos, dm_bit_union(n->followpos,
n->followpos, c2->firstpos); n->followpos, c2->firstpos);
}
} }
break; break;
case PLUS: case PLUS:
case STAR: case STAR:
for (j = 0; j < m->num_nodes; j++) { for (j = 0; j < m->num_charsets; j++) {
if (dm_bit(rx->lastpos, j)) { struct rx_node *n = m->charsets[j];
struct rx_node *n = m->nodes[j]; if (dm_bit(rx->lastpos, j))
dm_bit_union(n->followpos, dm_bit_union(n->followpos,
n->followpos, rx->firstpos); n->followpos, rx->firstpos);
}
} }
break; break;
} }
@ -189,7 +219,7 @@ static struct state_queue *_create_state_queue(struct dm_pool *mem,
static int _calc_states(struct dm_regex *m, struct rx_node *rx) static int _calc_states(struct dm_regex *m, struct rx_node *rx)
{ {
unsigned iwidth = (m->num_nodes / DM_BITS_PER_INT) + 1; unsigned iwidth = (m->num_charsets / DM_BITS_PER_INT) + 1;
struct ttree *tt = ttree_create(m->scratch, iwidth); struct ttree *tt = ttree_create(m->scratch, iwidth);
struct state_queue *h, *t, *tmp; struct state_queue *h, *t, *tmp;
struct dfa_state *dfa, *ldfa; struct dfa_state *dfa, *ldfa;
@ -199,9 +229,26 @@ static int _calc_states(struct dm_regex *m, struct rx_node *rx)
if (!tt) if (!tt)
return_0; return_0;
if (!(bs = dm_bitset_create(m->scratch, m->num_nodes))) if (!(bs = dm_bitset_create(m->scratch, m->num_charsets)))
return_0; return_0;
/* build some char maps */
dm_bitset_t charmap[256];
for (a = 0; a < 256; a++) {
charmap[a] = dm_bitset_create(m->scratch, m->num_charsets);
if (!charmap[a])
return_0;
}
for (i = 0; i < m->num_nodes; i++) {
struct rx_node *n = m->nodes[i];
if (n->type == CHARSET) {
for (a = dm_bit_get_first(n->charset);
a >= 0; a = dm_bit_get_next(n->charset, a))
dm_bit_set(charmap[a], n->charset_index);
}
}
/* create first state */ /* create first state */
dfa = _create_dfa_state(m->mem); dfa = _create_dfa_state(m->mem);
m->start = dfa; m->start = dfa;
@ -209,8 +256,9 @@ static int _calc_states(struct dm_regex *m, struct rx_node *rx)
/* prime the queue */ /* prime the queue */
h = t = _create_state_queue(m->scratch, dfa, rx->firstpos); h = t = _create_state_queue(m->scratch, dfa, rx->firstpos);
dm_bitset_t dfa_copy = dm_bitset_create(m->scratch, m->num_charsets);
while (h) { while (h) {
int elems, j, idx[h->bits[0]]; int elems, idx[h->bits[0]];
/* pop state off front of the queue */ /* pop state off front of the queue */
dfa = h->s; dfa = h->s;
@ -227,18 +275,18 @@ static int _calc_states(struct dm_regex *m, struct rx_node *rx)
idx[elems++] = i; idx[elems++] = i;
for (a = 0; a < 256; a++) { for (a = 0; a < 256; a++) {
/* iterate through all the states in firstpos */ dm_bit_and(dfa_copy, charmap[a], dfa_bits);
for (j = 0; j < elems; j++) {
i = idx[j];
if (dm_bit(m->nodes[i]->charset, a)) {
if (a == TARGET_TRANS)
dfa->final = m->nodes[i]->final;
dm_bit_union(bs, bs, /* iterate through all the states in firstpos */
m->nodes[i]->followpos); for (i = dm_bit_get_first(dfa_copy);
set_bits = 1; i >= 0; i = dm_bit_get_next(dfa_copy, i)) {
} if (a == TARGET_TRANS)
} dfa->final = m->charsets[i]->final;
dm_bit_union(bs, bs,
m->charsets[i]->followpos);
set_bits = 1;
}
if (set_bits) { if (set_bits) {
ldfa = ttree_lookup(tt, bs + 1); ldfa = ttree_lookup(tt, bs + 1);
@ -314,11 +362,16 @@ struct dm_regex *dm_regex_create(struct dm_pool *mem, const char **patterns,
m->mem = mem; m->mem = mem;
m->scratch = scratch; m->scratch = scratch;
m->num_nodes = _count_nodes(rx); m->num_nodes = _count_nodes(rx);
m->num_charsets = _count_charsets(rx);
_enumerate_charsets(rx);
m->nodes = dm_pool_alloc(scratch, sizeof(*m->nodes) * m->num_nodes); m->nodes = dm_pool_alloc(scratch, sizeof(*m->nodes) * m->num_nodes);
if (!m->nodes) if (!m->nodes)
goto_bad; goto_bad;
m->charsets = dm_pool_alloc(scratch, sizeof(*m->charsets) * m->num_charsets);
if (!m->charsets)
goto_bad;
_fill_table(m, rx); _fill_table(m, rx);
_create_bitsets(m); _create_bitsets(m);
_calc_functions(m); _calc_functions(m);

View File

@ -284,7 +284,7 @@ static struct rx_node *_node(struct dm_pool *mem, int type,
struct rx_node *n = dm_pool_zalloc(mem, sizeof(*n)); struct rx_node *n = dm_pool_zalloc(mem, sizeof(*n));
if (n) { if (n) {
if (!(n->charset = dm_bitset_create(mem, 256))) { if (type == CHARSET && !(n->charset = dm_bitset_create(mem, 256))) {
dm_pool_free(mem, n); dm_pool_free(mem, n);
return NULL; return NULL;
} }

View File

@ -41,6 +41,7 @@ struct rx_node {
struct rx_node *left, *right; struct rx_node *left, *right;
/* used to build the dfa for the toker */ /* used to build the dfa for the toker */
unsigned charset_index;
int nullable, final; int nullable, final;
dm_bitset_t firstpos; dm_bitset_t firstpos;
dm_bitset_t lastpos; dm_bitset_t lastpos;