From 529edb1853f9b1a3199eacbed8dfabc14903ee11 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 21 Jul 2010 11:58:49 +0000 Subject: [PATCH] [REGEX] reduce the number of charset nodes that are produced --- libdm/regex/matcher.c | 117 ++++++++++++++++++++++++++++++----------- libdm/regex/parse_rx.c | 2 +- libdm/regex/parse_rx.h | 1 + 3 files changed, 87 insertions(+), 33 deletions(-) diff --git a/libdm/regex/matcher.c b/libdm/regex/matcher.c index d8d856c84..07ba9e4a3 100644 --- a/libdm/regex/matcher.c +++ b/libdm/regex/matcher.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved. + * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved. * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. * * This file is part of the device-mapper userspace tools. @@ -32,8 +32,11 @@ struct state_queue { struct dm_regex { /* Instance variables for the lexer */ struct dfa_state *start; unsigned num_nodes; + unsigned num_charsets; int nodes_entered; struct rx_node **nodes; + int charsets_entered; + struct rx_node **charsets; struct dm_pool *scratch, *mem; }; @@ -50,6 +53,33 @@ static int _count_nodes(struct rx_node *rx) return r; } +static unsigned _count_charsets(struct rx_node *rx) +{ + if (rx->type == CHARSET) + return 1; + + return (rx->left ? _count_charsets(rx->left) : 0) + + (rx->right ? _count_charsets(rx->right) : 0); +} + +static void _enumerate_charsets_internal(struct rx_node *rx, unsigned *i) +{ + if (rx->type == CHARSET) + rx->charset_index = (*i)++; + else { + if (rx->left) + _enumerate_charsets_internal(rx->left, i); + if (rx->right) + _enumerate_charsets_internal(rx->right, i); + } +} + +static void _enumerate_charsets(struct rx_node *rx) +{ + unsigned i = 0; + _enumerate_charsets_internal(rx, &i); +} + static void _fill_table(struct dm_regex *m, struct rx_node *rx) { assert((rx->type != OR) || (rx->left && rx->right)); @@ -61,6 +91,8 @@ static void _fill_table(struct dm_regex *m, struct rx_node *rx) _fill_table(m, rx->right); m->nodes[m->nodes_entered++] = rx; + if (rx->type == CHARSET) + m->charsets[m->charsets_entered++] = rx; } static void _create_bitsets(struct dm_regex *m) @@ -69,9 +101,9 @@ static void _create_bitsets(struct dm_regex *m) for (i = 0; i < m->num_nodes; i++) { struct rx_node *n = m->nodes[i]; - n->firstpos = dm_bitset_create(m->scratch, m->num_nodes); - n->lastpos = dm_bitset_create(m->scratch, m->num_nodes); - n->followpos = dm_bitset_create(m->scratch, m->num_nodes); + n->firstpos = dm_bitset_create(m->scratch, m->num_charsets); + n->lastpos = dm_bitset_create(m->scratch, m->num_charsets); + n->followpos = dm_bitset_create(m->scratch, m->num_charsets); } } @@ -85,7 +117,7 @@ static void _calc_functions(struct dm_regex *m) c1 = rx->left; c2 = rx->right; - if (dm_bit(rx->charset, TARGET_TRANS)) + if (rx->type == CHARSET && dm_bit(rx->charset, TARGET_TRANS)) rx->final = final++; switch (rx->type) { @@ -125,8 +157,8 @@ static void _calc_functions(struct dm_regex *m) break; case CHARSET: - dm_bit_set(rx->firstpos, i); - dm_bit_set(rx->lastpos, i); + dm_bit_set(rx->firstpos, rx->charset_index); + dm_bit_set(rx->lastpos, rx->charset_index); rx->nullable = 0; break; @@ -141,23 +173,21 @@ static void _calc_functions(struct dm_regex *m) */ switch (rx->type) { case CAT: - for (j = 0; j < m->num_nodes; j++) { - if (dm_bit(c1->lastpos, j)) { - struct rx_node *n = m->nodes[j]; + for (j = 0; j < m->num_charsets; j++) { + struct rx_node *n = m->charsets[j]; + if (dm_bit(c1->lastpos, j)) dm_bit_union(n->followpos, - n->followpos, c2->firstpos); - } + n->followpos, c2->firstpos); } break; case PLUS: case STAR: - for (j = 0; j < m->num_nodes; j++) { - if (dm_bit(rx->lastpos, j)) { - struct rx_node *n = m->nodes[j]; + for (j = 0; j < m->num_charsets; j++) { + struct rx_node *n = m->charsets[j]; + if (dm_bit(rx->lastpos, j)) dm_bit_union(n->followpos, - n->followpos, rx->firstpos); - } + n->followpos, rx->firstpos); } break; } @@ -189,7 +219,7 @@ static struct state_queue *_create_state_queue(struct dm_pool *mem, static int _calc_states(struct dm_regex *m, struct rx_node *rx) { - unsigned iwidth = (m->num_nodes / DM_BITS_PER_INT) + 1; + unsigned iwidth = (m->num_charsets / DM_BITS_PER_INT) + 1; struct ttree *tt = ttree_create(m->scratch, iwidth); struct state_queue *h, *t, *tmp; struct dfa_state *dfa, *ldfa; @@ -199,9 +229,26 @@ static int _calc_states(struct dm_regex *m, struct rx_node *rx) if (!tt) return_0; - if (!(bs = dm_bitset_create(m->scratch, m->num_nodes))) + if (!(bs = dm_bitset_create(m->scratch, m->num_charsets))) return_0; + /* build some char maps */ + dm_bitset_t charmap[256]; + for (a = 0; a < 256; a++) { + charmap[a] = dm_bitset_create(m->scratch, m->num_charsets); + if (!charmap[a]) + return_0; + } + + for (i = 0; i < m->num_nodes; i++) { + struct rx_node *n = m->nodes[i]; + if (n->type == CHARSET) { + for (a = dm_bit_get_first(n->charset); + a >= 0; a = dm_bit_get_next(n->charset, a)) + dm_bit_set(charmap[a], n->charset_index); + } + } + /* create first state */ dfa = _create_dfa_state(m->mem); m->start = dfa; @@ -209,8 +256,9 @@ static int _calc_states(struct dm_regex *m, struct rx_node *rx) /* prime the queue */ h = t = _create_state_queue(m->scratch, dfa, rx->firstpos); + dm_bitset_t dfa_copy = dm_bitset_create(m->scratch, m->num_charsets); while (h) { - int elems, j, idx[h->bits[0]]; + int elems, idx[h->bits[0]]; /* pop state off front of the queue */ dfa = h->s; @@ -227,18 +275,18 @@ static int _calc_states(struct dm_regex *m, struct rx_node *rx) idx[elems++] = i; for (a = 0; a < 256; a++) { - /* iterate through all the states in firstpos */ - for (j = 0; j < elems; j++) { - i = idx[j]; - if (dm_bit(m->nodes[i]->charset, a)) { - if (a == TARGET_TRANS) - dfa->final = m->nodes[i]->final; + dm_bit_and(dfa_copy, charmap[a], dfa_bits); - dm_bit_union(bs, bs, - m->nodes[i]->followpos); - set_bits = 1; - } - } + /* iterate through all the states in firstpos */ + for (i = dm_bit_get_first(dfa_copy); + i >= 0; i = dm_bit_get_next(dfa_copy, i)) { + if (a == TARGET_TRANS) + dfa->final = m->charsets[i]->final; + + dm_bit_union(bs, bs, + m->charsets[i]->followpos); + set_bits = 1; + } if (set_bits) { ldfa = ttree_lookup(tt, bs + 1); @@ -314,11 +362,16 @@ struct dm_regex *dm_regex_create(struct dm_pool *mem, const char **patterns, m->mem = mem; m->scratch = scratch; m->num_nodes = _count_nodes(rx); + m->num_charsets = _count_charsets(rx); + _enumerate_charsets(rx); m->nodes = dm_pool_alloc(scratch, sizeof(*m->nodes) * m->num_nodes); - if (!m->nodes) goto_bad; + m->charsets = dm_pool_alloc(scratch, sizeof(*m->charsets) * m->num_charsets); + if (!m->charsets) + goto_bad; + _fill_table(m, rx); _create_bitsets(m); _calc_functions(m); diff --git a/libdm/regex/parse_rx.c b/libdm/regex/parse_rx.c index 20f8c30e2..56260f54e 100644 --- a/libdm/regex/parse_rx.c +++ b/libdm/regex/parse_rx.c @@ -284,7 +284,7 @@ static struct rx_node *_node(struct dm_pool *mem, int type, struct rx_node *n = dm_pool_zalloc(mem, sizeof(*n)); if (n) { - if (!(n->charset = dm_bitset_create(mem, 256))) { + if (type == CHARSET && !(n->charset = dm_bitset_create(mem, 256))) { dm_pool_free(mem, n); return NULL; } diff --git a/libdm/regex/parse_rx.h b/libdm/regex/parse_rx.h index 4a5d8b94d..37d7ced8a 100644 --- a/libdm/regex/parse_rx.h +++ b/libdm/regex/parse_rx.h @@ -41,6 +41,7 @@ struct rx_node { struct rx_node *left, *right; /* used to build the dfa for the toker */ + unsigned charset_index; int nullable, final; dm_bitset_t firstpos; dm_bitset_t lastpos;