/* * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved. * Copyright (C) 2004-2012 Red Hat, Inc. All rights reserved. * * This file is part of the device-mapper userspace tools. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU Lesser General Public License v.2.1. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "dmlib.h" #include "parse_rx.h" #include "ttree.h" #include "assert.h" struct dfa_state { struct dfa_state *next; int final; dm_bitset_t bits; struct dfa_state *lookup[256]; }; struct dm_regex { /* Instance variables for the lexer */ struct dfa_state *start; unsigned num_nodes; unsigned num_charsets; int nodes_entered; struct rx_node **nodes; int charsets_entered; struct rx_node **charsets; struct dm_pool *scratch, *mem; /* stuff for on the fly dfa calculation */ dm_bitset_t charmap[256]; dm_bitset_t dfa_copy; struct ttree *tt; dm_bitset_t bs; struct dfa_state *h, *t; }; static int _count_nodes(struct rx_node *rx) { int r = 1; if (rx->left) r += _count_nodes(rx->left); if (rx->right) r += _count_nodes(rx->right); return r; } static unsigned _count_charsets(struct rx_node *rx) { if (rx->type == CHARSET) return 1; return (rx->left ? _count_charsets(rx->left) : 0) + (rx->right ? _count_charsets(rx->right) : 0); } static void _enumerate_charsets_internal(struct rx_node *rx, unsigned *i) { if (rx->type == CHARSET) rx->charset_index = (*i)++; else { if (rx->left) _enumerate_charsets_internal(rx->left, i); if (rx->right) _enumerate_charsets_internal(rx->right, i); } } static void _enumerate_charsets(struct rx_node *rx) { unsigned i = 0; _enumerate_charsets_internal(rx, &i); } static void _fill_table(struct dm_regex *m, struct rx_node *rx) { assert((rx->type != OR) || (rx->left && rx->right)); if (rx->left) _fill_table(m, rx->left); if (rx->right) _fill_table(m, rx->right); m->nodes[m->nodes_entered++] = rx; if (rx->type == CHARSET) m->charsets[m->charsets_entered++] = rx; } static int _create_bitsets(struct dm_regex *m) { unsigned i; struct rx_node *n; for (i = 0; i < m->num_nodes; i++) { n = m->nodes[i]; if (!(n->firstpos = dm_bitset_create(m->scratch, m->num_charsets))) return_0; if (!(n->lastpos = dm_bitset_create(m->scratch, m->num_charsets))) return_0; if (!(n->followpos = dm_bitset_create(m->scratch, m->num_charsets))) return_0; } return 1; } static void _calc_functions(struct dm_regex *m) { unsigned i, j, final = 1; struct rx_node *rx, *c1, *c2; for (i = 0; i < m->num_nodes; i++) { rx = m->nodes[i]; c1 = rx->left; c2 = rx->right; if (rx->type == CHARSET && dm_bit(rx->charset, TARGET_TRANS)) rx->final = final++; switch (rx->type) { case CAT: if (c1->nullable) dm_bit_union(rx->firstpos, c1->firstpos, c2->firstpos); else dm_bit_copy(rx->firstpos, c1->firstpos); if (c2->nullable) dm_bit_union(rx->lastpos, c1->lastpos, c2->lastpos); else dm_bit_copy(rx->lastpos, c2->lastpos); rx->nullable = c1->nullable && c2->nullable; break; case PLUS: dm_bit_copy(rx->firstpos, c1->firstpos); dm_bit_copy(rx->lastpos, c1->lastpos); rx->nullable = c1->nullable; break; case OR: dm_bit_union(rx->firstpos, c1->firstpos, c2->firstpos); dm_bit_union(rx->lastpos, c1->lastpos, c2->lastpos); rx->nullable = c1->nullable || c2->nullable; break; case QUEST: case STAR: dm_bit_copy(rx->firstpos, c1->firstpos); dm_bit_copy(rx->lastpos, c1->lastpos); rx->nullable = 1; break; case CHARSET: dm_bit_set(rx->firstpos, rx->charset_index); dm_bit_set(rx->lastpos, rx->charset_index); rx->nullable = 0; break; default: log_error(INTERNAL_ERROR "Unknown calc node type"); } /* * followpos has it's own switch * because PLUS and STAR do the * same thing. */ switch (rx->type) { case CAT: for (j = 0; j < m->num_charsets; j++) { struct rx_node *n = m->charsets[j]; if (dm_bit(c1->lastpos, j)) dm_bit_union(n->followpos, n->followpos, c2->firstpos); } break; case PLUS: case STAR: for (j = 0; j < m->num_charsets; j++) { struct rx_node *n = m->charsets[j]; if (dm_bit(rx->lastpos, j)) dm_bit_union(n->followpos, n->followpos, rx->firstpos); } break; } } } static struct dfa_state *_create_dfa_state(struct dm_pool *mem) { return dm_pool_zalloc(mem, sizeof(struct dfa_state)); } static struct dfa_state *_create_state_queue(struct dm_pool *mem, struct dfa_state *dfa, dm_bitset_t bits) { if (!(dfa->bits = dm_bitset_create(mem, bits[0]))) /* first element is the size */ return_NULL; dm_bit_copy(dfa->bits, bits); dfa->next = 0; dfa->final = -1; return dfa; } static int _calc_state(struct dm_regex *m, struct dfa_state *dfa, int a) { int set_bits = 0, i; dm_bitset_t dfa_bits = dfa->bits; dm_bit_and(m->dfa_copy, m->charmap[a], dfa_bits); /* iterate through all the states in firstpos */ for (i = dm_bit_get_first(m->dfa_copy); i >= 0; i = dm_bit_get_next(m->dfa_copy, i)) { if (a == TARGET_TRANS) dfa->final = m->charsets[i]->final; dm_bit_union(m->bs, m->bs, m->charsets[i]->followpos); set_bits = 1; } if (set_bits) { struct dfa_state *tmp; struct dfa_state *ldfa = ttree_lookup(m->tt, m->bs + 1); if (!ldfa) { /* push */ if (!(ldfa = _create_dfa_state(m->mem))) return_0; ttree_insert(m->tt, m->bs + 1, ldfa); if (!(tmp = _create_state_queue(m->scratch, ldfa, m->bs))) return_0; if (!m->h) m->h = m->t = tmp; else { m->t->next = tmp; m->t = tmp; } } dfa->lookup[a] = ldfa; dm_bit_clear_all(m->bs); } return 1; } static int _calc_states(struct dm_regex *m, struct rx_node *rx) { unsigned iwidth = (m->num_charsets / DM_BITS_PER_INT) + 1; struct dfa_state *dfa; struct rx_node *n; unsigned i; int a; if (!(m->tt = ttree_create(m->scratch, iwidth))) return_0; if (!(m->bs = dm_bitset_create(m->scratch, m->num_charsets))) return_0; /* build some char maps */ for (a = 0; a < 256; a++) if (!(m->charmap[a] = dm_bitset_create(m->scratch, m->num_charsets))) return_0; for (i = 0; i < m->num_nodes; i++) { n = m->nodes[i]; if (n->type == CHARSET) { for (a = dm_bit_get_first(n->charset); a >= 0; a = dm_bit_get_next(n->charset, a)) dm_bit_set(m->charmap[a], n->charset_index); } } /* create first state */ if (!(dfa = _create_dfa_state(m->mem))) return_0; m->start = dfa; ttree_insert(m->tt, rx->firstpos + 1, dfa); /* prime the queue */ if (!(m->h = m->t = _create_state_queue(m->scratch, dfa, rx->firstpos))) return_0; if (!(m->dfa_copy = dm_bitset_create(m->scratch, m->num_charsets))) return_0; return 1; } /* * Forces all the dfa states to be calculated up front, ie. what * _calc_states() used to do before we switched to calculating on demand. */ static int _force_states(struct dm_regex *m) { int a; /* keep processing until there's nothing in the queue */ struct dfa_state *s; while ((s = m->h)) { /* pop state off front of the queue */ m->h = m->h->next; /* iterate through all the inputs for this state */ dm_bit_clear_all(m->bs); for (a = 0; a < 256; a++) if (!_calc_state(m, s, a)) return_0; } return 1; } struct dm_regex *dm_regex_create(struct dm_pool *mem, const char * const *patterns, unsigned num_patterns) { char *all, *ptr; unsigned i; size_t len = 0; struct rx_node *rx; struct dm_regex *m; struct dm_pool *scratch = mem; if (!(m = dm_pool_zalloc(mem, sizeof(*m)))) return_NULL; /* join the regexps together, delimiting with zero */ for (i = 0; i < num_patterns; i++) len += strlen(patterns[i]) + 8; ptr = all = dm_pool_alloc(scratch, len + 1); if (!all) goto_bad; for (i = 0; i < num_patterns; i++) { ptr += sprintf(ptr, "(.*(%s)%c)", patterns[i], TARGET_TRANS); if (i < (num_patterns - 1)) *ptr++ = '|'; } /* parse this expression */ if (!(rx = rx_parse_tok(scratch, all, ptr))) { log_error("Couldn't parse regex"); goto bad; } m->mem = mem; m->scratch = scratch; m->num_nodes = _count_nodes(rx); m->num_charsets = _count_charsets(rx); _enumerate_charsets(rx); if (!(m->nodes = dm_pool_alloc(scratch, sizeof(*m->nodes) * m->num_nodes))) goto_bad; if (!(m->charsets = dm_pool_alloc(scratch, sizeof(*m->charsets) * m->num_charsets))) goto_bad; _fill_table(m, rx); if (!_create_bitsets(m)) goto_bad; _calc_functions(m); if (!_calc_states(m, rx)) goto_bad; return m; bad: dm_pool_free(mem, m); return NULL; } static struct dfa_state *_step_matcher(struct dm_regex *m, int c, struct dfa_state *cs, int *r) { struct dfa_state *ns; if (!(ns = cs->lookup[(unsigned char) c])) { if (!_calc_state(m, cs, (unsigned char) c)) return_NULL; if (!(ns = cs->lookup[(unsigned char) c])) return NULL; } // yuck, we have to special case the target trans if ((ns->final == -1) && !_calc_state(m, ns, TARGET_TRANS)) return_NULL; if (ns->final && (ns->final > *r)) *r = ns->final; return ns; } int dm_regex_match(struct dm_regex *regex, const char *s) { struct dfa_state *cs = regex->start; int r = 0; dm_bit_clear_all(regex->bs); if (!(cs = _step_matcher(regex, HAT_CHAR, cs, &r))) goto out; for (; *s; s++) if (!(cs = _step_matcher(regex, *s, cs, &r))) goto out; _step_matcher(regex, DOLLAR_CHAR, cs, &r); out: /* subtract 1 to get back to zero index */ return r - 1; } /* * The next block of code concerns calculating a fingerprint for the dfa. * * We're not calculating a minimal dfa in _calculate_state (maybe a future * improvement). As such it's possible that two non-isomorphic dfas * recognise the same language. This can only really happen if you start * with equivalent, but different regexes (for example the simplifier in * parse_rx.c may have changed). * * The code is inefficient; repeatedly searching a singly linked list for * previously seen nodes. Not worried since this is test code. */ struct node_list { unsigned node_id; struct dfa_state *node; struct node_list *next; }; struct printer { struct dm_pool *mem; struct node_list *pending; struct node_list *processed; unsigned next_index; }; static uint32_t _randomise(uint32_t n) { /* 2^32 - 5 */ uint32_t const prime = (~0) - 4; return n * prime; } static int _seen(struct node_list *n, struct dfa_state *node, uint32_t *i) { while (n) { if (n->node == node) { *i = n->node_id; return 1; } n = n->next; } return 0; } /* * Push node if it's not been seen before, returning a unique index. */ static uint32_t _push_node(struct printer *p, struct dfa_state *node) { uint32_t i; struct node_list *n; if (_seen(p->pending, node, &i) || _seen(p->processed, node, &i)) return i; if (!(n = dm_pool_alloc(p->mem, sizeof(*n)))) return_0; n->node_id = ++p->next_index; /* start from 1, keep 0 as error code */ n->node = node; n->next = p->pending; p->pending = n; return n->node_id; } /* * Pop the front node, and fill out it's previously assigned index. */ static struct dfa_state *_pop_node(struct printer *p) { struct dfa_state *node = NULL; struct node_list *n; if (p->pending) { n = p->pending; p->pending = n->next; n->next = p->processed; p->processed = n; node = n->node; } return node; } static uint32_t _combine(uint32_t n1, uint32_t n2) { return ((n1 << 8) | (n1 >> 24)) ^ _randomise(n2); } static uint32_t _fingerprint(struct printer *p) { int c; uint32_t result = 0; struct dfa_state *node; while ((node = _pop_node(p))) { result = _combine(result, (node->final < 0) ? 0 : node->final); for (c = 0; c < 256; c++) result = _combine(result, _push_node(p, node->lookup[c])); } return result; } uint32_t dm_regex_fingerprint(struct dm_regex *regex) { struct printer p; uint32_t result = 0; struct dm_pool *mem = dm_pool_create("regex fingerprint", 1024); if (!mem) return_0; if (!_force_states(regex)) goto_out; p.mem = mem; p.pending = NULL; p.processed = NULL; p.next_index = 0; if (!_push_node(&p, regex->start)) goto_out; result = _fingerprint(&p); out: dm_pool_destroy(mem); return result; }