1
0
mirror of git://sourceware.org/git/lvm2.git synced 2024-12-22 17:35:59 +03:00

[REGEX] add a fingerprinting facility to allow test code to compare dfas

This commit is contained in:
Joe Thornber 2010-07-20 15:32:07 +00:00
parent 3113200309
commit 78f321b32c
4 changed files with 133 additions and 0 deletions

View File

@ -1007,6 +1007,14 @@ struct dm_regex *dm_regex_create(struct dm_pool *mem, const char **patterns,
*/
int dm_regex_match(struct dm_regex *regex, const char *s);
/*
* This is useful for regression testing only. The idea is if two
* fingerprints are different, then the two dfas are certainly not
* isomorphic. If two fingerprints _are_ the same then it's very likely
* that the dfas are isomorphic.
*/
uint32_t dm_regex_fingerprint(struct dm_regex *regex);
/*********************
* reporting functions
*********************/

View File

@ -363,3 +363,126 @@ int dm_regex_match(struct dm_regex *regex, const char *s)
/* subtract 1 to get back to zero index */
return r - 1;
}
/*
* The next block of code concerns calculating a fingerprint for the dfa.
*
* We're not calculating a minimal dfa in _calculate_state (maybe a future
* improvement). As such it's possible that two non-isomorphic dfas
* recognise the same language. This can only really happen if you start
* with equivalent, but different regexes (for example the simplifier in
* parse_rx.c may have changed).
*
* The code is inefficient; repeatedly searching a singly linked list for
* previously seen nodes. Not worried since this is test code.
*/
struct node_list {
unsigned node_id;
struct dfa_state *node;
struct node_list *next;
};
struct printer {
struct dm_pool *mem;
struct node_list *pending;
struct node_list *processed;
unsigned next_index;
};
static uint32_t randomise_(uint32_t n)
{
/* 2^32 - 5 */
uint32_t const prime = (~0) - 4;
return n * prime;
}
static int seen_(struct node_list *n, struct dfa_state *node, uint32_t *i)
{
while (n) {
if (n->node == node) {
*i = n->node_id;
return 1;
}
n = n->next;
}
return 0;
}
/*
* Push node if it's not been seen before, returning a unique index.
*/
static uint32_t push_node_(struct printer *p, struct dfa_state *node)
{
uint32_t i;
if (seen_(p->pending, node, &i) ||
seen_(p->processed, node, &i))
return i;
else {
struct node_list *n = dm_pool_alloc(p->mem, sizeof(*n));
assert(n);
n->node_id = p->next_index++;
n->node = node;
n->next = p->pending;
p->pending = n;
return n->node_id;
}
}
/*
* Pop the front node, and fill out it's previously assigned index.
*/
static struct dfa_state *pop_node_(struct printer *p)
{
struct dfa_state *node = NULL;
if (p->pending) {
struct node_list *n = p->pending;
p->pending = n->next;
n->next = p->processed;
p->processed = n;
node = n->node;
}
return node;
}
static uint32_t combine_(uint32_t n1, uint32_t n2)
{
return ((n1 << 8) | (n1 >> 24)) ^ randomise_(n2);
}
static uint32_t fingerprint_(struct printer *p)
{
int c;
uint32_t result = 0;
struct dfa_state *node;
while ((node = pop_node_(p))) {
result = combine_(result, node->final);
for (c = 0; c < 256; c++)
result = combine_(result,
push_node_(p, node->lookup[c]));
}
return result;
}
uint32_t dm_regex_fingerprint(struct dm_regex *regex)
{
uint32_t result;
struct printer p;
struct dm_pool *mem = dm_pool_create("regex fingerprint", 1024);
assert(mem);
p.mem = mem;
p.pending = NULL;
p.processed = NULL;
p.next_index = 0;
push_node_(&p, regex->start);
result = fingerprint_(&p);
dm_pool_destroy(mem);
return result;
}

View File

@ -135,6 +135,7 @@ int main(int argc, char **argv)
goto err;
}
printf("fingerprint: %x\n", dm_regex_fingerprint(scanner));
_scan_input(scanner, regex);
_free_regex(regex, nregex);

View File

@ -1,4 +1,5 @@
Matcher built with 23 dfa states
fingerprint: 352b6c4f
/dev/loop/0 : loop/[0-9]+
/dev/loop/1 : loop/[0-9]+
/dev/loop/2 : loop/[0-9]+