Add a regex optimisation pass for shared character prefixes.

2024-12-21 13:34:40 +03:00 · 2010-04-20 22:31:22 +00:00 · 2010-04-20 22:31:22 +00:00 · b2bd4e2d8d
commit b2bd4e2d8d
parent 4bd1ac728c
2 changed files with 176 additions and 5 deletions
--- a/1
+++ b/1
@ -1,5 +1,6 @@
 Version 1.02.47 -
 =================================
  Add a regex optimisation pass for shared character prefixes.
  Add dm_bit_and and dm_bitset_equal to libdevmapper.
  Simplify dm_bitset_create.
  Speed up dm_bit_get_next with ffs().
--- a/libdm/regex/parse_rx.c
+++ b/libdm/regex/parse_rx.c
@ -329,19 +329,182 @@ static struct rx_node *_or_term(struct parse_sp *ps)
 	return n;
 }
 /*----------------------------------------------------------------*/
 /*
 * The optimiser spots common prefixes on either side of an 'or' node, and
 * lifts them outside the 'or' with a 'cat'.
 */
 static unsigned _leftmost_depth(struct rx_node *r)
 {
 	int count = 1;
 	while (r->type != CHARSET) {
 		count++;
 		r = r->left;
 	}
 	return count;
 }
 /*
 * FIXME: a unique key could be built up as part of the parse, to make the
 * comparison quick.  Alternatively we could use cons-hashing, and then
 * this would simply be a pointer comparison.
 */
 static int _nodes_equal(struct rx_node *l, struct rx_node *r)
 {
 	if (l->type != r->type)
 		return 0;
 	switch (l->type) {
 	case CAT:
 	case OR:
 		return _nodes_equal(l->left, r->left) &&
 			_nodes_equal(l->right, r->right);
 	case STAR:
 	case PLUS:
 	case QUEST:
 		return _nodes_equal(l->left, r->left);
 	case CHARSET:
 		return dm_bitset_equal(l->charset, r->charset);
 	}
 	/* NOTREACHED */
 	return_0;
 }
 static int _find_leftmost_common(struct rx_node *or,
                                 struct rx_node **l,
                                 struct rx_node **r)
 {
 	struct rx_node *left = or->left, *right = or->right;
 	unsigned left_depth = _leftmost_depth(left);
 	unsigned right_depth = _leftmost_depth(right);
 	while (left_depth > right_depth) {
 		left = left->left;
 		left_depth--;
 	}
 	while (right_depth > left_depth) {
 		right = right->left;
 		right_depth--;
 	}
 	while (left_depth) {
 		if (left->type == CAT && right->type == CAT) {
 			if (_nodes_equal(left->left, right->left)) {
 				*l = left;
 				*r = right;
 				return 1;
 			}
 		}
 		left = left->left;
 		right = right->left;
 		left_depth--;
 	}
 	return 0;
 }
 static struct rx_node *_pass(struct dm_pool *mem,
                             struct rx_node *r,
                             int *changed)
 {
 	/*
 	 * walk the tree, optimising every 'or' node.
 	 */
 	switch (r->type) {
 	case CAT:
 		if (!(r->left = _pass(mem, r->left, changed)))
 			return_NULL;
 		if (!(r->right = _pass(mem, r->right, changed)))
 			return_NULL;
 		break;
 	case STAR:
 	case PLUS:
 	case QUEST:
 		if (!(r->left = _pass(mem, r->left, changed)))
 			return_NULL;
 		break;
 	case OR:
 		/* It's important we optimise sub nodes first */
 		if (!(r->left = _pass(mem, r->left, changed)))
 			return_NULL;
 		if (!(r->right = _pass(mem, r->right, changed)))
 			return_NULL;
 		{
 			struct rx_node *left, *right;
 			if (_find_leftmost_common(r, &left, &right)) {
 				struct rx_node *new_r = _node(mem, CAT, left->left, r);
 				if (!new_r)
 					return_NULL;
 				memcpy(left, left->right, sizeof(*left));
 				memcpy(right, right->right, sizeof(*right));
 				r = new_r;
 				*changed = 1;
 			}
 		}
 		break;
 	case CHARSET:
 		break;
 	}
 	return r;
 }
 static struct rx_node *_optimise(struct dm_pool *mem, struct rx_node *r)
 {
 	/*
 	 * We're looking for (or (... (cat <foo> a)) (... (cat <foo> b)))
 	 * and want to turn it into (cat <foo> (or (... a) (... b)))
 	 */
 	/*
 	 * Initially done as an inefficient multipass algorithm.
 	 */
 	int changed;
 	do {
 		changed = 0;
 		r = _pass(mem, r, &changed);
 	} while (r && changed);
 	return r;
 }
 /*----------------------------------------------------------------*/
 struct rx_node *rx_parse_tok(struct dm_pool *mem,
 			     const char *begin, const char *end)
 {
 	struct rx_node *r;
 	struct parse_sp *ps = dm_pool_zalloc(mem, sizeof(*ps));
-	if (!ps) {
+	if (!ps)
-		stack;
+		return_NULL;
 		return NULL;
 	}
 	ps->mem = mem;
-	ps->charset = dm_bitset_create(mem, 256);
+	if (!(ps->charset = dm_bitset_create(mem, 256))) {
 		log_error("Regex charset allocation failed");
 		dm_pool_free(mem, ps);
 		return NULL;
 	}
 	ps->cursor = begin;
 	ps->rx_end = end;
 	_rx_get_token(ps);		/* load the first token */
@ -349,6 +512,13 @@ struct rx_node *rx_parse_tok(struct dm_pool *mem,
 	if (!(r = _or_term(ps))) {
 		log_error("Parse error in regex");
 		dm_pool_free(mem, ps);
 		return NULL;
 	}
 	if (!(r = _optimise(mem, r))) {
 		log_error("Regex optimisation error");
 		dm_pool_free(mem, ps);
 		return NULL;
 	}
 	return r;