1
0
mirror of git://sourceware.org/git/lvm2.git synced 2024-12-22 17:35:59 +03:00
lvm2/libdm/regex/parse_rx.c
Joe Thornber 7def56f1e4 [REGEX] Parse regexes that contain chars with value > 0x80
This is a long standing issue.  Fixed by casting a char value to
unsigned char before using it as an index into a bitset.
2010-08-09 10:29:42 +00:00

667 lines
13 KiB
C

/*
* Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
* Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
*
* This file is part of the device-mapper userspace tools.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
* of the GNU Lesser General Public License v.2.1.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "dmlib.h"
#include "parse_rx.h"
#ifdef DEBUG
#include <ctype.h>
static void _regex_print(struct rx_node *rx, int depth, unsigned show_nodes)
{
int i, numchars;
if (rx->left) {
if (rx->left->type != CHARSET && (show_nodes || (!((rx->type == CAT || rx->type == OR) && rx->left->type == CAT))))
printf("(");
_regex_print(rx->left, depth + 1, show_nodes);
if (rx->left->type != CHARSET && (show_nodes || (!((rx->type == CAT || rx->type == OR) && rx->left->type == CAT))))
printf(")");
}
/* display info about the node */
switch (rx->type) {
case CAT:
break;
case OR:
printf("|");
break;
case STAR:
printf("*");
break;
case PLUS:
printf("+");
break;
case QUEST:
printf("?");
break;
case CHARSET:
numchars = 0;
for (i = 0; i < 256; i++)
if (dm_bit(rx->charset, i) && (isprint(i) || i == HAT_CHAR || i == DOLLAR_CHAR))
numchars++;
if (numchars == 97) {
printf(".");
break;
}
if (numchars > 1)
printf("[");
for (i = 0; i < 256; i++)
if (dm_bit(rx->charset, i)) {
if isprint(i)
printf("%c", (char) i);
else if (i == HAT_CHAR)
printf("^");
else if (i == DOLLAR_CHAR)
printf("$");
}
if (numchars > 1)
printf("]");
break;
default:
fprintf(stderr, "Unknown type");
}
if (rx->right) {
if (rx->right->type != CHARSET && (show_nodes || (!(rx->type == CAT && rx->right->type == CAT) && rx->right->right)))
printf("(");
_regex_print(rx->right, depth + 1, show_nodes);
if (rx->right->type != CHARSET && (show_nodes || (!(rx->type == CAT && rx->right->type == CAT) && rx->right->right)))
printf(")");
}
if (!depth)
printf("\n");
}
#endif /* DEBUG */
struct parse_sp { /* scratch pad for the parsing process */
struct dm_pool *mem;
int type; /* token type, 0 indicates a charset */
dm_bitset_t charset; /* The current charset */
const char *cursor; /* where we are in the regex */
const char *rx_end; /* 1pte for the expression being parsed */
};
static struct rx_node *_or_term(struct parse_sp *ps);
static void _single_char(struct parse_sp *ps, unsigned int c, const char *ptr)
{
ps->type = 0;
ps->cursor = ptr + 1;
dm_bit_clear_all(ps->charset);
dm_bit_set(ps->charset, c);
}
/*
* Get the next token from the regular expression.
* Returns: 1 success, 0 end of input, -1 error.
*/
static int _rx_get_token(struct parse_sp *ps)
{
int neg = 0, range = 0;
char c, lc = 0;
const char *ptr = ps->cursor;
if (ptr == ps->rx_end) { /* end of input ? */
ps->type = -1;
return 0;
}
switch (*ptr) {
/* charsets and ncharsets */
case '[':
ptr++;
if (*ptr == '^') {
dm_bit_set_all(ps->charset);
/* never transition on zero */
dm_bit_clear(ps->charset, 0);
neg = 1;
ptr++;
} else
dm_bit_clear_all(ps->charset);
while ((ptr < ps->rx_end) && (*ptr != ']')) {
if (*ptr == '\\') {
/* an escaped character */
ptr++;
switch (*ptr) {
case 'n':
c = '\n';
break;
case 'r':
c = '\r';
break;
case 't':
c = '\t';
break;
default:
c = *ptr;
}
} else if (*ptr == '-' && lc) {
/* we've got a range on our hands */
range = 1;
ptr++;
if (ptr == ps->rx_end) {
log_error("Incomplete range"
"specification");
return -1;
}
c = *ptr;
} else
c = *ptr;
if (range) {
/* add lc - c into the bitset */
if (lc > c) {
char tmp = c;
c = lc;
lc = tmp;
}
for (; lc <= c; lc++) {
if (neg)
dm_bit_clear(ps->charset, lc);
else
dm_bit_set(ps->charset, lc);
}
range = 0;
} else {
/* add c into the bitset */
if (neg)
dm_bit_clear(ps->charset, c);
else
dm_bit_set(ps->charset, c);
}
ptr++;
lc = c;
}
if (ptr >= ps->rx_end) {
ps->type = -1;
return -1;
}
ps->type = 0;
ps->cursor = ptr + 1;
break;
/* These characters are special, we just return their ASCII
codes as the type. Sorted into ascending order to help the
compiler */
case '(':
case ')':
case '*':
case '+':
case '?':
case '|':
ps->type = (int) *ptr;
ps->cursor = ptr + 1;
break;
case '^':
_single_char(ps, HAT_CHAR, ptr);
break;
case '$':
_single_char(ps, DOLLAR_CHAR, ptr);
break;
case '.':
/* The 'all but newline' character set */
ps->type = 0;
ps->cursor = ptr + 1;
dm_bit_set_all(ps->charset);
dm_bit_clear(ps->charset, (int) '\n');
dm_bit_clear(ps->charset, (int) '\r');
dm_bit_clear(ps->charset, 0);
break;
case '\\':
/* escaped character */
ptr++;
if (ptr >= ps->rx_end) {
log_error("Badly quoted character at end "
"of expression");
ps->type = -1;
return -1;
}
ps->type = 0;
ps->cursor = ptr + 1;
dm_bit_clear_all(ps->charset);
switch (*ptr) {
case 'n':
dm_bit_set(ps->charset, (int) '\n');
break;
case 'r':
dm_bit_set(ps->charset, (int) '\r');
break;
case 't':
dm_bit_set(ps->charset, (int) '\t');
break;
default:
dm_bit_set(ps->charset, (int) *ptr);
}
break;
default:
/* add a single character to the bitset */
ps->type = 0;
ps->cursor = ptr + 1;
dm_bit_clear_all(ps->charset);
dm_bit_set(ps->charset, (int) (unsigned char) *ptr);
break;
}
return 1;
}
static struct rx_node *_node(struct dm_pool *mem, int type,
struct rx_node *l, struct rx_node *r)
{
struct rx_node *n = dm_pool_zalloc(mem, sizeof(*n));
if (n) {
if (type == CHARSET && !(n->charset = dm_bitset_create(mem, 256))) {
dm_pool_free(mem, n);
return NULL;
}
n->type = type;
n->left = l;
n->right = r;
}
return n;
}
static struct rx_node *_term(struct parse_sp *ps)
{
struct rx_node *n;
switch (ps->type) {
case 0:
if (!(n = _node(ps->mem, CHARSET, NULL, NULL))) {
stack;
return NULL;
}
dm_bit_copy(n->charset, ps->charset);
_rx_get_token(ps); /* match charset */
break;
case '(':
_rx_get_token(ps); /* match '(' */
n = _or_term(ps);
if (ps->type != ')') {
log_error("missing ')' in regular expression");
return 0;
}
_rx_get_token(ps); /* match ')' */
break;
default:
n = 0;
}
return n;
}
static struct rx_node *_closure_term(struct parse_sp *ps)
{
struct rx_node *l, *n;
if (!(l = _term(ps)))
return NULL;
for (;;) {
switch (ps->type) {
case '*':
n = _node(ps->mem, STAR, l, NULL);
break;
case '+':
n = _node(ps->mem, PLUS, l, NULL);
break;
case '?':
n = _node(ps->mem, QUEST, l, NULL);
break;
default:
return l;
}
if (!n) {
stack;
return NULL;
}
_rx_get_token(ps);
l = n;
}
return n;
}
static struct rx_node *_cat_term(struct parse_sp *ps)
{
struct rx_node *l, *r, *n;
if (!(l = _closure_term(ps)))
return NULL;
if (ps->type == '|')
return l;
if (!(r = _cat_term(ps)))
return l;
if (!(n = _node(ps->mem, CAT, l, r)))
stack;
return n;
}
static struct rx_node *_or_term(struct parse_sp *ps)
{
struct rx_node *l, *r, *n;
if (!(l = _cat_term(ps)))
return NULL;
if (ps->type != '|')
return l;
_rx_get_token(ps); /* match '|' */
if (!(r = _or_term(ps))) {
log_error("Badly formed 'or' expression");
return NULL;
}
if (!(n = _node(ps->mem, OR, l, r)))
stack;
return n;
}
/*----------------------------------------------------------------*/
/* Macros for left and right nodes. Inverted if 'leftmost' is set. */
#define LEFT(a) (leftmost ? (a)->left : (a)->right)
#define RIGHT(a) (leftmost ? (a)->right : (a)->left)
/*
* The optimiser spots common prefixes on either side of an 'or' node, and
* lifts them outside the 'or' with a 'cat'.
*/
static unsigned _depth(struct rx_node *r, unsigned leftmost)
{
int count = 1;
while (r->type != CHARSET && LEFT(r) && (leftmost || r->type != OR)) {
count++;
r = LEFT(r);
}
return count;
}
/*
* FIXME: a unique key could be built up as part of the parse, to make the
* comparison quick. Alternatively we could use cons-hashing, and then
* this would simply be a pointer comparison.
*/
static int _nodes_equal(struct rx_node *l, struct rx_node *r)
{
if (l->type != r->type)
return 0;
switch (l->type) {
case CAT:
case OR:
return _nodes_equal(l->left, r->left) &&
_nodes_equal(l->right, r->right);
case STAR:
case PLUS:
case QUEST:
return _nodes_equal(l->left, r->left);
case CHARSET:
/*
* Never change anything containing TARGET_TRANS
* used by matcher as boundary marker between concatenated
* expressions.
*/
return (!dm_bit(l->charset, TARGET_TRANS) && dm_bitset_equal(l->charset, r->charset));
}
/* NOTREACHED */
return_0;
}
static int _find_leftmost_common(struct rx_node *or,
struct rx_node **l,
struct rx_node **r,
unsigned leftmost)
{
struct rx_node *left = or->left, *right = or->right;
unsigned left_depth = _depth(left, leftmost);
unsigned right_depth = _depth(right, leftmost);
while (left_depth > right_depth) {
left = LEFT(left);
left_depth--;
}
while (right_depth > left_depth) {
right = LEFT(right);
right_depth--;
}
while (left_depth) {
if (left->type == CAT && right->type == CAT) {
if (_nodes_equal(LEFT(left), LEFT(right))) {
*l = left;
*r = right;
return 1;
}
}
left = LEFT(left);
right = LEFT(right);
left_depth--;
}
return 0;
}
/* If top node is OR, rotate (leftmost example) from ((ab)|((ac)|d)) to (((ab)|(ac))|d) */
static int _rotate_ors(struct rx_node *r, unsigned leftmost)
{
struct rx_node *old_node;
if (r->type != OR || RIGHT(r)->type != OR)
return 0;
old_node = RIGHT(r);
if (leftmost) {
r->right = RIGHT(old_node);
old_node->right = LEFT(old_node);
old_node->left = LEFT(r);
r->left = old_node;
} else {
r->left = RIGHT(old_node);
old_node->left = LEFT(old_node);
old_node->right = LEFT(r);
r->right = old_node;
}
return 1;
}
static struct rx_node *_exchange_nodes(struct dm_pool *mem, struct rx_node *r,
struct rx_node *left_cat, struct rx_node *right_cat,
unsigned leftmost)
{
struct rx_node *new_r;
if (leftmost)
new_r = _node(mem, CAT, LEFT(left_cat), r);
else
new_r = _node(mem, CAT, r, LEFT(right_cat));
if (!new_r)
return_NULL;
memcpy(left_cat, RIGHT(left_cat), sizeof(*left_cat));
memcpy(right_cat, RIGHT(right_cat), sizeof(*right_cat));
return new_r;
}
static struct rx_node *_pass(struct dm_pool *mem,
struct rx_node *r,
int *changed)
{
struct rx_node *left, *right;
/*
* walk the tree, optimising every 'or' node.
*/
switch (r->type) {
case CAT:
if (!(r->left = _pass(mem, r->left, changed)))
return_NULL;
if (!(r->right = _pass(mem, r->right, changed)))
return_NULL;
break;
case STAR:
case PLUS:
case QUEST:
if (!(r->left = _pass(mem, r->left, changed)))
return_NULL;
break;
case OR:
/* It's important we optimise sub nodes first */
if (!(r->left = _pass(mem, r->left, changed)))
return_NULL;
if (!(r->right = _pass(mem, r->right, changed)))
return_NULL;
/*
* If rotate_ors changes the tree, left and right are stale,
* so just set 'changed' to repeat the search.
*
* FIXME Check we can't 'bounce' between left and right rotations here.
*/
if (_find_leftmost_common(r, &left, &right, 1)) {
if (!_rotate_ors(r, 1))
r = _exchange_nodes(mem, r, left, right, 1);
*changed = 1;
} else if (_find_leftmost_common(r, &left, &right, 0)) {
if (!_rotate_ors(r, 0))
r = _exchange_nodes(mem, r, left, right, 0);
*changed = 1;
}
break;
case CHARSET:
break;
}
return r;
}
static struct rx_node *_optimise(struct dm_pool *mem, struct rx_node *r)
{
/*
* We're looking for (or (... (cat <foo> a)) (... (cat <foo> b)))
* and want to turn it into (cat <foo> (or (... a) (... b)))
*
* (fa)|(fb) becomes f(a|b)
*/
/*
* Initially done as an inefficient multipass algorithm.
*/
int changed;
do {
changed = 0;
r = _pass(mem, r, &changed);
} while (r && changed);
return r;
}
/*----------------------------------------------------------------*/
struct rx_node *rx_parse_tok(struct dm_pool *mem,
const char *begin, const char *end)
{
struct rx_node *r;
struct parse_sp *ps = dm_pool_zalloc(mem, sizeof(*ps));
if (!ps)
return_NULL;
ps->mem = mem;
if (!(ps->charset = dm_bitset_create(mem, 256))) {
log_error("Regex charset allocation failed");
dm_pool_free(mem, ps);
return NULL;
}
ps->cursor = begin;
ps->rx_end = end;
_rx_get_token(ps); /* load the first token */
if (!(r = _or_term(ps))) {
log_error("Parse error in regex");
dm_pool_free(mem, ps);
return NULL;
}
if (!(r = _optimise(mem, r))) {
log_error("Regex optimisation error");
dm_pool_free(mem, ps);
return NULL;
}
return r;
}
struct rx_node *rx_parse_str(struct dm_pool *mem, const char *str)
{
return rx_parse_tok(mem, str, str + strlen(str));
}