quickjs/libregexp.c

2611 lines
82 KiB
C
Raw Permalink Normal View History

2020-09-06 16:53:08 +00:00
/*
* Regular Expression Engine
*
* Copyright (c) 2017-2018 Fabrice Bellard
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <inttypes.h>
#include <string.h>
#include <assert.h>
#include "cutils.h"
#include "libregexp.h"
/*
TODO:
- Add full unicode canonicalize rules for character ranges (not
really useful but needed for exact "ignorecase" compatibility).
- Add a lock step execution mode (=linear time execution guaranteed)
when the regular expression is "simple" i.e. no backreference nor
complicated lookahead. The opcodes are designed for this execution
model.
*/
#if defined(TEST)
#define DUMP_REOP
#endif
typedef enum {
#define DEF(id, size) REOP_ ## id,
#include "libregexp-opcode.h"
#undef DEF
REOP_COUNT,
} REOPCodeEnum;
#define CAPTURE_COUNT_MAX 255
#define STACK_SIZE_MAX 255
/* unicode code points */
#define CP_LS 0x2028
#define CP_PS 0x2029
#define TMP_BUF_SIZE 128
typedef struct {
DynBuf byte_code;
const uint8_t *buf_ptr;
const uint8_t *buf_end;
const uint8_t *buf_start;
int re_flags;
BOOL is_utf16;
BOOL ignore_case;
BOOL dotall;
int capture_count;
int total_capture_count; /* -1 = not computed yet */
int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
2020-11-08 13:30:56 +00:00
void *opaque;
2020-09-06 16:53:08 +00:00
DynBuf group_names;
union {
char error_msg[TMP_BUF_SIZE];
char tmp_buf[TMP_BUF_SIZE];
} u;
} REParseState;
typedef struct {
#ifdef DUMP_REOP
const char *name;
#endif
uint8_t size;
} REOpCode;
static const REOpCode reopcode_info[REOP_COUNT] = {
#ifdef DUMP_REOP
#define DEF(id, size) { #id, size },
#else
#define DEF(id, size) { size },
#endif
#include "libregexp-opcode.h"
#undef DEF
};
#define RE_HEADER_FLAGS 0
#define RE_HEADER_CAPTURE_COUNT 1
#define RE_HEADER_STACK_SIZE 2
#define RE_HEADER_LEN 7
static inline int is_digit(int c) {
return c >= '0' && c <= '9';
}
2020-09-06 17:07:30 +00:00
/* insert 'len' bytes at position 'pos'. Return < 0 if error. */
static int dbuf_insert(DynBuf *s, int pos, int len)
2020-09-06 16:53:08 +00:00
{
2020-09-06 17:07:30 +00:00
if (dbuf_realloc(s, s->size + len))
return -1;
2020-09-06 16:53:08 +00:00
memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
s->size += len;
2020-09-06 17:07:30 +00:00
return 0;
2020-09-06 16:53:08 +00:00
}
/* canonicalize with the specific JS regexp rules */
static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16)
{
uint32_t res[LRE_CC_RES_LEN_MAX];
int len;
if (is_utf16) {
if (likely(c < 128)) {
if (c >= 'A' && c <= 'Z')
c = c - 'A' + 'a';
} else {
lre_case_conv(res, c, 2);
c = res[0];
}
} else {
if (likely(c < 128)) {
if (c >= 'a' && c <= 'z')
c = c - 'a' + 'A';
} else {
/* legacy regexp: to upper case if single char >= 128 */
len = lre_case_conv(res, c, FALSE);
if (len == 1 && res[0] >= 128)
c = res[0];
}
}
return c;
}
static const uint16_t char_range_d[] = {
1,
0x0030, 0x0039 + 1,
};
/* code point ranges for Zs,Zl or Zp property */
static const uint16_t char_range_s[] = {
10,
0x0009, 0x000D + 1,
0x0020, 0x0020 + 1,
0x00A0, 0x00A0 + 1,
0x1680, 0x1680 + 1,
0x2000, 0x200A + 1,
/* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
/* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
0x2028, 0x2029 + 1,
0x202F, 0x202F + 1,
0x205F, 0x205F + 1,
0x3000, 0x3000 + 1,
/* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
0xFEFF, 0xFEFF + 1,
};
BOOL lre_is_space(int c)
{
int i, n, low, high;
n = (countof(char_range_s) - 1) / 2;
for(i = 0; i < n; i++) {
low = char_range_s[2 * i + 1];
if (c < low)
return FALSE;
high = char_range_s[2 * i + 2];
if (c < high)
return TRUE;
}
return FALSE;
}
uint32_t const lre_id_start_table_ascii[4] = {
/* $ A-Z _ a-z */
0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
};
uint32_t const lre_id_continue_table_ascii[4] = {
/* $ 0-9 A-Z _ a-z */
0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE
};
static const uint16_t char_range_w[] = {
4,
0x0030, 0x0039 + 1,
0x0041, 0x005A + 1,
0x005F, 0x005F + 1,
0x0061, 0x007A + 1,
};
#define CLASS_RANGE_BASE 0x40000000
typedef enum {
CHAR_RANGE_d,
CHAR_RANGE_D,
CHAR_RANGE_s,
CHAR_RANGE_S,
CHAR_RANGE_w,
CHAR_RANGE_W,
} CharRangeEnum;
static const uint16_t *char_range_table[] = {
char_range_d,
char_range_s,
char_range_w,
};
static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
{
BOOL invert;
const uint16_t *c_pt;
int len, i;
invert = c & 1;
c_pt = char_range_table[c >> 1];
len = *c_pt++;
2020-11-08 13:30:56 +00:00
cr_init(cr, s->opaque, lre_realloc);
2020-09-06 16:53:08 +00:00
for(i = 0; i < len * 2; i++) {
if (cr_add_point(cr, c_pt[i]))
goto fail;
}
if (invert) {
if (cr_invert(cr))
goto fail;
}
return 0;
fail:
cr_free(cr);
return -1;
}
static int cr_canonicalize(CharRange *cr)
{
CharRange a;
uint32_t pt[2];
int i, ret;
cr_init(&a, cr->mem_opaque, lre_realloc);
pt[0] = 'a';
pt[1] = 'z' + 1;
ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER);
if (ret)
goto fail;
/* convert to upper case */
/* XXX: the generic unicode case would be much more complicated
and not really useful */
for(i = 0; i < a.len; i++) {
a.points[i] += 'A' - 'a';
}
/* Note: for simplicity we keep the lower case ranges */
ret = cr_union1(cr, a.points, a.len);
fail:
cr_free(&a);
return ret;
}
#ifdef DUMP_REOP
static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
int buf_len)
{
int pos, len, opcode, bc_len, re_flags, i;
uint32_t val;
assert(buf_len >= RE_HEADER_LEN);
re_flags= buf[0];
bc_len = get_u32(buf + 3);
assert(bc_len + RE_HEADER_LEN <= buf_len);
printf("flags: 0x%x capture_count=%d stack_size=%d\n",
re_flags, buf[1], buf[2]);
if (re_flags & LRE_FLAG_NAMED_GROUPS) {
const char *p;
p = (char *)buf + RE_HEADER_LEN + bc_len;
printf("named groups: ");
for(i = 1; i < buf[1]; i++) {
if (i != 1)
printf(",");
printf("<%s>", p);
p += strlen(p) + 1;
}
printf("\n");
assert(p == (char *)(buf + buf_len));
}
printf("bytecode_len=%d\n", bc_len);
buf += RE_HEADER_LEN;
pos = 0;
while (pos < bc_len) {
printf("%5u: ", pos);
opcode = buf[pos];
len = reopcode_info[opcode].size;
if (opcode >= REOP_COUNT) {
printf(" invalid opcode=0x%02x\n", opcode);
break;
}
if ((pos + len) > bc_len) {
printf(" buffer overflow (opcode=0x%02x)\n", opcode);
break;
}
printf("%s", reopcode_info[opcode].name);
switch(opcode) {
case REOP_char:
val = get_u16(buf + pos + 1);
if (val >= ' ' && val <= 126)
printf(" '%c'", val);
else
printf(" 0x%04x", val);
break;
case REOP_char32:
val = get_u32(buf + pos + 1);
if (val >= ' ' && val <= 126)
printf(" '%c'", val);
else
printf(" 0x%08x", val);
break;
case REOP_goto:
case REOP_split_goto_first:
case REOP_split_next_first:
case REOP_loop:
case REOP_lookahead:
case REOP_negative_lookahead:
case REOP_bne_char_pos:
val = get_u32(buf + pos + 1);
val += (pos + 5);
printf(" %u", val);
break;
case REOP_simple_greedy_quant:
printf(" %u %u %u %u",
get_u32(buf + pos + 1) + (pos + 17),
get_u32(buf + pos + 1 + 4),
get_u32(buf + pos + 1 + 8),
get_u32(buf + pos + 1 + 12));
break;
case REOP_save_start:
case REOP_save_end:
case REOP_back_reference:
case REOP_backward_back_reference:
printf(" %u", buf[pos + 1]);
break;
case REOP_save_reset:
printf(" %u %u", buf[pos + 1], buf[pos + 2]);
break;
case REOP_push_i32:
val = get_u32(buf + pos + 1);
printf(" %d", val);
break;
case REOP_range:
{
int n, i;
n = get_u16(buf + pos + 1);
len += n * 4;
for(i = 0; i < n * 2; i++) {
val = get_u16(buf + pos + 3 + i * 2);
printf(" 0x%04x", val);
}
}
break;
case REOP_range32:
{
int n, i;
n = get_u16(buf + pos + 1);
len += n * 8;
for(i = 0; i < n * 2; i++) {
val = get_u32(buf + pos + 3 + i * 4);
printf(" 0x%08x", val);
}
}
break;
default:
break;
}
printf("\n");
pos += len;
}
}
#endif
static void re_emit_op(REParseState *s, int op)
{
dbuf_putc(&s->byte_code, op);
}
/* return the offset of the u32 value */
static int re_emit_op_u32(REParseState *s, int op, uint32_t val)
{
int pos;
dbuf_putc(&s->byte_code, op);
pos = s->byte_code.size;
dbuf_put_u32(&s->byte_code, val);
return pos;
}
static int re_emit_goto(REParseState *s, int op, uint32_t val)
{
int pos;
dbuf_putc(&s->byte_code, op);
pos = s->byte_code.size;
dbuf_put_u32(&s->byte_code, val - (pos + 4));
return pos;
}
static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
{
dbuf_putc(&s->byte_code, op);
dbuf_putc(&s->byte_code, val);
}
static void re_emit_op_u16(REParseState *s, int op, uint32_t val)
{
dbuf_putc(&s->byte_code, op);
dbuf_put_u16(&s->byte_code, val);
}
static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
va_end(ap);
return -1;
}
2020-09-06 17:07:30 +00:00
static int re_parse_out_of_memory(REParseState *s)
{
return re_parse_error(s, "out of memory");
}
2020-09-06 17:02:03 +00:00
/* If allow_overflow is false, return -1 in case of
overflow. Otherwise return INT32_MAX. */
static int parse_digits(const uint8_t **pp, BOOL allow_overflow)
2020-09-06 16:53:08 +00:00
{
const uint8_t *p;
uint64_t v;
int c;
p = *pp;
v = 0;
for(;;) {
c = *p;
if (c < '0' || c > '9')
break;
v = v * 10 + c - '0';
2020-09-06 17:02:03 +00:00
if (v >= INT32_MAX) {
if (allow_overflow)
v = INT32_MAX;
else
return -1;
}
2020-09-06 16:53:08 +00:00
p++;
}
*pp = p;
return v;
}
static int re_parse_expect(REParseState *s, const uint8_t **pp, int c)
{
const uint8_t *p;
p = *pp;
if (*p != c)
return re_parse_error(s, "expecting '%c'", c);
p++;
*pp = p;
return 0;
}
/* Parse an escape sequence, *pp points after the '\':
allow_utf16 value:
0 : no UTF-16 escapes allowed
1 : UTF-16 escapes allowed
2 : UTF-16 escapes allowed and escapes of surrogate pairs are
converted to a unicode character (unicode regexp case).
Return the unicode char and update *pp if recognized,
return -1 if malformed escape,
return -2 otherwise. */
int lre_parse_escape(const uint8_t **pp, int allow_utf16)
{
const uint8_t *p;
uint32_t c;
p = *pp;
c = *p++;
switch(c) {
case 'b':
c = '\b';
break;
case 'f':
c = '\f';
break;
case 'n':
c = '\n';
break;
case 'r':
c = '\r';
break;
case 't':
c = '\t';
break;
case 'v':
c = '\v';
break;
case 'x':
case 'u':
{
int h, n, i;
uint32_t c1;
if (*p == '{' && allow_utf16) {
p++;
c = 0;
for(;;) {
h = from_hex(*p++);
if (h < 0)
return -1;
c = (c << 4) | h;
if (c > 0x10FFFF)
return -1;
if (*p == '}')
break;
}
p++;
} else {
if (c == 'x') {
n = 2;
} else {
n = 4;
}
c = 0;
for(i = 0; i < n; i++) {
h = from_hex(*p++);
if (h < 0) {
return -1;
}
c = (c << 4) | h;
}
if (c >= 0xd800 && c < 0xdc00 &&
allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
/* convert an escaped surrogate pair into a
unicode char */
c1 = 0;
for(i = 0; i < 4; i++) {
h = from_hex(p[2 + i]);
if (h < 0)
break;
c1 = (c1 << 4) | h;
}
if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) {
p += 6;
c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
}
}
}
}
break;
2020-09-06 17:10:15 +00:00
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
2020-09-06 16:53:08 +00:00
c -= '0';
if (allow_utf16 == 2) {
/* only accept \0 not followed by digit */
if (c != 0 || is_digit(*p))
return -1;
} else {
/* parse a legacy octal sequence */
uint32_t v;
v = *p - '0';
if (v > 7)
break;
c = (c << 3) | v;
p++;
if (c >= 32)
break;
v = *p - '0';
if (v > 7)
break;
c = (c << 3) | v;
p++;
}
break;
default:
return -2;
}
*pp = p;
return c;
}
#ifdef CONFIG_ALL_UNICODE
/* XXX: we use the same chars for name and value */
static BOOL is_unicode_char(int c)
{
return ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'Z') ||
(c >= 'a' && c <= 'z') ||
(c == '_'));
}
static int parse_unicode_property(REParseState *s, CharRange *cr,
const uint8_t **pp, BOOL is_inv)
{
const uint8_t *p;
char name[64], value[64];
char *q;
BOOL script_ext;
int ret;
p = *pp;
if (*p != '{')
return re_parse_error(s, "expecting '{' after \\p");
p++;
q = name;
while (is_unicode_char(*p)) {
2020-11-08 13:30:56 +00:00
if ((q - name) >= sizeof(name) - 1)
2020-09-06 16:53:08 +00:00
goto unknown_property_name;
*q++ = *p++;
}
*q = '\0';
q = value;
if (*p == '=') {
p++;
while (is_unicode_char(*p)) {
2020-11-08 13:30:56 +00:00
if ((q - value) >= sizeof(value) - 1)
2020-09-06 16:53:08 +00:00
return re_parse_error(s, "unknown unicode property value");
*q++ = *p++;
}
}
*q = '\0';
if (*p != '}')
return re_parse_error(s, "expecting '}'");
p++;
// printf("name=%s value=%s\n", name, value);
if (!strcmp(name, "Script") || !strcmp(name, "sc")) {
script_ext = FALSE;
goto do_script;
} else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
script_ext = TRUE;
do_script:
2020-11-08 13:30:56 +00:00
cr_init(cr, s->opaque, lre_realloc);
2020-09-06 16:53:08 +00:00
ret = unicode_script(cr, value, script_ext);
if (ret) {
cr_free(cr);
if (ret == -2)
return re_parse_error(s, "unknown unicode script");
else
goto out_of_memory;
}
} else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
2020-11-08 13:30:56 +00:00
cr_init(cr, s->opaque, lre_realloc);
2020-09-06 16:53:08 +00:00
ret = unicode_general_category(cr, value);
if (ret) {
cr_free(cr);
if (ret == -2)
return re_parse_error(s, "unknown unicode general category");
else
goto out_of_memory;
}
} else if (value[0] == '\0') {
2020-11-08 13:30:56 +00:00
cr_init(cr, s->opaque, lre_realloc);
2020-09-06 16:53:08 +00:00
ret = unicode_general_category(cr, name);
if (ret == -1) {
cr_free(cr);
goto out_of_memory;
}
if (ret < 0) {
ret = unicode_prop(cr, name);
if (ret) {
cr_free(cr);
if (ret == -2)
goto unknown_property_name;
else
goto out_of_memory;
}
}
} else {
unknown_property_name:
return re_parse_error(s, "unknown unicode property name");
}
if (is_inv) {
if (cr_invert(cr)) {
cr_free(cr);
return -1;
}
}
*pp = p;
return 0;
out_of_memory:
2020-09-06 17:07:30 +00:00
return re_parse_out_of_memory(s);
2020-09-06 16:53:08 +00:00
}
#endif /* CONFIG_ALL_UNICODE */
/* return -1 if error otherwise the character or a class range
(CLASS_RANGE_BASE). In case of class range, 'cr' is
initialized. Otherwise, it is ignored. */
static int get_class_atom(REParseState *s, CharRange *cr,
const uint8_t **pp, BOOL inclass)
{
const uint8_t *p;
uint32_t c;
int ret;
p = *pp;
c = *p;
switch(c) {
case '\\':
p++;
if (p >= s->buf_end)
goto unexpected_end;
c = *p++;
switch(c) {
case 'd':
c = CHAR_RANGE_d;
goto class_range;
case 'D':
c = CHAR_RANGE_D;
goto class_range;
case 's':
c = CHAR_RANGE_s;
goto class_range;
case 'S':
c = CHAR_RANGE_S;
goto class_range;
case 'w':
c = CHAR_RANGE_w;
goto class_range;
case 'W':
c = CHAR_RANGE_W;
class_range:
if (cr_init_char_range(s, cr, c))
return -1;
c = CLASS_RANGE_BASE;
break;
case 'c':
c = *p;
if ((c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(((c >= '0' && c <= '9') || c == '_') &&
inclass && !s->is_utf16)) { /* Annex B.1.4 */
c &= 0x1f;
p++;
} else if (s->is_utf16) {
goto invalid_escape;
} else {
/* otherwise return '\' and 'c' */
p--;
c = '\\';
}
break;
#ifdef CONFIG_ALL_UNICODE
case 'p':
case 'P':
if (s->is_utf16) {
if (parse_unicode_property(s, cr, &p, (c == 'P')))
return -1;
c = CLASS_RANGE_BASE;
break;
}
/* fall thru */
#endif
default:
p--;
ret = lre_parse_escape(&p, s->is_utf16 * 2);
if (ret >= 0) {
c = ret;
} else {
if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
/* always valid to escape these characters */
goto normal_char;
} else if (s->is_utf16) {
invalid_escape:
return re_parse_error(s, "invalid escape sequence in regular expression");
} else {
/* just ignore the '\' */
goto normal_char;
}
}
break;
}
break;
case '\0':
if (p >= s->buf_end) {
unexpected_end:
return re_parse_error(s, "unexpected end");
}
/* fall thru */
default:
normal_char:
/* normal char */
if (c >= 128) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
if ((unsigned)c > 0xffff && !s->is_utf16) {
/* XXX: should handle non BMP-1 code points */
return re_parse_error(s, "malformed unicode char");
}
} else {
p++;
}
break;
}
*pp = p;
return c;
}
static int re_emit_range(REParseState *s, const CharRange *cr)
{
int len, i;
uint32_t high;
len = (unsigned)cr->len / 2;
if (len >= 65535)
return re_parse_error(s, "too many ranges");
if (len == 0) {
/* not sure it can really happen. Emit a match that is always
false */
re_emit_op_u32(s, REOP_char32, -1);
} else {
high = cr->points[cr->len - 1];
if (high == UINT32_MAX)
high = cr->points[cr->len - 2];
if (high <= 0xffff) {
/* can use 16 bit ranges with the conversion that 0xffff =
infinity */
re_emit_op_u16(s, REOP_range, len);
for(i = 0; i < cr->len; i += 2) {
dbuf_put_u16(&s->byte_code, cr->points[i]);
high = cr->points[i + 1] - 1;
if (high == UINT32_MAX - 1)
high = 0xffff;
dbuf_put_u16(&s->byte_code, high);
}
} else {
re_emit_op_u16(s, REOP_range32, len);
for(i = 0; i < cr->len; i += 2) {
dbuf_put_u32(&s->byte_code, cr->points[i]);
dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
}
}
}
return 0;
}
static int re_parse_char_class(REParseState *s, const uint8_t **pp)
{
const uint8_t *p;
uint32_t c1, c2;
CharRange cr_s, *cr = &cr_s;
CharRange cr1_s, *cr1 = &cr1_s;
BOOL invert;
2020-11-08 13:30:56 +00:00
cr_init(cr, s->opaque, lre_realloc);
2020-09-06 16:53:08 +00:00
p = *pp;
p++; /* skip '[' */
invert = FALSE;
if (*p == '^') {
p++;
invert = TRUE;
}
for(;;) {
if (*p == ']')
break;
c1 = get_class_atom(s, cr1, &p, TRUE);
if ((int)c1 < 0)
goto fail;
if (*p == '-' && p[1] != ']') {
const uint8_t *p0 = p + 1;
if (c1 >= CLASS_RANGE_BASE) {
if (s->is_utf16) {
cr_free(cr1);
goto invalid_class_range;
}
/* Annex B: match '-' character */
goto class_atom;
}
c2 = get_class_atom(s, cr1, &p0, TRUE);
if ((int)c2 < 0)
goto fail;
if (c2 >= CLASS_RANGE_BASE) {
cr_free(cr1);
if (s->is_utf16) {
goto invalid_class_range;
}
/* Annex B: match '-' character */
goto class_atom;
}
p = p0;
if (c2 < c1) {
invalid_class_range:
re_parse_error(s, "invalid class range");
goto fail;
}
if (cr_union_interval(cr, c1, c2))
goto memory_error;
} else {
class_atom:
if (c1 >= CLASS_RANGE_BASE) {
int ret;
ret = cr_union1(cr, cr1->points, cr1->len);
cr_free(cr1);
if (ret)
goto memory_error;
} else {
if (cr_union_interval(cr, c1, c1))
goto memory_error;
}
}
}
if (s->ignore_case) {
if (cr_canonicalize(cr))
goto memory_error;
}
if (invert) {
if (cr_invert(cr))
goto memory_error;
}
if (re_emit_range(s, cr))
goto fail;
cr_free(cr);
p++; /* skip ']' */
*pp = p;
return 0;
memory_error:
2020-09-06 17:07:30 +00:00
re_parse_out_of_memory(s);
2020-09-06 16:53:08 +00:00
fail:
cr_free(cr);
return -1;
}
/* Return:
1 if the opcodes in bc_buf[] always advance the character pointer.
0 if the character pointer may not be advanced.
-1 if the code may depend on side effects of its previous execution (backreference)
*/
static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
{
int pos, opcode, ret, len, i;
uint32_t val, last;
BOOL has_back_reference;
uint8_t capture_bitmap[CAPTURE_COUNT_MAX];
ret = -2; /* not known yet */
pos = 0;
has_back_reference = FALSE;
memset(capture_bitmap, 0, sizeof(capture_bitmap));
while (pos < bc_buf_len) {
opcode = bc_buf[pos];
len = reopcode_info[opcode].size;
switch(opcode) {
case REOP_range:
val = get_u16(bc_buf + pos + 1);
len += val * 4;
goto simple_char;
case REOP_range32:
val = get_u16(bc_buf + pos + 1);
len += val * 8;
goto simple_char;
case REOP_char:
case REOP_char32:
case REOP_dot:
case REOP_any:
simple_char:
if (ret == -2)
ret = 1;
break;
case REOP_line_start:
case REOP_line_end:
case REOP_push_i32:
case REOP_push_char_pos:
case REOP_drop:
case REOP_word_boundary:
case REOP_not_word_boundary:
case REOP_prev:
/* no effect */
break;
case REOP_save_start:
case REOP_save_end:
val = bc_buf[pos + 1];
capture_bitmap[val] |= 1;
break;
case REOP_save_reset:
{
val = bc_buf[pos + 1];
last = bc_buf[pos + 2];
while (val < last)
capture_bitmap[val++] |= 1;
}
break;
case REOP_back_reference:
case REOP_backward_back_reference:
val = bc_buf[pos + 1];
capture_bitmap[val] |= 2;
has_back_reference = TRUE;
break;
default:
/* safe behvior: we cannot predict the outcome */
if (ret == -2)
ret = 0;
break;
}
pos += len;
}
if (has_back_reference) {
/* check if there is back reference which references a capture
made in the some code */
for(i = 0; i < CAPTURE_COUNT_MAX; i++) {
if (capture_bitmap[i] == 3)
return -1;
}
}
if (ret == -2)
ret = 0;
return ret;
}
/* return -1 if a simple quantifier cannot be used. Otherwise return
the number of characters in the atom. */
static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
{
int pos, opcode, len, count;
uint32_t val;
count = 0;
pos = 0;
while (pos < bc_buf_len) {
opcode = bc_buf[pos];
len = reopcode_info[opcode].size;
switch(opcode) {
case REOP_range:
val = get_u16(bc_buf + pos + 1);
len += val * 4;
goto simple_char;
case REOP_range32:
val = get_u16(bc_buf + pos + 1);
len += val * 8;
goto simple_char;
case REOP_char:
case REOP_char32:
case REOP_dot:
case REOP_any:
simple_char:
count++;
break;
case REOP_line_start:
case REOP_line_end:
case REOP_word_boundary:
case REOP_not_word_boundary:
break;
default:
return -1;
}
pos += len;
}
return count;
}
/* '*pp' is the first char after '<' */
static int re_parse_group_name(char *buf, int buf_size,
const uint8_t **pp, BOOL is_utf16)
{
const uint8_t *p;
uint32_t c;
char *q;
p = *pp;
q = buf;
for(;;) {
c = *p;
if (c == '\\') {
p++;
if (*p != 'u')
return -1;
c = lre_parse_escape(&p, is_utf16 * 2);
} else if (c == '>') {
break;
} else if (c >= 128) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
} else {
p++;
}
if (c > 0x10FFFF)
return -1;
if (q == buf) {
if (!lre_js_is_ident_first(c))
return -1;
} else {
if (!lre_js_is_ident_next(c))
return -1;
}
if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
return -1;
if (c < 128) {
*q++ = c;
} else {
q += unicode_to_utf8((uint8_t*)q, c);
}
}
if (q == buf)
return -1;
*q = '\0';
p++;
*pp = p;
return 0;
}
/* if capture_name = NULL: return the number of captures + 1.
Otherwise, return the capture index corresponding to capture_name
or -1 if none */
static int re_parse_captures(REParseState *s, int *phas_named_captures,
const char *capture_name)
{
const uint8_t *p;
int capture_index;
char name[TMP_BUF_SIZE];
capture_index = 1;
*phas_named_captures = 0;