quickjs/unicode_gen.c

3058 lines
81 KiB
C
Raw Permalink Normal View History

2020-09-06 16:53:08 +00:00
/*
* Generation of Unicode tables
*
* Copyright (c) 2017-2018 Fabrice Bellard
* Copyright (c) 2017-2018 Charlie Gordon
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <inttypes.h>
#include <string.h>
#include <assert.h>
#include <ctype.h>
#include <time.h>
#include "cutils.h"
/* define it to be able to test unicode.c */
//#define USE_TEST
/* profile tests */
//#define PROFILE
//#define DUMP_CASE_CONV_TABLE
//#define DUMP_TABLE_SIZE
//#define DUMP_CC_TABLE
//#define DUMP_DECOMP_TABLE
/* Ideas:
- Generalize run length encoding + index for all tables
- remove redundant tables for ID_start, ID_continue, Case_Ignorable, Cased
Case conversion:
- use a single entry for consecutive U/LF runs
- allow EXT runs of length > 1
Decomposition:
- Greek lower case (+1f10/1f10) ?
- allow holes in B runs
- suppress more upper / lower case redundancy
*/
#ifdef USE_TEST
#include "libunicode.c"
#endif
#define CHARCODE_MAX 0x10ffff
#define CC_LEN_MAX 3
void *mallocz(size_t size)
{
void *ptr;
ptr = malloc(size);
memset(ptr, 0, size);
return ptr;
}
const char *get_field(const char *p, int n)
{
int i;
for(i = 0; i < n; i++) {
while (*p != ';' && *p != '\0')
p++;
if (*p == '\0')
return NULL;
p++;
}
return p;
}
const char *get_field_buf(char *buf, size_t buf_size, const char *p, int n)
{
char *q;
p = get_field(p, n);
q = buf;
while (*p != ';' && *p != '\0') {
if ((q - buf) < buf_size - 1)
*q++ = *p;
p++;
}
*q = '\0';
return buf;
}
void add_char(int **pbuf, int *psize, int *plen, int c)
{
int len, size, *buf;
buf = *pbuf;
size = *psize;
len = *plen;
if (len >= size) {
size = *psize;
size = max_int(len + 1, size * 3 / 2);
buf = realloc(buf, sizeof(buf[0]) * size);
*pbuf = buf;
*psize = size;
}
buf[len++] = c;
*plen = len;
}
int *get_field_str(int *plen, const char *str, int n)
{
const char *p;
int *buf, len, size;
p = get_field(str, n);
if (!p) {
*plen = 0;
return NULL;
}
len = 0;
size = 0;
buf = NULL;
for(;;) {
while (isspace(*p))
p++;
if (!isxdigit(*p))
break;
add_char(&buf, &size, &len, strtoul(p, (char **)&p, 16));
}
*plen = len;
return buf;
}
char *get_line(char *buf, int buf_size, FILE *f)
{
int len;
if (!fgets(buf, buf_size, f))
return NULL;
len = strlen(buf);
if (len > 0 && buf[len - 1] == '\n')
buf[len - 1] = '\0';
return buf;
}
#define UNICODE_GENERAL_CATEGORY
typedef enum {
#define DEF(id, str) GCAT_ ## id,
#include "unicode_gen_def.h"
#undef DEF
GCAT_COUNT,
} UnicodeGCEnum1;
static const char *unicode_gc_name[] = {
#define DEF(id, str) #id,
#include "unicode_gen_def.h"
#undef DEF
};
static const char *unicode_gc_short_name[] = {
#define DEF(id, str) str,
#include "unicode_gen_def.h"
#undef DEF
};
#undef UNICODE_GENERAL_CATEGORY
#define UNICODE_SCRIPT
typedef enum {
#define DEF(id, str) SCRIPT_ ## id,
#include "unicode_gen_def.h"
#undef DEF
SCRIPT_COUNT,
} UnicodeScriptEnum1;
static const char *unicode_script_name[] = {
#define DEF(id, str) #id,
#include "unicode_gen_def.h"
#undef DEF
};
const char *unicode_script_short_name[] = {
#define DEF(id, str) str,
#include "unicode_gen_def.h"
#undef DEF
};
#undef UNICODE_SCRIPT
#define UNICODE_PROP_LIST
typedef enum {
#define DEF(id, str) PROP_ ## id,
#include "unicode_gen_def.h"
#undef DEF
PROP_COUNT,
} UnicodePropEnum1;
static const char *unicode_prop_name[] = {
#define DEF(id, str) #id,
#include "unicode_gen_def.h"
#undef DEF
};
static const char *unicode_prop_short_name[] = {
#define DEF(id, str) str,
#include "unicode_gen_def.h"
#undef DEF
};
#undef UNICODE_SPROP_LIST
typedef struct {
/* case conv */
uint8_t u_len;
uint8_t l_len;
int u_data[CC_LEN_MAX];
int l_data[CC_LEN_MAX];
int f_code;
uint8_t combining_class;
uint8_t is_compat:1;
uint8_t is_excluded:1;
uint8_t general_category;
uint8_t script;
uint8_t script_ext_len;
uint8_t *script_ext;
uint32_t prop_bitmap_tab[3];
/* decomposition */
int decomp_len;
int *decomp_data;
} CCInfo;
CCInfo *unicode_db;
int find_name(const char **tab, int tab_len, const char *name)
{
int i, len, name_len;
const char *p, *r;
name_len = strlen(name);
for(i = 0; i < tab_len; i++) {
p = tab[i];
for(;;) {
r = strchr(p, ',');
if (!r)
len = strlen(p);
else
len = r - p;
if (len == name_len && memcmp(p, name, len) == 0)
return i;
if (!r)
break;
p = r + 1;
}
}
return -1;
}
static int get_prop(uint32_t c, int prop_idx)
{
return (unicode_db[c].prop_bitmap_tab[prop_idx >> 5] >> (prop_idx & 0x1f)) & 1;
}
static void set_prop(uint32_t c, int prop_idx, int val)
{
uint32_t mask;
mask = 1U << (prop_idx & 0x1f);
if (val)
unicode_db[c].prop_bitmap_tab[prop_idx >> 5] |= mask;
else
unicode_db[c].prop_bitmap_tab[prop_idx >> 5] &= ~mask;
}
void parse_unicode_data(const char *filename)
{
FILE *f;
char line[1024];
char buf1[256];
const char *p;
int code, lc, uc, last_code;
CCInfo *ci, *tab = unicode_db;
f = fopen(filename, "rb");
if (!f) {
perror(filename);
exit(1);
}
last_code = 0;
for(;;) {
if (!get_line(line, sizeof(line), f))
break;
p = line;
while (isspace(*p))
p++;
if (*p == '#')
continue;
p = get_field(line, 0);
if (!p)
continue;
code = strtoul(p, NULL, 16);
lc = 0;
uc = 0;
p = get_field(line, 12);
if (p && *p != ';') {
uc = strtoul(p, NULL, 16);
}
p = get_field(line, 13);
if (p && *p != ';') {
lc = strtoul(p, NULL, 16);
}
ci = &tab[code];
if (uc > 0 || lc > 0) {
assert(code <= CHARCODE_MAX);
if (uc > 0) {
assert(ci->u_len == 0);
ci->u_len = 1;
ci->u_data[0] = uc;
}
if (lc > 0) {
assert(ci->l_len == 0);
ci->l_len = 1;
ci->l_data[0] = lc;
}
}
{
int i;
get_field_buf(buf1, sizeof(buf1), line, 2);
i = find_name(unicode_gc_name, countof(unicode_gc_name), buf1);
if (i < 0) {
fprintf(stderr, "General category '%s' not found\n",
buf1);
exit(1);
}
ci->general_category = i;
}
p = get_field(line, 3);
if (p && *p != ';' && *p != '\0') {
int cc;
cc = strtoul(p, NULL, 0);
if (cc != 0) {
assert(code <= CHARCODE_MAX);
ci->combining_class = cc;
// printf("%05x: %d\n", code, ci->combining_class);
}
}
p = get_field(line, 5);
if (p && *p != ';' && *p != '\0') {
int size;
assert(code <= CHARCODE_MAX);
ci->is_compat = 0;
if (*p == '<') {
while (*p != '\0' && *p != '>')
p++;
if (*p == '>')
p++;
ci->is_compat = 1;
}
size = 0;
for(;;) {
while (isspace(*p))
p++;
if (!isxdigit(*p))
break;
add_char(&ci->decomp_data, &size, &ci->decomp_len, strtoul(p, (char **)&p, 16));
}
#if 0
{
int i;
static int count, d_count;
printf("%05x: %c", code, ci->is_compat ? 'C': ' ');
for(i = 0; i < ci->decomp_len; i++)
printf(" %05x", ci->decomp_data[i]);
printf("\n");
count++;
d_count += ci->decomp_len;
// printf("%d %d\n", count, d_count);
}
#endif
}
p = get_field(line, 9);
if (p && *p == 'Y') {
set_prop(code, PROP_Bidi_Mirrored, 1);
}
/* handle ranges */
get_field_buf(buf1, sizeof(buf1), line, 1);
if (strstr(buf1, " Last>")) {
int i;
// printf("range: 0x%x-%0x\n", last_code, code);
assert(ci->decomp_len == 0);
assert(ci->script_ext_len == 0);
for(i = last_code + 1; i < code; i++) {
unicode_db[i] = *ci;
}
}
last_code = code;
}
fclose(f);
}
void parse_special_casing(CCInfo *tab, const char *filename)
{
FILE *f;
char line[1024];
const char *p;
int code;
CCInfo *ci;
f = fopen(filename, "rb");
if (!f) {
perror(filename);
exit(1);
}
for(;;) {
if (!get_line(line, sizeof(line), f))
break;
p = line;
while (isspace(*p))
p++;
if (*p == '#')
continue;
p = get_field(line, 0);
if (!p)
continue;
code = strtoul(p, NULL, 16);
assert(code <= CHARCODE_MAX);
ci = &tab[code];
p = get_field(line, 4);
if (p) {
/* locale dependent casing */
while (isspace(*p))
p++;
if (*p != '#' && *p != '\0')
continue;
}
p = get_field(line, 1);
if (p && *p != ';') {
ci->l_len = 0;
for(;;) {
while (isspace(*p))
p++;
if (*p == ';')
break;
assert(ci->l_len < CC_LEN_MAX);
ci->l_data[ci->l_len++] = strtoul(p, (char **)&p, 16);
}
if (ci->l_len == 1 && ci->l_data[0] == code)
ci->l_len = 0;
}
p = get_field(line, 3);
if (p && *p != ';') {
ci->u_len = 0;
for(;;) {
while (isspace(*p))
p++;
if (*p == ';')
break;
assert(ci->u_len < CC_LEN_MAX);
ci->u_data[ci->u_len++] = strtoul(p, (char **)&p, 16);
}
if (ci->u_len == 1 && ci->u_data[0] == code)
ci->u_len = 0;
}
}
fclose(f);
}
void parse_case_folding(CCInfo *tab, const char *filename)
{
FILE *f;
char line[1024];
const char *p;
int code;
CCInfo *ci;
f = fopen(filename, "rb");
if (!f) {
perror(filename);
exit(1);
}
for(;;) {
if (!get_line(line, sizeof(line), f))
break;
p = line;
while (isspace(*p))
p++;
if (*p == '#')
continue;
p = get_field(line, 0);
if (!p)
continue;
code = strtoul(p, NULL, 16);
assert(code <= CHARCODE_MAX);
ci = &tab[code];
p = get_field(line, 1);
if (!p)
continue;
/* locale dependent casing */
while (isspace(*p))
p++;
if (*p != 'C' && *p != 'S')
continue;
p = get_field(line, 2);
assert(p != 0);
assert(ci->f_code == 0);
ci->f_code = strtoul(p, NULL, 16);
assert(ci->f_code != 0 && ci->f_code != code);
}
fclose(f);
}
void parse_composition_exclusions(const char *filename)
{
FILE *f;
char line[4096], *p;
uint32_t c0;
f = fopen(filename, "rb");
if (!f) {
perror(filename);
exit(1);
}
for(;;) {
if (!get_line(line, sizeof(line), f))
break;
p = line;
while (isspace(*p))
p++;
if (*p == '#' || *p == '@' || *p == '\0')
continue;
c0 = strtoul(p, (char **)&p, 16);
assert(c0 > 0 && c0 <= CHARCODE_MAX);
unicode_db[c0].is_excluded = TRUE;
}
fclose(f);
}
void parse_derived_core_properties(const char *filename)
{
FILE *f;
char line[4096], *p, buf[256], *q;
uint32_t c0, c1, c;
int i;
f = fopen(filename, "rb");
if (!f) {
perror(filename);
exit(1);
}
for(;;) {
if (!get_line(line, sizeof(line), f))
break;
p = line;
while (isspace(*p))
p++;
if (*p == '#' || *p == '@' || *p == '\0')
continue;
c0 = strtoul(p, (char **)&p, 16);
if (*p == '.' && p[1] == '.') {
p += 2;
c1 = strtoul(p, (char **)&p, 16);
} else {
c1 = c0;
}
assert(c1 <= CHARCODE_MAX);
p += strspn(p, " \t");
if (*p == ';') {
p++;
p += strspn(p, " \t");
q = buf;
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
if ((q - buf) < sizeof(buf) - 1)
*q++ = *p;
p++;
}
*q = '\0';
i = find_name(unicode_prop_name,
countof(unicode_prop_name), buf);
if (i < 0) {
if (!strcmp(buf, "Grapheme_Link"))
goto next;
fprintf(stderr, "Property not found: %s\n", buf);
exit(1);
}
for(c = c0; c <= c1; c++) {
set_prop(c, i, 1);
}
next: ;
}
}
fclose(f);
}
void parse_derived_norm_properties(const char *filename)
{
FILE *f;
char line[4096], *p, buf[256], *q;
uint32_t c0, c1, c;
f = fopen(filename, "rb");
if (!f) {
perror(filename);
exit(1);
}
for(;;) {
if (!get_line(line, sizeof(line), f))
break;
p = line;
while (isspace(*p))
p++;
if (*p == '#' || *p == '@' || *p == '\0')
continue;
c0 = strtoul(p, (char **)&p, 16);
if (*p == '.' && p[1] == '.') {
p += 2;
c1 = strtoul(p, (char **)&p, 16);
} else {
c1 = c0;
}
assert(c1 <= CHARCODE_MAX);
p += strspn(p, " \t");
if (*p == ';') {
p++;
p += strspn(p, " \t");
q = buf;
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
if ((q - buf) < sizeof(buf) - 1)
*q++ = *p;
p++;
}
*q = '\0';
if (!strcmp(buf, "Changes_When_NFKC_Casefolded")) {
for(c = c0; c <= c1; c++) {
set_prop(c, PROP_Changes_When_NFKC_Casefolded, 1);
}
}
}
}
fclose(f);
}
void parse_prop_list(const char *filename)
{
FILE *f;
char line[4096], *p, buf[256], *q;
uint32_t c0, c1, c;
int i;
f = fopen(filename, "rb");
if (!f) {
perror(filename);
exit(1);
}
for(;;) {
if (!get_line(line, sizeof(line), f))
break;
p = line;
while (isspace(*p))
p++;
if (*p == '#' || *p == '@' || *p == '\0')
continue;
c0 = strtoul(p, (char **)&p, 16);
if (*p == '.' && p[1] == '.') {
p += 2;
c1 = strtoul(p, (char **)&p, 16);
} else {
c1 = c0;
}
assert(c1 <= CHARCODE_MAX);
p += strspn(p, " \t");
if (*p == ';') {
p++;
p += strspn(p, " \t");
q = buf;
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
if ((q - buf) < sizeof(buf) - 1)
*q++ = *p;
p++;
}
*q = '\0';
i = find_name(unicode_prop_name,
countof(unicode_prop_name), buf);
if (i < 0) {
fprintf(stderr, "Property not found: %s\n", buf);
exit(1);
}
for(c = c0; c <= c1; c++) {
set_prop(c, i, 1);
}
}
}
fclose(f);
}
void parse_scripts(const char *filename)
{
FILE *f;
char line[4096], *p, buf[256], *q;
uint32_t c0, c1, c;
int i;
f = fopen(filename, "rb");
if (!f) {
perror(filename);
exit(1);
}
for(;;) {
if (!get_line(line, sizeof(line), f))
break;
p = line;
while (isspace(*p))
p++;
if (*p == '#' || *p == '@' || *p == '\0')
continue;
c0 = strtoul(p, (char **)&p, 16);
if (*p == '.' && p[1] == '.') {
p += 2;
c1 = strtoul(p, (char **)&p, 16);
} else {
c1 = c0;
}
assert(c1 <= CHARCODE_MAX);
p += strspn(p, " \t");
if (*p == ';') {
p++;
p += strspn(p, " \t");
q = buf;
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
if ((q - buf) < sizeof(buf) - 1)
*q++ = *p;
p++;
}
*q = '\0';
i = find_name(unicode_script_name,
countof(unicode_script_name), buf);
if (i < 0) {
fprintf(stderr, "Unknown script: '%s'\n", buf);
exit(1);
}
for(c = c0; c <= c1; c++)
unicode_db[c].script = i;
}
}
fclose(f);
}
void parse_script_extensions(const char *filename)
{
FILE *f;
char line[4096], *p, buf[256], *q;
uint32_t c0, c1, c;
int i;
uint8_t script_ext[255];
int script_ext_len;
f = fopen(filename, "rb");
if (!f) {
perror(filename);
exit(1);
}
for(;;) {
if (!get_line(line, sizeof(line), f))
break;
p = line;
while (isspace(*p))
p++;
if (*p == '#' || *p == '@' || *p == '\0')
continue;
c0 = strtoul(p, (char **)&p, 16);
if (*p == '.' && p[1] == '.') {
p += 2;
c1 = strtoul(p, (char **)&p, 16);
} else {
c1 = c0;
}
assert(c1 <= CHARCODE_MAX);
p += strspn(p, " \t");
script_ext_len = 0;
if (*p == ';') {
p++;
for(;;) {
p += strspn(p, " \t");
q = buf;
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
if ((q - buf) < sizeof(buf) - 1)
*q++ = *p;
p++;
}
*q = '\0';
if (buf[0] == '\0')
break;
i = find_name(unicode_script_short_name,
countof(unicode_script_short_name), buf);
if (i < 0) {
fprintf(stderr, "Script not found: %s\n", buf);
exit(1);
}
assert(script_ext_len < sizeof(script_ext));
script_ext[script_ext_len++] = i;
}
for(c = c0; c <= c1; c++) {
CCInfo *ci = &unicode_db[c];
ci->script_ext_len = script_ext_len;
ci->script_ext = malloc(sizeof(ci->script_ext[0]) * script_ext_len);
for(i = 0; i < script_ext_len; i++)
ci->script_ext[i] = script_ext[i];
}
}
}
fclose(f);
}
void dump_cc_info(CCInfo *ci, int i)
{
int j;
printf("%05x:", i);
if (ci->u_len != 0) {
printf(" U:");
for(j = 0; j < ci->u_len; j++)
printf(" %05x", ci->u_data[j]);
}
if (ci->l_len != 0) {
printf(" L:");
for(j = 0; j < ci->l_len; j++)
printf(" %05x", ci->l_data[j]);
}
if (ci->f_code != 0) {
printf(" F: %05x", ci->f_code);
}
printf("\n");
}
void dump_data(CCInfo *tab)
{
int i;
CCInfo *ci;
for(i = 0; i <= CHARCODE_MAX; i++) {
ci = &tab[i];
if (ci->u_len != 0 || ci->l_len != 0 || ci->f_code != 0) {
dump_cc_info(ci, i);
}
}
}
BOOL is_complicated_case(const CCInfo *ci)
{
return (ci->u_len > 1 || ci->l_len > 1 ||
(ci->u_len > 0 && ci->l_len > 0) ||
(ci->f_code != 0) != ci->l_len ||
(ci->f_code != 0 && ci->l_data[0] != ci->f_code));
}
#ifndef USE_TEST
enum {
RUN_TYPE_U,
RUN_TYPE_L,
RUN_TYPE_UF,
RUN_TYPE_LF,
RUN_TYPE_UL,
RUN_TYPE_LSU,
RUN_TYPE_U2L_399_EXT2,
RUN_TYPE_UF_D20,
RUN_TYPE_UF_D1_EXT,
RUN_TYPE_U_EXT,
RUN_TYPE_LF_EXT,
RUN_TYPE_U_EXT2,
RUN_TYPE_L_EXT2,
RUN_TYPE_U_EXT3,
};
#endif
const char *run_type_str[] = {
"U",
"L",
"UF",
"LF",
"UL",
"LSU",
"U2L_399_EXT2",
"UF_D20",
"UF_D1_EXT",
"U_EXT",
"LF_EXT",
"U_EXT2",
"L_EXT2",
"U_EXT3",
};
typedef struct {
int code;
int len;
int type;
int data;
int ext_len;
int ext_data[3];
int data_index; /* 'data' coming from the table */
} TableEntry;
/* code (17), len (7), type (4) */
void find_run_type(TableEntry *te, CCInfo *tab, int code)
{
int is_lower, len;
CCInfo *ci, *ci1, *ci2;
ci = &tab[code];
ci1 = &tab[code + 1];
ci2 = &tab[code + 2];
te->code = code;
if (ci->l_len == 1 && ci->l_data[0] == code + 2 &&
ci->f_code == ci->l_data[0] &&
ci->u_len == 0 &&
ci1->l_len == 1 && ci1->l_data[0] == code + 2 &&
ci1->f_code == ci1->l_data[0] &&
ci1->u_len == 1 && ci1->u_data[0] == code &&
ci2->l_len == 0 &&
ci2->f_code == 0 &&
ci2->u_len == 1 && ci2->u_data[0] == code) {
te->len = 3;
te->data = 0;
te->type = RUN_TYPE_LSU;
return;
}
if (is_complicated_case(ci)) {
len = 1;
while (code + len <= CHARCODE_MAX) {
ci1 = &tab[code + len];
if (ci1->u_len != 1 ||
ci1->u_data[0] != ci->u_data[0] + len ||
ci1->l_len != 0 ||
ci1->f_code != ci1->u_data[0])
break;
len++;
}
if (len > 1) {
te->len = len;
te->type = RUN_TYPE_UF;
te->data = ci->u_data[0];
return;
}
if (ci->u_len == 2 && ci->u_data[1] == 0x399 &&
ci->f_code == 0 && ci->l_len == 0) {
len = 1;
while (code + len <= CHARCODE_MAX) {
ci1 = &tab[code + len];
if (!(ci1->u_len == 2 &&
ci1->u_data[1] == 0x399 &&
ci1->u_data[0] == ci->u_data[0] + len &&
ci1->f_code == 0 &&
ci1->l_len == 0))
break;
len++;
}
te->len = len;
te->type = RUN_TYPE_U_EXT2;
te->ext_data[0] = ci->u_data[0];
te->ext_data[1] = ci->u_data[1];
te->ext_len = 2;
return;
}
if (ci->u_len == 2 && ci->u_data[1] == 0x399 &&
ci->l_len == 1 && ci->f_code == ci->l_data[0]) {
len = 1;
while (code + len <= CHARCODE_MAX) {
ci1 = &tab[code + len];
if (!(ci1->u_len == 2 &&
ci1->u_data[1] == 0x399 &&
ci1->u_data[0] == ci->u_data[0] + len &&
ci1->l_len == 1 &&
ci1->l_data[0] == ci->l_data[0] + len &&
ci1->f_code == ci1->l_data[0]))
break;
len++;
}
te->len = len;
te->type = RUN_TYPE_U2L_399_EXT2;
te->ext_data[0] = ci->u_data[0];
te->ext_data[1] = ci->l_data[0];
te->ext_len = 2;
return;
}
if (ci->l_len == 1 && ci->u_len == 0 && ci->f_code == 0) {
len = 1;
while (code + len <= CHARCODE_MAX) {
ci1 = &tab[code + len];
if (!(ci1->l_len == 1 &&
ci1->l_data[0] == ci->l_data[0] + len &&
ci1->u_len == 0 && ci1->f_code == 0))
break;
len++;
}
te->len = len;
te->type = RUN_TYPE_L;
te->data = ci->l_data[0];
return;
}
if (ci->l_len == 0 &&
ci->u_len == 1 &&
ci->u_data[0] < 0x1000 &&
ci->f_code == ci->u_data[0] + 0x20) {
te->len = 1;
te->type = RUN_TYPE_UF_D20;
te->data = ci->u_data[0];
} else if (ci->l_len == 0 &&
ci->u_len == 1 &&
ci->f_code == ci->u_data[0] + 1) {
te->len = 1;
te->type = RUN_TYPE_UF_D1_EXT;
te->ext_data[0] = ci->u_data[0];
te->ext_len = 1;
} else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_code == 0) {
te->len = 1;
te->type = RUN_TYPE_L_EXT2;
te->ext_data[0] = ci->l_data[0];
te->ext_data[1] = ci->l_data[1];
te->ext_len = 2;
} else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_code == 0) {
te->len = 1;
te->type = RUN_TYPE_U_EXT2;
te->ext_data[0] = ci->u_data[0];
te->ext_data[1] = ci->u_data[1];
te->ext_len = 2;
} else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_code == 0) {
te->len = 1;
te->type = RUN_TYPE_U_EXT3;
te->ext_data[0] = ci->u_data[0];
te->ext_data[1] = ci->u_data[1];
te->ext_data[2] = ci->u_data[2];
te->ext_len = 3;
} else {
printf("unsupported encoding case:\n");
dump_cc_info(ci, code);
abort();
}
} else {
/* look for a run of identical conversions */
len = 0;
for(;;) {
if (code >= CHARCODE_MAX || len >= 126)
break;
ci = &tab[code + len];
ci1 = &tab[code + len + 1];
if (is_complicated_case(ci) || is_complicated_case(ci1)) {
break;
}
if (ci->l_len != 1 || ci->l_data[0] != code + len + 1)
break;
if (ci1->u_len != 1 || ci1->u_data[0] != code + len)
break;
len += 2;
}
if (len > 0) {
te->len = len;
te->type = RUN_TYPE_UL;
te->data = 0;
return;
}
ci = &tab[code];
is_lower = ci->l_len > 0;
len = 1;
while (code + len <= CHARCODE_MAX) {
ci1 = &tab[code + len];
if (is_complicated_case(ci1))
break;
if (is_lower) {
if (ci1->l_len != 1 ||
ci1->l_data[0] != ci->l_data[0] + len)
break;
} else {
if (ci1->u_len != 1 ||
ci1->u_data[0] != ci->u_data[0] + len)
break;
}
len++;
}
te->len = len;
if (is_lower) {
te->type = RUN_TYPE_LF;
te->data = ci->l_data[0];
} else {
te->type = RUN_TYPE_U;
te->data = ci->u_data[0];
}
}
}