3058 lines
81 KiB
C
3058 lines
81 KiB
C
|
/*
|
||
|
* Generation of Unicode tables
|
||
|
*
|
||
|
* Copyright (c) 2017-2018 Fabrice Bellard
|
||
|
* Copyright (c) 2017-2018 Charlie Gordon
|
||
|
*
|
||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||
|
* of this software and associated documentation files (the "Software"), to deal
|
||
|
* in the Software without restriction, including without limitation the rights
|
||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||
|
* copies of the Software, and to permit persons to whom the Software is
|
||
|
* furnished to do so, subject to the following conditions:
|
||
|
*
|
||
|
* The above copyright notice and this permission notice shall be included in
|
||
|
* all copies or substantial portions of the Software.
|
||
|
*
|
||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||
|
* THE SOFTWARE.
|
||
|
*/
|
||
|
#include <stdlib.h>
|
||
|
#include <stdio.h>
|
||
|
#include <stdarg.h>
|
||
|
#include <inttypes.h>
|
||
|
#include <string.h>
|
||
|
#include <assert.h>
|
||
|
#include <ctype.h>
|
||
|
#include <time.h>
|
||
|
|
||
|
#include "cutils.h"
|
||
|
|
||
|
/* define it to be able to test unicode.c */
|
||
|
//#define USE_TEST
|
||
|
/* profile tests */
|
||
|
//#define PROFILE
|
||
|
|
||
|
//#define DUMP_CASE_CONV_TABLE
|
||
|
//#define DUMP_TABLE_SIZE
|
||
|
//#define DUMP_CC_TABLE
|
||
|
//#define DUMP_DECOMP_TABLE
|
||
|
|
||
|
/* Ideas:
|
||
|
- Generalize run length encoding + index for all tables
|
||
|
- remove redundant tables for ID_start, ID_continue, Case_Ignorable, Cased
|
||
|
|
||
|
Case conversion:
|
||
|
- use a single entry for consecutive U/LF runs
|
||
|
- allow EXT runs of length > 1
|
||
|
|
||
|
Decomposition:
|
||
|
- Greek lower case (+1f10/1f10) ?
|
||
|
- allow holes in B runs
|
||
|
- suppress more upper / lower case redundancy
|
||
|
*/
|
||
|
|
||
|
#ifdef USE_TEST
|
||
|
#include "libunicode.c"
|
||
|
#endif
|
||
|
|
||
|
#define CHARCODE_MAX 0x10ffff
|
||
|
#define CC_LEN_MAX 3
|
||
|
|
||
|
void *mallocz(size_t size)
|
||
|
{
|
||
|
void *ptr;
|
||
|
ptr = malloc(size);
|
||
|
memset(ptr, 0, size);
|
||
|
return ptr;
|
||
|
}
|
||
|
|
||
|
const char *get_field(const char *p, int n)
|
||
|
{
|
||
|
int i;
|
||
|
for(i = 0; i < n; i++) {
|
||
|
while (*p != ';' && *p != '\0')
|
||
|
p++;
|
||
|
if (*p == '\0')
|
||
|
return NULL;
|
||
|
p++;
|
||
|
}
|
||
|
return p;
|
||
|
}
|
||
|
|
||
|
const char *get_field_buf(char *buf, size_t buf_size, const char *p, int n)
|
||
|
{
|
||
|
char *q;
|
||
|
p = get_field(p, n);
|
||
|
q = buf;
|
||
|
while (*p != ';' && *p != '\0') {
|
||
|
if ((q - buf) < buf_size - 1)
|
||
|
*q++ = *p;
|
||
|
p++;
|
||
|
}
|
||
|
*q = '\0';
|
||
|
return buf;
|
||
|
}
|
||
|
|
||
|
void add_char(int **pbuf, int *psize, int *plen, int c)
|
||
|
{
|
||
|
int len, size, *buf;
|
||
|
buf = *pbuf;
|
||
|
size = *psize;
|
||
|
len = *plen;
|
||
|
if (len >= size) {
|
||
|
size = *psize;
|
||
|
size = max_int(len + 1, size * 3 / 2);
|
||
|
buf = realloc(buf, sizeof(buf[0]) * size);
|
||
|
*pbuf = buf;
|
||
|
*psize = size;
|
||
|
}
|
||
|
buf[len++] = c;
|
||
|
*plen = len;
|
||
|
}
|
||
|
|
||
|
int *get_field_str(int *plen, const char *str, int n)
|
||
|
{
|
||
|
const char *p;
|
||
|
int *buf, len, size;
|
||
|
p = get_field(str, n);
|
||
|
if (!p) {
|
||
|
*plen = 0;
|
||
|
return NULL;
|
||
|
}
|
||
|
len = 0;
|
||
|
size = 0;
|
||
|
buf = NULL;
|
||
|
for(;;) {
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (!isxdigit(*p))
|
||
|
break;
|
||
|
add_char(&buf, &size, &len, strtoul(p, (char **)&p, 16));
|
||
|
}
|
||
|
*plen = len;
|
||
|
return buf;
|
||
|
}
|
||
|
|
||
|
char *get_line(char *buf, int buf_size, FILE *f)
|
||
|
{
|
||
|
int len;
|
||
|
if (!fgets(buf, buf_size, f))
|
||
|
return NULL;
|
||
|
len = strlen(buf);
|
||
|
if (len > 0 && buf[len - 1] == '\n')
|
||
|
buf[len - 1] = '\0';
|
||
|
return buf;
|
||
|
}
|
||
|
|
||
|
#define UNICODE_GENERAL_CATEGORY
|
||
|
|
||
|
typedef enum {
|
||
|
#define DEF(id, str) GCAT_ ## id,
|
||
|
#include "unicode_gen_def.h"
|
||
|
#undef DEF
|
||
|
GCAT_COUNT,
|
||
|
} UnicodeGCEnum1;
|
||
|
|
||
|
static const char *unicode_gc_name[] = {
|
||
|
#define DEF(id, str) #id,
|
||
|
#include "unicode_gen_def.h"
|
||
|
#undef DEF
|
||
|
};
|
||
|
|
||
|
static const char *unicode_gc_short_name[] = {
|
||
|
#define DEF(id, str) str,
|
||
|
#include "unicode_gen_def.h"
|
||
|
#undef DEF
|
||
|
};
|
||
|
|
||
|
#undef UNICODE_GENERAL_CATEGORY
|
||
|
|
||
|
#define UNICODE_SCRIPT
|
||
|
|
||
|
typedef enum {
|
||
|
#define DEF(id, str) SCRIPT_ ## id,
|
||
|
#include "unicode_gen_def.h"
|
||
|
#undef DEF
|
||
|
SCRIPT_COUNT,
|
||
|
} UnicodeScriptEnum1;
|
||
|
|
||
|
static const char *unicode_script_name[] = {
|
||
|
#define DEF(id, str) #id,
|
||
|
#include "unicode_gen_def.h"
|
||
|
#undef DEF
|
||
|
};
|
||
|
|
||
|
const char *unicode_script_short_name[] = {
|
||
|
#define DEF(id, str) str,
|
||
|
#include "unicode_gen_def.h"
|
||
|
#undef DEF
|
||
|
};
|
||
|
|
||
|
#undef UNICODE_SCRIPT
|
||
|
|
||
|
#define UNICODE_PROP_LIST
|
||
|
|
||
|
typedef enum {
|
||
|
#define DEF(id, str) PROP_ ## id,
|
||
|
#include "unicode_gen_def.h"
|
||
|
#undef DEF
|
||
|
PROP_COUNT,
|
||
|
} UnicodePropEnum1;
|
||
|
|
||
|
static const char *unicode_prop_name[] = {
|
||
|
#define DEF(id, str) #id,
|
||
|
#include "unicode_gen_def.h"
|
||
|
#undef DEF
|
||
|
};
|
||
|
|
||
|
static const char *unicode_prop_short_name[] = {
|
||
|
#define DEF(id, str) str,
|
||
|
#include "unicode_gen_def.h"
|
||
|
#undef DEF
|
||
|
};
|
||
|
|
||
|
#undef UNICODE_SPROP_LIST
|
||
|
|
||
|
typedef struct {
|
||
|
/* case conv */
|
||
|
uint8_t u_len;
|
||
|
uint8_t l_len;
|
||
|
int u_data[CC_LEN_MAX];
|
||
|
int l_data[CC_LEN_MAX];
|
||
|
int f_code;
|
||
|
|
||
|
uint8_t combining_class;
|
||
|
uint8_t is_compat:1;
|
||
|
uint8_t is_excluded:1;
|
||
|
uint8_t general_category;
|
||
|
uint8_t script;
|
||
|
uint8_t script_ext_len;
|
||
|
uint8_t *script_ext;
|
||
|
uint32_t prop_bitmap_tab[3];
|
||
|
/* decomposition */
|
||
|
int decomp_len;
|
||
|
int *decomp_data;
|
||
|
} CCInfo;
|
||
|
|
||
|
CCInfo *unicode_db;
|
||
|
|
||
|
int find_name(const char **tab, int tab_len, const char *name)
|
||
|
{
|
||
|
int i, len, name_len;
|
||
|
const char *p, *r;
|
||
|
|
||
|
name_len = strlen(name);
|
||
|
for(i = 0; i < tab_len; i++) {
|
||
|
p = tab[i];
|
||
|
for(;;) {
|
||
|
r = strchr(p, ',');
|
||
|
if (!r)
|
||
|
len = strlen(p);
|
||
|
else
|
||
|
len = r - p;
|
||
|
if (len == name_len && memcmp(p, name, len) == 0)
|
||
|
return i;
|
||
|
if (!r)
|
||
|
break;
|
||
|
p = r + 1;
|
||
|
}
|
||
|
}
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
static int get_prop(uint32_t c, int prop_idx)
|
||
|
{
|
||
|
return (unicode_db[c].prop_bitmap_tab[prop_idx >> 5] >> (prop_idx & 0x1f)) & 1;
|
||
|
}
|
||
|
|
||
|
static void set_prop(uint32_t c, int prop_idx, int val)
|
||
|
{
|
||
|
uint32_t mask;
|
||
|
mask = 1U << (prop_idx & 0x1f);
|
||
|
if (val)
|
||
|
unicode_db[c].prop_bitmap_tab[prop_idx >> 5] |= mask;
|
||
|
else
|
||
|
unicode_db[c].prop_bitmap_tab[prop_idx >> 5] &= ~mask;
|
||
|
}
|
||
|
|
||
|
void parse_unicode_data(const char *filename)
|
||
|
{
|
||
|
FILE *f;
|
||
|
char line[1024];
|
||
|
char buf1[256];
|
||
|
const char *p;
|
||
|
int code, lc, uc, last_code;
|
||
|
CCInfo *ci, *tab = unicode_db;
|
||
|
|
||
|
f = fopen(filename, "rb");
|
||
|
if (!f) {
|
||
|
perror(filename);
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
last_code = 0;
|
||
|
for(;;) {
|
||
|
if (!get_line(line, sizeof(line), f))
|
||
|
break;
|
||
|
p = line;
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (*p == '#')
|
||
|
continue;
|
||
|
|
||
|
p = get_field(line, 0);
|
||
|
if (!p)
|
||
|
continue;
|
||
|
code = strtoul(p, NULL, 16);
|
||
|
lc = 0;
|
||
|
uc = 0;
|
||
|
|
||
|
p = get_field(line, 12);
|
||
|
if (p && *p != ';') {
|
||
|
uc = strtoul(p, NULL, 16);
|
||
|
}
|
||
|
|
||
|
p = get_field(line, 13);
|
||
|
if (p && *p != ';') {
|
||
|
lc = strtoul(p, NULL, 16);
|
||
|
}
|
||
|
ci = &tab[code];
|
||
|
if (uc > 0 || lc > 0) {
|
||
|
assert(code <= CHARCODE_MAX);
|
||
|
if (uc > 0) {
|
||
|
assert(ci->u_len == 0);
|
||
|
ci->u_len = 1;
|
||
|
ci->u_data[0] = uc;
|
||
|
}
|
||
|
if (lc > 0) {
|
||
|
assert(ci->l_len == 0);
|
||
|
ci->l_len = 1;
|
||
|
ci->l_data[0] = lc;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
{
|
||
|
int i;
|
||
|
get_field_buf(buf1, sizeof(buf1), line, 2);
|
||
|
i = find_name(unicode_gc_name, countof(unicode_gc_name), buf1);
|
||
|
if (i < 0) {
|
||
|
fprintf(stderr, "General category '%s' not found\n",
|
||
|
buf1);
|
||
|
exit(1);
|
||
|
}
|
||
|
ci->general_category = i;
|
||
|
}
|
||
|
|
||
|
p = get_field(line, 3);
|
||
|
if (p && *p != ';' && *p != '\0') {
|
||
|
int cc;
|
||
|
cc = strtoul(p, NULL, 0);
|
||
|
if (cc != 0) {
|
||
|
assert(code <= CHARCODE_MAX);
|
||
|
ci->combining_class = cc;
|
||
|
// printf("%05x: %d\n", code, ci->combining_class);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
p = get_field(line, 5);
|
||
|
if (p && *p != ';' && *p != '\0') {
|
||
|
int size;
|
||
|
assert(code <= CHARCODE_MAX);
|
||
|
ci->is_compat = 0;
|
||
|
if (*p == '<') {
|
||
|
while (*p != '\0' && *p != '>')
|
||
|
p++;
|
||
|
if (*p == '>')
|
||
|
p++;
|
||
|
ci->is_compat = 1;
|
||
|
}
|
||
|
size = 0;
|
||
|
for(;;) {
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (!isxdigit(*p))
|
||
|
break;
|
||
|
add_char(&ci->decomp_data, &size, &ci->decomp_len, strtoul(p, (char **)&p, 16));
|
||
|
}
|
||
|
#if 0
|
||
|
{
|
||
|
int i;
|
||
|
static int count, d_count;
|
||
|
|
||
|
printf("%05x: %c", code, ci->is_compat ? 'C': ' ');
|
||
|
for(i = 0; i < ci->decomp_len; i++)
|
||
|
printf(" %05x", ci->decomp_data[i]);
|
||
|
printf("\n");
|
||
|
count++;
|
||
|
d_count += ci->decomp_len;
|
||
|
// printf("%d %d\n", count, d_count);
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
p = get_field(line, 9);
|
||
|
if (p && *p == 'Y') {
|
||
|
set_prop(code, PROP_Bidi_Mirrored, 1);
|
||
|
}
|
||
|
|
||
|
/* handle ranges */
|
||
|
get_field_buf(buf1, sizeof(buf1), line, 1);
|
||
|
if (strstr(buf1, " Last>")) {
|
||
|
int i;
|
||
|
// printf("range: 0x%x-%0x\n", last_code, code);
|
||
|
assert(ci->decomp_len == 0);
|
||
|
assert(ci->script_ext_len == 0);
|
||
|
for(i = last_code + 1; i < code; i++) {
|
||
|
unicode_db[i] = *ci;
|
||
|
}
|
||
|
}
|
||
|
last_code = code;
|
||
|
}
|
||
|
|
||
|
fclose(f);
|
||
|
}
|
||
|
|
||
|
void parse_special_casing(CCInfo *tab, const char *filename)
|
||
|
{
|
||
|
FILE *f;
|
||
|
char line[1024];
|
||
|
const char *p;
|
||
|
int code;
|
||
|
CCInfo *ci;
|
||
|
|
||
|
f = fopen(filename, "rb");
|
||
|
if (!f) {
|
||
|
perror(filename);
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
for(;;) {
|
||
|
if (!get_line(line, sizeof(line), f))
|
||
|
break;
|
||
|
p = line;
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (*p == '#')
|
||
|
continue;
|
||
|
|
||
|
p = get_field(line, 0);
|
||
|
if (!p)
|
||
|
continue;
|
||
|
code = strtoul(p, NULL, 16);
|
||
|
assert(code <= CHARCODE_MAX);
|
||
|
ci = &tab[code];
|
||
|
|
||
|
p = get_field(line, 4);
|
||
|
if (p) {
|
||
|
/* locale dependent casing */
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (*p != '#' && *p != '\0')
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
|
||
|
p = get_field(line, 1);
|
||
|
if (p && *p != ';') {
|
||
|
ci->l_len = 0;
|
||
|
for(;;) {
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (*p == ';')
|
||
|
break;
|
||
|
assert(ci->l_len < CC_LEN_MAX);
|
||
|
ci->l_data[ci->l_len++] = strtoul(p, (char **)&p, 16);
|
||
|
}
|
||
|
|
||
|
if (ci->l_len == 1 && ci->l_data[0] == code)
|
||
|
ci->l_len = 0;
|
||
|
}
|
||
|
|
||
|
p = get_field(line, 3);
|
||
|
if (p && *p != ';') {
|
||
|
ci->u_len = 0;
|
||
|
for(;;) {
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (*p == ';')
|
||
|
break;
|
||
|
assert(ci->u_len < CC_LEN_MAX);
|
||
|
ci->u_data[ci->u_len++] = strtoul(p, (char **)&p, 16);
|
||
|
}
|
||
|
|
||
|
if (ci->u_len == 1 && ci->u_data[0] == code)
|
||
|
ci->u_len = 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
fclose(f);
|
||
|
}
|
||
|
|
||
|
void parse_case_folding(CCInfo *tab, const char *filename)
|
||
|
{
|
||
|
FILE *f;
|
||
|
char line[1024];
|
||
|
const char *p;
|
||
|
int code;
|
||
|
CCInfo *ci;
|
||
|
|
||
|
f = fopen(filename, "rb");
|
||
|
if (!f) {
|
||
|
perror(filename);
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
for(;;) {
|
||
|
if (!get_line(line, sizeof(line), f))
|
||
|
break;
|
||
|
p = line;
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (*p == '#')
|
||
|
continue;
|
||
|
|
||
|
p = get_field(line, 0);
|
||
|
if (!p)
|
||
|
continue;
|
||
|
code = strtoul(p, NULL, 16);
|
||
|
assert(code <= CHARCODE_MAX);
|
||
|
ci = &tab[code];
|
||
|
|
||
|
p = get_field(line, 1);
|
||
|
if (!p)
|
||
|
continue;
|
||
|
/* locale dependent casing */
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (*p != 'C' && *p != 'S')
|
||
|
continue;
|
||
|
|
||
|
p = get_field(line, 2);
|
||
|
assert(p != 0);
|
||
|
assert(ci->f_code == 0);
|
||
|
ci->f_code = strtoul(p, NULL, 16);
|
||
|
assert(ci->f_code != 0 && ci->f_code != code);
|
||
|
}
|
||
|
|
||
|
fclose(f);
|
||
|
}
|
||
|
|
||
|
void parse_composition_exclusions(const char *filename)
|
||
|
{
|
||
|
FILE *f;
|
||
|
char line[4096], *p;
|
||
|
uint32_t c0;
|
||
|
|
||
|
f = fopen(filename, "rb");
|
||
|
if (!f) {
|
||
|
perror(filename);
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
for(;;) {
|
||
|
if (!get_line(line, sizeof(line), f))
|
||
|
break;
|
||
|
p = line;
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (*p == '#' || *p == '@' || *p == '\0')
|
||
|
continue;
|
||
|
c0 = strtoul(p, (char **)&p, 16);
|
||
|
assert(c0 > 0 && c0 <= CHARCODE_MAX);
|
||
|
unicode_db[c0].is_excluded = TRUE;
|
||
|
}
|
||
|
fclose(f);
|
||
|
}
|
||
|
|
||
|
void parse_derived_core_properties(const char *filename)
|
||
|
{
|
||
|
FILE *f;
|
||
|
char line[4096], *p, buf[256], *q;
|
||
|
uint32_t c0, c1, c;
|
||
|
int i;
|
||
|
|
||
|
f = fopen(filename, "rb");
|
||
|
if (!f) {
|
||
|
perror(filename);
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
for(;;) {
|
||
|
if (!get_line(line, sizeof(line), f))
|
||
|
break;
|
||
|
p = line;
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (*p == '#' || *p == '@' || *p == '\0')
|
||
|
continue;
|
||
|
c0 = strtoul(p, (char **)&p, 16);
|
||
|
if (*p == '.' && p[1] == '.') {
|
||
|
p += 2;
|
||
|
c1 = strtoul(p, (char **)&p, 16);
|
||
|
} else {
|
||
|
c1 = c0;
|
||
|
}
|
||
|
assert(c1 <= CHARCODE_MAX);
|
||
|
p += strspn(p, " \t");
|
||
|
if (*p == ';') {
|
||
|
p++;
|
||
|
p += strspn(p, " \t");
|
||
|
q = buf;
|
||
|
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
|
||
|
if ((q - buf) < sizeof(buf) - 1)
|
||
|
*q++ = *p;
|
||
|
p++;
|
||
|
}
|
||
|
*q = '\0';
|
||
|
i = find_name(unicode_prop_name,
|
||
|
countof(unicode_prop_name), buf);
|
||
|
if (i < 0) {
|
||
|
if (!strcmp(buf, "Grapheme_Link"))
|
||
|
goto next;
|
||
|
fprintf(stderr, "Property not found: %s\n", buf);
|
||
|
exit(1);
|
||
|
}
|
||
|
for(c = c0; c <= c1; c++) {
|
||
|
set_prop(c, i, 1);
|
||
|
}
|
||
|
next: ;
|
||
|
}
|
||
|
}
|
||
|
fclose(f);
|
||
|
}
|
||
|
|
||
|
void parse_derived_norm_properties(const char *filename)
|
||
|
{
|
||
|
FILE *f;
|
||
|
char line[4096], *p, buf[256], *q;
|
||
|
uint32_t c0, c1, c;
|
||
|
|
||
|
f = fopen(filename, "rb");
|
||
|
if (!f) {
|
||
|
perror(filename);
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
for(;;) {
|
||
|
if (!get_line(line, sizeof(line), f))
|
||
|
break;
|
||
|
p = line;
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (*p == '#' || *p == '@' || *p == '\0')
|
||
|
continue;
|
||
|
c0 = strtoul(p, (char **)&p, 16);
|
||
|
if (*p == '.' && p[1] == '.') {
|
||
|
p += 2;
|
||
|
c1 = strtoul(p, (char **)&p, 16);
|
||
|
} else {
|
||
|
c1 = c0;
|
||
|
}
|
||
|
assert(c1 <= CHARCODE_MAX);
|
||
|
p += strspn(p, " \t");
|
||
|
if (*p == ';') {
|
||
|
p++;
|
||
|
p += strspn(p, " \t");
|
||
|
q = buf;
|
||
|
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
|
||
|
if ((q - buf) < sizeof(buf) - 1)
|
||
|
*q++ = *p;
|
||
|
p++;
|
||
|
}
|
||
|
*q = '\0';
|
||
|
if (!strcmp(buf, "Changes_When_NFKC_Casefolded")) {
|
||
|
for(c = c0; c <= c1; c++) {
|
||
|
set_prop(c, PROP_Changes_When_NFKC_Casefolded, 1);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
fclose(f);
|
||
|
}
|
||
|
|
||
|
void parse_prop_list(const char *filename)
|
||
|
{
|
||
|
FILE *f;
|
||
|
char line[4096], *p, buf[256], *q;
|
||
|
uint32_t c0, c1, c;
|
||
|
int i;
|
||
|
|
||
|
f = fopen(filename, "rb");
|
||
|
if (!f) {
|
||
|
perror(filename);
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
for(;;) {
|
||
|
if (!get_line(line, sizeof(line), f))
|
||
|
break;
|
||
|
p = line;
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (*p == '#' || *p == '@' || *p == '\0')
|
||
|
continue;
|
||
|
c0 = strtoul(p, (char **)&p, 16);
|
||
|
if (*p == '.' && p[1] == '.') {
|
||
|
p += 2;
|
||
|
c1 = strtoul(p, (char **)&p, 16);
|
||
|
} else {
|
||
|
c1 = c0;
|
||
|
}
|
||
|
assert(c1 <= CHARCODE_MAX);
|
||
|
p += strspn(p, " \t");
|
||
|
if (*p == ';') {
|
||
|
p++;
|
||
|
p += strspn(p, " \t");
|
||
|
q = buf;
|
||
|
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
|
||
|
if ((q - buf) < sizeof(buf) - 1)
|
||
|
*q++ = *p;
|
||
|
p++;
|
||
|
}
|
||
|
*q = '\0';
|
||
|
i = find_name(unicode_prop_name,
|
||
|
countof(unicode_prop_name), buf);
|
||
|
if (i < 0) {
|
||
|
fprintf(stderr, "Property not found: %s\n", buf);
|
||
|
exit(1);
|
||
|
}
|
||
|
for(c = c0; c <= c1; c++) {
|
||
|
set_prop(c, i, 1);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
fclose(f);
|
||
|
}
|
||
|
|
||
|
void parse_scripts(const char *filename)
|
||
|
{
|
||
|
FILE *f;
|
||
|
char line[4096], *p, buf[256], *q;
|
||
|
uint32_t c0, c1, c;
|
||
|
int i;
|
||
|
|
||
|
f = fopen(filename, "rb");
|
||
|
if (!f) {
|
||
|
perror(filename);
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
for(;;) {
|
||
|
if (!get_line(line, sizeof(line), f))
|
||
|
break;
|
||
|
p = line;
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (*p == '#' || *p == '@' || *p == '\0')
|
||
|
continue;
|
||
|
c0 = strtoul(p, (char **)&p, 16);
|
||
|
if (*p == '.' && p[1] == '.') {
|
||
|
p += 2;
|
||
|
c1 = strtoul(p, (char **)&p, 16);
|
||
|
} else {
|
||
|
c1 = c0;
|
||
|
}
|
||
|
assert(c1 <= CHARCODE_MAX);
|
||
|
p += strspn(p, " \t");
|
||
|
if (*p == ';') {
|
||
|
p++;
|
||
|
p += strspn(p, " \t");
|
||
|
q = buf;
|
||
|
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
|
||
|
if ((q - buf) < sizeof(buf) - 1)
|
||
|
*q++ = *p;
|
||
|
p++;
|
||
|
}
|
||
|
*q = '\0';
|
||
|
i = find_name(unicode_script_name,
|
||
|
countof(unicode_script_name), buf);
|
||
|
if (i < 0) {
|
||
|
fprintf(stderr, "Unknown script: '%s'\n", buf);
|
||
|
exit(1);
|
||
|
}
|
||
|
for(c = c0; c <= c1; c++)
|
||
|
unicode_db[c].script = i;
|
||
|
}
|
||
|
}
|
||
|
fclose(f);
|
||
|
}
|
||
|
|
||
|
void parse_script_extensions(const char *filename)
|
||
|
{
|
||
|
FILE *f;
|
||
|
char line[4096], *p, buf[256], *q;
|
||
|
uint32_t c0, c1, c;
|
||
|
int i;
|
||
|
uint8_t script_ext[255];
|
||
|
int script_ext_len;
|
||
|
|
||
|
f = fopen(filename, "rb");
|
||
|
if (!f) {
|
||
|
perror(filename);
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
for(;;) {
|
||
|
if (!get_line(line, sizeof(line), f))
|
||
|
break;
|
||
|
p = line;
|
||
|
while (isspace(*p))
|
||
|
p++;
|
||
|
if (*p == '#' || *p == '@' || *p == '\0')
|
||
|
continue;
|
||
|
c0 = strtoul(p, (char **)&p, 16);
|
||
|
if (*p == '.' && p[1] == '.') {
|
||
|
p += 2;
|
||
|
c1 = strtoul(p, (char **)&p, 16);
|
||
|
} else {
|
||
|
c1 = c0;
|
||
|
}
|
||
|
assert(c1 <= CHARCODE_MAX);
|
||
|
p += strspn(p, " \t");
|
||
|
script_ext_len = 0;
|
||
|
if (*p == ';') {
|
||
|
p++;
|
||
|
for(;;) {
|
||
|
p += strspn(p, " \t");
|
||
|
q = buf;
|
||
|
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
|
||
|
if ((q - buf) < sizeof(buf) - 1)
|
||
|
*q++ = *p;
|
||
|
p++;
|
||
|
}
|
||
|
*q = '\0';
|
||
|
if (buf[0] == '\0')
|
||
|
break;
|
||
|
i = find_name(unicode_script_short_name,
|
||
|
countof(unicode_script_short_name), buf);
|
||
|
if (i < 0) {
|
||
|
fprintf(stderr, "Script not found: %s\n", buf);
|
||
|
exit(1);
|
||
|
}
|
||
|
assert(script_ext_len < sizeof(script_ext));
|
||
|
script_ext[script_ext_len++] = i;
|
||
|
}
|
||
|
for(c = c0; c <= c1; c++) {
|
||
|
CCInfo *ci = &unicode_db[c];
|
||
|
ci->script_ext_len = script_ext_len;
|
||
|
ci->script_ext = malloc(sizeof(ci->script_ext[0]) * script_ext_len);
|
||
|
for(i = 0; i < script_ext_len; i++)
|
||
|
ci->script_ext[i] = script_ext[i];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
fclose(f);
|
||
|
}
|
||
|
|
||
|
void dump_cc_info(CCInfo *ci, int i)
|
||
|
{
|
||
|
int j;
|
||
|
printf("%05x:", i);
|
||
|
if (ci->u_len != 0) {
|
||
|
printf(" U:");
|
||
|
for(j = 0; j < ci->u_len; j++)
|
||
|
printf(" %05x", ci->u_data[j]);
|
||
|
}
|
||
|
if (ci->l_len != 0) {
|
||
|
printf(" L:");
|
||
|
for(j = 0; j < ci->l_len; j++)
|
||
|
printf(" %05x", ci->l_data[j]);
|
||
|
}
|
||
|
if (ci->f_code != 0) {
|
||
|
printf(" F: %05x", ci->f_code);
|
||
|
}
|
||
|
printf("\n");
|
||
|
}
|
||
|
|
||
|
void dump_data(CCInfo *tab)
|
||
|
{
|
||
|
int i;
|
||
|
CCInfo *ci;
|
||
|
for(i = 0; i <= CHARCODE_MAX; i++) {
|
||
|
ci = &tab[i];
|
||
|
if (ci->u_len != 0 || ci->l_len != 0 || ci->f_code != 0) {
|
||
|
dump_cc_info(ci, i);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
BOOL is_complicated_case(const CCInfo *ci)
|
||
|
{
|
||
|
return (ci->u_len > 1 || ci->l_len > 1 ||
|
||
|
(ci->u_len > 0 && ci->l_len > 0) ||
|
||
|
(ci->f_code != 0) != ci->l_len ||
|
||
|
(ci->f_code != 0 && ci->l_data[0] != ci->f_code));
|
||
|
}
|
||
|
|
||
|
#ifndef USE_TEST
|
||
|
enum {
|
||
|
RUN_TYPE_U,
|
||
|
RUN_TYPE_L,
|
||
|
RUN_TYPE_UF,
|
||
|
RUN_TYPE_LF,
|
||
|
RUN_TYPE_UL,
|
||
|
RUN_TYPE_LSU,
|
||
|
RUN_TYPE_U2L_399_EXT2,
|
||
|
RUN_TYPE_UF_D20,
|
||
|
RUN_TYPE_UF_D1_EXT,
|
||
|
RUN_TYPE_U_EXT,
|
||
|
RUN_TYPE_LF_EXT,
|
||
|
RUN_TYPE_U_EXT2,
|
||
|
RUN_TYPE_L_EXT2,
|
||
|
RUN_TYPE_U_EXT3,
|
||
|
};
|
||
|
#endif
|
||
|
|
||
|
const char *run_type_str[] = {
|
||
|
"U",
|
||
|
"L",
|
||
|
"UF",
|
||
|
"LF",
|
||
|
"UL",
|
||
|
"LSU",
|
||
|
"U2L_399_EXT2",
|
||
|
"UF_D20",
|
||
|
"UF_D1_EXT",
|
||
|
"U_EXT",
|
||
|
"LF_EXT",
|
||
|
"U_EXT2",
|
||
|
"L_EXT2",
|
||
|
"U_EXT3",
|
||
|
};
|
||
|
|
||
|
typedef struct {
|
||
|
int code;
|
||
|
int len;
|
||
|
int type;
|
||
|
int data;
|
||
|
int ext_len;
|
||
|
int ext_data[3];
|
||
|
int data_index; /* 'data' coming from the table */
|
||
|
} TableEntry;
|
||
|
|
||
|
/* code (17), len (7), type (4) */
|
||
|
|
||
|
void find_run_type(TableEntry *te, CCInfo *tab, int code)
|
||
|
{
|
||
|
int is_lower, len;
|
||
|
CCInfo *ci, *ci1, *ci2;
|
||
|
|
||
|
ci = &tab[code];
|
||
|
ci1 = &tab[code + 1];
|
||
|
ci2 = &tab[code + 2];
|
||
|
te->code = code;
|
||
|
|
||
|
if (ci->l_len == 1 && ci->l_data[0] == code + 2 &&
|
||
|
ci->f_code == ci->l_data[0] &&
|
||
|
ci->u_len == 0 &&
|
||
|
|
||
|
ci1->l_len == 1 && ci1->l_data[0] == code + 2 &&
|
||
|
ci1->f_code == ci1->l_data[0] &&
|
||
|
ci1->u_len == 1 && ci1->u_data[0] == code &&
|
||
|
|
||
|
ci2->l_len == 0 &&
|
||
|
ci2->f_code == 0 &&
|
||
|
ci2->u_len == 1 && ci2->u_data[0] == code) {
|
||
|
te->len = 3;
|
||
|
te->data = 0;
|
||
|
te->type = RUN_TYPE_LSU;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (is_complicated_case(ci)) {
|
||
|
len = 1;
|
||
|
while (code + len <= CHARCODE_MAX) {
|
||
|
ci1 = &tab[code + len];
|
||
|
if (ci1->u_len != 1 ||
|
||
|
ci1->u_data[0] != ci->u_data[0] + len ||
|
||
|
ci1->l_len != 0 ||
|
||
|
ci1->f_code != ci1->u_data[0])
|
||
|
break;
|
||
|
len++;
|
||
|
}
|
||
|
if (len > 1) {
|
||
|
te->len = len;
|
||
|
te->type = RUN_TYPE_UF;
|
||
|
te->data = ci->u_data[0];
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (ci->u_len == 2 && ci->u_data[1] == 0x399 &&
|
||
|
ci->f_code == 0 && ci->l_len == 0) {
|
||
|
len = 1;
|
||
|
while (code + len <= CHARCODE_MAX) {
|
||
|
ci1 = &tab[code + len];
|
||
|
if (!(ci1->u_len == 2 &&
|
||
|
ci1->u_data[1] == 0x399 &&
|
||
|
ci1->u_data[0] == ci->u_data[0] + len &&
|
||
|
ci1->f_code == 0 &&
|
||
|
ci1->l_len == 0))
|
||
|
break;
|
||
|
len++;
|
||
|
}
|
||
|
te->len = len;
|
||
|
te->type = RUN_TYPE_U_EXT2;
|
||
|
te->ext_data[0] = ci->u_data[0];
|
||
|
te->ext_data[1] = ci->u_data[1];
|
||
|
te->ext_len = 2;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (ci->u_len == 2 && ci->u_data[1] == 0x399 &&
|
||
|
ci->l_len == 1 && ci->f_code == ci->l_data[0]) {
|
||
|
len = 1;
|
||
|
while (code + len <= CHARCODE_MAX) {
|
||
|
ci1 = &tab[code + len];
|
||
|
if (!(ci1->u_len == 2 &&
|
||
|
ci1->u_data[1] == 0x399 &&
|
||
|
ci1->u_data[0] == ci->u_data[0] + len &&
|
||
|
ci1->l_len == 1 &&
|
||
|
ci1->l_data[0] == ci->l_data[0] + len &&
|
||
|
ci1->f_code == ci1->l_data[0]))
|
||
|
break;
|
||
|
len++;
|
||
|
}
|
||
|
te->len = len;
|
||
|
te->type = RUN_TYPE_U2L_399_EXT2;
|
||
|
te->ext_data[0] = ci->u_data[0];
|
||
|
te->ext_data[1] = ci->l_data[0];
|
||
|
te->ext_len = 2;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (ci->l_len == 1 && ci->u_len == 0 && ci->f_code == 0) {
|
||
|
len = 1;
|
||
|
while (code + len <= CHARCODE_MAX) {
|
||
|
ci1 = &tab[code + len];
|
||
|
if (!(ci1->l_len == 1 &&
|
||
|
ci1->l_data[0] == ci->l_data[0] + len &&
|
||
|
ci1->u_len == 0 && ci1->f_code == 0))
|
||
|
break;
|
||
|
len++;
|
||
|
}
|
||
|
te->len = len;
|
||
|
te->type = RUN_TYPE_L;
|
||
|
te->data = ci->l_data[0];
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (ci->l_len == 0 &&
|
||
|
ci->u_len == 1 &&
|
||
|
ci->u_data[0] < 0x1000 &&
|
||
|
ci->f_code == ci->u_data[0] + 0x20) {
|
||
|
te->len = 1;
|
||
|
te->type = RUN_TYPE_UF_D20;
|
||
|
te->data = ci->u_data[0];
|
||
|
} else if (ci->l_len == 0 &&
|
||
|
ci->u_len == 1 &&
|
||
|
ci->f_code == ci->u_data[0] + 1) {
|
||
|
te->len = 1;
|
||
|
te->type = RUN_TYPE_UF_D1_EXT;
|
||
|
te->ext_data[0] = ci->u_data[0];
|
||
|
te->ext_len = 1;
|
||
|
} else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_code == 0) {
|
||
|
te->len = 1;
|
||
|
te->type = RUN_TYPE_L_EXT2;
|
||
|
te->ext_data[0] = ci->l_data[0];
|
||
|
te->ext_data[1] = ci->l_data[1];
|
||
|
te->ext_len = 2;
|
||
|
} else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_code == 0) {
|
||
|
te->len = 1;
|
||
|
te->type = RUN_TYPE_U_EXT2;
|
||
|
te->ext_data[0] = ci->u_data[0];
|
||
|
te->ext_data[1] = ci->u_data[1];
|
||
|
te->ext_len = 2;
|
||
|
} else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_code == 0) {
|
||
|
te->len = 1;
|
||
|
te->type = RUN_TYPE_U_EXT3;
|
||
|
te->ext_data[0] = ci->u_data[0];
|
||
|
te->ext_data[1] = ci->u_data[1];
|
||
|
te->ext_data[2] = ci->u_data[2];
|
||
|
te->ext_len = 3;
|
||
|
} else {
|
||
|
printf("unsupported encoding case:\n");
|
||
|
dump_cc_info(ci, code);
|
||
|
abort();
|
||
|
}
|
||
|
} else {
|
||
|
/* look for a run of identical conversions */
|
||
|
len = 0;
|
||
|
for(;;) {
|
||
|
if (code >= CHARCODE_MAX || len >= 126)
|
||
|
break;
|
||
|
ci = &tab[code + len];
|
||
|
ci1 = &tab[code + len + 1];
|
||
|
if (is_complicated_case(ci) || is_complicated_case(ci1)) {
|
||
|
break;
|
||
|
}
|
||
|
if (ci->l_len != 1 || ci->l_data[0] != code + len + 1)
|
||
|
break;
|
||
|
if (ci1->u_len != 1 || ci1->u_data[0] != code + len)
|
||
|
break;
|
||
|
len += 2;
|
||
|
}
|
||
|
if (len > 0) {
|
||
|
te->len = len;
|
||
|
te->type = RUN_TYPE_UL;
|
||
|
te->data = 0;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
ci = &tab[code];
|
||
|
is_lower = ci->l_len > 0;
|
||
|
len = 1;
|
||
|
while (code + len <= CHARCODE_MAX) {
|
||
|
ci1 = &tab[code + len];
|
||
|
if (is_complicated_case(ci1))
|
||
|
break;
|
||
|
if (is_lower) {
|
||
|
if (ci1->l_len != 1 ||
|
||
|
ci1->l_data[0] != ci->l_data[0] + len)
|
||
|
break;
|
||
|
} else {
|
||
|
if (ci1->u_len != 1 ||
|
||
|
ci1->u_data[0] != ci->u_data[0] + len)
|
||
|
break;
|
||
|
}
|
||
|
len++;
|
||
|
}
|
||
|
te->len = len;
|
||
|
if (is_lower) {
|
||
|
te->type = RUN_TYPE_LF;
|
||
|
te->data = ci->l_data[0];
|
||
|
} else {
|
||
|
te->type = RUN_TYPE_U;
|
||
|
te->data = ci->u_data[0];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|