IM: unicode _string_ output support.

This commit is contained in:
Mark K. Kim 2007-04-27 03:43:27 +00:00
parent e2c829b9b1
commit cf0a8a7bee
3 changed files with 96 additions and 50 deletions

View file

@ -90,6 +90,9 @@ section
3092 wo - 3092 wo -
3093 n - 3093 n -
304D:3083 kya -
3063:305F tta -
# Katakana # Katakana
section section
@ -186,3 +189,5 @@ section
30FA vo - 30FA vo -
30FB . - 30FB . -
30FC - - 30FC - -
# vim:ts=12

139
src/im.c
View file

@ -80,8 +80,9 @@ static const char* const im_tip_text[NUM_IM_TIPS] =
/* #define IM_DEBUG 1 */ /* #define IM_DEBUG 1 */
#define MAX_SECTIONS 8 /* Maximum numbers of sections in *.im file */ #define MAX_SECTIONS 8 /* Maximum numbers of sections in *.im file */
#define INITIAL_SMSIZE 8 /* Initial num of transitions in STATE_MACHINE */ #define MAX_UNICODE_SEQ 16 /* Output of state machine, including NUL */
#define INITIAL_SMSIZE 8 /* Initial num of transitions in STATE_MACHINE */
#ifndef LANG_DEFAULT #ifndef LANG_DEFAULT
#define LANG_DEFAULT (LANG_EN) #define LANG_DEFAULT (LANG_EN)
@ -138,7 +139,7 @@ typedef struct SM_WITH_KEY {
* @see SM_WITH_KEY * @see SM_WITH_KEY
*/ */
typedef struct STATE_MACHINE { typedef struct STATE_MACHINE {
wchar_t output; wchar_t output[MAX_UNICODE_SEQ];
char flag; char flag;
SM_WITH_KEY* next; /* Possible transitions */ SM_WITH_KEY* next; /* Possible transitions */
@ -158,9 +159,9 @@ typedef struct {
int section; int section;
/* These variables get populated when a search is performed */ /* These variables get populated when a search is performed */
int match_count; int match_count; /* How many char seq was used for output */
int match_is_final; int match_is_final; /* T/F - tells if match is final */
int match_stats; /* Statistics gathering */ int match_stats; /* Statistics gathering */
STATE_MACHINE* match_state; STATE_MACHINE* match_state;
STATE_MACHINE* match_state_prev; STATE_MACHINE* match_state_prev;
} CHARMAP; } CHARMAP;
@ -322,12 +323,12 @@ static STATE_MACHINE* sm_search_shallow(STATE_MACHINE* sm, char key)
* @param end The last state found. Return on output. * @param end The last state found. Return on output.
* @param penult The penultimate state found. * @param penult The penultimate state found.
* *
* @return Found unicode output of the last state. * @return Found unicode character sequence output of the last state.
*/ */
static wchar_t sm_search(STATE_MACHINE* start, wchar_t* key, int* matched, STATE_MACHINE** penult, STATE_MACHINE** end) static const wchar_t* sm_search(STATE_MACHINE* start, wchar_t* key, int* matched, STATE_MACHINE** penult, STATE_MACHINE** end)
{ {
STATE_MACHINE* sm = sm_search_shallow(start, (char)*key); STATE_MACHINE* sm = sm_search_shallow(start, (char)*key);
wchar_t unicode; const wchar_t* unicode;
/* No match - stop recursion */ /* No match - stop recursion */
if(!sm) { if(!sm) {
@ -359,17 +360,22 @@ static void sm_sort_shallow(STATE_MACHINE* sm)
/** /**
* Add a single sequence-to-unicode path to the state machine. * Add a single sequence-to-unicode path to the state machine.
*/ */
static int sm_add(STATE_MACHINE* sm, char* seq, wchar_t unicode, char flag) static int sm_add(STATE_MACHINE* sm, char* seq, const wchar_t* unicode, char flag)
{ {
STATE_MACHINE* sm_found = sm_search_shallow(sm, seq[0]); STATE_MACHINE* sm_found = sm_search_shallow(sm, seq[0]);
/* Empty sequence */ /* Empty sequence */
if(seq[0] == '\0') { if(seq[0] == '\0') {
if(sm->output) { if(wcslen(sm->output)) {
fprintf(stderr, "Unicode %04X already defined, overriding with %04X\n", size_t i;
(int)sm->output, (int)unicode);
fprintf(stderr, "Unicode sequence ");
for(i = 0; i < wcslen(sm->output); i++) fprintf(stderr, "%04X ", (int)sm->output[i]);
fprintf(stderr, " already defined, overriding with ");
for(i = 0; i < wcslen(unicode); i++) fprintf(stderr, "%04X ", (int)unicode[i]);
fprintf(stderr, "\n");
} }
sm->output = unicode; wcscpy(sm->output, unicode);
sm->flag = flag; sm->flag = flag;
return 0; return 0;
} }
@ -443,7 +449,7 @@ static int charmap_init(CHARMAP* cm)
* *
* @return 0 if no error, 1 if error. * @return 0 if no error, 1 if error.
*/ */
static int charmap_add(CHARMAP* cm, int section, char* seq, wchar_t unicode, char* flag) static int charmap_add(CHARMAP* cm, int section, char* seq, const wchar_t* unicode, char* flag)
{ {
if(section >= MAX_SECTIONS) { if(section >= MAX_SECTIONS) {
fprintf(stderr, "Section count exceeded\n"); fprintf(stderr, "Section count exceeded\n");
@ -481,35 +487,67 @@ static int charmap_load(CHARMAP* cm, const char* path)
/* Load */ /* Load */
while(!feof(is)) { while(!feof(is)) {
wchar_t unicode; wchar_t unicode[MAX_UNICODE_SEQ];
int ulen = 0;
char buf[256]; char buf[256];
char flag[256]; char flag[256];
int scanned = 0; int scanned = 0;
int u;
scanned = fscanf(is, "%x\t%255s\t%255s", &u, buf, flag); /* Scan a single token first */
scanned = fscanf(is, "%255s", buf);
if(scanned < 0) break; if(scanned < 0) break;
unicode = u; if(scanned == 0) {
fprintf(stderr, "%s: Character map syntax error\n", path);
return 1;
}
switch(scanned) { /* Handle the first argument */
case 0: if(strcmp(buf, "section") == 0) { /* Section division */
fscanf(is, "%255s", buf); section++;
continue;
}
else if(buf[0] == '#') { /* Comment */
fscanf(is, "%*[^\n]");
continue;
}
else {
char* bp = buf;
int u;
if(strcmp(buf, "section") == 0) section++; /* Section division */ do {
else if(buf[0] == '#') fscanf(is, "%*[^\n]"); /* Comment */ if(sscanf(bp, "%x", &u) == 1) { /* Unicode */
unicode[ulen++] = u;
}
else { else {
fprintf(stderr, "%s: Syntax error at '%s'\n", path, buf); fprintf(stderr, "%s: Syntax error at '%s'\n", path, buf);
return 1; return 1;
} }
break;
case 1: case 2: bp = strchr(bp, ':');
if(bp) bp++;
} while(bp && ulen < MAX_UNICODE_SEQ-1);
unicode[ulen] = L'\0';
}
/* Scan some more */
scanned = fscanf(is, "%255s\t%255s", buf, flag);
if(scanned < 0) break;
/* Input count checking */
switch(scanned) {
case 0: case 1:
fprintf(stderr, "%s: Character map syntax error\n", path); fprintf(stderr, "%s: Character map syntax error\n", path);
return 1; return 1;
default: default:
if(charmap_add(cm, section, buf, unicode, flag)) { if(charmap_add(cm, section, buf, unicode, flag)) {
fwprintf(stderr, L"Unable to add sequence '%ls', unicode '%04X' in section %d\n", buf, unicode, section); size_t i = 0;
fwprintf(stderr, L"Unable to add sequence '%ls', unicode ", buf);
for(i = 0; i < wcslen(unicode); i++) fwprintf(stderr, L"%04X ", (int)unicode[i]);
fwprintf(stderr, L"in section %d\n", section);
error_code = 1; error_code = 1;
} }
} }
@ -540,10 +578,10 @@ static void charmap_free(CHARMAP* cm)
/** /**
* Search for a matching character string in the character map. * Search for a matching character string in the character map.
*/ */
static wchar_t charmap_search(CHARMAP* cm, wchar_t* s) static const wchar_t* charmap_search(CHARMAP* cm, wchar_t* s)
{ {
STATE_MACHINE* start; STATE_MACHINE* start;
wchar_t unicode; const wchar_t* unicode;
int section; int section;
/* Determine the starting state based on the charmap's active section */ /* Determine the starting state based on the charmap's active section */
@ -567,10 +605,12 @@ static wchar_t charmap_search(CHARMAP* cm, wchar_t* s)
* final state we possibly can. * final state we possibly can.
*/ */
cm->match_is_final = 0; cm->match_is_final = 0;
cm->match_stats = MATCH_STAT_NONE;
if(cm->match_count < (int)wcslen(s)) { if(cm->match_count < (int)wcslen(s)) {
cm->match_is_final = 1; cm->match_is_final = 1;
} }
/* Statistics */
cm->match_stats = MATCH_STAT_NONE;
if(cm->match_state->next_size == 0) { if(cm->match_state->next_size == 0) {
cm->match_is_final = 1; cm->match_is_final = 1;
cm->match_stats |= MATCH_STAT_NOMOSTATES; cm->match_stats |= MATCH_STAT_NOMOSTATES;
@ -757,18 +797,18 @@ static int im_event_ja(IM_DATA* im, SDL_keysym ks)
/* Translate the characters */ /* Translate the characters */
im->discard = 0; im->discard = 0;
while(1) { while(1) {
u = charmap_search(&cm, im->buf); const wchar_t* us = charmap_search(&cm, im->buf);
#ifdef IM_DEBUG #ifdef IM_DEBUG
wprintf(L" [%8ls] [%8ls] %2d %2d\n", im->s, im->buf, wcslen(im->s), wcslen(im->buf)); wprintf(L" [%8ls] [%8ls] %2d %2d\n", im->s, im->buf, wcslen(im->s), wcslen(im->buf));
#endif #endif
/* Match was found? */ /* Match was found? */
if(u) { if(us && wcslen(us)) {
#ifdef IM_DEBUG #ifdef IM_DEBUG
wprintf(L" 1\n"); wprintf(L" 1\n");
#endif #endif
wcsncat(im->s, &u, 1); wcscat(im->s, us);
/* Final match */ /* Final match */
if(cm.match_is_final) { if(cm.match_is_final) {
@ -778,7 +818,7 @@ static int im_event_ja(IM_DATA* im, SDL_keysym ks)
} }
/* May need to be overwritten next time */ /* May need to be overwritten next time */
else { else {
im->discard++; im->discard += wcslen(us);
break; break;
} }
} }
@ -840,7 +880,7 @@ static int im_event_ja(IM_DATA* im, SDL_keysym ks)
static int im_event_ko_isvowel(CHARMAP* cm, wchar_t c) static int im_event_ko_isvowel(CHARMAP* cm, wchar_t c)
{ {
STATE_MACHINE *start, *next; STATE_MACHINE *start, *next;
wchar_t unicode; const wchar_t* unicode;
int section; int section;
/* Determine the starting state based on the charmap's active section */ /* Determine the starting state based on the charmap's active section */
@ -849,9 +889,9 @@ static int im_event_ko_isvowel(CHARMAP* cm, wchar_t c)
start = &cm->sections[section]; start = &cm->sections[section];
next = sm_search_shallow(start, (char)c); next = sm_search_shallow(start, (char)c);
unicode = next ? next->output : 0; unicode = next ? next->output : NULL;
return (0x314F <= unicode && unicode <= 0x3163); return (wcslen(unicode) == 1 && 0x314F <= unicode[0] && unicode[0] <= 0x3163);
} }
@ -968,13 +1008,13 @@ static int im_event_ko(IM_DATA* im, SDL_keysym ks)
/* Translate the characters */ /* Translate the characters */
im->discard = 0; im->discard = 0;
while(1) { while(1) {
u = charmap_search(&cm, bp); const wchar_t* us = charmap_search(&cm, bp);
#ifdef IM_DEBUG #ifdef IM_DEBUG
wprintf(L" [%8ls] [%8ls] %2d %2d\n", im->s, im->buf, wcslen(im->s), wcslen(im->buf)); wprintf(L" [%8ls] [%8ls] %2d %2d\n", im->s, im->buf, wcslen(im->s), wcslen(im->buf));
#endif #endif
/* Match was found? */ /* Match was found? */
if(u) { if(us && wcslen(us)) {
/* Final match */ /* Final match */
if(cm.match_is_final) { if(cm.match_is_final) {
/* Batchim may carry over to the next character */ /* Batchim may carry over to the next character */
@ -987,9 +1027,9 @@ static int im_event_ko(IM_DATA* im, SDL_keysym ks)
wprintf(L" 1a\n"); wprintf(L" 1a\n");
#endif #endif
wcsncat(im->s, &u, 1); /* Output */ wcscat(im->s, us); /* Output */
im->discard++; /* May need to re-eval next time */ im->discard += wcslen(us); /* May need to re-eval next time */
bp += cm.match_count; /* Keep buffer data for re-eval*/ bp += cm.match_count; /* Keep buffer data for re-eval*/
cm.match_count = 0; cm.match_count = 0;
cm.match_is_final = 0; cm.match_is_final = 0;
} }
@ -999,7 +1039,7 @@ static int im_event_ko(IM_DATA* im, SDL_keysym ks)
wprintf(L" 1b\n"); wprintf(L" 1b\n");
#endif #endif
wcsncat(im->s, &u, 1); /* Output */ wcscat(im->s, us); /* Output */
wcs_lshift(bp, cm.match_count); wcs_lshift(bp, cm.match_count);
cm.match_count = 0; cm.match_count = 0;
cm.match_is_final = 0; cm.match_is_final = 0;
@ -1010,19 +1050,20 @@ static int im_event_ko(IM_DATA* im, SDL_keysym ks)
wprintf(L" 1c\n"); wprintf(L" 1c\n");
#endif #endif
u = cm.match_state_prev->output; us = cm.match_state_prev->output;
wcsncat(im->s, &u, 1); /* Output */ wcscat(im->s, us); /* Output */
cm.match_count--; /* Matched all but one */ cm.match_count--; /* Matched all but one */
cm.match_is_final = 0; cm.match_is_final = 0;
wcs_lshift(bp, cm.match_count); wcs_lshift(bp, cm.match_count);
} }
} }
/* No batchim - this is final */
else { else {
#ifdef IM_DEBUG #ifdef IM_DEBUG
wprintf(L" 1d\n"); wprintf(L" 1d\n");
#endif #endif
wcsncat(im->s, &u, 1); wcscat(im->s, us);
wcs_lshift(bp, cm.match_count); wcs_lshift(bp, cm.match_count);
cm.match_count = 0; cm.match_count = 0;
cm.match_is_final = 0; cm.match_is_final = 0;
@ -1034,8 +1075,8 @@ static int im_event_ko(IM_DATA* im, SDL_keysym ks)
wprintf(L" 1e\n"); wprintf(L" 1e\n");
#endif #endif
wcsncat(im->s, &u, 1); wcscat(im->s, us);
im->discard++; im->discard += wcslen(us);
break; break;
} }
} }

View file

@ -35,7 +35,7 @@
typedef struct IM_DATA { typedef struct IM_DATA {
int lang; /* Language used in sequence translation */ int lang; /* Language used in sequence translation */
wchar_t s[8]; /* Characters that should be displayed */ wchar_t s[16]; /* Characters that should be displayed */
const char* tip_text; /* Tip text, read-only please */ const char* tip_text; /* Tip text, read-only please */
/* For use by language-specific im_event_<lang> calls. PRIVATE! */ /* For use by language-specific im_event_<lang> calls. PRIVATE! */