tuxpaint-pencil-sharpener/src/im.c

/*
  im.c

  Input method handling
  Copyright (c)2007 by Mark K. Kim and others

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  (See COPYING.txt)

  $Id$
*/

/*
* See the LANGUAGE-SPECIFIC IM FUNCTIONS section for instructions on adding
* support for new languages.
*
* This file is called IM (Input Method), but it's actually an Input Translator.
* This implementation was sort of necessary in order to work without having to
* modify SDL.
*
* Basically, to read in text in foreign language, read Keysym off of SDL and
* pass to im_read.  im_read will translate the text and pass the unicode string
* back to you.  But before all this is done, be sure to create the IM_DATA
* structure and initialize it with the proper language translator you want to use.
*/


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
#include "im.h"


/* ***************************************************************************
* I18N GETTEXT
*/

#ifndef gettext_noop
#define gettext_noop(s) (s)
#endif


enum {
  IM_TIP_NONE,
  IM_TIP_ENGLISH,
  IM_TIP_HIRAGANA,
  IM_TIP_KATAKANA,
  IM_TIP_HANGUL,
  NUM_IM_TIPS
};


static const char* const im_tip_text[NUM_IM_TIPS] =
{
  NULL,
  gettext_noop("English"),
  gettext_noop("Hiragana"),
  gettext_noop("Katakana"),
  gettext_noop("Hangul")
};


/* ***************************************************************************
* CONSTANTS
*/

/* #define IM_DEBUG       1 */

#define MAX_SECTIONS     8    /* Maximum numbers of sections in *.im file */
#define MAX_UNICODE_SEQ 16    /* Output of state machine, including NUL */
#define INITIAL_SMSIZE   8    /* Initial num of transitions in STATE_MACHINE */

#ifndef LANG_DEFAULT
#define LANG_DEFAULT   (LANG_EN)
#endif


/**
* Event types that im_event_*() functions need to handle.
*/
enum {
  IM_REQ_TRANSLATE,    /* The ever-more important IM translation request */
  IM_REQ_INIT,         /* Initialization request */
  IM_REQ_RESET_SOFT,   /* Soft reset request */
  IM_REQ_RESET_FULL,   /* Full reset request */
  IM_REQ_FREE,         /* Free resources */
  NUM_IM_REQUESTS
};


/**
* Match statuses.
*/
enum {
  MATCH_STAT_NONE       = 0x00,
  MATCH_STAT_NOMOSTATES = 0x01,
  MATCH_STAT_NOMOBUF    = 0x02,
};


/* ***************************************************************************
* TYPES
*/

/**
* All im_event_*() functions have this type.
*/
typedef int (*IM_EVENT_FN)(IM_DATA*, SDL_keysym);   /* IM_EVENT_FN type */


/**
* State Machine key-value pair for transition control.  When the "key"
* is pressed, transition is made to "state".
*
* @see STATE_MACHINE
*/
typedef struct SM_WITH_KEY {
  char key;
  struct STATE_MACHINE* state;
} SM_WITH_KEY;


/**
* A State Machine is used to map key strokes to the unicode output.
* A single State Machine has a possible output (the unicode) and pointers
* to next states.  The "next state" is determined by the key stroke
* pressed by the user - this key is looked up in SM_WITH_KEY and
* its next state determined by the STATE_MACHINE pointer in SM_WITH_KEY.
*
* The number of possible transitions to the next state is dynamically
* adjustable using the parameter next_maxsize.  The actual storage in
* use can be determined via next_size.
*
* @see SM_WITH_KEY
*/
typedef struct STATE_MACHINE {
  wchar_t output[MAX_UNICODE_SEQ];
  char flag;

  SM_WITH_KEY* next;      /* Possible transitions */
  size_t next_maxsize;    /* Potential size of the next pointer */
  size_t next_size;       /* Used size of the next pointer */
} STATE_MACHINE;


/**
* A Character Map loads the *.im file, which may have several "sections".
* Each section has its own state machine, and the C code determines which
* section is used in determining which STATE_MACHINE to use for the
* key mapping.
*/
typedef struct {
  STATE_MACHINE sections[MAX_SECTIONS];
  int section;

  /* These variables get populated when a search is performed */
  int match_count;              /* How many char seq was used for output */
  int match_is_final;           /* T/F - tells if match is final */
  int match_stats;              /* Statistics gathering */
  STATE_MACHINE* match_state;
  STATE_MACHINE* match_state_prev;
} CHARMAP;


/* ***************************************************************************
* STATIC GLOBALS
*/

/**
* Global initialization flag.
*/
static int im_initialized = 0;


/**
* Language-specific IM event-handler function pointers.  This lookup table
* is initialized in im_init().  Every support language should have a pointer
* mapped here.
*
* @see im_init()
* @see im_read()
*/
static IM_EVENT_FN im_event_fns[NUM_LANGS];


/* ***************************************************************************
* UTILITY FUNCTIONS
*/

#define MIN(a,b)              ((a)<=(b) ? (a) : (b))
#define IN_RANGE(a,v,b)       ( (a)<=(v) && (v)<(b) )
#define ARRAYLEN(a)           ( sizeof(a)/sizeof(*(a)) )


static void wcs_lshift(wchar_t* s, size_t count)
{
  wchar_t* dest = s;
  wchar_t* src = s+count;
  size_t len = wcslen(src)+1;   /* Copy over all src string + NUL */

  memmove(dest, src, len * sizeof(wchar_t));
}


/**
* Pull out "count" characters from the back.
*/
static void wcs_pull(wchar_t* s, size_t count)
{
  int peg = (int)wcslen(s) - (int)count;
  if(peg < 0) peg = 0;

  s[peg] = L'\0';
}


/* ***************************************************************************
* STATE_MACHINE FUNCTIONS
*/

/**
* Compare two SM_WITH_KEY, return appropriate result.
*/
static int swk_compare(const void* swk1, const void* swk2)
{
  SM_WITH_KEY* sk1 = (SM_WITH_KEY*)swk1;
  SM_WITH_KEY* sk2 = (SM_WITH_KEY*)swk2;

  return (sk1->key) - (sk2->key);
}


/**
* Initialize the State Machine.
*/
static int sm_init(STATE_MACHINE* sm)
{
  memset(sm, 0, sizeof(STATE_MACHINE));

  sm->next = calloc(INITIAL_SMSIZE, sizeof(SM_WITH_KEY));
  if(!sm->next) {
    perror("sm_init");
    return 1;
  }

  sm->next_maxsize = INITIAL_SMSIZE;
  return 0;
}


/**
* Free the State Machine resources.
*/
static void sm_free(STATE_MACHINE* sm)
{
  if(sm->next) {
    int i = 0;

    for(i = 0; i < (int)sm->next_maxsize; i++) {
      STATE_MACHINE* next_state = sm->next[i].state;
      if(next_state) sm_free(next_state);
      sm->next[i].state = NULL;
    }

    free(sm->next);
    sm->next = NULL;
  }

  memset(sm, 0, sizeof(STATE_MACHINE));
}


/**
* Double the storage space of the possible transition states.
*/
static int sm_dblspace(STATE_MACHINE* sm)
{
  size_t newsize = sm->next_maxsize * 2;
  SM_WITH_KEY* next = realloc(sm->next, sizeof(SM_WITH_KEY) * newsize);

  if(next == NULL) {
    perror("sm_dblspace");
    return 1;
  }

  sm->next = next;
  sm->next_maxsize = newsize;
  return 0;
}


/**
* Search the state machine's transition keys, return pointer to the next state.
* Return NULL if none is found.  The search is done only at 1 level, and does
* not recurse deep.
*/
static STATE_MACHINE* sm_search_shallow(STATE_MACHINE* sm, char key)
{
  SM_WITH_KEY smk = { key, NULL };
  SM_WITH_KEY* smk_found;

  smk_found = bsearch(
      &smk, sm->next, sm->next_size, sizeof(SM_WITH_KEY), swk_compare);

  if(!smk_found) return NULL;
  return smk_found->state;
}


/**
* Search the state machine's transition keys, return the unicode output of the
* last state found.  The search is done deep, recursing until no more match
* can be found.
*
* @param start    Starting point of the state transition.  Constant.
* @param key      The key string to look for.  Constant.
* @param matched  The number of character strings matched.  Return on output.
* @param end      The last state found.  Return on output.
* @param penult   The penultimate state found.
*
* @return         Found unicode character sequence output of the last state.
*/
static const wchar_t* sm_search(STATE_MACHINE* start, wchar_t* key, int* matched, STATE_MACHINE** penult, STATE_MACHINE** end)
{
  STATE_MACHINE* sm = sm_search_shallow(start, (char)*key);
  const wchar_t* unicode;

  /* No match - stop recursion */
  if(!sm) {
    *matched = 0;
    *end = start;

    return start->output;
  }

  /* Match - recurse */
  *penult = start;
  unicode = sm_search(sm, key+1, matched, penult, end);
  (*matched)++;

  return unicode;
}


/**
* Sort the state machine's transition keys so it can be binary-searched.
* The sort is done only at 1 level, and does not recurse deep.
*/
static void sm_sort_shallow(STATE_MACHINE* sm)
{
  qsort(sm->next, sm->next_size, sizeof(SM_WITH_KEY), swk_compare);
}


/**
* Add a single sequence-to-unicode path to the state machine.
*/
static int sm_add(STATE_MACHINE* sm, char* seq, const wchar_t* unicode, char flag)
{
  STATE_MACHINE* sm_found = sm_search_shallow(sm, seq[0]);

  /* Empty sequence */
  if(seq[0] == '\0') {
    if(wcslen(sm->output)) {
      size_t i;

      fprintf(stderr, "Unicode sequence ");
      for(i = 0; i < wcslen(sm->output); i++) fprintf(stderr, "%04X ", (int)sm->output[i]);
      fprintf(stderr, " already defined, overriding with ");
      for(i = 0; i < wcslen(unicode); i++) fprintf(stderr, "%04X ", (int)unicode[i]);
      fprintf(stderr, "\n");
    }
    wcscpy(sm->output, unicode);
    sm->flag = flag;
    return 0;
  }

  /* The key doesn't exist yet */
  if(!sm_found) {
    int index = (int)sm->next_size;
    SM_WITH_KEY* next = &sm->next[index];

    /* Add the key */
    next->key = seq[0];
    next->state = malloc(sizeof(STATE_MACHINE));
    if(!next->state) {
      perror("sm_add");
      return 1;
    }
    sm_init(next->state);

    /* Increase store for next time, if necessary */
    if(++(sm->next_size) >= sm->next_maxsize) {
      if(sm_dblspace(sm)) {
        fprintf(stderr, "Memory expansion failure\n");
        return 1;
      }
    }

    sm_found = next->state;
  }

  /* Recurse */
  sm_add(sm_found, seq+1, unicode, flag);

  /* Sort the states */
  sm_sort_shallow(sm);

  return 0;
}


/* ***************************************************************************
* CHARMAP FUNCTIONS
*/

/**
* Initialize the character map table.
*/
static int charmap_init(CHARMAP* cm)
{
  int error_code = 0;
  int i = 0;

  memset(cm, 0, sizeof(CHARMAP));

  for(i = 0; i < MAX_SECTIONS; i++) {
    error_code += sm_init(&cm->sections[i]);
  }

  return error_code;
}


/**
* Add a character-sequence-to-unicode mapping to the character map.
*
* @param cm      Character map to which to add the mapping.
* @param section The section of the character map to add the mapping.
* @param seq     The character sequence to which to add the mapping.
* @param unicode The unicode of the character sequence.
* @param flag    The flag associated with this state, if any.
*
* @return        0 if no error, 1 if error.
*/
static int charmap_add(CHARMAP* cm, int section, char* seq, const wchar_t* unicode, char* flag)
{
  if(section >= MAX_SECTIONS) {
    fprintf(stderr, "Section count exceeded\n");
    return 1;
  }

  /* For now, we only utilize one-character flags */
  if(strlen(flag) > 1) {
    fprintf(stderr, "%04X: Multi-character flag, truncated.\n", (int)unicode);
  }

  return sm_add(&cm->sections[section], seq, unicode, flag[0]);
}


/**
* Load the character map table from a file.
*
* @param cm     Character Map to load the table into.
* @param path   The path of the file to load.
* @return       Zero if the file is loaded fine, nonzero otherwise.
*/
static int charmap_load(CHARMAP* cm, const char* path)
{
  FILE* is = NULL;
  int section = 0;
  int error_code = 0;

  /* Open */
  is = fopen(path, "rt");
  if(!is) {
    perror("path");
    return 1;
  }

  /* Load */
  while(!feof(is)) {
    wchar_t unicode[MAX_UNICODE_SEQ];
    int ulen = 0;

    char buf[256];
    char flag[256];

    int scanned = 0;

    /* Scan a single token first */
    scanned = fscanf(is, "%255s", buf);
    if(scanned < 0) break;
    if(scanned == 0) {
      fprintf(stderr, "%s: Character map syntax error\n", path);
      return 1;
    }

    /* Handle the first argument */
    if(strcmp(buf, "section") == 0) {    /* Section division */
      section++;
      continue;
    }
    else if(buf[0] == '#') {             /* Comment */
      fscanf(is, "%*[^\n]");
      continue;
    }
    else {
      char* bp = buf;
      int u;

      do {
        if(sscanf(bp, "%x", &u) == 1) {   /* Unicode */
          unicode[ulen++] = u;
        }
        else {
          fprintf(stderr, "%s: Syntax error at '%s'\n", path, buf);
          return 1;
        }

        bp = strchr(bp, ':');
        if(bp) bp++;
      } while(bp && ulen < MAX_UNICODE_SEQ-1);
      unicode[ulen] = L'\0';
    }

    /* Scan some more */
    scanned = fscanf(is, "%255s\t%255s", buf, flag);
    if(scanned < 0) break;

    /* Input count checking */
    switch(scanned) {
      case 0: case 1:
        fprintf(stderr, "%s: Character map syntax error\n", path);
        return 1;

      default:
        if(charmap_add(cm, section, buf, unicode, flag)) {
          size_t i = 0;

          fwprintf(stderr, L"Unable to add sequence '%ls', unicode ", buf);
          for(i = 0; i < wcslen(unicode); i++) fwprintf(stderr, L"%04X ", (int)unicode[i]);
          fwprintf(stderr, L"in section %d\n", section);
          error_code = 1;
        }
    }
  }

  /* Close */
  fclose(is);

  return error_code;
}


/**
* Free the resources used by a character map.
*/
static void charmap_free(CHARMAP* cm)
{
  int i;

  for(i = 0; i < MAX_SECTIONS; i++) {
    sm_free(&cm->sections[i]);
  }

  memset(cm, 0, sizeof(CHARMAP));
}


/**
* Search for a matching character string in the character map.
*/
static const wchar_t* charmap_search(CHARMAP* cm, wchar_t* s)
{
  STATE_MACHINE* start;
  const wchar_t* unicode;
  int section;

  /* Determine the starting state based on the charmap's active section */
  section = cm->section;
  if(!IN_RANGE(0, section, (int)ARRAYLEN(cm->sections))) section = 0;
  start = &cm->sections[section];

  cm->match_state = NULL;
  cm->match_state_prev = NULL;
  unicode = sm_search(start, s, &cm->match_count, &cm->match_state_prev, &cm->match_state);

  /**
  * Determine whether the match is final.  A match is considered to be final
  * in two cases: (1)if the last state mached has no exit paths, or (2)if we
  * did not consume all of the search string.  (1) is obvious - if there are
  * no more states to transition to, then the unicode we find is the final
  * code.  (2) means we reached the final state that can be the only
  * interpretation of the input string, so it must be the final state.
  * If neither of these is true, that means further input from the user
  * may allow us to get to a different state, so we have not reached the
  * final state we possibly can.
  */
  cm->match_is_final = 0;
  if(cm->match_count < (int)wcslen(s)) {
    cm->match_is_final = 1;
  }

  /* Statistics */
  cm->match_stats = MATCH_STAT_NONE;
  if(cm->match_state->next_size == 0) {
    cm->match_is_final = 1;
    cm->match_stats |= MATCH_STAT_NOMOSTATES;
  }
  if(cm->match_count == (int)wcslen(s)) {
    cm->match_stats |= MATCH_STAT_NOMOBUF;
  }

  return unicode;
}


/* ***************************************************************************
* LANGUAGE-SPECIFIC IM FUNCTIONS
*
* If you want to add a new language support, add the main code to this
* section.  More specifically, do the following:
*
*   1) Add im_event_<lang>() function to this section.  Use the existing
*      im_event_* functions as models, and feel free to use the state-machine
*      character map engine (CHARMAP struct) but do not feel obligated to
*      do so.  The CHARMAP engine exists for the programmer's benefit, to
*      make it easier to support complex languages.
*
*   2) Update the im_init() functions so that it initializes im_event_fns[]
*      with a pointer to your im_event_<lang>() function.
*
*   3) Create <lang>.im in the "im" directory, if you use the CHARMAP engine.
*      Your code is what loads this file so you should already know to do this
*      step if you have already written a working im_event_<lang>() function
*      that uses CHARMAP, but I explicitly write out this instruction for
*      those trying to figure out the relationship of <lang>.im to this IM
*      framework.
*
*   4) Increase MAX_SECTION if your language needs more sections in <lang>.im
*
*   5) Increase INITIAL_SMSIZE if your <lang>.im is huginormous and takes too
*      long to load.  I can't think of any reason why this would happen unless
*      you're writing a Chinese IM with a significant characters of the
*      language represented, but the code as-is is somewhat lacking when it
*      comes to writing a Chinese IM (need some way to show a dropdown box
*      from the main app - same problem with Korean Hanja and Japanese Kanji
*      inputs, but this isn't meant to be a complex IM framework so I think
*      we're safe for Hanja and Kanji.)  Do this with caution because
*      changing INITIAL_SMSIZE will affect the memory consumption of all IM
*      functions.
*/

/**
* Default C IM event handler.
*
* @see im_read
*/
static int im_event_c(IM_DATA* im, SDL_keysym ks)
{
  /* Handle event requests */
  im->s[0] = L'\0';
  if(im->request != IM_REQ_TRANSLATE) return 0;

  /* Handle key stroke */
  switch(ks.sym) {
    case SDLK_BACKSPACE: im->s[0] = L'\b'; break;
    case SDLK_TAB:       im->s[0] = L'\t'; break;
    case SDLK_RETURN:    im->s[0] = L'\r'; break;
    default:             im->s[0] = ks.unicode;
  }
  im->s[1] = L'\0';
  im->buf[0] = L'\0';

  return 0;
}


/**
* Japanese IM.
*
* @see im_read
*/
static int im_event_ja(IM_DATA* im, SDL_keysym ks)
{
  static const char* lang_file = IMDIR "ja.im";
  enum { SEC_ENGLISH, SEC_HIRAGANA, SEC_KATAKANA, SEC_TOTAL };

  static CHARMAP cm;


  /* Handle event requests */
  switch(im->request) {
    case 0: break;

    case IM_REQ_FREE:        /* Free allocated resources */
      charmap_free(&cm);
      /* go onto full reset */

    case IM_REQ_RESET_FULL:  /* Full reset */
      cm.section = SEC_ENGLISH;
      im->tip_text = im_tip_text[IM_TIP_ENGLISH];
      /* go onto soft reset */

    case IM_REQ_RESET_SOFT:  /* Soft reset */
      im->s[0] = L'\0';
      im->buf[0] = L'\0';
      im->redraw = 0;
      cm.match_count = 0;
      cm.match_is_final = 0;
      cm.match_state = &cm.sections[cm.section];
      cm.match_state_prev = &cm.sections[cm.section];
      break;

    case IM_REQ_INIT:        /* Initialization */
      charmap_init(&cm);

      if(charmap_load(&cm, lang_file)) {
        fprintf(stderr, "Unable to load %s, defaulting to im_event_c\n", lang_file);
        im->lang = LANG_DEFAULT;
        return im_event_c(im, ks);
      }

      im_fullreset(im);

      #ifdef DEBUG
      printf("IM: Loaded '%s'\n", lang_file);
      #endif
      break;
  }
  if(im->request != IM_REQ_TRANSLATE) return 0;


  /* Discard redraw characters, so they can be redrawn */
  if((int)wcslen(im->s) < im->redraw) im->redraw = wcslen(im->s);
  wcs_lshift(im->s, (wcslen(im->s) - im->redraw) );


  /* Handle keys */
  switch(ks.sym) {
    /* Keys to ignore */
    case SDLK_NUMLOCK: case SDLK_CAPSLOCK: case SDLK_SCROLLOCK:
    case SDLK_LSHIFT:  case SDLK_RSHIFT:
    case SDLK_LCTRL:   case SDLK_RCTRL:
    case SDLK_LALT:
    case SDLK_LMETA:   case SDLK_RMETA:
    case SDLK_LSUPER:  case SDLK_RSUPER:
    case SDLK_MODE:    case SDLK_COMPOSE:
      break;

    /* Right-Alt mapped to mode-switch */
    case SDLK_RALT:
      cm.section = (++cm.section % SEC_TOTAL);   /* Change section */
      im_softreset(im);                          /* Soft reset */

      /* Set tip text */
      switch(cm.section) {
        case SEC_ENGLISH:  im->tip_text = im_tip_text[IM_TIP_ENGLISH]; break;
        case SEC_HIRAGANA: im->tip_text = im_tip_text[IM_TIP_HIRAGANA]; break;
        case SEC_KATAKANA: im->tip_text = im_tip_text[IM_TIP_KATAKANA]; break;
      }
      break;

    /* Enter finalizes previous redraw */
    case SDLK_RETURN:
      if(im->redraw <= 0) {
        im->s[0] = L'\r';
        im->s[1] = L'\0';
      }
      im->buf[0] = L'\0';
      im->redraw = 0;
      break;

    /* Actual character processing */
    default:
      /* English mode */
      if(cm.section == SEC_ENGLISH) {
        im->s[0] = ks.unicode;
        im->s[1] = L'\0';
        im->buf[0] = L'\0';
      }
      /* Hiragana and Katakana modes */
      else {
        wchar_t u = ks.unicode;

        im->s[0] = L'\0';                     /* Zero-out output string */
        wcsncat(im->buf, &u, 1);              /* Copy new character */

        /* Translate the characters */
        im->redraw = 0;
        while(1) {
          const wchar_t* us = charmap_search(&cm, im->buf);
          #ifdef IM_DEBUG
          wprintf(L"  [%8ls] [%8ls] %2d %2d\n", im->s, im->buf, wcslen(im->s), wcslen(im->buf));
          #endif

          /* Match was found? */
          if(us && wcslen(us)) {
            #ifdef IM_DEBUG
            wprintf(L"    1\n");
            #endif

            wcscat(im->s, us);

            /* Final match */
            if(cm.match_is_final) {
              wcs_lshift(im->buf, cm.match_count);
              cm.match_count = 0;
              cm.match_is_final = 0;
            }
            /* May need to be overwritten next time */
            else {
              im->redraw += wcslen(us);
              break;
            }
          }
          /* No match, but more data is in the buffer */
          else if(wcslen(im->buf) > 0) {
            /* If the input character has no state, it's its own state */
            if(cm.match_count == 0) {
              #ifdef IM_DEBUG
              wprintf(L"    2a\n");
              #endif
              wcsncat(im->s, im->buf, 1);
              wcs_lshift(im->buf, 1);
              cm.match_is_final = 0;
            }
            /* If the matched characters didn't consume all, it's own state */
            else if((size_t)cm.match_count != wcslen(im->buf)) {
              #ifdef IM_DEBUG
              wprintf(L"    2b (%2d)\n", cm.match_count);
              #endif
              wcsncat(im->s, im->buf, 1);
              wcs_lshift(im->buf, 1);
              cm.match_is_final = 0;
            }
            /* Otherwise it's just a part of a future input */
            else {
              #ifdef IM_DEBUG
              wprintf(L"    2c (%2d)\n", cm.match_count);
              #endif
              wcscat(im->s, im->buf);
              cm.match_is_final = 0;
              im->redraw += wcslen(im->buf);
              break;
            }
          }
          /* No match and no more data in the buffer */
          else {
            #ifdef IM_DEBUG
            wprintf(L"    3\n");
            #endif
            break;
          }

          /* Is this the end? */
          if(cm.match_is_final) break;
        }
      }
  }

  return im->redraw;
}


/**
* Korean IM helper function to tell whether a character typed will produce
* a vowel.
*
* @see im_event_ko
*/
static int im_event_ko_isvowel(CHARMAP* cm, wchar_t c)
{
  STATE_MACHINE *start, *next;
  const wchar_t* unicode;
  int section;

  /* Determine the starting state based on the charmap's active section */
  section = cm->section;
  if(!IN_RANGE(0, section, (int)ARRAYLEN(cm->sections))) section = 0;
  start = &cm->sections[section];

  next = sm_search_shallow(start, (char)c);
  unicode = next ? next->output : NULL;

  return (unicode && wcslen(unicode) == 1 && 0x314F <= unicode[0] && unicode[0] <= 0x3163);
}


/**
* Korean IM.
*
* @see im_read
*/
static int im_event_ko(IM_DATA* im, SDL_keysym ks)
{
  static const char* lang_file = IMDIR "ko.im";
  enum { SEC_ENGLISH, SEC_HANGUL, SEC_TOTAL };

  static CHARMAP cm;


  /* Handle event requests */
  switch(im->request) {
    case 0: break;

    case IM_REQ_FREE:        /* Free allocated resources */
      charmap_free(&cm);
      /* go onto full reset */

    case IM_REQ_RESET_FULL:  /* Full reset */
      cm.section = SEC_ENGLISH;
      im->tip_text = im_tip_text[IM_TIP_ENGLISH];
      /* go onto soft reset */

    case IM_REQ_RESET_SOFT:  /* Soft reset */
      im->s[0] = L'\0';
      im->buf[0] = L'\0';
      im->redraw = 0;
      cm.match_count = 0;
      cm.match_is_final = 0;
      cm.match_state = &cm.sections[cm.section];
      cm.match_state_prev = &cm.sections[cm.section];
      break;

    case IM_REQ_INIT:        /* Initialization */
      charmap_init(&cm);

      if(charmap_load(&cm, lang_file)) {
        fprintf(stderr, "Unable to load %s, defaulting to im_event_c\n", lang_file);
        im->lang = LANG_DEFAULT;
        return im_event_c(im, ks);
      }

      im_fullreset(im);

      #ifdef DEBUG
      printf("IM: Loaded '%s'\n", lang_file);
      #endif
      break;
  }
  if(im->request != IM_REQ_TRANSLATE) return 0;


  /* Discard redraw characters, so they can be redrawn */
  if((int)wcslen(im->s) < im->redraw) im->redraw = wcslen(im->s);
  wcs_lshift(im->s, (wcslen(im->s) - im->redraw) );


  /* Handle keys */
  switch(ks.sym) {
    /* Keys to ignore */
    case SDLK_NUMLOCK: case SDLK_CAPSLOCK: case SDLK_SCROLLOCK:
    case SDLK_LSHIFT:  case SDLK_RSHIFT:
    case SDLK_LCTRL:   case SDLK_RCTRL:
    case SDLK_LMETA:   case SDLK_RMETA:
    case SDLK_LSUPER:  case SDLK_RSUPER:
    case SDLK_MODE:    case SDLK_COMPOSE:
      break;

    /* Right-Alt mapped to mode-switch */
    case SDLK_LALT: case SDLK_RALT:
      cm.section = (++cm.section % SEC_TOTAL);   /* Change section */
      im_softreset(im);                          /* Soft reset */

      /* Set tip text */
      switch(cm.section) {
        case SEC_ENGLISH: im->tip_text = im_tip_text[IM_TIP_ENGLISH]; break;
        case SEC_HANGUL:  im->tip_text = im_tip_text[IM_TIP_HANGUL]; break;
      }
      break;

    /* Backspace removes only a single buffered character */
    case SDLK_BACKSPACE:
      /* Delete one buffered character */
      if(wcslen(im->buf) > 0) {
        wcs_pull(im->buf, 1);
        if(im->redraw > 0) im->redraw--;
        ks.unicode = L'\0';
      }
      /* continue processing: */

    /* Actual character processing */
    default:
      /* English mode */
      if(cm.section == SEC_ENGLISH) {
        im->s[0] = ks.unicode;
        im->s[1] = L'\0';
        im->buf[0] = L'\0';
      }
      /* Hangul mode */
      else {
        wchar_t u = ks.unicode;
        wchar_t* bp = im->buf;

        im->s[0] = L'\0';                     /* Zero-out output string */
        wcsncat(bp, &u, 1);                   /* Copy new character */

        /* Translate the characters */
        im->redraw = 0;
        while(1) {
          const wchar_t* us = charmap_search(&cm, bp);
          #ifdef IM_DEBUG
          wprintf(L"  [%8ls] [%8ls] %2d %2d\n", im->s, im->buf, wcslen(im->s), wcslen(im->buf));
          #endif

          /* Match was found? */
          if(us && wcslen(us)) {
            /* Final match */
            if(cm.match_is_final) {
              /* Batchim may carry over to the next character */
              if(cm.match_state->flag == 'b') {
                wchar_t next_char = bp[cm.match_count];

                /* If there is no more buffer, output it */
                if(cm.match_stats & MATCH_STAT_NOMOBUF) {
                  #ifdef IM_DEBUG
                  wprintf(L"    1a\n");
                  #endif

                  wcscat(im->s, us);          /* Output */
                  im->redraw += wcslen(us);  /* May need to re-eval next time */
                  bp += cm.match_count;       /* Keep buffer data for re-eval*/
                  cm.match_count = 0;
                  cm.match_is_final = 0;
                }
                /* If there is buffer data but it's not vowel, finalize it */
                else if(!im_event_ko_isvowel(&cm, next_char)) {
                  #ifdef IM_DEBUG
                  wprintf(L"    1b\n");
                  #endif

                  wcscat(im->s, us);     /* Output */
                  wcs_lshift(bp, cm.match_count);
                  cm.match_count = 0;
                  cm.match_is_final = 0;
                }
                /* If there is buffer and it's vowel, re-eval */
                else {
                  #ifdef IM_DEBUG
                  wprintf(L"    1c\n");
                  #endif

                  us = cm.match_state_prev->output;
                  wcscat(im->s, us);      /* Output */
                  cm.match_count--;       /* Matched all but one */
                  cm.match_is_final = 0;
                  wcs_lshift(bp, cm.match_count);
                }
              }
              /* No batchim - this is final */
              else {
                #ifdef IM_DEBUG
                wprintf(L"    1d\n");
                #endif

                wcscat(im->s, us);
                wcs_lshift(bp, cm.match_count);
                cm.match_count = 0;
                cm.match_is_final = 0;
              }
            }
            /* May need to be overwritten next time */
            else {
              #ifdef IM_DEBUG
              wprintf(L"    1e\n");
              #endif

              wcscat(im->s, us);
              im->redraw += wcslen(us);
              break;
            }
          }
          /* No match, but more data is in the buffer */
          else if(wcslen(bp) > 0) {
            /* If the input character has no state, it's its own state */
            if(cm.match_count == 0) {
              #ifdef IM_DEBUG
              wprintf(L"    2a\n");
              #endif
              wcsncat(im->s, bp, 1);
              wcs_lshift(bp, 1);
              cm.match_is_final = 0;
            }
            /* If the matched characters didn't consume all, it's own state */
            else if((size_t)cm.match_count != wcslen(bp)) {
              #ifdef IM_DEBUG
              wprintf(L"    2b (%2d)\n", cm.match_count);
              #endif
              wcsncat(im->s, bp, 1);
              wcs_lshift(bp, 1);
              cm.match_is_final = 0;
            }
            /* Otherwise it's just a part of a future input */
            else {
              #ifdef IM_DEBUG
              wprintf(L"    2c (%2d)\n", cm.match_count);
              #endif
              wcscat(im->s, bp);
              cm.match_is_final = 0;
              im->redraw += wcslen(bp);
              break;
            }
          }
          /* No match and no more data in the buffer */
          else {
            #ifdef IM_DEBUG
            wprintf(L"    3\n");
            #endif
            break;
          }

          /* Is this the end? */
          if(cm.match_is_final) break;
        }
      }
  }

  return im->redraw;
}


/* ***************************************************************************
* OTHER STATIC IM FUNCTIONS
*/

/**
* Generic event handler that calls the appropriate language handler.
* im->request should have the event ID.
*/
static void im_event(IM_DATA* im)
{
  SDL_keysym ks;

  ks.sym = 0;
  ks.unicode = 0;

  im_read(im, ks);
}


/**
* Make an event request and call the event handler.
*/
static void im_request(IM_DATA* im, int request)
{
  im->request = request;
  im_event(im);
  im->request = IM_REQ_TRANSLATE;
}


/* ***************************************************************************
* PUBLIC IM FUNCTIONS
*/

/**
* Initialize the IM_DATA structure.
*
* @param im    IM_DATA structure to initialize.
* @param lang  LANG_* defined constant to initialize the structure with.
*/
void im_init(IM_DATA* im, int lang)
{
  /* Free already allocated resources if initialized before */
  if(im_initialized) {
    im_free(im);
  }

  /* Initialize */
  memset(im, 0, sizeof(IM_DATA));
  im->lang = lang;

  /* Setup static globals */
  if(!im_initialized) {
    /* ADD NEW LANGUAGE SUPPORT HERE */
    im_event_fns[LANG_JA] = &im_event_ja;
    im_event_fns[LANG_KO] = &im_event_ko;

    im_initialized = 1;
  }

  #ifdef DEBUG
  assert(0 <= im->lang && im->lang < NUM_LANGS);
  if(im_event_fp) printf("Initializing IM for %s...\n", lang_prefixes[im->lang]);
  #endif

  /* Initialize the individual IM */
  im_request(im, IM_REQ_INIT);
}


void im_softreset(IM_DATA* im)
{
  im->s[0] = L'\0';
  im->buf[0] = L'\0';

  im_request(im, IM_REQ_RESET_SOFT);
}


void im_fullreset(IM_DATA* im)
{
  im->s[0] = L'\0';
  im->buf[0] = L'\0';

  im_request(im, IM_REQ_RESET_FULL);
}


/**
* Free any allocated resources.
*/
void im_free(IM_DATA* im)
{
  im_request(im, IM_REQ_FREE);
}


/**
* IM-process a character.  This function simply looks up the language from
* IM and calls the appropriate im_event_<lang>() language-specific IM event
* handler.  im_event_c() is called by default if no language-specific
* function is specified for the specified language.
*
* @param im  IM-processed data to return to the caller function.
* @param ks  SDL_keysym typed on the keyboard.
*
* @return    The number of characters in im->s that should not be committed.
*            In other words, the returned number of characters at the end of
*            im->s should be overwritten the next time im_read is called.
*
* @see im_event_c()
* @see im_event_fns
*/
int im_read(IM_DATA* im, SDL_keysym ks)
{
  IM_EVENT_FN im_event_fp = NULL;
  int redraw = 0;

  /* Sanity check */
  if(im->lang < 0 || im->lang >= NUM_LANGS) {
    fprintf(stderr, "im->lang out of range (%d), using default\n", im->lang);
    im->lang = LANG_DEFAULT;
  }

  /* Function pointer to the language-specific im_event_* function */
  im_event_fp = im_event_fns[im->lang];

  /* Run the language-specific IM or run the default C IM */
  if(im_event_fp) redraw = (*im_event_fp)(im, ks);
  else redraw = im_event_c(im, ks);

  #ifdef IM_DEBUG
  wprintf(L"* [%8ls] [%8ls] %2d %2d (%2d)\n", im->s, im->buf, wcslen(im->s), wcslen(im->buf), im->redraw);
  #endif

  return redraw;
}


/* vim:ts=2:et
*/