ucsutf.c

/*
  ucsutf.c  avp 2012

  Функции для UTF-8 и Unicode (ASCII&Cyrillic), независимые от setlocale()
 */


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>

#include "ucsutf.h"

/*
  utf-8 схема                                               error: 0xFF, 0xFE
  1111 110x , 1111 10xx , 1111 0xxx , 1110 xxxx , 110x xxxx , 10xx xxxx
  10xx xxxx , 10xx xxxx , 10xx xxxx , 10xx xxxx , 10xx xxxx
  10xx xxxx , 10xx xxxx , 10xx xxxx , 10xx xxxx
  10xx xxxx , 10xx xxxx , 10xx xxxx
  10xx xxxx , 10xx xxxx
  10xx xxxx
*/
// значимые биты из первого байта UTF-8
static int mask[5] = { 0x01, 0x03 ,0x07 ,0x0F ,0x1F};

int
ucs_len (int ucs)
{
  if (ucs < 0)
    return 0;
  if (ucs < 128)
    return 1;
  int n = 6;
  if (ucs < 0x800)
    n = 2;
  else if (ucs < 0x10000)
    n = 3;
  else if (ucs < 0x200000)
    n = 4;
  else if (ucs < 0x4000000)
    n = 5;
  return n;
}


/*
  Некая замена fgetwc()
  не зависит от setlocale() и ПОЗВОЛЯЕТ использовать fgetc() и др.

  Возвращает UCS from stream UTF-8 characters or EOF
  если преобразование UTF-8 -> UCS невозможно возвращает EOF
  и устанавливает errno в EILSEQ

  Также заполняет структуру getucs прочитанными байтами
 */
int
utf8_fgetc (struct getucs *uc, FILE *f)
{
  uc->nc = uc->err = 0;
  int c, n = 5;
  if (feof(f))
    c = EOF;
  else if ((c = fgetc(f)) != EOF) {
    uc->bytes[uc->nc++] = c;
    if (c > 127) {
      if ((c & 0xc0) == 0x80 || c == 0xff || c == 0xfe) {
    c = EOF;
    uc->err = 1;
      } else {
    while (n && (c & (1<<n)))
      n--;

    u_int ucs = c & mask[n-1];
    n = 6-n;
    while (n--) {
      if ((c = fgetc(f)) == EOF)
        break;
      if (((uc->bytes[uc->nc++] = c) & 0xc0) != 0x80) {
        uc->err = uc->nc;
        c = EOF;
        break;
      }
      ucs <<= 6;
      ucs |= (c & 0x3f);
    }
    if (c != EOF)
      c = ucs;
      }
    }
  }
  if (uc->err)
    errno = EILSEQ;
  return uc->ucs = c;
}

/*
  Получает адрес памяти с байтами, закодированными в UTF-8.
  Возвращает UCS из одного или нескольких байт в памяти кодированных в UTF-8.
  Во втором аргументе возвращает длину UTF-8 последовательности.
  Третий аргумент индикатор ошибки. Если ошибки нет, то он устанавливается в 0.

  При ошибке (недопустимая UTF-8 последовательность)
  возвращает первый байт, второй параметр устанавливается в 1,
  а третий задает смещение к байту, следующего за ошибочным.
 */
int
utf8_to_ucs (const char *utf, int *step, int *err)
{
  if (step)
    *step = 1;
  if (err)
    *err = 0;
  u_int ucs = *utf & 0xFF, estep = 1;
  int k, n = 5, efl = 0;

  if (ucs > 127) {
    // для любой длины utf-8
    // FF, FE, 10xx xxxx в первом байте - error (not utf-8 !!!). Return it
    if ((ucs & 0xC0) == 0x80 || ucs == 0xFF || ucs == 0xFE)
      efl = 1;
    else {
      // 1111110x
      while (n && (ucs & (1<<n)))
    n--;

      k = 7-n;
      u_int uc = ucs & mask[n-1];
      n = 6-n;
      while (n--) {
    estep++;
    if ((*(++utf) & 0xC0) != 0x80) {
      efl = 1;
      break;
    }
    uc <<= 6;
    uc |= (*utf & 0x3f);
      }
      if (!efl) {
    if (step)
      *step = k;
    ucs = uc;
      }
    }
  }
  if (efl) {
    if (err)
      *err = estep;
    errno = EILSEQ;
  }
  return ucs;
}

/*
  Для заданного UCS помещает в память байты в UTF-8
  Возвращает количество байт или 0 при ошибке (UCS < 0)
 */
int
ucs_to_utf8 (int uc, char *b)
{
  if (uc < 0)
    return 0;
  u_int ucs = uc;
  if (ucs < 128) {
    b[0] = ucs;
    return 1;
  }

  int n = 6, i;
  if (ucs < 0x800)
    n = 2;
  else if (ucs < 0x10000)
    n = 3;
  else if (ucs < 0x200000)
    n = 4;
  else if (ucs < 0x4000000)
    n = 5;

  char *u = b+n-1;
  static u_int
    mask1[7] = { 0x0, 0x0, 0x1f, 0x0f, 0x07, 0x03, 0x01},
    mask2[7] = { 0x0, 0x0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};

  for (i = 0; i < n-1; i++, u--, ucs >>= 6)
    *u = (ucs & 0x3f) | 0x80;
  *u = (ucs & mask1[n]) | mask2[n];

  return n;
}


/*
  Проверяет является ли байт началом символа в UTF-8
  Возвращает
  -1 - недопустимый символ (шаг назад)
  n > 0 UTF-8 символ длиной n байт
 */
int
utf8_chrlen (const char *utf)
{
  u_int ucs = *utf & 0xFF;

  if (ucs < 128)
    return 1;
  if ((ucs & 0xC0) == 0x80 || ucs == 0xFF || ucs == 0xFE)
    return -1;

  int n = 5;
  while (n && (ucs & (1<<n)))
    n--;
  return 7-n;
}


// ascii alpha or Cyr
int
isAlpha (int ucs)
{
  if ((u_int)ucs < 128)
    return isalpha(ucs);
  if (0x400 <= ucs && ucs < 0x460)
    return 1;
  return 0;
}

int
isLower (int ucs)
{
  if ((u_int)ucs < 128)
    return islower(ucs);
  if (0x430 <= ucs && ucs < 0x460)
    return 1;
  return 0;
}

int
isUpper (int ucs)
{
  if ((u_int)ucs < 128)
    return isupper(ucs);
  if (0x400 <= ucs && ucs < 0x430)
    return 1;
  return 0;
}

int
toLower (int ucs)
{
  if ((u_int)ucs < 128)
    return tolower(ucs);
  if (0x400 <= ucs && ucs < 0x410)
    return ucs+80;
  if (0x410 <= ucs && ucs < 0x430)
    return ucs+32;
  return ucs;
}

// ascii alpha or only Rus
int
toUpper (int ucs)
{
  if ((u_int)ucs < 128)
    return toupper(ucs);
  if (0x430 <= ucs && ucs < 0x450)
    return ucs-32;
  if (0x450 <= ucs && ucs < 0x460)
    return ucs-80;
  return ucs;
}

/*
  utf8_toUpper(), utf8_toLower() меняет регистр UTF-8 символа "по месту"

  Возвращают длину измененного символа
  При ошибке возвращают 0 и устанавливаеют параметр err в смещение к байту,
  следующего за ошибочным.
 */
int
utf8_toUpper (char *utf, int *err)
{
  u_int ucs = utf8_to_ucs(utf,0,err);
  return *err? 0: ucs_to_utf8(toUpper(ucs), utf);
}

int
utf8_toLower (char *utf, int *err)
{
  u_int ucs = utf8_to_ucs(utf,0,err);
  return *err? 0: ucs_to_utf8(toLower(ucs), utf);
}

/*
  Длина utf-8 в символах.

  Передаем адрес начала и размер в байтах. Если размер равен нулю,
  то считаем до U+0000
  При ошибке (неправильный utf-8) в *err передаем смещениие в байтах + 1
 */
int
utf8_len (const char *utf, int n, int *err)
{
  int ucs = 1, step, nc = 0;
  const char *s = utf;

  if (n) {
    while (n > 0) {
      ucs = utf8_to_ucs(utf,&step,err);
      if (*err) {
    *err += (utf-s);
    break;
      }
      if ((n -= step) >= 0)
    nc++;
      utf += step;
    }
  } else {
    while (ucs = utf8_to_ucs(utf,&step,err)) {
      if (*err) {
    *err += (utf-s);
    break;
      }
      utf += step;
      nc++;
    }
  }
  return nc;
}

/*
  Перекодирует n байт строки из utf-8 в массив Unicode

  Если длина n == 0, то перекодируем до '\0' в utf
  Возвращает количество символов UCS и в
  *ofs смещение первого не обработанного в байтах в utf (при ошибке < 0)
*/
int
utf8str_to_ucs (const char *utf, int n,
               int *ucsarr, int size, int *ofs)
{
  int ucs, nc = 0, step, err;
  const char *s = utf;

  *ofs = 0;
  if (n) {
    while (nc < size && n > 0) {
      ucs = utf8_to_ucs(utf,&step,&err);
      if (err) {
    *ofs = -(err + (utf-s));
    break;
      }
      if ((n -= step) >= 0) {
    ucsarr[nc++] = ucs;
    *ofs += step;
      }
      utf += step;
    }
  } else {
    while ((ucs = utf8_to_ucs(utf,&step,&err)) && nc < size) {
      if (err) {
    *ofs = -(err + (utf-s));
    break;
      }
      utf += step;
      *ofs += step;
      ucsarr[nc++] = ucs;
    }
  }
  return nc;
}

// returns -1 if OK !!!
int
ucs_to_surrpair (u_int ucs, int sp[2])
{
  sp[0] = sp[1] = 0;
  if (ucs < 0x10000 || ucs > 0x10FFFF)
    return ucs;
  sp[0] = ((ucs >> 10) & 0x3FF) | 0xD8;
  sp[1] = (ucs & 0x3FF) | 0xDC;
  return -1;
}

// returns 0 if no valid surrogate pair
int
surpair_to_ucs (int sp[2])
{
  int ucs = 0;

  if ((sp[0] & 0xFFFFFC00) == 0xD800 && (sp[1] & 0xFFFFFC00) == 0xDC00)
    ucs = ((sp[0] & 0x3FF) << 10) | (sp[1] & 0x3FF);
  return ucs;
}


static inline int
hexdigit (int c)
{
  if (c <= '9')
    return c-'0';
  if (c <= 'F')
    return c-'A'+10;
  return c-'a'+10;
}


/*
  Возвращает UCS из строки
  HHHH
  если HHHH это первая часть суррогатной пары, то читает далее \uHHHH
  в *step смещение от json.
  при ошибке в *err смещение следующего за обнаруженной ошибкой байта
 */
static int
getjhex (const char *json, const char *end, int *step, int *err)
{
  int i, ucs = 0;

  if (json+4 >= end) {
    *err = end-json+1;
    return 0;
  }
  *err = 0;
  for (i = 0; i < 4; i++) {
    if (!isxdigit(json[i])) {
      *err = i+1;
      return 0;
    }
    ucs = (ucs << 4) | hexdigit(json[i]);
  }
  *step = 4;

  if (end - json > 9 && // есть место
      strncmp("\\u",json+4,2) == 0 && (ucs & 0xFC00) == 0xd800) {
    int sp[2];
    sp[0] = ucs; sp[1] = 0;
    for (i = 6; i < 10; i++) {
      if (!isxdigit(json[i])) {
    *err = i+1;
    return 0;
      }
      sp[1] = (sp[1] << 4) | hexdigit(json[i]);
    }
    if (!(ucs = surpair_to_ucs (sp)))
      *err = i;
    *step = 10;
  }
  return ucs;
}

#define JSTR_INCR 512

/*
  Переводит utf-8 в формате JSON string
    "\"\\\/\b\f\r\t\n\uHHHH"
  json, макимальной длины jmxl
  в nil terminated *utf без внешних кавычек.
  в *utf выделено памяти не меньше *size
  если при вызове *utf != 0, то она д.б. malloc и как минимум *size байт и
  может быть realloc().
  jmxl - max длина json (если раньше не найден 0)

  Возвращает количество UCS СИМВОЛОВ результата
  (отрицательное - some error, например нет закрывающей кавычки)
  в *ofs смещение байта в json за закрывающей кавычкой (или байта с ошибкой)
  в *size количество байт, помещенных в *utf (включая nil)
 */
int
utf8json_to_utf (const char *json, int *ofs, int jmxl, char **utf, int *size)
{
  if (!*utf || *size < 1)
    *utf = (char *)malloc(*size = JSTR_INCR);

  const char *s = json, *end = json+jmxl;
  if (end < json)
    end = (typeof(end))~0L;
  u_char c;

  while ((c = *json++) != '"')
    if (!c || json >= end) {
      *ofs = json-s;
      **utf = 0;
      *size = 1;
      return -1;
    }

  int k, // UCS символов в *utf
    l;   // байт (с nil) в *utf
  int ucs, step, err;
  k = l = 0;


  while (json < end && (ucs = utf8_to_ucs(json,&step,&err)) != '"') {
    if (err || ucs < ' ') {
      *ofs = json-s;
      (*utf)[l] = 0;
      *size = l+1;
      return -(k + 1);
    }
    if (ucs != '\\') {
      if (*size <= l + step)
    *utf = (char *)realloc(*utf,*size = *size+JSTR_INCR);

      ucs_to_utf8(ucs,*utf+l);
      l += step;
      k++;
      json += step;
    } else {
      ucs = utf8_to_ucs(++json,&step,&err);
      if (err)
    ucs = 0; // simulate wrong input
      switch (ucs) {
      case '\\':
      case '/':
      case '"':
    break;
      case 'b':
    ucs = '\b';
    break;
      case 't':
    ucs = '\t';
    break;
      case 'n':
    ucs = '\n';
    break;
      case 'r':
    ucs = '\r';
    break;
      case 'f':
    ucs = '\f';
    break;
      case 'u':
    ucs = getjhex(++json,end,&step,&err);
    break;
      default:
    if (!err) // в err смещение за ошибку, поэтому if
      err = 1;
      }
      int lu = ucs_len(ucs);
      if (err || lu == 0) {
    *ofs = json-s + err? err: 0;
    (*utf)[l] = 0;
    *size = l+1;
    return -(k + 1);
      }
      if (*size <= l + lu)
    *utf = (char *)realloc(*utf,*size = *size+JSTR_INCR);
      l += ucs_to_utf8(ucs,*utf+l);
      json += step;
      k++;
    }
  }

  (*utf)[l] = 0;
  *size = l+1;
  *ofs = json - s + 1;

  return json < end ? k : ((*ofs)--, -(k + 1));
}

// возвращает длину в байтах (включая "")
int
utf8str_to_json (const char *utf, int n, char *json, int size, int *ofs)
{
  if (size < 3)
    return 0;

  *json++ = '"';
  *ofs = 0;
  size -= 2;
  int l = 1, i = 0, c;

  while (n? i < n: (c = utf[i]&0xff)) {
    switch (c) {
    case '\\':
    case '/':
    case '"':
      break;
    case '\b':
      c = 'b';
      break;
    case '\r':
      c = 'r';
      break;
    case '\n':
      c = 'n';
      break;
    case '\f':
      c = 'f';
      break;
    case '\t':
      c = 't';
      break;
    default:
      if (c < ' ') {
    if (size < l+6)
      goto OUT;
    sprintf(json+l,"\\u%04x",c);
    l += 6;
    i++;
      } else {
    if (size <= l)
      goto OUT;
    json[l++] = c;
    i++;
      }
      continue;
    }
    if (size < l+2)
      break;
    json[l++] = '\\';
    json[l++] = c;
    i++;
  }
 OUT:;
  json[l++] = '"';
  json[l] = 0;
  *ofs = i;
  return l;
}