#include #include #include "utfcvutils.h" // todo: remember start of current input symbol, so we can return the // correct nr of items used when the output buf is too small // utf8 // 00000000-0000007F | 0xxxxxxx | 00-7f // 00000080-000007FF | 110xxxxx 10xxxxxx | c0-df 80-bf // 00000800-0000FFFF | 1110xxxx 10xxxxxx 10xxxxxx | e0-ef 80-bf 80-bf // 00010000-0010FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | f0-f7 80-bf 80-bf 80-bf // utf16 // d800-dbff 110110yy yyyyyyyy high half // dc00-dfff 110111xx xxxxxxxx low half // -> yyyy yyyyyyxx xxxxxxxx + 0x10000 size_t utf8bytesneeded(int32_t c) { if (c<0x80) return 1; if (c<0x800) return 2; if (c<0x10000) return 3; return 4; // < 0x110000 } size_t utf32toutf8bytesneeded(const int32_t *p) { size_t n=0; while (*p) n += utf8bytesneeded(*p++); return n; } size_t utf16toutf8bytesneeded(const int16_t *p) { size_t n=0; int32_t w; while (uint16_t c= *p++) { if (c<0xd800 || c>=0xe000) n += utf8bytesneeded(c); else if (c<0xdc00) { w = c&0x3ff; } else { w = 0x10000 + ((w<<10) | (c&0x3ff)); n += utf8bytesneeded(w); } } return n; } size_t utf8toutf32bytesneeded(const int8_t *p) { size_t n=0; while (uint8_t c= *p++) if (c<0x80 || c>=0xc0) n++; return n*sizeof(int32_t); } size_t utf8toutf16bytesneeded(const int8_t *p) { size_t n=0; while (uint8_t c= *p++) { if (c<0x80 || c>=0xc0) n++; if (c>=0xf0) n++; } return n*sizeof(int16_t); } size_t utf32toutf16bytesneeded(const int32_t *p) { size_t n=0; while (int32_t c= *p++) { n++; if (c>=0x10000) n++; } return n*sizeof(int32_t); } template bool checkend(T*p, T*end, size_t size) { if (size==AUTOSIZE) return true; return p>6); *p8++ = 0x80 + (c&0x3f); } else if (c<0x10000) { if (!checkend(p8+2,p8end,maxsize)) break; *p8++ = 0xe0 + (c>>12); *p8++ = 0x80 + ((c>>6)&0x3f); *p8++ = 0x80 + (c&0x3f); } else { if (!checkend(p8+3,p8end,maxsize)) break; *p8++ = 0xf0 + (c>>18); *p8++ = 0x80 + ((c>>12)&0x3f); *p8++ = 0x80 + ((c>>6)&0x3f); *p8++ = 0x80 + (c&0x3f); } } *p8++ = 0; return p32-p32start; } size_t utf16toutf32(const int16_t *p16, int32_t *p32, size_t maxsize) { const int16_t *p16start= p16; int32_t *p32end= p32+maxsize-1; int32_t w=0; uint16_t c; while (checkend(p32,p32end,maxsize) && (c= *p16++)!=0) { if (c<0xd800 || c>=0xe000) *p32++ = c; else if (c<0xdc00) { w = c&0x3ff; } else { // c=dc00 .. dfff w = 0x10000 + ((w<<10) | (c&0x3ff)); *p32++ = w; } } *p32++ = 0; return p16-p16start; } size_t utf32toutf16(const int32_t *p32, int16_t *p16, size_t maxsize) { const int32_t *p32start= p32; int16_t *p16end= p16+maxsize-1; int32_t c; while (checkend(p16,p16end,maxsize) && (c= *p32++)!=0) { if (c<0x10000) *p16++ = c; else { if (!checkend(p16+1,p16end,maxsize)) break; c -= 0x10000; *p16++ = 0xd800+(c>>10); *p16++ = 0xdc00+(c&0x3ff); } } *p16++ = 0; return p32-p32start; } size_t utf8toutf16(const int8_t *p8, int16_t *p16, size_t maxsize) { const int8_t *p8start= p8; int16_t *p16end= p16+maxsize-1; int n=0; int32_t w=0; uint8_t c; while (checkend(p16,p16end,maxsize) && (c= *p8++)!=0) { if (c<0x80) *p16++ = c; else if (c<0xc0) { w = (w<<6) | (c&0x3f); if (n==0) { if (w<0x10000) *p16++ = w; else { if (!checkend(p16+1,p16end,maxsize)) break; w -= 0x10000; *p16++ = 0xd800+(w>>10); *p16++ = 0xdc00+(w&0x3ff); } } else n--; } else if (c<0xe0) { w = (c&0x1f); n= 0; } else if (c<0xf0) { w = (c&0xf); n= 1; } else { // c < 0xf8 w = (c&0x7); n= 2; } } *p16++ = 0; return p8-p8start; } size_t utf16toutf8(const int16_t *p16, int8_t *p8, size_t maxsize) { const int16_t *p16start= p16; int8_t *p8end= p8+maxsize-1; int32_t w=0; bool emit=false; uint16_t c; while (checkend(p8,p8end,maxsize) && (c= *p16++)!=0) { if ((c<0xd800) || (c>=0xe000)) { w = c; emit= true; } else if (c<0xdc00) { w = c&0x3ff; emit= false; } else { // c=dc00 .. dfff w = 0x10000 + ((w<<10) | (c&0x3ff)); emit= true; } if (emit) { if (w<0x80) { *p8++ = w; } else if (w<0x800) { if (!checkend(p8+1,p8end,maxsize)) break; *p8++ = 0xc0 + (w>>6); *p8++ = 0x80 + (w&0x3f); } else if (w<0x10000) { if (!checkend(p8+2,p8end,maxsize)) break; *p8++ = 0xe0 + (w>>12); *p8++ = 0x80 + ((w>>6)&0x3f); *p8++ = 0x80 + (w&0x3f); } else { if (!checkend(p8+3,p8end,maxsize)) break; *p8++ = 0xf0 + (w>>18); *p8++ = 0x80 + ((w>>12)&0x3f); *p8++ = 0x80 + ((w>>6)&0x3f); *p8++ = 0x80 + (w&0x3f); } } } *p8++ = 0; return p16-p16start; } size_t utf16bytesneeded(int32_t c) { if (c<0x10000) return 2; return 4; // < 0x110000 } size_t utf16bytesneeded(const int32_t *p) { size_t n=0; while (*p) n += utf16bytesneeded(*p++); return n; } size_t utf16toutf32bytesneeded(const int16_t *p) { size_t n=0; while (uint16_t c= *p++) if (c<0xdc00 || c>=0xe000) n++; return n*sizeof(int32_t); }