Blender: string_utf8.c Source File

Blender V2.61 - r43446
00001 /*
00002  * ***** BEGIN GPL LICENSE BLOCK *****
00003  *
00004  * This program is free software; you can redistribute it and/or
00005  * modify it under the terms of the GNU General Public License
00006  * as published by the Free Software Foundation; either version 2
00007  * of the License, or (at your option) any later version.
00008  *
00009  * This program is distributed in the hope that it will be useful,
00010  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  * GNU General Public License for more details.
00013  *
00014  * You should have received a copy of the GNU General Public License
00015  * along with this program; if not, write to the Free Software Foundation,
00016  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
00017  *
00018  * The Original Code is Copyright (C) 2011 Blender Foundation.
00019  * Code from gutf8.c Copyright (C) 1999 Tom Tromey
00020  *                   Copyright (C) 2000 Red Hat, Inc.
00021  * All rights reserved.
00022  *
00023  * Contributor(s): Campbell Barton.
00024  *
00025  * ***** END GPL LICENSE BLOCK *****
00026  *
00027  */
00028 
00033 #include <string.h>
00034 #include <wchar.h>
00035 #include <wctype.h>
00036 
00037 #include "BLI_string_utf8.h"
00038 
00039 /* from libswish3, originally called u8_isvalid(),
00040  * modified to return the index of the bad character (byte index not utf).
00041  * http://svn.swish-e.org/libswish3/trunk/src/libswish3/utf8.c r3044 - campbell */
00042 
00043 /* based on the valid_utf8 routine from the PCRE library by Philip Hazel
00044 
00045    length is in bytes, since without knowing whether the string is valid
00046    it's hard to know how many characters there are! */
00047 
00048 static const char trailingBytesForUTF8[256] = {
00049     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00050     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00051     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00052     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00053     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00054     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00055     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00056     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
00057 };
00058 
00059 int BLI_utf8_invalid_byte(const char *str, int length)
00060 {
00061     const unsigned char *p, *pend = (unsigned char*)str + length;
00062     unsigned char c;
00063     int ab;
00064 
00065     for (p = (unsigned char*)str; p < pend; p++) {
00066         c = *p;
00067         if (c < 128)
00068             continue;
00069         if ((c & 0xc0) != 0xc0)
00070             goto utf8_error;
00071         ab = trailingBytesForUTF8[c];
00072         if (length < ab)
00073             goto utf8_error;
00074         length -= ab;
00075 
00076         p++;
00077         /* Check top bits in the second byte */
00078         if ((*p & 0xc0) != 0x80)
00079             goto utf8_error;
00080 
00081         /* Check for overlong sequences for each different length */
00082         switch (ab) {
00083             /* Check for xx00 000x */
00084         case 1:
00085             if ((c & 0x3e) == 0) goto utf8_error;
00086             continue;   /* We know there aren't any more bytes to check */
00087 
00088             /* Check for 1110 0000, xx0x xxxx */
00089         case 2:
00090             if (c == 0xe0 && (*p & 0x20) == 0) goto utf8_error;
00091             break;
00092 
00093             /* Check for 1111 0000, xx00 xxxx */
00094         case 3:
00095             if (c == 0xf0 && (*p & 0x30) == 0) goto utf8_error;
00096             break;
00097 
00098             /* Check for 1111 1000, xx00 0xxx */
00099         case 4:
00100             if (c == 0xf8 && (*p & 0x38) == 0) goto utf8_error;
00101             break;
00102 
00103             /* Check for leading 0xfe or 0xff,
00104                and then for 1111 1100, xx00 00xx */
00105         case 5:
00106             if (c == 0xfe || c == 0xff ||
00107                 (c == 0xfc && (*p & 0x3c) == 0)) goto utf8_error;
00108             break;
00109         }
00110 
00111         /* Check for valid bytes after the 2nd, if any; all must start 10 */
00112         while (--ab > 0) {
00113             if ((*(p+1) & 0xc0) != 0x80) goto utf8_error;
00114             p++; /* do this after so we get usable offset - campbell */
00115         }
00116     }
00117 
00118     return -1;
00119 
00120 utf8_error:
00121 
00122     return (int)((char *)p - (char *)str) - 1;
00123 }
00124 
00125 int BLI_utf8_invalid_strip(char *str, int length)
00126 {
00127     int bad_char, tot= 0;
00128 
00129     while((bad_char= BLI_utf8_invalid_byte(str, length)) != -1) {
00130         str += bad_char;
00131         length -= bad_char;
00132 
00133         if(length == 0) {
00134             /* last character bad, strip it */
00135             *str= '\0';
00136             tot++;
00137             break;
00138         }
00139         else {
00140             /* strip, keep looking */
00141             memmove(str, str + 1, length);
00142             tot++;
00143         }
00144     }
00145 
00146     return tot;
00147 }
00148 
00149 
00150 /* compatible with BLI_strncpy, but esnure no partial utf8 chars */
00151 
00152 /* array copied from glib's gutf8.c,
00153  * note: this looks to be at odd's with 'trailingBytesForUTF8',
00154  * need to find out what gives here! - campbell */
00155 static const size_t utf8_skip_data[256] = {
00156     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00157     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00158     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00159     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00160     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00161     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00162     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
00163     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
00164 };
00165 
00166 #define BLI_STR_UTF8_CPY(dst, src, maxncpy)                                   \
00167     {                                                                         \
00168         size_t utf8_size;                                                     \
00169         while(*src != '\0' && (utf8_size= utf8_skip_data[*src]) < maxncpy) {  \
00170             maxncpy -= utf8_size;                                             \
00171             switch(utf8_size) {                                               \
00172                 case 6: *dst ++ = *src ++;                                    \
00173                 case 5: *dst ++ = *src ++;                                    \
00174                 case 4: *dst ++ = *src ++;                                    \
00175                 case 3: *dst ++ = *src ++;                                    \
00176                 case 2: *dst ++ = *src ++;                                    \
00177                 case 1: *dst ++ = *src ++;                                    \
00178             }                                                                 \
00179         }                                                                     \
00180         *dst= '\0';                                                           \
00181     }
00182 
00183 char *BLI_strncpy_utf8(char *dst, const char *src, size_t maxncpy)
00184 {
00185     char *dst_r= dst;
00186 
00187     /* note: currently we dont attempt to deal with invalid utf8 chars */
00188     BLI_STR_UTF8_CPY(dst, src, maxncpy)
00189 
00190     return dst_r;
00191 }
00192 
00193 char *BLI_strncat_utf8(char *dst, const char *src, size_t maxncpy)
00194 {
00195     while (*dst && maxncpy > 0) {
00196         dst++;
00197         maxncpy--;
00198     }
00199 
00200     BLI_STR_UTF8_CPY(dst, src, maxncpy)
00201 
00202     return dst;
00203 }
00204 
00205 #undef BLI_STR_UTF8_CPY
00206 
00207 /* --------------------------------------------------------------------------*/
00208 /* wchar_t / utf8 functions  */
00209 
00210 size_t BLI_strncpy_wchar_as_utf8(char *dst, const wchar_t *src, const size_t maxcpy)
00211 {
00212     size_t len = 0;
00213     while(*src && len < maxcpy) { /* XXX can still run over the buffer because utf8 size isnt known :| */
00214         len += BLI_str_utf8_from_unicode(*src++, dst+len);
00215     }
00216 
00217     dst[len]= '\0';
00218 
00219     return len;
00220 }
00221 
00222 /* wchar len in utf8 */
00223 size_t BLI_wstrlen_utf8(const wchar_t *src)
00224 {
00225     size_t len = 0;
00226 
00227     while(*src) {
00228         len += BLI_str_utf8_from_unicode(*src++, NULL);
00229     }
00230 
00231     return len;
00232 }
00233 
00234 // utf8slen
00235 size_t BLI_strlen_utf8(const char *strc)
00236 {
00237     int len=0;
00238 
00239     while(*strc) {
00240         if ((*strc & 0xe0) == 0xc0) {
00241             if((strc[1] & 0x80) && (strc[1] & 0x40) == 0x00)
00242                 strc++;
00243         } else if ((*strc & 0xf0) == 0xe0) {
00244             if((strc[1] & strc[2] & 0x80) && ((strc[1] | strc[2]) & 0x40) == 0x00)
00245                 strc += 2;
00246         } else if ((*strc & 0xf8) == 0xf0) {
00247             if((strc[1] & strc[2] & strc[3] & 0x80) && ((strc[1] | strc[2] | strc[3]) & 0x40) == 0x00)
00248                 strc += 3;
00249         }
00250 
00251         strc++;
00252         len++;
00253     }
00254 
00255     return len;
00256 }
00257 
00258 size_t BLI_strncpy_wchar_from_utf8(wchar_t *dst_w, const char *src_c, const size_t maxcpy)
00259 {
00260     int len=0;
00261 
00262     if(dst_w==NULL || src_c==NULL) return(0);
00263 
00264     while(*src_c && len < maxcpy) {
00265         size_t step= 0;
00266         unsigned int unicode= BLI_str_utf8_as_unicode_and_size(src_c, &step);
00267         if (unicode != BLI_UTF8_ERR) {
00268             *dst_w= (wchar_t)unicode;
00269             src_c += step;
00270         }
00271         else {
00272             *dst_w = '?';
00273             src_c= BLI_str_find_next_char_utf8(src_c, NULL);
00274         }
00275         dst_w++;
00276         len++;
00277     }
00278     return len;
00279 }
00280 
00281 /* end wchar_t / utf8 functions  */
00282 /* --------------------------------------------------------------------------*/
00283 
00284 /* copied from glib's gutf8.c */
00285 
00286 /* note, glib uses unsigned int for unicode, best we do the same,
00287  * though we dont typedef it - campbell */
00288 
00289 #define UTF8_COMPUTE(Char, Mask, Len)                                         \
00290     if (Char < 128) {                                                         \
00291         Len = 1;                                                              \
00292         Mask = 0x7f;                                                          \
00293     }                                                                         \
00294     else if ((Char & 0xe0) == 0xc0) {                                         \
00295         Len = 2;                                                              \
00296         Mask = 0x1f;                                                          \
00297     }                                                                         \
00298     else if ((Char & 0xf0) == 0xe0) {                                         \
00299         Len = 3;                                                              \
00300         Mask = 0x0f;                                                          \
00301     }                                                                         \
00302     else if ((Char & 0xf8) == 0xf0) {                                         \
00303         Len = 4;                                                              \
00304         Mask = 0x07;                                                          \
00305     }                                                                         \
00306     else if ((Char & 0xfc) == 0xf8) {                                         \
00307         Len = 5;                                                              \
00308         Mask = 0x03;                                                          \
00309     }                                                                         \
00310     else if ((Char & 0xfe) == 0xfc) {                                         \
00311         Len = 6;                                                              \
00312         Mask = 0x01;                                                          \
00313     }                                                                         \
00314     else {                                                                    \
00315         Len = -1;                                                             \
00316     }
00317 
00318 /* same as glib define but added an 'Err' arg */
00319 #define UTF8_GET(Result, Chars, Count, Mask, Len, Err)                        \
00320     (Result) = (Chars)[0] & (Mask);                                           \
00321     for ((Count) = 1; (Count) < (Len); ++(Count)) {                           \
00322         if (((Chars)[(Count)] & 0xc0) != 0x80) {                              \
00323             (Result) = Err;                                                   \
00324             break;                                                            \
00325         }                                                                     \
00326         (Result) <<= 6;                                                       \
00327         (Result) |= ((Chars)[(Count)] & 0x3f);                                \
00328     }
00329 
00330 
00331 /* uses glib functions but not from glib */
00332 /* gets the size of a single utf8 char */
00333 int BLI_str_utf8_size(const char *p)
00334 {
00335     int mask = 0, len;
00336     unsigned char c = (unsigned char) *p;
00337 
00338     UTF8_COMPUTE (c, mask, len);
00339 
00340     (void)mask; /* quiet warning */
00341 
00342     return len;
00343 }
00344 
00345 /* was g_utf8_get_char */
00358 unsigned int BLI_str_utf8_as_unicode(const char *p)
00359 {
00360   int i, mask = 0, len;
00361   unsigned int result;
00362   unsigned char c = (unsigned char) *p;
00363 
00364   UTF8_COMPUTE (c, mask, len);
00365   if (len == -1)
00366     return BLI_UTF8_ERR;
00367   UTF8_GET (result, p, i, mask, len, BLI_UTF8_ERR);
00368 
00369   return result;
00370 }
00371 
00372 /* varient that increments the length */
00373 unsigned int BLI_str_utf8_as_unicode_and_size(const char *p, size_t *index)
00374 {
00375     int i, mask = 0, len;
00376     unsigned int result;
00377     unsigned char c = (unsigned char) *p;
00378 
00379     UTF8_COMPUTE (c, mask, len);
00380     if (len == -1)
00381         return BLI_UTF8_ERR;
00382     UTF8_GET (result, p, i, mask, len, BLI_UTF8_ERR);
00383     *index += len;
00384     return result;
00385 }
00386 
00387 /* another varient that steps over the index,
00388  * note, currently this also falls back to latin1 for text drawing. */
00389 unsigned int BLI_str_utf8_as_unicode_step(const char *p, size_t *index)
00390 {
00391     int i, mask = 0, len;
00392     unsigned int result;
00393     unsigned char c;
00394 
00395     p += *index;
00396     c= (unsigned char) *p;
00397 
00398     UTF8_COMPUTE (c, mask, len);
00399     if (len == -1) {
00400         /* when called with NULL end, result will never be NULL,
00401          * checks for a NULL character */
00402         char *p_next= BLI_str_find_next_char_utf8(p, NULL);
00403         /* will never return the same pointer unless '\0',
00404          * eternal loop is prevented */
00405         *index += (size_t)(p_next - p);
00406         return BLI_UTF8_ERR;
00407     }
00408 
00409     /* this is tricky since there are a few ways we can bail out of bad unicode
00410      * values, 3 possible solutions. */
00411 #if 0
00412     UTF8_GET (result, p, i, mask, len, BLI_UTF8_ERR);
00413 #elif 1
00414     /* WARNING: this is NOT part of glib, or supported by similar functions.
00415      * this is added for text drawing because some filepaths can have latin1
00416      * characters */
00417     UTF8_GET (result, p, i, mask, len, BLI_UTF8_ERR);
00418     if(result == BLI_UTF8_ERR) {
00419         len= 1;
00420         result= *p;
00421     }
00422     /* end warning! */
00423 #else
00424     /* without a fallback like '?', text drawing will stop on this value */
00425     UTF8_GET (result, p, i, mask, len, '?');
00426 #endif
00427 
00428     *index += len;
00429     return result;
00430 }
00431 
00432 /* was g_unichar_to_utf8 */
00444 size_t BLI_str_utf8_from_unicode(unsigned int c, char *outbuf)
00445 {
00446     /* If this gets modified, also update the copy in g_string_insert_unichar() */
00447     unsigned int len = 0;
00448     int first;
00449     int i;
00450 
00451     if (c < 0x80) {
00452         first = 0;
00453         len = 1;
00454     }
00455     else if (c < 0x800) {
00456         first = 0xc0;
00457         len = 2;
00458     }
00459     else if (c < 0x10000) {
00460         first = 0xe0;
00461         len = 3;
00462     }
00463     else if (c < 0x200000) {
00464         first = 0xf0;
00465         len = 4;
00466     }
00467     else if (c < 0x4000000) {
00468         first = 0xf8;
00469         len = 5;
00470     }
00471     else {
00472         first = 0xfc;
00473         len = 6;
00474     }
00475 
00476     if (outbuf) {
00477         for (i = len - 1; i > 0; --i) {
00478             outbuf[i] = (c & 0x3f) | 0x80;
00479             c >>= 6;
00480         }
00481         outbuf[0] = c | first;
00482     }
00483 
00484     return len;
00485 }
00486 
00487 /* was g_utf8_find_prev_char */
00503 char * BLI_str_find_prev_char_utf8(const char *str, const char *p)
00504 {
00505     for (--p; p >= str; --p) {
00506         if ((*p & 0xc0) != 0x80) {
00507             return (char *)p;
00508         }
00509     }
00510     return NULL;
00511 }
00512 
00513 /* was g_utf8_find_next_char */
00528 char *BLI_str_find_next_char_utf8(const char *p, const char *end)
00529 {
00530     if (*p) {
00531         if (end) {
00532             for (++p; p < end && (*p & 0xc0) == 0x80; ++p) {
00533                 /* do nothing */
00534             }
00535         }
00536         else {
00537             for (++p; (*p & 0xc0) == 0x80; ++p) {
00538                 /* do nothing */
00539             }
00540         }
00541     }
00542     return (p == end) ? NULL : (char *)p;
00543 }
00544 
00545 /* was g_utf8_prev_char */
00559 char *BLI_str_prev_char_utf8(const char *p)
00560 {
00561     while (1) {
00562         p--;
00563         if ((*p & 0xc0) != 0x80) {
00564             return (char *)p;
00565         }
00566     }
00567 }
00568 /* end glib copy */