Blender V2.61 - r43446
|
00001 /* 00002 * ***** BEGIN GPL LICENSE BLOCK ***** 00003 * 00004 * This program is free software; you can redistribute it and/or 00005 * modify it under the terms of the GNU General Public License 00006 * as published by the Free Software Foundation; either version 2 00007 * of the License, or (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program; if not, write to the Free Software Foundation, 00016 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 00017 * 00018 * The Original Code is Copyright (C) 2011 Blender Foundation. 00019 * Code from gutf8.c Copyright (C) 1999 Tom Tromey 00020 * Copyright (C) 2000 Red Hat, Inc. 00021 * All rights reserved. 00022 * 00023 * Contributor(s): Campbell Barton. 00024 * 00025 * ***** END GPL LICENSE BLOCK ***** 00026 * 00027 */ 00028 00033 #include <string.h> 00034 #include <wchar.h> 00035 #include <wctype.h> 00036 00037 #include "BLI_string_utf8.h" 00038 00039 /* from libswish3, originally called u8_isvalid(), 00040 * modified to return the index of the bad character (byte index not utf). 00041 * http://svn.swish-e.org/libswish3/trunk/src/libswish3/utf8.c r3044 - campbell */ 00042 00043 /* based on the valid_utf8 routine from the PCRE library by Philip Hazel 00044 00045 length is in bytes, since without knowing whether the string is valid 00046 it's hard to know how many characters there are! */ 00047 00048 static const char trailingBytesForUTF8[256] = { 00049 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00050 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00051 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00052 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00053 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00054 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00055 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00056 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 00057 }; 00058 00059 int BLI_utf8_invalid_byte(const char *str, int length) 00060 { 00061 const unsigned char *p, *pend = (unsigned char*)str + length; 00062 unsigned char c; 00063 int ab; 00064 00065 for (p = (unsigned char*)str; p < pend; p++) { 00066 c = *p; 00067 if (c < 128) 00068 continue; 00069 if ((c & 0xc0) != 0xc0) 00070 goto utf8_error; 00071 ab = trailingBytesForUTF8[c]; 00072 if (length < ab) 00073 goto utf8_error; 00074 length -= ab; 00075 00076 p++; 00077 /* Check top bits in the second byte */ 00078 if ((*p & 0xc0) != 0x80) 00079 goto utf8_error; 00080 00081 /* Check for overlong sequences for each different length */ 00082 switch (ab) { 00083 /* Check for xx00 000x */ 00084 case 1: 00085 if ((c & 0x3e) == 0) goto utf8_error; 00086 continue; /* We know there aren't any more bytes to check */ 00087 00088 /* Check for 1110 0000, xx0x xxxx */ 00089 case 2: 00090 if (c == 0xe0 && (*p & 0x20) == 0) goto utf8_error; 00091 break; 00092 00093 /* Check for 1111 0000, xx00 xxxx */ 00094 case 3: 00095 if (c == 0xf0 && (*p & 0x30) == 0) goto utf8_error; 00096 break; 00097 00098 /* Check for 1111 1000, xx00 0xxx */ 00099 case 4: 00100 if (c == 0xf8 && (*p & 0x38) == 0) goto utf8_error; 00101 break; 00102 00103 /* Check for leading 0xfe or 0xff, 00104 and then for 1111 1100, xx00 00xx */ 00105 case 5: 00106 if (c == 0xfe || c == 0xff || 00107 (c == 0xfc && (*p & 0x3c) == 0)) goto utf8_error; 00108 break; 00109 } 00110 00111 /* Check for valid bytes after the 2nd, if any; all must start 10 */ 00112 while (--ab > 0) { 00113 if ((*(p+1) & 0xc0) != 0x80) goto utf8_error; 00114 p++; /* do this after so we get usable offset - campbell */ 00115 } 00116 } 00117 00118 return -1; 00119 00120 utf8_error: 00121 00122 return (int)((char *)p - (char *)str) - 1; 00123 } 00124 00125 int BLI_utf8_invalid_strip(char *str, int length) 00126 { 00127 int bad_char, tot= 0; 00128 00129 while((bad_char= BLI_utf8_invalid_byte(str, length)) != -1) { 00130 str += bad_char; 00131 length -= bad_char; 00132 00133 if(length == 0) { 00134 /* last character bad, strip it */ 00135 *str= '\0'; 00136 tot++; 00137 break; 00138 } 00139 else { 00140 /* strip, keep looking */ 00141 memmove(str, str + 1, length); 00142 tot++; 00143 } 00144 } 00145 00146 return tot; 00147 } 00148 00149 00150 /* compatible with BLI_strncpy, but esnure no partial utf8 chars */ 00151 00152 /* array copied from glib's gutf8.c, 00153 * note: this looks to be at odd's with 'trailingBytesForUTF8', 00154 * need to find out what gives here! - campbell */ 00155 static const size_t utf8_skip_data[256] = { 00156 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00157 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00158 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00159 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00160 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00161 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00162 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 00163 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 00164 }; 00165 00166 #define BLI_STR_UTF8_CPY(dst, src, maxncpy) \ 00167 { \ 00168 size_t utf8_size; \ 00169 while(*src != '\0' && (utf8_size= utf8_skip_data[*src]) < maxncpy) { \ 00170 maxncpy -= utf8_size; \ 00171 switch(utf8_size) { \ 00172 case 6: *dst ++ = *src ++; \ 00173 case 5: *dst ++ = *src ++; \ 00174 case 4: *dst ++ = *src ++; \ 00175 case 3: *dst ++ = *src ++; \ 00176 case 2: *dst ++ = *src ++; \ 00177 case 1: *dst ++ = *src ++; \ 00178 } \ 00179 } \ 00180 *dst= '\0'; \ 00181 } 00182 00183 char *BLI_strncpy_utf8(char *dst, const char *src, size_t maxncpy) 00184 { 00185 char *dst_r= dst; 00186 00187 /* note: currently we dont attempt to deal with invalid utf8 chars */ 00188 BLI_STR_UTF8_CPY(dst, src, maxncpy) 00189 00190 return dst_r; 00191 } 00192 00193 char *BLI_strncat_utf8(char *dst, const char *src, size_t maxncpy) 00194 { 00195 while (*dst && maxncpy > 0) { 00196 dst++; 00197 maxncpy--; 00198 } 00199 00200 BLI_STR_UTF8_CPY(dst, src, maxncpy) 00201 00202 return dst; 00203 } 00204 00205 #undef BLI_STR_UTF8_CPY 00206 00207 /* --------------------------------------------------------------------------*/ 00208 /* wchar_t / utf8 functions */ 00209 00210 size_t BLI_strncpy_wchar_as_utf8(char *dst, const wchar_t *src, const size_t maxcpy) 00211 { 00212 size_t len = 0; 00213 while(*src && len < maxcpy) { /* XXX can still run over the buffer because utf8 size isnt known :| */ 00214 len += BLI_str_utf8_from_unicode(*src++, dst+len); 00215 } 00216 00217 dst[len]= '\0'; 00218 00219 return len; 00220 } 00221 00222 /* wchar len in utf8 */ 00223 size_t BLI_wstrlen_utf8(const wchar_t *src) 00224 { 00225 size_t len = 0; 00226 00227 while(*src) { 00228 len += BLI_str_utf8_from_unicode(*src++, NULL); 00229 } 00230 00231 return len; 00232 } 00233 00234 // utf8slen 00235 size_t BLI_strlen_utf8(const char *strc) 00236 { 00237 int len=0; 00238 00239 while(*strc) { 00240 if ((*strc & 0xe0) == 0xc0) { 00241 if((strc[1] & 0x80) && (strc[1] & 0x40) == 0x00) 00242 strc++; 00243 } else if ((*strc & 0xf0) == 0xe0) { 00244 if((strc[1] & strc[2] & 0x80) && ((strc[1] | strc[2]) & 0x40) == 0x00) 00245 strc += 2; 00246 } else if ((*strc & 0xf8) == 0xf0) { 00247 if((strc[1] & strc[2] & strc[3] & 0x80) && ((strc[1] | strc[2] | strc[3]) & 0x40) == 0x00) 00248 strc += 3; 00249 } 00250 00251 strc++; 00252 len++; 00253 } 00254 00255 return len; 00256 } 00257 00258 size_t BLI_strncpy_wchar_from_utf8(wchar_t *dst_w, const char *src_c, const size_t maxcpy) 00259 { 00260 int len=0; 00261 00262 if(dst_w==NULL || src_c==NULL) return(0); 00263 00264 while(*src_c && len < maxcpy) { 00265 size_t step= 0; 00266 unsigned int unicode= BLI_str_utf8_as_unicode_and_size(src_c, &step); 00267 if (unicode != BLI_UTF8_ERR) { 00268 *dst_w= (wchar_t)unicode; 00269 src_c += step; 00270 } 00271 else { 00272 *dst_w = '?'; 00273 src_c= BLI_str_find_next_char_utf8(src_c, NULL); 00274 } 00275 dst_w++; 00276 len++; 00277 } 00278 return len; 00279 } 00280 00281 /* end wchar_t / utf8 functions */ 00282 /* --------------------------------------------------------------------------*/ 00283 00284 /* copied from glib's gutf8.c */ 00285 00286 /* note, glib uses unsigned int for unicode, best we do the same, 00287 * though we dont typedef it - campbell */ 00288 00289 #define UTF8_COMPUTE(Char, Mask, Len) \ 00290 if (Char < 128) { \ 00291 Len = 1; \ 00292 Mask = 0x7f; \ 00293 } \ 00294 else if ((Char & 0xe0) == 0xc0) { \ 00295 Len = 2; \ 00296 Mask = 0x1f; \ 00297 } \ 00298 else if ((Char & 0xf0) == 0xe0) { \ 00299 Len = 3; \ 00300 Mask = 0x0f; \ 00301 } \ 00302 else if ((Char & 0xf8) == 0xf0) { \ 00303 Len = 4; \ 00304 Mask = 0x07; \ 00305 } \ 00306 else if ((Char & 0xfc) == 0xf8) { \ 00307 Len = 5; \ 00308 Mask = 0x03; \ 00309 } \ 00310 else if ((Char & 0xfe) == 0xfc) { \ 00311 Len = 6; \ 00312 Mask = 0x01; \ 00313 } \ 00314 else { \ 00315 Len = -1; \ 00316 } 00317 00318 /* same as glib define but added an 'Err' arg */ 00319 #define UTF8_GET(Result, Chars, Count, Mask, Len, Err) \ 00320 (Result) = (Chars)[0] & (Mask); \ 00321 for ((Count) = 1; (Count) < (Len); ++(Count)) { \ 00322 if (((Chars)[(Count)] & 0xc0) != 0x80) { \ 00323 (Result) = Err; \ 00324 break; \ 00325 } \ 00326 (Result) <<= 6; \ 00327 (Result) |= ((Chars)[(Count)] & 0x3f); \ 00328 } 00329 00330 00331 /* uses glib functions but not from glib */ 00332 /* gets the size of a single utf8 char */ 00333 int BLI_str_utf8_size(const char *p) 00334 { 00335 int mask = 0, len; 00336 unsigned char c = (unsigned char) *p; 00337 00338 UTF8_COMPUTE (c, mask, len); 00339 00340 (void)mask; /* quiet warning */ 00341 00342 return len; 00343 } 00344 00345 /* was g_utf8_get_char */ 00358 unsigned int BLI_str_utf8_as_unicode(const char *p) 00359 { 00360 int i, mask = 0, len; 00361 unsigned int result; 00362 unsigned char c = (unsigned char) *p; 00363 00364 UTF8_COMPUTE (c, mask, len); 00365 if (len == -1) 00366 return BLI_UTF8_ERR; 00367 UTF8_GET (result, p, i, mask, len, BLI_UTF8_ERR); 00368 00369 return result; 00370 } 00371 00372 /* varient that increments the length */ 00373 unsigned int BLI_str_utf8_as_unicode_and_size(const char *p, size_t *index) 00374 { 00375 int i, mask = 0, len; 00376 unsigned int result; 00377 unsigned char c = (unsigned char) *p; 00378 00379 UTF8_COMPUTE (c, mask, len); 00380 if (len == -1) 00381 return BLI_UTF8_ERR; 00382 UTF8_GET (result, p, i, mask, len, BLI_UTF8_ERR); 00383 *index += len; 00384 return result; 00385 } 00386 00387 /* another varient that steps over the index, 00388 * note, currently this also falls back to latin1 for text drawing. */ 00389 unsigned int BLI_str_utf8_as_unicode_step(const char *p, size_t *index) 00390 { 00391 int i, mask = 0, len; 00392 unsigned int result; 00393 unsigned char c; 00394 00395 p += *index; 00396 c= (unsigned char) *p; 00397 00398 UTF8_COMPUTE (c, mask, len); 00399 if (len == -1) { 00400 /* when called with NULL end, result will never be NULL, 00401 * checks for a NULL character */ 00402 char *p_next= BLI_str_find_next_char_utf8(p, NULL); 00403 /* will never return the same pointer unless '\0', 00404 * eternal loop is prevented */ 00405 *index += (size_t)(p_next - p); 00406 return BLI_UTF8_ERR; 00407 } 00408 00409 /* this is tricky since there are a few ways we can bail out of bad unicode 00410 * values, 3 possible solutions. */ 00411 #if 0 00412 UTF8_GET (result, p, i, mask, len, BLI_UTF8_ERR); 00413 #elif 1 00414 /* WARNING: this is NOT part of glib, or supported by similar functions. 00415 * this is added for text drawing because some filepaths can have latin1 00416 * characters */ 00417 UTF8_GET (result, p, i, mask, len, BLI_UTF8_ERR); 00418 if(result == BLI_UTF8_ERR) { 00419 len= 1; 00420 result= *p; 00421 } 00422 /* end warning! */ 00423 #else 00424 /* without a fallback like '?', text drawing will stop on this value */ 00425 UTF8_GET (result, p, i, mask, len, '?'); 00426 #endif 00427 00428 *index += len; 00429 return result; 00430 } 00431 00432 /* was g_unichar_to_utf8 */ 00444 size_t BLI_str_utf8_from_unicode(unsigned int c, char *outbuf) 00445 { 00446 /* If this gets modified, also update the copy in g_string_insert_unichar() */ 00447 unsigned int len = 0; 00448 int first; 00449 int i; 00450 00451 if (c < 0x80) { 00452 first = 0; 00453 len = 1; 00454 } 00455 else if (c < 0x800) { 00456 first = 0xc0; 00457 len = 2; 00458 } 00459 else if (c < 0x10000) { 00460 first = 0xe0; 00461 len = 3; 00462 } 00463 else if (c < 0x200000) { 00464 first = 0xf0; 00465 len = 4; 00466 } 00467 else if (c < 0x4000000) { 00468 first = 0xf8; 00469 len = 5; 00470 } 00471 else { 00472 first = 0xfc; 00473 len = 6; 00474 } 00475 00476 if (outbuf) { 00477 for (i = len - 1; i > 0; --i) { 00478 outbuf[i] = (c & 0x3f) | 0x80; 00479 c >>= 6; 00480 } 00481 outbuf[0] = c | first; 00482 } 00483 00484 return len; 00485 } 00486 00487 /* was g_utf8_find_prev_char */ 00503 char * BLI_str_find_prev_char_utf8(const char *str, const char *p) 00504 { 00505 for (--p; p >= str; --p) { 00506 if ((*p & 0xc0) != 0x80) { 00507 return (char *)p; 00508 } 00509 } 00510 return NULL; 00511 } 00512 00513 /* was g_utf8_find_next_char */ 00528 char *BLI_str_find_next_char_utf8(const char *p, const char *end) 00529 { 00530 if (*p) { 00531 if (end) { 00532 for (++p; p < end && (*p & 0xc0) == 0x80; ++p) { 00533 /* do nothing */ 00534 } 00535 } 00536 else { 00537 for (++p; (*p & 0xc0) == 0x80; ++p) { 00538 /* do nothing */ 00539 } 00540 } 00541 } 00542 return (p == end) ? NULL : (char *)p; 00543 } 00544 00545 /* was g_utf8_prev_char */ 00559 char *BLI_str_prev_char_utf8(const char *p) 00560 { 00561 while (1) { 00562 p--; 00563 if ((*p & 0xc0) != 0x80) { 00564 return (char *)p; 00565 } 00566 } 00567 } 00568 /* end glib copy */