Main Page | Class Hierarchy | Alphabetical List | Data Structures | Directories | File List | Data Fields | Globals | Related Pages

utf16.h

Go to the documentation of this file.
00001 /*
00002 *******************************************************************************
00003 *
00004 *   Copyright (C) 1999-2001, International Business Machines
00005 *   Corporation and others.  All Rights Reserved.
00006 *
00007 *******************************************************************************
00008 *   file name:  utf16.h
00009 *   encoding:   US-ASCII
00010 *   tab size:   8 (not used)
00011 *   indentation:4
00012 *
00013 *   created on: 1999sep09
00014 *   created by: Markus W. Scherer
00015 */
00016 
00032 #ifndef __UTF16_H__
00033 #define __UTF16_H__
00034 
00035 /* single-code point definitions -------------------------------------------- */
00036 
00037 /* handle surrogate pairs */
00038 #define UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800)
00039 #define UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00)
00040 
00041 #define UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0)
00042 
00043 /* get the UTF-32 value directly from the surrogate pseudo-characters */
00044 #define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
00045 
00046 #define UTF16_GET_PAIR_VALUE(first, second) \
00047     (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET)
00048 
00049 /* get the first and second surrogates for a supplementary code point */
00055 #define UTF_FIRST_SURROGATE(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
00056 
00062 #define UTF_SECOND_SURROGATE(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
00063 
00065 #define UTF16_LEAD(supplementary) UTF_FIRST_SURROGATE(supplementary)
00066 
00068 #define UTF16_TRAIL(supplementary) UTF_SECOND_SURROGATE(supplementary)
00069 
00070 /* classes of code unit values */
00071 #define UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar)
00072 #define UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar)
00073 #define UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar)
00074 
00075 /* number of code units per code point */
00076 #define UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff)
00077 #define UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
00078 #define UTF16_MAX_CHAR_LENGTH 2
00079 
00080 /* average number of code units compared to UTF-16 */
00081 #define UTF16_ARRAY_SIZE(size) (size)
00082 
00083 /*
00084  * Get a single code point from an offset that points to any
00085  * of the code units that belong to that code point.
00086  * Assume 0<=i<length.
00087  *
00088  * This could be used for iteration together with
00089  * UTF16_CHAR_LENGTH() and UTF_IS_ERROR(),
00090  * but the use of UTF16_NEXT_CHAR_[UN]SAFE() and
00091  * UTF16_PREV_CHAR_[UN]SAFE() is more efficient for that.
00092  */
00093 #define UTF16_GET_CHAR_UNSAFE(s, i, c) { \
00094     (c)=(s)[i]; \
00095     if(UTF_IS_SURROGATE(c)) { \
00096         if(UTF_IS_SURROGATE_FIRST(c)) { \
00097             (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)+1]); \
00098         } else { \
00099             (c)=UTF16_GET_PAIR_VALUE((s)[(i)-1], (c)); \
00100         } \
00101     } \
00102 }
00103 
00104 #define UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
00105     (c)=(s)[i]; \
00106     if(UTF_IS_SURROGATE(c)) { \
00107         uint16_t __c2; \
00108         if(UTF_IS_SURROGATE_FIRST(c)) { \
00109             if((i)+1<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)+1])) { \
00110                 (c)=UTF16_GET_PAIR_VALUE((c), __c2); \
00111                 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
00112             } else if(strict) {\
00113                 /* unmatched first surrogate */ \
00114                 (c)=UTF_ERROR_VALUE; \
00115             } \
00116         } else { \
00117             if((i)-1>=(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
00118                 (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
00119                 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
00120             } else if(strict) {\
00121                 /* unmatched second surrogate */ \
00122                 (c)=UTF_ERROR_VALUE; \
00123             } \
00124         } \
00125     } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
00126         (c)=UTF_ERROR_VALUE; \
00127     } \
00128 }
00129 
00130 /* definitions with forward iteration --------------------------------------- */
00131 
00132 /*
00133  * all the macros that go forward assume that
00134  * the initial offset is 0<=i<length;
00135  * they update the offset
00136  */
00137 
00138 /* fast versions, no error-checking */
00139 
00140 /*
00141  * Get a single code point from an offset that points to the first
00142  * of the code units that belong to that code point.
00143  * Assume 0<=i<length.
00144  */
00145 #define UTF16_NEXT_CHAR_UNSAFE(s, i, c) { \
00146     (c)=(s)[(i)++]; \
00147     if(UTF_IS_FIRST_SURROGATE(c)) { \
00148         (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)++]); \
00149     } \
00150 }
00151 
00152 #define UTF16_APPEND_CHAR_UNSAFE(s, i, c) { \
00153     if((uint32_t)(c)<=0xffff) { \
00154         (s)[(i)++]=(uint16_t)(c); \
00155     } else { \
00156         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
00157         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
00158     } \
00159 }
00160 
00161 #define UTF16_FWD_1_UNSAFE(s, i) { \
00162     if(UTF_IS_FIRST_SURROGATE((s)[(i)++])) { \
00163         ++(i); \
00164     } \
00165 }
00166 
00167 #define UTF16_FWD_N_UNSAFE(s, i, n) { \
00168     int32_t __N=(n); \
00169     while(__N>0) { \
00170         UTF16_FWD_1_UNSAFE(s, i); \
00171         --__N; \
00172     } \
00173 }
00174 
00175 /*
00176  * Set a random-access offset and adjust it so that
00177  * it points to the beginning of a Unicode character.
00178  * The offset that is passed in points to
00179  * any code unit of a code point
00180  * and will point to the first code unit after
00181  * the macro invocation.
00182  * Never increments the offset.
00183  */
00184 #define UTF16_SET_CHAR_START_UNSAFE(s, i) { \
00185     if(UTF_IS_SECOND_SURROGATE((s)[i])) { \
00186         --(i); \
00187     } \
00188 }
00189 
00190 /* safe versions with error-checking and optional regularity-checking */
00191 
00192 #define UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
00193     (c)=(s)[(i)++]; \
00194     if(UTF_IS_FIRST_SURROGATE(c)) { \
00195         uint16_t __c2; \
00196         if((i)<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)])) { \
00197             ++(i); \
00198             (c)=UTF16_GET_PAIR_VALUE((c), __c2); \
00199             /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
00200         } else if(strict) {\
00201             /* unmatched first surrogate */ \
00202             (c)=UTF_ERROR_VALUE; \
00203         } \
00204     } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
00205         /* unmatched second surrogate or other non-character */ \
00206         (c)=UTF_ERROR_VALUE; \
00207     } \
00208 }
00209 
00210 #define UTF16_APPEND_CHAR_SAFE(s, i, length, c) { \
00211     if((uint32_t)(c)<=0xffff) { \
00212         (s)[(i)++]=(uint16_t)(c); \
00213     } else if((uint32_t)(c)<=0x10ffff) { \
00214         if((i)+1<(length)) { \
00215             (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
00216             (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
00217         } else /* not enough space */ { \
00218             (s)[(i)++]=UTF_ERROR_VALUE; \
00219         } \
00220     } else /* c>0x10ffff, write error value */ { \
00221         (s)[(i)++]=UTF_ERROR_VALUE; \
00222     } \
00223 }
00224 
00225 #define UTF16_FWD_1_SAFE(s, i, length) { \
00226     if(UTF_IS_FIRST_SURROGATE((s)[(i)++]) && (i)<(length) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
00227         ++(i); \
00228     } \
00229 }
00230 
00231 #define UTF16_FWD_N_SAFE(s, i, length, n) { \
00232     int32_t __N=(n); \
00233     while(__N>0 && (i)<(length)) { \
00234         UTF16_FWD_1_SAFE(s, i, length); \
00235         --__N; \
00236     } \
00237 }
00238 
00239 #define UTF16_SET_CHAR_START_SAFE(s, start, i) { \
00240     if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
00241         --(i); \
00242     } \
00243 }
00244 
00245 /* definitions with backward iteration -------------------------------------- */
00246 
00247 /*
00248  * all the macros that go backward assume that
00249  * the valid buffer range starts at offset 0
00250  * and that the initial offset is 0<i<=length;
00251  * they update the offset
00252  */
00253 
00254 /* fast versions, no error-checking */
00255 
00256 /*
00257  * Get a single code point from an offset that points behind the last
00258  * of the code units that belong to that code point.
00259  * Assume 0<=i<length.
00260  */
00261 #define UTF16_PREV_CHAR_UNSAFE(s, i, c) { \
00262     (c)=(s)[--(i)]; \
00263     if(UTF_IS_SECOND_SURROGATE(c)) { \
00264         (c)=UTF16_GET_PAIR_VALUE((s)[--(i)], (c)); \
00265     } \
00266 }
00267 
00268 #define UTF16_BACK_1_UNSAFE(s, i) { \
00269     if(UTF_IS_SECOND_SURROGATE((s)[--(i)])) { \
00270         --(i); \
00271     } \
00272 }
00273 
00274 #define UTF16_BACK_N_UNSAFE(s, i, n) { \
00275     int32_t __N=(n); \
00276     while(__N>0) { \
00277         UTF16_BACK_1_UNSAFE(s, i); \
00278         --__N; \
00279     } \
00280 }
00281 
00282 /*
00283  * Set a random-access offset and adjust it so that
00284  * it points after the end of a Unicode character.
00285  * The offset that is passed in points behind
00286  * any code unit of a code point
00287  * and will point behind the last code unit after
00288  * the macro invocation.
00289  * Never decrements the offset.
00290  */
00291 #define UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) { \
00292     if(UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
00293         ++(i); \
00294     } \
00295 }
00296 
00297 /* safe versions with error-checking and optional regularity-checking */
00298 
00299 #define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \
00300     (c)=(s)[--(i)]; \
00301     if(UTF_IS_SECOND_SURROGATE(c)) { \
00302         uint16_t __c2; \
00303         if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
00304             --(i); \
00305             (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
00306             /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
00307         } else if(strict) {\
00308             /* unmatched second surrogate */ \
00309             (c)=UTF_ERROR_VALUE; \
00310         } \
00311     } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
00312         /* unmatched first surrogate or other non-character */ \
00313         (c)=UTF_ERROR_VALUE; \
00314     } \
00315 }
00316 
00317 #define UTF16_BACK_1_SAFE(s, start, i) { \
00318     if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
00319         --(i); \
00320     } \
00321 }
00322 
00323 #define UTF16_BACK_N_SAFE(s, start, i, n) { \
00324     int32_t __N=(n); \
00325     while(__N>0 && (i)>(start)) { \
00326         UTF16_BACK_1_SAFE(s, start, i); \
00327         --__N; \
00328     } \
00329 }
00330 
00331 #define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \
00332     if((start)<(i) && (i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
00333         ++(i); \
00334     } \
00335 }
00336 
00337 #endif

Generated on Mon May 23 13:34:29 2005 for ICU 2.1 by  doxygen 1.4.2