LLVM  9.0.0svn
ConvertUTF.cpp
Go to the documentation of this file.
1 /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===------------------------------------------------------------------------=*/
8 /*
9  * Copyright 2001-2004 Unicode, Inc.
10  *
11  * Disclaimer
12  *
13  * This source code is provided as is by Unicode, Inc. No claims are
14  * made as to fitness for any particular purpose. No warranties of any
15  * kind are expressed or implied. The recipient agrees to determine
16  * applicability of information provided. If this file has been
17  * purchased on magnetic or optical media from Unicode, Inc., the
18  * sole remedy for any claim will be exchange of defective media
19  * within 90 days of receipt.
20  *
21  * Limitations on Rights to Redistribute This Code
22  *
23  * Unicode, Inc. hereby grants the right to freely use the information
24  * supplied in this file in the creation of products supporting the
25  * Unicode Standard, and to make copies of this file in any form
26  * for internal or external distribution as long as this notice
27  * remains attached.
28  */
29 
30 /* ---------------------------------------------------------------------
31 
32  Conversions between UTF32, UTF-16, and UTF-8. Source code file.
33  Author: Mark E. Davis, 1994.
34  Rev History: Rick McGowan, fixes & updates May 2001.
35  Sept 2001: fixed const & error conditions per
36  mods suggested by S. Parent & A. Lillich.
37  June 2002: Tim Dodd added detection and handling of incomplete
38  source sequences, enhanced error detection, added casts
39  to eliminate compiler warnings.
40  July 2003: slight mods to back out aggressive FFFE detection.
41  Jan 2004: updated switches in from-UTF8 conversions.
42  Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
43 
44  See the header file "ConvertUTF.h" for complete documentation.
45 
46 ------------------------------------------------------------------------ */
47 
49 #ifdef CVTUTF_DEBUG
50 #include <stdio.h>
51 #endif
52 #include <assert.h>
53 
54 /*
55  * This code extensively uses fall-through switches.
56  * Keep the compiler from warning about that.
57  */
58 #if defined(__clang__) && defined(__has_warning)
59 # if __has_warning("-Wimplicit-fallthrough")
60 # define ConvertUTF_DISABLE_WARNINGS \
61  _Pragma("clang diagnostic push") \
62  _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
63 # define ConvertUTF_RESTORE_WARNINGS \
64  _Pragma("clang diagnostic pop")
65 # endif
66 #elif defined(__GNUC__) && __GNUC__ > 6
67 # define ConvertUTF_DISABLE_WARNINGS \
68  _Pragma("GCC diagnostic push") \
69  _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
70 # define ConvertUTF_RESTORE_WARNINGS \
71  _Pragma("GCC diagnostic pop")
72 #endif
73 #ifndef ConvertUTF_DISABLE_WARNINGS
74 # define ConvertUTF_DISABLE_WARNINGS
75 #endif
76 #ifndef ConvertUTF_RESTORE_WARNINGS
77 # define ConvertUTF_RESTORE_WARNINGS
78 #endif
79 
81 
82 namespace llvm {
83 
84 static const int halfShift = 10; /* used for shifting by 10 bits */
85 
86 static const UTF32 halfBase = 0x0010000UL;
87 static const UTF32 halfMask = 0x3FFUL;
88 
89 #define UNI_SUR_HIGH_START (UTF32)0xD800
90 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
91 #define UNI_SUR_LOW_START (UTF32)0xDC00
92 #define UNI_SUR_LOW_END (UTF32)0xDFFF
93 
94 /* --------------------------------------------------------------------- */
95 
96 /*
97  * Index into the table below with the first byte of a UTF-8 sequence to
98  * get the number of trailing bytes that are supposed to follow it.
99  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
100  * left as-is for anyone who may want to do such conversion, which was
101  * allowed in earlier algorithms.
102  */
103 static const char trailingBytesForUTF8[256] = {
104  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
105  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
106  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
107  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
108  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
109  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
110  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
111  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
112 };
113 
114 /*
115  * Magic values subtracted from a buffer value during UTF8 conversion.
116  * This table contains as many values as there might be trailing bytes
117  * in a UTF-8 sequence.
118  */
119 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
120  0x03C82080UL, 0xFA082080UL, 0x82082080UL };
121 
122 /*
123  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
124  * into the first byte, depending on how many bytes follow. There are
125  * as many entries in this table as there are UTF-8 sequence types.
126  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
127  * for *legal* UTF-8 will be 4 or fewer bytes total.
128  */
129 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
130 
131 /* --------------------------------------------------------------------- */
132 
133 /* The interface converts a whole buffer to avoid function-call overhead.
134  * Constants have been gathered. Loops & conditionals have been removed as
135  * much as possible for efficiency, in favor of drop-through switches.
136  * (See "Note A" at the bottom of the file for equivalent code.)
137  * If your compiler supports it, the "isLegalUTF8" call can be turned
138  * into an inline function.
139  */
140 
141 
142 /* --------------------------------------------------------------------- */
143 
145  const UTF32** sourceStart, const UTF32* sourceEnd,
146  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
148  const UTF32* source = *sourceStart;
149  UTF16* target = *targetStart;
150  while (source < sourceEnd) {
151  UTF32 ch;
152  if (target >= targetEnd) {
153  result = targetExhausted; break;
154  }
155  ch = *source++;
156  if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
157  /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
158  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
159  if (flags == strictConversion) {
160  --source; /* return to the illegal value itself */
161  result = sourceIllegal;
162  break;
163  } else {
164  *target++ = UNI_REPLACEMENT_CHAR;
165  }
166  } else {
167  *target++ = (UTF16)ch; /* normal case */
168  }
169  } else if (ch > UNI_MAX_LEGAL_UTF32) {
170  if (flags == strictConversion) {
171  result = sourceIllegal;
172  } else {
173  *target++ = UNI_REPLACEMENT_CHAR;
174  }
175  } else {
176  /* target is a character in range 0xFFFF - 0x10FFFF. */
177  if (target + 1 >= targetEnd) {
178  --source; /* Back up source pointer! */
179  result = targetExhausted; break;
180  }
181  ch -= halfBase;
182  *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
183  *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
184  }
185  }
186  *sourceStart = source;
187  *targetStart = target;
188  return result;
189 }
190 
191 /* --------------------------------------------------------------------- */
192 
194  const UTF16** sourceStart, const UTF16* sourceEnd,
195  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
197  const UTF16* source = *sourceStart;
198  UTF32* target = *targetStart;
199  UTF32 ch, ch2;
200  while (source < sourceEnd) {
201  const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
202  ch = *source++;
203  /* If we have a surrogate pair, convert to UTF32 first. */
204  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
205  /* If the 16 bits following the high surrogate are in the source buffer... */
206  if (source < sourceEnd) {
207  ch2 = *source;
208  /* If it's a low surrogate, convert to UTF32. */
209  if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
210  ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
211  + (ch2 - UNI_SUR_LOW_START) + halfBase;
212  ++source;
213  } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
214  --source; /* return to the illegal value itself */
215  result = sourceIllegal;
216  break;
217  }
218  } else { /* We don't have the 16 bits following the high surrogate. */
219  --source; /* return to the high surrogate */
220  result = sourceExhausted;
221  break;
222  }
223  } else if (flags == strictConversion) {
224  /* UTF-16 surrogate values are illegal in UTF-32 */
225  if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
226  --source; /* return to the illegal value itself */
227  result = sourceIllegal;
228  break;
229  }
230  }
231  if (target >= targetEnd) {
232  source = oldSource; /* Back up source pointer! */
233  result = targetExhausted; break;
234  }
235  *target++ = ch;
236  }
237  *sourceStart = source;
238  *targetStart = target;
239 #ifdef CVTUTF_DEBUG
240 if (result == sourceIllegal) {
241  fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
242  fflush(stderr);
243 }
244 #endif
245  return result;
246 }
248  const UTF16** sourceStart, const UTF16* sourceEnd,
249  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
251  const UTF16* source = *sourceStart;
252  UTF8* target = *targetStart;
253  while (source < sourceEnd) {
254  UTF32 ch;
255  unsigned short bytesToWrite = 0;
256  const UTF32 byteMask = 0xBF;
257  const UTF32 byteMark = 0x80;
258  const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
259  ch = *source++;
260  /* If we have a surrogate pair, convert to UTF32 first. */
261  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
262  /* If the 16 bits following the high surrogate are in the source buffer... */
263  if (source < sourceEnd) {
264  UTF32 ch2 = *source;
265  /* If it's a low surrogate, convert to UTF32. */
266  if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
267  ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
268  + (ch2 - UNI_SUR_LOW_START) + halfBase;
269  ++source;
270  } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
271  --source; /* return to the illegal value itself */
272  result = sourceIllegal;
273  break;
274  }
275  } else { /* We don't have the 16 bits following the high surrogate. */
276  --source; /* return to the high surrogate */
277  result = sourceExhausted;
278  break;
279  }
280  } else if (flags == strictConversion) {
281  /* UTF-16 surrogate values are illegal in UTF-32 */
282  if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
283  --source; /* return to the illegal value itself */
284  result = sourceIllegal;
285  break;
286  }
287  }
288  /* Figure out how many bytes the result will require */
289  if (ch < (UTF32)0x80) { bytesToWrite = 1;
290  } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
291  } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
292  } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
293  } else { bytesToWrite = 3;
295  }
296 
297  target += bytesToWrite;
298  if (target > targetEnd) {
299  source = oldSource; /* Back up source pointer! */
300  target -= bytesToWrite; result = targetExhausted; break;
301  }
302  switch (bytesToWrite) { /* note: everything falls through. */
303  case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
304  case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
305  case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
306  case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
307  }
308  target += bytesToWrite;
309  }
310  *sourceStart = source;
311  *targetStart = target;
312  return result;
313 }
314 
315 /* --------------------------------------------------------------------- */
316 
318  const UTF32** sourceStart, const UTF32* sourceEnd,
319  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
321  const UTF32* source = *sourceStart;
322  UTF8* target = *targetStart;
323  while (source < sourceEnd) {
324  UTF32 ch;
325  unsigned short bytesToWrite = 0;
326  const UTF32 byteMask = 0xBF;
327  const UTF32 byteMark = 0x80;
328  ch = *source++;
329  if (flags == strictConversion ) {
330  /* UTF-16 surrogate values are illegal in UTF-32 */
331  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
332  --source; /* return to the illegal value itself */
333  result = sourceIllegal;
334  break;
335  }
336  }
337  /*
338  * Figure out how many bytes the result will require. Turn any
339  * illegally large UTF32 things (> Plane 17) into replacement chars.
340  */
341  if (ch < (UTF32)0x80) { bytesToWrite = 1;
342  } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
343  } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
344  } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
345  } else { bytesToWrite = 3;
347  result = sourceIllegal;
348  }
349 
350  target += bytesToWrite;
351  if (target > targetEnd) {
352  --source; /* Back up source pointer! */
353  target -= bytesToWrite; result = targetExhausted; break;
354  }
355  switch (bytesToWrite) { /* note: everything falls through. */
356  case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
357  case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
358  case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
359  case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
360  }
361  target += bytesToWrite;
362  }
363  *sourceStart = source;
364  *targetStart = target;
365  return result;
366 }
367 
368 /* --------------------------------------------------------------------- */
369 
370 /*
371  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
372  * This must be called with the length pre-determined by the first byte.
373  * If not calling this from ConvertUTF8to*, then the length can be set by:
374  * length = trailingBytesForUTF8[*source]+1;
375  * and the sequence is illegal right away if there aren't that many bytes
376  * available.
377  * If presented with a length > 4, this returns false. The Unicode
378  * definition of UTF-8 goes up to 4-byte sequences.
379  */
380 
381 static Boolean isLegalUTF8(const UTF8 *source, int length) {
382  UTF8 a;
383  const UTF8 *srcptr = source+length;
384  switch (length) {
385  default: return false;
386  /* Everything else falls through when "true"... */
387  case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
388  case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
389  case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
390 
391  switch (*source) {
392  /* no fall-through in this inner switch */
393  case 0xE0: if (a < 0xA0) return false; break;
394  case 0xED: if (a > 0x9F) return false; break;
395  case 0xF0: if (a < 0x90) return false; break;
396  case 0xF4: if (a > 0x8F) return false; break;
397  default: if (a < 0x80) return false;
398  }
399 
400  case 1: if (*source >= 0x80 && *source < 0xC2) return false;
401  }
402  if (*source > 0xF4) return false;
403  return true;
404 }
405 
406 /* --------------------------------------------------------------------- */
407 
408 /*
409  * Exported function to return whether a UTF-8 sequence is legal or not.
410  * This is not used here; it's just exported.
411  */
412 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
413  int length = trailingBytesForUTF8[*source]+1;
414  if (length > sourceEnd - source) {
415  return false;
416  }
417  return isLegalUTF8(source, length);
418 }
419 
420 /* --------------------------------------------------------------------- */
421 
422 static unsigned
424  const UTF8 *sourceEnd) {
425  UTF8 b1, b2, b3;
426 
427  assert(!isLegalUTF8Sequence(source, sourceEnd));
428 
429  /*
430  * Unicode 6.3.0, D93b:
431  *
432  * Maximal subpart of an ill-formed subsequence: The longest code unit
433  * subsequence starting at an unconvertible offset that is either:
434  * a. the initial subsequence of a well-formed code unit sequence, or
435  * b. a subsequence of length one.
436  */
437 
438  if (source == sourceEnd)
439  return 0;
440 
441  /*
442  * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
443  * Byte Sequences.
444  */
445 
446  b1 = *source;
447  ++source;
448  if (b1 >= 0xC2 && b1 <= 0xDF) {
449  /*
450  * First byte is valid, but we know that this code unit sequence is
451  * invalid, so the maximal subpart has to end after the first byte.
452  */
453  return 1;
454  }
455 
456  if (source == sourceEnd)
457  return 1;
458 
459  b2 = *source;
460  ++source;
461 
462  if (b1 == 0xE0) {
463  return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
464  }
465  if (b1 >= 0xE1 && b1 <= 0xEC) {
466  return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
467  }
468  if (b1 == 0xED) {
469  return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
470  }
471  if (b1 >= 0xEE && b1 <= 0xEF) {
472  return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
473  }
474  if (b1 == 0xF0) {
475  if (b2 >= 0x90 && b2 <= 0xBF) {
476  if (source == sourceEnd)
477  return 2;
478 
479  b3 = *source;
480  return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
481  }
482  return 1;
483  }
484  if (b1 >= 0xF1 && b1 <= 0xF3) {
485  if (b2 >= 0x80 && b2 <= 0xBF) {
486  if (source == sourceEnd)
487  return 2;
488 
489  b3 = *source;
490  return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
491  }
492  return 1;
493  }
494  if (b1 == 0xF4) {
495  if (b2 >= 0x80 && b2 <= 0x8F) {
496  if (source == sourceEnd)
497  return 2;
498 
499  b3 = *source;
500  return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
501  }
502  return 1;
503  }
504 
505  assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
506  /*
507  * There are no valid sequences that start with these bytes. Maximal subpart
508  * is defined to have length 1 in these cases.
509  */
510  return 1;
511 }
512 
513 /* --------------------------------------------------------------------- */
514 
515 /*
516  * Exported function to return the total number of bytes in a codepoint
517  * represented in UTF-8, given the value of the first byte.
518  */
520  return trailingBytesForUTF8[first] + 1;
521 }
522 
523 /* --------------------------------------------------------------------- */
524 
525 /*
526  * Exported function to return whether a UTF-8 string is legal or not.
527  * This is not used here; it's just exported.
528  */
529 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
530  while (*source != sourceEnd) {
531  int length = trailingBytesForUTF8[**source] + 1;
532  if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
533  return false;
534  *source += length;
535  }
536  return true;
537 }
538 
539 /* --------------------------------------------------------------------- */
540 
542  const UTF8** sourceStart, const UTF8* sourceEnd,
543  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
545  const UTF8* source = *sourceStart;
546  UTF16* target = *targetStart;
547  while (source < sourceEnd) {
548  UTF32 ch = 0;
549  unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
550  if (extraBytesToRead >= sourceEnd - source) {
551  result = sourceExhausted; break;
552  }
553  /* Do this check whether lenient or strict */
554  if (!isLegalUTF8(source, extraBytesToRead+1)) {
555  result = sourceIllegal;
556  break;
557  }
558  /*
559  * The cases all fall through. See "Note A" below.
560  */
561  switch (extraBytesToRead) {
562  case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
563  case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
564  case 3: ch += *source++; ch <<= 6;
565  case 2: ch += *source++; ch <<= 6;
566  case 1: ch += *source++; ch <<= 6;
567  case 0: ch += *source++;
568  }
569  ch -= offsetsFromUTF8[extraBytesToRead];
570 
571  if (target >= targetEnd) {
572  source -= (extraBytesToRead+1); /* Back up source pointer! */
573  result = targetExhausted; break;
574  }
575  if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
576  /* UTF-16 surrogate values are illegal in UTF-32 */
577  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
578  if (flags == strictConversion) {
579  source -= (extraBytesToRead+1); /* return to the illegal value itself */
580  result = sourceIllegal;
581  break;
582  } else {
583  *target++ = UNI_REPLACEMENT_CHAR;
584  }
585  } else {
586  *target++ = (UTF16)ch; /* normal case */
587  }
588  } else if (ch > UNI_MAX_UTF16) {
589  if (flags == strictConversion) {
590  result = sourceIllegal;
591  source -= (extraBytesToRead+1); /* return to the start */
592  break; /* Bail out; shouldn't continue */
593  } else {
594  *target++ = UNI_REPLACEMENT_CHAR;
595  }
596  } else {
597  /* target is a character in range 0xFFFF - 0x10FFFF. */
598  if (target + 1 >= targetEnd) {
599  source -= (extraBytesToRead+1); /* Back up source pointer! */
600  result = targetExhausted; break;
601  }
602  ch -= halfBase;
603  *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
604  *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
605  }
606  }
607  *sourceStart = source;
608  *targetStart = target;
609  return result;
610 }
611 
612 /* --------------------------------------------------------------------- */
613 
615  const UTF8** sourceStart, const UTF8* sourceEnd,
616  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
617  Boolean InputIsPartial) {
619  const UTF8* source = *sourceStart;
620  UTF32* target = *targetStart;
621  while (source < sourceEnd) {
622  UTF32 ch = 0;
623  unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
624  if (extraBytesToRead >= sourceEnd - source) {
625  if (flags == strictConversion || InputIsPartial) {
626  result = sourceExhausted;
627  break;
628  } else {
629  result = sourceIllegal;
630 
631  /*
632  * Replace the maximal subpart of ill-formed sequence with
633  * replacement character.
634  */
636  sourceEnd);
637  *target++ = UNI_REPLACEMENT_CHAR;
638  continue;
639  }
640  }
641  if (target >= targetEnd) {
642  result = targetExhausted; break;
643  }
644 
645  /* Do this check whether lenient or strict */
646  if (!isLegalUTF8(source, extraBytesToRead+1)) {
647  result = sourceIllegal;
648  if (flags == strictConversion) {
649  /* Abort conversion. */
650  break;
651  } else {
652  /*
653  * Replace the maximal subpart of ill-formed sequence with
654  * replacement character.
655  */
657  sourceEnd);
658  *target++ = UNI_REPLACEMENT_CHAR;
659  continue;
660  }
661  }
662  /*
663  * The cases all fall through. See "Note A" below.
664  */
665  switch (extraBytesToRead) {
666  case 5: ch += *source++; ch <<= 6;
667  case 4: ch += *source++; ch <<= 6;
668  case 3: ch += *source++; ch <<= 6;
669  case 2: ch += *source++; ch <<= 6;
670  case 1: ch += *source++; ch <<= 6;
671  case 0: ch += *source++;
672  }
673  ch -= offsetsFromUTF8[extraBytesToRead];
674 
675  if (ch <= UNI_MAX_LEGAL_UTF32) {
676  /*
677  * UTF-16 surrogate values are illegal in UTF-32, and anything
678  * over Plane 17 (> 0x10FFFF) is illegal.
679  */
680  if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
681  if (flags == strictConversion) {
682  source -= (extraBytesToRead+1); /* return to the illegal value itself */
683  result = sourceIllegal;
684  break;
685  } else {
686  *target++ = UNI_REPLACEMENT_CHAR;
687  }
688  } else {
689  *target++ = ch;
690  }
691  } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
692  result = sourceIllegal;
693  *target++ = UNI_REPLACEMENT_CHAR;
694  }
695  }
696  *sourceStart = source;
697  *targetStart = target;
698  return result;
699 }
700 
702  const UTF8 *sourceEnd,
703  UTF32 **targetStart,
704  UTF32 *targetEnd,
705  ConversionFlags flags) {
706  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
707  flags, /*InputIsPartial=*/true);
708 }
709 
711  const UTF8 *sourceEnd, UTF32 **targetStart,
712  UTF32 *targetEnd, ConversionFlags flags) {
713  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
714  flags, /*InputIsPartial=*/false);
715 }
716 
717 /* ---------------------------------------------------------------------
718 
719  Note A.
720  The fall-through switches in UTF-8 reading code save a
721  temp variable, some decrements & conditionals. The switches
722  are equivalent to the following loop:
723  {
724  int tmpBytesToRead = extraBytesToRead+1;
725  do {
726  ch += *source++;
727  --tmpBytesToRead;
728  if (tmpBytesToRead) ch <<= 6;
729  } while (tmpBytesToRead > 0);
730  }
731  In UTF-8 writing code, the switches on "bytesToWrite" are
732  similarly unrolled loops.
733 
734  --------------------------------------------------------------------- */
735 
736 } // namespace llvm
737 
unsigned int UTF32
Definition: ConvertUTF.h:109
static const UTF32 halfBase
Definition: ConvertUTF.cpp:86
This class represents lattice values for constants.
Definition: AllocatorList.h:23
#define UNI_MAX_BMP
Definition: ConvertUTF.h:116
ConversionResult
Definition: ConvertUTF.h:126
static const char trailingBytesForUTF8[256]
Definition: ConvertUTF.cpp:103
#define UNI_MAX_UTF16
Definition: ConvertUTF.h:117
unsigned short UTF16
Definition: ConvertUTF.h:110
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Convert a partial UTF8 sequence to UTF32.
Definition: ConvertUTF.cpp:710
#define UNI_MAX_LEGAL_UTF32
Definition: ConvertUTF.h:119
#define UNI_REPLACEMENT_CHAR
Definition: ConvertUTF.h:115
ConversionResult ConvertUTF16toUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:247
static ConversionResult ConvertUTF8toUTF32Impl(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags, Boolean InputIsPartial)
Definition: ConvertUTF.cpp:614
ConversionResult ConvertUTF32toUTF16(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:144
unsigned char UTF8
Definition: ConvertUTF.h:111
unsigned getNumBytesForUTF8(UTF8 firstByte)
Definition: ConvertUTF.cpp:519
#define UNI_SUR_HIGH_END
Definition: ConvertUTF.cpp:90
static Boolean isLegalUTF8(const UTF8 *source, int length)
Definition: ConvertUTF.cpp:381
#define ConvertUTF_DISABLE_WARNINGS
Definition: ConvertUTF.cpp:74
ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:541
ConversionResult ConvertUTF32toUTF8(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:317
Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
Definition: ConvertUTF.cpp:412
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd)
Definition: ConvertUTF.cpp:529
ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Convert a partial UTF8 sequence to UTF32.
Definition: ConvertUTF.cpp:701
static const int halfShift
Definition: ConvertUTF.cpp:84
unsigned char Boolean
Definition: ConvertUTF.h:112
static const UTF32 offsetsFromUTF8[6]
Definition: ConvertUTF.cpp:119
unsigned first
ConversionFlags
Definition: ConvertUTF.h:133
#define UNI_SUR_LOW_START
Definition: ConvertUTF.cpp:91
ConversionResult ConvertUTF16toUTF32(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Definition: ConvertUTF.cpp:193
static unsigned findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
Definition: ConvertUTF.cpp:423
static const UTF8 firstByteMark[7]
Definition: ConvertUTF.cpp:129
#define ConvertUTF_RESTORE_WARNINGS
Definition: ConvertUTF.cpp:77
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
#define UNI_SUR_HIGH_START
Definition: ConvertUTF.cpp:89
#define UNI_SUR_LOW_END
Definition: ConvertUTF.cpp:92
static const UTF32 halfMask
Definition: ConvertUTF.cpp:87