00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043 module mango.convert.Unicode;
00044
00045 private import mango.convert.Type;
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089 struct Unicode
00090 {
00091
00092 enum {
00093 Unknown,
00094 UTF_8,
00095 UTF_8N,
00096 UTF_16,
00097 UTF_16BE,
00098 UTF_16LE,
00099 UTF_32,
00100 UTF_32BE,
00101 UTF_32LE,
00102 };
00103
00104
00105
00106
00107
00108 static bool isValid (int encoding)
00109 {
00110 return cast(bool) (encoding >= Unknown && encoding <= UTF_32LE);
00111 }
00112
00113
00114
00115
00116
00117 private static final void error (char[] msg)
00118 {
00119 static class UnicodeException : Exception
00120 {
00121 this (char[] msg)
00122 {
00123 super (msg);
00124 }
00125 }
00126
00127 throw new UnicodeException (msg);
00128 }
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154 static final char[] toUtf8 (wchar[] input, char[] output=null, uint* ate=null)
00155 {
00156 if (ate)
00157 *ate = input.length;
00158 else
00159 {
00160
00161 int estimate = input.length * 2 + 3;
00162 if (output.length < estimate)
00163 output.length = estimate;
00164 }
00165
00166 char* pOut = output.ptr;
00167 char* pMax = pOut + output.length - 3;
00168
00169 foreach (int eaten, wchar b; input)
00170 {
00171
00172 if (pOut > pMax)
00173 {
00174
00175 if (ate)
00176 {
00177 *ate = eaten;
00178 break;
00179 }
00180
00181
00182 int len = pOut - output.ptr;
00183 output.length = len + len / 2;
00184 pOut = output.ptr + len;
00185 pMax = output.ptr + output.length - 3;
00186 }
00187
00188 if (b < 0x80)
00189 *pOut++ = b;
00190 else
00191 if (b < 0x0800)
00192 {
00193 pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
00194 pOut[1] = 0x80 | (b & 0x3f);
00195 pOut += 2;
00196 }
00197 else
00198 if (b < 0xd800 || b > 0xdfff)
00199 {
00200 pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
00201 pOut[1] = 0x80 | ((b >> 6) & 0x3f);
00202 pOut[2] = 0x80 | (b & 0x3f);
00203 pOut += 3;
00204 }
00205 else
00206
00207 return toUtf8 (toUtf32(input, null, ate), output);
00208 }
00209
00210
00211 return output [0..(pOut - output.ptr)];
00212 }
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229 static final wchar[] toUtf16 (char[] input, wchar[] output=null, uint* ate=null)
00230 {
00231 int produced;
00232 char* pIn = input;
00233 char* pMax = pIn + input.length;
00234 char* pValid;
00235
00236 if (ate is null)
00237 if (input.length > output.length)
00238 output.length = input.length;
00239
00240 if (input.length)
00241 foreach (inout wchar d; output)
00242 {
00243 pValid = pIn;
00244 wchar b = cast(wchar) *pIn;
00245
00246 if (b & 0x80)
00247 if (b < 0xe0)
00248 {
00249 b &= 0x1f;
00250 b = (b << 6) | (*++pIn & 0x3f);
00251 }
00252 else
00253 if (b < 0xf0)
00254 {
00255 b &= 0x0f;
00256 b = (b << 6) | (pIn[1] & 0x3f);
00257 b = (b << 6) | (pIn[2] & 0x3f);
00258 pIn += 2;
00259 }
00260 else
00261
00262 return toUtf16 (toUtf32(input, null, ate), output);
00263
00264 d = b;
00265 ++produced;
00266
00267
00268 if (++pIn >= pMax)
00269 if (pIn > pMax)
00270 {
00271
00272 if (ate)
00273 {
00274 pIn = pValid;
00275 --produced;
00276 break;
00277 }
00278 error ("Unicode.toUtf16 : incomplete utf8 input");
00279 }
00280 else
00281 break;
00282 }
00283
00284
00285 if (ate)
00286 *ate = pIn - input.ptr;
00287 else
00288 if (pIn < pMax)
00289
00290 error ("Unicode.toUtf16 : utf8 overflow");
00291
00292
00293 return output [0..produced];
00294 }
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313 static final char[] toUtf8 (dchar[] input, char[] output=null, uint* ate=null)
00314 {
00315 if (ate)
00316 *ate = input.length;
00317 else
00318 {
00319
00320 int estimate = input.length * 2 + 4;
00321 if (output.length < estimate)
00322 output.length = estimate;
00323 }
00324
00325 char* pOut = output.ptr;
00326 char* pMax = pOut + output.length - 4;
00327
00328 foreach (int eaten, dchar b; input)
00329 {
00330
00331 if (pOut > pMax)
00332 {
00333
00334 if (ate)
00335 {
00336 *ate = eaten;
00337 break;
00338 }
00339
00340
00341 int len = pOut - output.ptr;
00342 output.length = len + len / 2;
00343 pOut = output.ptr + len;
00344 pMax = output.ptr + output.length - 4;
00345 }
00346
00347 if (b < 0x80)
00348 *pOut++ = b;
00349 else
00350 if (b < 0x0800)
00351 {
00352 pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
00353 pOut[1] = 0x80 | (b & 0x3f);
00354 pOut += 2;
00355 }
00356 else
00357 if (b < 0x10000)
00358 {
00359 pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
00360 pOut[1] = 0x80 | ((b >> 6) & 0x3f);
00361 pOut[2] = 0x80 | (b & 0x3f);
00362 pOut += 3;
00363 }
00364 else
00365 if (b < 0x110000)
00366 {
00367 pOut[0] = 0xf0 | ((b >> 18) & 0x3f);
00368 pOut[1] = 0x80 | ((b >> 12) & 0x3f);
00369 pOut[2] = 0x80 | ((b >> 6) & 0x3f);
00370 pOut[3] = 0x80 | (b & 0x3f);
00371 pOut += 4;
00372 }
00373 else
00374 error ("Unicode.toUtf8 : invalid dchar");
00375 }
00376
00377
00378 return output [0..(pOut - output.ptr)];
00379 }
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396 static final dchar[] toUtf32 (char[] input, dchar[] output=null, uint* ate=null)
00397 {
00398 int produced;
00399 char* pIn = input;
00400 char* pMax = pIn + input.length;
00401 char* pValid;
00402
00403 if (ate is null)
00404 if (input.length > output.length)
00405 output.length = input.length;
00406
00407 if (input.length)
00408 foreach (inout dchar d; output)
00409 {
00410 pValid = pIn;
00411 dchar b = cast(dchar) *pIn;
00412
00413 if (b & 0x80)
00414 if (b < 0xe0)
00415 {
00416 b &= 0x1f;
00417 b = (b << 6) | (*++pIn & 0x3f);
00418 }
00419 else
00420 if (b < 0xf0)
00421 {
00422 b &= 0x0f;
00423 b = (b << 6) | (pIn[1] & 0x3f);
00424 b = (b << 6) | (pIn[2] & 0x3f);
00425 pIn += 2;
00426 }
00427 else
00428 {
00429 b &= 0x07;
00430 b = (b << 6) | (pIn[1] & 0x3f);
00431 b = (b << 6) | (pIn[2] & 0x3f);
00432 b = (b << 6) | (pIn[3] & 0x3f);
00433
00434 if (b >= 0x110000)
00435 error ("Unicode.toUtf32 : invalid utf8 input");
00436 pIn += 3;
00437 }
00438
00439 d = b;
00440 ++produced;
00441
00442
00443 if (++pIn >= pMax)
00444 if (pIn > pMax)
00445 {
00446
00447 if (ate)
00448 {
00449 pIn = pValid;
00450 --produced;
00451 break;
00452 }
00453 error ("Unicode.toUtf32 : incomplete utf8 input");
00454 }
00455 else
00456 break;
00457 }
00458
00459
00460 if (ate)
00461 *ate = pIn - input.ptr;
00462 else
00463 if (pIn < pMax)
00464
00465 error ("Unicode.toUtf32 : utf8 overflow");
00466
00467
00468 return output [0..produced];
00469 }
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486 static final wchar[] toUtf16 (dchar[] input, wchar[] output=null, uint* ate=null)
00487 {
00488 if (ate)
00489 *ate = input.length;
00490 else
00491 {
00492 int estimate = input.length * 2 + 2;
00493 if (output.length < estimate)
00494 output.length = estimate;
00495 }
00496
00497 wchar* pOut = output.ptr;
00498 wchar* pMax = pOut + output.length - 2;
00499
00500 foreach (int eaten, dchar b; input)
00501 {
00502
00503 if (pOut > pMax)
00504 {
00505
00506 if (ate)
00507 {
00508 *ate = eaten;
00509 break;
00510 }
00511
00512
00513 int len = pOut - output.ptr;
00514 output.length = len + len / 2;
00515 pOut = output.ptr + len;
00516 pMax = output.ptr + output.length - 2;
00517 }
00518
00519 if (b < 0x10000)
00520 *pOut++ = b;
00521 else
00522 if (b < 0x110000)
00523 {
00524 pOut[0] = 0xd800 | (((b - 0x10000) >> 10) & 0x3ff);
00525 pOut[1] = 0xdc00 | ((b - 0x10000) & 0x3ff);
00526 pOut += 2;
00527 }
00528 else
00529 error ("Unicode.toUtf16 : invalid dchar");
00530 }
00531
00532
00533 return output [0..(pOut - output.ptr)];
00534 }
00535
00536
00537
00538
00539
00540
00541
00542
00543
00544
00545
00546
00547
00548
00549
00550 static final dchar[] toUtf32 (wchar[] input, dchar[] output=null, uint* ate=null)
00551 {
00552 int produced;
00553 wchar* pIn = input;
00554 wchar* pMax = pIn + input.length;
00555 wchar* pValid;
00556
00557 if (ate is null)
00558 if (input.length > output.length)
00559 output.length = input.length;
00560
00561 if (input.length)
00562 foreach (inout dchar d; output)
00563 {
00564 pValid = pIn;
00565 dchar b = cast(dchar) *pIn;
00566
00567
00568 if (b >= 0xd800 && b <= 0xdfff)
00569 b = ((b - 0xd7c0) << 10) + (*++pIn - 0xdc00);
00570
00571 if (b >= 0x110000)
00572 error ("Unicode.toUtf32 : invalid utf16 input");
00573
00574 d = b;
00575 ++produced;
00576
00577 if (++pIn >= pMax)
00578 if (pIn > pMax)
00579 {
00580
00581 if (ate)
00582 {
00583 pIn = pValid;
00584 --produced;
00585 break;
00586 }
00587 error ("Unicode.toUtf32 : incomplete utf16 input");
00588 }
00589 else
00590 break;
00591 }
00592
00593
00594 if (ate)
00595 *ate = pIn - input.ptr;
00596 else
00597 if (pIn < pMax)
00598
00599 error ("Unicode.toUtf32 : utf16 overflow");
00600
00601
00602 return output [0..produced];
00603 }
00604
00605
00606
00607
00608
00609
00610
00611
00612
00613
00614
00615
00616 struct Into(T)
00617 {
00618
00619
00620
00621
00622 static uint type ()
00623 {
00624 static if (is (T == char))
00625 return Type.Utf8;
00626 static if (is (T == wchar))
00627 return Type.Utf16;
00628 static if (is (T == dchar))
00629 return Type.Utf32;
00630 }
00631
00632
00633
00634
00635
00636 static void[] convert (void[] x, uint type, void[] dst=null, uint* ate=null)
00637 {
00638 void[] ret;
00639
00640 static if (is (T == char))
00641 {
00642 if (type == Type.Utf8)
00643 return x;
00644
00645 if (type == Type.Utf16)
00646 ret = toUtf8 (cast(wchar[]) x, cast(char[]) dst, ate);
00647 else
00648 if (type == Type.Utf32)
00649 ret = toUtf8 (cast(dchar[]) x, cast(char[]) dst, ate);
00650 }
00651
00652 static if (is (T == wchar))
00653 {
00654 if (type == Type.Utf16)
00655 return x;
00656
00657 if (type == Type.Utf8)
00658 ret = toUtf16 (cast(char[]) x, cast(wchar[]) dst, ate);
00659 else
00660 if (type == Type.Utf32)
00661 ret = toUtf16 (cast(dchar[]) x, cast(wchar[]) dst, ate);
00662 }
00663
00664 static if (is (T == dchar))
00665 {
00666 if (type == Type.Utf32)
00667 return x;
00668
00669 if (type == Type.Utf8)
00670 ret = toUtf32 (cast(char[]) x, cast(dchar[]) dst, ate);
00671 else
00672 if (type == Type.Utf16)
00673 ret = toUtf32 (cast(wchar[]) x, cast(dchar[]) dst, ate);
00674 }
00675 if (ate)
00676 *ate *= Type.widths[type];
00677 return ret;
00678 }
00679 }
00680
00681
00682
00683
00684
00685
00686
00687
00688
00689
00690
00691 struct From(T)
00692 {
00693
00694
00695
00696
00697 static uint type ()
00698 {
00699 static if (is (T == char))
00700 return Type.Utf8;
00701 static if (is (T == wchar))
00702 return Type.Utf16;
00703 static if (is (T == dchar))
00704 return Type.Utf32;
00705 }
00706
00707
00708
00709
00710
00711 static void[] convert (void[] x, uint type, void[] dst=null, uint* ate=null)
00712 {
00713 void[] ret;
00714
00715 static if (is (T == char))
00716 {
00717 if (type == Type.Utf8)
00718 return x;
00719
00720 if (type == Type.Utf16)
00721 ret = toUtf16 (cast(char[]) x, cast(wchar[]) dst, ate);
00722 else
00723 if (type == Type.Utf32)
00724 ret = toUtf32 (cast(char[]) x, cast(dchar[]) dst, ate);
00725 }
00726
00727 static if (is (T == wchar))
00728 {
00729 if (type == Type.Utf16)
00730 return x;
00731
00732 if (type == Type.Utf8)
00733 ret = toUtf8 (cast(wchar[]) x, cast(char[]) dst, ate);
00734 else
00735 if (type == Type.Utf32)
00736 ret = toUtf32 (cast(wchar[]) x, cast(dchar[]) dst, ate);
00737 }
00738
00739 static if (is (T == dchar))
00740 {
00741 if (type == Type.Utf32)
00742 return x;
00743
00744 if (type == Type.Utf8)
00745 ret = toUtf8 (cast(dchar[]) x, cast(char[]) dst, ate);
00746 else
00747 if (type == Type.Utf16)
00748 ret = toUtf16 (cast(dchar[]) x, cast(wchar[]) dst, ate);
00749 }
00750
00751 static if (is (T == wchar))
00752 {
00753 if (ate)
00754 *ate *= 2;
00755 }
00756 static if (is (T == dchar))
00757 {
00758 if (ate)
00759 *ate *= 4;
00760 }
00761 return ret;
00762 }
00763 }
00764 }
00765
00766
00767
00768
00769
00770
00771 /+
00772
00773 version=QTEMPLATE;
00774 version (TEMPLATE)
00775 {
00776
00777
00778
00779
00780
00781
00782
00783
00784
00785 private import mango.convert.Type;
00786
00787 struct UtfCodec1(T)
00788 {
00789 private void[] tmp;
00790
00791 void dthis (int size = 0)
00792 {
00793 tmp = new ubyte[size];
00794 }
00795
00796 private void[] update (void[] t)
00797 {
00798 if (t.length > tmp.length)
00799 tmp = t;
00800 return t;
00801 }
00802
00803 uint type ()
00804 {
00805 static if (is (T == char))
00806 return Type.Utf8;
00807 static if (is (T == wchar))
00808 return Type.Utf16;
00809 static if (is (T == dchar))
00810 return Type.Utf32;
00811 }
00812
00813 void[] from (void[] x, uint type)
00814 {
00815 switch (type)
00816 {
00817 static if (is (T == char))
00818 {
00819 case Type.Utf8:
00820 return cast(char[]) x;
00821 case Type.Utf16:
00822 return update (Unicode.toUtf8 (cast(wchar[]) x, cast(char[]) tmp));
00823 case Type.Utf32:
00824 return update (Unicode.toUtf8 (cast(dchar[]) x, cast(char[]) tmp));
00825 }
00826
00827 static if (is (T == wchar))
00828 {
00829 case Type.Utf8:
00830 return update (Unicode.toUtf16 (cast(char[]) x, cast(wchar[]) tmp));
00831 case Type.Utf16:
00832 return cast(wchar[]) x;
00833 case Type.Utf32:
00834 return update (Unicode.toUtf16 (cast(dchar[]) x, cast(wchar[]) tmp));
00835 }
00836
00837 static if (is (T == dchar))
00838 {
00839 case Type.Utf8:
00840 return update (Unicode.toUtf32 (cast(char[]) x, cast(dchar[]) tmp));
00841 case Type.Utf16:
00842 return update (Unicode.toUtf32 (cast(wchar[]) x, cast(dchar[]) tmp));
00843 case Type.Utf32:
00844 return cast(dchar[]) x;
00845 }
00846 default:
00847 break;
00848 }
00849 }
00850
00851
00852 void[] into (void[] src, uint type, void[] dst=null, uint* ate=null)
00853 {
00854 if (dst is null)
00855 dst = tmp;
00856
00857 switch (type)
00858 {
00859 static if (is (T == char))
00860 {
00861 case Type.Utf8:
00862 return src;
00863 case Type.Utf16:
00864 return update (Unicode.toUtf16 (cast(char[]) src, cast(wchar[]) dst, ate));
00865 case Type.Utf32:
00866 return update (Unicode.toUtf32 (cast(char[]) src, cast(dchar[]) dst, ate));
00867 }
00868
00869 static if (is (T == wchar))
00870 {
00871 case Type.Utf8:
00872 return update (Unicode.toUtf8 (cast(wchar[]) src, cast(char[]) dst, ate));
00873 case Type.Utf16:
00874 return src;
00875 case Type.Utf32:
00876 return update (Unicode.toUtf32 (cast(wchar[]) src, cast(dchar[]) dst, ate));
00877 }
00878
00879 static if (is (T == dchar))
00880 {
00881 case Type.Utf8:
00882 return update (Unicode.toUtf8 (cast(dchar[]) src, cast(char[]) dst, ate));
00883 case Type.Utf16:
00884 return update (Unicode.toUtf16 (cast(dchar[]) src, cast(wchar[]) dst, ate));
00885 case Type.Utf32:
00886 return src;
00887 }
00888 default:
00889 break;
00890 }
00891 }
00892 }
00893
00894 }
00895
00896 version (FUNCTION)
00897 {
00898
00899 private import mango.convert.Type;
00900
00901
00902
00903
00904
00905 static final void[] convert (void[] src, void[] dst, uint srcType, uint dstType, uint*ate)
00906 {
00907 enum : ubyte {char2char, char2wchar, char2dchar,
00908 wchar2char, wchar2wchar, wchar2dchar,
00909 dchar2char, dchar2wchar, dchar2dchar};
00910
00911 const int[][4] router = [
00912 [char2char, char2wchar, char2dchar, 0],
00913 [wchar2char, wchar2wchar, wchar2dchar, 0],
00914 [dchar2char, dchar2wchar, dchar2dchar, 0],
00915 [0, 0, 0, 0],
00916 ];
00917
00918
00919 srcType -= Type.Utf8;
00920 dstType -= Type.Utf8;
00921 assert (srcType < 3);
00922 assert (dstType < 3);
00923
00924 switch (router[srcType][dstType])
00925 {
00926 case char2char:
00927 return src;
00928
00929 case char2wchar:
00930 return Unicode.toUtf16 (cast(char[]) src, cast(wchar[]) dst, ate);
00931
00932 case char2dchar:
00933 return Unicode.toUtf32 (cast(char[]) src, cast(dchar[]) dst, ate);
00934
00935
00936 case wchar2char:
00937 return Unicode.toUtf8 (cast(wchar[]) src, cast(char[]) dst, ate);
00938
00939 case wchar2wchar:
00940 return src;
00941
00942 case wchar2dchar:
00943 return Unicode.toUtf32 (cast(wchar[]) src, cast(dchar[]) dst, ate);
00944
00945
00946 case dchar2char:
00947 return Unicode.toUtf8 (cast(dchar[]) src, cast(char[]) dst, ate);
00948
00949 case dchar2wchar:
00950 return Unicode.toUtf16 (cast(dchar[]) src, cast(wchar[]) dst, ate);
00951
00952 case dchar2dchar:
00953 return src;
00954
00955 default:
00956 return null;
00957 }
00958 }
00959 }
00960 +/