Main Page | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Class Members | File Members | Related Pages

Unicode.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file Unicode.d
00004         
00005         Copyright: (c) 2004 Kris Bell
00006         
00007         License: 
00008 
00009         This software is provided 'as-is', without any express or implied
00010         warranty. In no event will the authors be held liable for damages
00011         of any kind arising from the use of this software.
00012         
00013         Permission is hereby granted to anyone to use this software for any 
00014         purpose, including commercial applications, and to alter it and/or 
00015         redistribute it freely, subject to the following restrictions:
00016         
00017         1. The origin of this software must not be misrepresented; you must 
00018            not claim that you wrote the original software. If you use this 
00019            software in a product, an acknowledgment within documentation of 
00020            said product would be appreciated but is not required.
00021 
00022         2. Altered source versions must be plainly marked as such, and must 
00023            not be misrepresented as being the original software.
00024 
00025         3. This notice may not be removed or altered from any distribution
00026            of the source.
00027 
00028         4. Derivative works are permitted, but they must carry this notice
00029            in full and credit the original source.
00030 
00031 
00032                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00033 
00034 
00035         History:        Initial version; Oct 2004      
00036         History:        Moved to mango.convert; Nov 2005
00037 
00038         Authors:        Kris
00039 
00040 
00041 *******************************************************************************/
00042 
00043 module mango.convert.Unicode;
00044 
00045 private import mango.convert.Type;
00046 
00047 /*******************************************************************************
00048 
00049         Fast Unicode transcoders. These are particularly sensitive to
00050         minor changes on 32bit x86 devices, because the register set of
00051         those devices is so small. Beware of subtle changes which might
00052         extend the execution-period by as much as 200%. Because of this, 
00053         three of the six transcoders might read past the end of input by 
00054         one, two, or three bytes before arresting themselves. Note that 
00055         support for streaming adds a 15% overhead to the dchar => char 
00056         conversion, but has little effect on the others.
00057 
00058         These routines were tuned on an Intel P4; other devices may work
00059         more efficiently with a slightly different approach, though this
00060         is likely to be reasonably optimal on AMD x86 CPUs also. These
00061         algorithms would benefit significantly from those extra AMD64 
00062         registers. On a 3GHz P4, the dchar/char conversions take around
00063         2500ns to process an array of 1000 ASCII elements. Invoking the
00064         memory manager doubles that period, and quadruples the time for 
00065         arrays of 100 elements. Memory allocation can slow down notably 
00066         in a multi-threaded environment, so avoid that where possible.
00067 
00068         Surrogate-pairs are dealt with in a non-optimal fashion when
00069         transcoding between utf16 and utf8. Such cases are considered 
00070         to be boundary-conditions for this module.
00071 
00072         There are three common cases where the input may be incomplete, 
00073         including each 'widening' case of utf8 => utf16, utf8 => utf32,
00074         and utf16 => utf32. An edge-case is utf16 => utf8, if surrogate
00075         pairs are present. Such cases will throw an exception, unless 
00076         streaming-mode is enabled ~ in the latter mode, an additional 
00077         integer is returned indicating how many elements of the input 
00078         have been consumed. In all cases, a correct slice of the output 
00079         is returned.
00080                 
00081         For details on Unicode processing see 
00082         $(LINK http://www.utf-8.com/)
00083         $(LINK http://www.hackcraft.net/xmlUnicode/)
00084         $(LINK http://www.azillionmonkeys.com/qed/unicode.html/)
00085         $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/)
00086 
00087 *******************************************************************************/
00088 
00089 struct Unicode
00090 {
00091         // see http://icu.sourceforge.net/docs/papers/forms_of_unicode/#t2
00092         enum    {
00093                 Unknown, 
00094                 UTF_8, 
00095                 UTF_8N, 
00096                 UTF_16, 
00097                 UTF_16BE, 
00098                 UTF_16LE, 
00099                 UTF_32, 
00100                 UTF_32BE,
00101                 UTF_32LE, 
00102                 };
00103 
00104         /***********************************************************************
00105 
00106         ***********************************************************************/
00107 
00108         static bool isValid (int encoding)
00109         {
00110                 return cast(bool) (encoding >= Unknown && encoding <= UTF_32LE);
00111         }
00112 
00113         /***********************************************************************
00114 
00115         ***********************************************************************/
00116 
00117         private static final void error (char[] msg)
00118         {
00119                 static class UnicodeException : Exception
00120                 {
00121                         this (char[] msg)
00122                         {
00123                                 super (msg);
00124                         }
00125                 }
00126 
00127                 throw new UnicodeException (msg);
00128         }
00129 
00130         /***********************************************************************
00131 
00132                 Encode Utf8 up to a maximum of 4 bytes long (five & six byte 
00133                 variations are not supported). 
00134 
00135                 If the output is provided off the stack, it should be large 
00136                 enough to encompass the entire transcoding; failing to do 
00137                 so will cause the output to be moved onto the heap instead.
00138 
00139                 Returns a slice of the output buffer, corresponding to the 
00140                 converted characters. For optimum performance, the returned
00141                 buffer should be specified as 'output' on subsequent calls.
00142                 For example:
00143 
00144                 char[] output;
00145 
00146                 wchar[] result = toUtf8 (input, output);
00147 
00148                 // reset output after a realloc
00149                 if (result.length > output.length)
00150                     output = result;
00151 
00152         ***********************************************************************/
00153 
00154         static final char[] toUtf8 (wchar[] input, char[] output=null, uint* ate=null)
00155         {
00156                 if (ate)
00157                     *ate = input.length;
00158                 else
00159                    {
00160                    // potentially reallocate output
00161                    int estimate = input.length * 2 + 3;
00162                    if (output.length < estimate)
00163                        output.length = estimate;
00164                    }
00165 
00166                 char* pOut = output.ptr;
00167                 char* pMax = pOut + output.length - 3;
00168 
00169                 foreach (int eaten, wchar b; input)
00170                         { 
00171                         // about to overflow the output?
00172                         if (pOut > pMax)
00173                            {
00174                            // if streaming, just return the unused input
00175                            if (ate)
00176                               {
00177                               *ate = eaten;
00178                               break;
00179                               }
00180 
00181                            // reallocate the output buffer
00182                            int len = pOut - output.ptr;
00183                            output.length = len + len / 2;
00184                            pOut = output.ptr + len;
00185                            pMax = output.ptr + output.length - 3;
00186                            }
00187 
00188                         if (b < 0x80)
00189                             *pOut++ = b;
00190                         else
00191                            if (b < 0x0800)
00192                               {
00193                               pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
00194                               pOut[1] = 0x80 | (b & 0x3f);
00195                               pOut += 2;
00196                               }
00197                            else
00198                               if (b < 0xd800 || b > 0xdfff)
00199                                  {
00200                                  pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
00201                                  pOut[1] = 0x80 | ((b >> 6)  & 0x3f);
00202                                  pOut[2] = 0x80 | (b & 0x3f);
00203                                  pOut += 3;
00204                                  }
00205                               else
00206                                  // deal with surrogate-pairs
00207                                  return toUtf8 (toUtf32(input, null, ate), output);
00208                         }
00209                 
00210                 // return the produced output
00211                 return output [0..(pOut - output.ptr)];
00212         }
00213 
00214 
00215         /***********************************************************************
00216 
00217                 Decode Utf8 produced by the above toUtf8() method. 
00218         
00219                 If the output is provided off the stack, it should be large 
00220                 enough to encompass the entire transcoding; failing to do 
00221                 so will cause the output to be moved onto the heap instead.
00222 
00223                 Returns a slice of the output buffer, corresponding to the 
00224                 converted characters. For optimum performance, the returned
00225                 buffer should be specified as 'output' on subsequent calls.
00226 
00227         ***********************************************************************/
00228 
00229         static final wchar[] toUtf16 (char[] input, wchar[] output=null, uint* ate=null)
00230         {
00231                 int     produced;
00232                 char*   pIn = input;
00233                 char*   pMax = pIn + input.length;
00234                 char*   pValid;
00235 
00236                 if (ate is null)
00237                     if (input.length > output.length)
00238                         output.length = input.length;
00239 
00240                 if (input.length)
00241                 foreach (inout wchar d; output)
00242                         {
00243                         pValid = pIn;
00244                         wchar b = cast(wchar) *pIn;
00245 
00246                         if (b & 0x80)
00247                             if (b < 0xe0)
00248                                {
00249                                b &= 0x1f;
00250                                b = (b << 6) | (*++pIn & 0x3f);
00251                                }
00252                             else
00253                                if (b < 0xf0)
00254                                   {
00255                                   b &= 0x0f;
00256                                   b = (b << 6) | (pIn[1] & 0x3f);
00257                                   b = (b << 6) | (pIn[2] & 0x3f);
00258                                   pIn += 2;
00259                                   }
00260                                else
00261                                   // deal with surrogate-pairs
00262                                   return toUtf16 (toUtf32(input, null, ate), output);
00263 
00264                         d = b;          
00265                         ++produced;
00266 
00267                         // did we read past the end of the input?
00268                         if (++pIn >= pMax)
00269                             if (pIn > pMax)    
00270                                {
00271                                // yep ~ return tail or throw error?
00272                                if (ate)
00273                                   {
00274                                   pIn = pValid; 
00275                                   --produced;
00276                                   break;
00277                                   }
00278                                error ("Unicode.toUtf16 : incomplete utf8 input");  
00279                                }
00280                             else
00281                                break;
00282                         }
00283                        
00284                 // do we still have some input left?
00285                 if (ate)
00286                     *ate = pIn - input.ptr;
00287                 else
00288                    if (pIn < pMax)
00289                        // this should never happen!
00290                        error ("Unicode.toUtf16 : utf8 overflow");
00291 
00292                 // return the produced output
00293                 return output [0..produced];
00294         }
00295 
00296 
00297         /***********************************************************************
00298 
00299                 Encode Utf8 up to a maximum of 4 bytes long (five & six
00300                 byte variations are not supported). Throws an exception
00301                 where the input dchar is greater than 0x10ffff.
00302 
00303                 If the output is provided off the stack, it should be large 
00304                 enough to encompass the entire transcoding; failing to do 
00305                 so will cause the output to be moved onto the heap instead.
00306 
00307                 Returns a slice of the output buffer, corresponding to the 
00308                 converted characters. For optimum performance, the returned
00309                 buffer should be specified as 'output' on subsequent calls.
00310 
00311         ***********************************************************************/
00312 
00313         static final char[] toUtf8 (dchar[] input, char[] output=null, uint* ate=null)
00314         {
00315                 if (ate)
00316                     *ate = input.length;
00317                 else
00318                    {
00319                    // potentially reallocate output
00320                    int estimate = input.length * 2 + 4;
00321                    if (output.length < estimate)
00322                        output.length = estimate;
00323                    }
00324 
00325                 char* pOut = output.ptr;
00326                 char* pMax = pOut + output.length - 4;
00327 
00328                 foreach (int eaten, dchar b; input)
00329                         { 
00330                         // about to overflow the output?
00331                         if (pOut > pMax)
00332                            {
00333                            // if streaming, just return the unused input
00334                            if (ate)
00335                               {
00336                               *ate = eaten;
00337                               break;
00338                               }
00339 
00340                            // reallocate the output buffer
00341                            int len = pOut - output.ptr;
00342                            output.length = len + len / 2;
00343                            pOut = output.ptr + len;
00344                            pMax = output.ptr + output.length - 4;
00345                            }
00346 
00347                         if (b < 0x80)
00348                             *pOut++ = b;
00349                         else
00350                            if (b < 0x0800)
00351                               {
00352                               pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
00353                               pOut[1] = 0x80 | (b & 0x3f);
00354                               pOut += 2;
00355                               }
00356                            else
00357                               if (b < 0x10000)
00358                                  {
00359                                  pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
00360                                  pOut[1] = 0x80 | ((b >> 6)  & 0x3f);
00361                                  pOut[2] = 0x80 | (b & 0x3f);
00362                                  pOut += 3;
00363                                  }
00364                               else
00365                                  if (b < 0x110000)
00366                                     {
00367                                     pOut[0] = 0xf0 | ((b >> 18) & 0x3f);
00368                                     pOut[1] = 0x80 | ((b >> 12) & 0x3f);
00369                                     pOut[2] = 0x80 | ((b >> 6)  & 0x3f);
00370                                     pOut[3] = 0x80 | (b & 0x3f);
00371                                     pOut += 4;
00372                                     }
00373                                  else
00374                                     error ("Unicode.toUtf8 : invalid dchar");
00375                         }
00376                 
00377                 // return the produced output
00378                 return output [0..(pOut - output.ptr)];
00379         }
00380 
00381 
00382         /***********************************************************************
00383 
00384                 Decode Utf8 produced by the above toUtf8() method. 
00385         
00386                 If the output is provided off the stack, it should be large 
00387                 enough to encompass the entire transcoding; failing to do 
00388                 so will cause the output to be moved onto the heap instead.
00389 
00390                 Returns a slice of the output buffer, corresponding to the 
00391                 converted characters. For optimum performance, the returned
00392                 buffer should be specified as 'output' on subsequent calls.
00393 
00394         ***********************************************************************/
00395 
00396         static final dchar[] toUtf32 (char[] input, dchar[] output=null, uint* ate=null)
00397         {
00398                 int     produced;
00399                 char*   pIn = input;
00400                 char*   pMax = pIn + input.length;
00401                 char*   pValid;
00402 
00403                 if (ate is null)
00404                     if (input.length > output.length)
00405                         output.length = input.length;
00406 
00407                 if (input.length)
00408                 foreach (inout dchar d; output)
00409                         {
00410                         pValid = pIn;
00411                         dchar b = cast(dchar) *pIn;
00412 
00413                         if (b & 0x80)
00414                             if (b < 0xe0)
00415                                {
00416                                b &= 0x1f;
00417                                b = (b << 6) | (*++pIn & 0x3f);
00418                                }
00419                             else
00420                                if (b < 0xf0)
00421                                   {
00422                                   b &= 0x0f;
00423                                   b = (b << 6) | (pIn[1] & 0x3f);
00424                                   b = (b << 6) | (pIn[2] & 0x3f);
00425                                   pIn += 2;
00426                                   }
00427                                else
00428                                   {
00429                                   b &= 0x07;
00430                                   b = (b << 6) | (pIn[1] & 0x3f);
00431                                   b = (b << 6) | (pIn[2] & 0x3f);
00432                                   b = (b << 6) | (pIn[3] & 0x3f);
00433 
00434                                   if (b >= 0x110000)
00435                                       error ("Unicode.toUtf32 : invalid utf8 input");
00436                                   pIn += 3;
00437                                   }
00438 
00439                         d = b;
00440                         ++produced;
00441 
00442                         // did we read past the end of the input?
00443                         if (++pIn >= pMax)
00444                             if (pIn > pMax)   
00445                                {
00446                                // yep ~ return tail or throw error?
00447                                if (ate)
00448                                   {
00449                                   pIn = pValid; 
00450                                   --produced;
00451                                   break;
00452                                   }
00453                                error ("Unicode.toUtf32 : incomplete utf8 input");  
00454                                }
00455                             else
00456                                break;
00457                         }
00458 
00459                 // do we still have some input left?
00460                 if (ate)
00461                     *ate = pIn - input.ptr;
00462                 else
00463                    if (pIn < pMax)
00464                        // this should never happen!
00465                        error ("Unicode.toUtf32 : utf8 overflow");
00466 
00467                 // return the produced output
00468                 return output [0..produced];
00469         }
00470 
00471         /***********************************************************************
00472 
00473                 Encode Utf16 up to a maximum of 2 bytes long. Throws an exception
00474                 where the input dchar is greater than 0x10ffff.
00475 
00476                 If the output is provided off the stack, it should be large 
00477                 enough to encompass the entire transcoding; failing to do 
00478                 so will cause the output to be moved onto the heap instead.
00479 
00480                 Returns a slice of the output buffer, corresponding to the 
00481                 converted characters. For optimum performance, the returned
00482                 buffer should be specified as 'output' on subsequent calls.
00483 
00484         ***********************************************************************/
00485 
00486         static final wchar[] toUtf16 (dchar[] input, wchar[] output=null, uint* ate=null)
00487         {
00488                 if (ate)
00489                     *ate = input.length;
00490                 else
00491                    {
00492                    int estimate = input.length * 2 + 2;
00493                    if (output.length < estimate)
00494                        output.length = estimate;
00495                    }
00496 
00497                 wchar* pOut = output.ptr;
00498                 wchar* pMax = pOut + output.length - 2;
00499 
00500                 foreach (int eaten, dchar b; input)
00501                         { 
00502                         // about to overflow the output?
00503                         if (pOut > pMax)
00504                            {
00505                            // if streaming, just return the unused input
00506                            if (ate)
00507                               {
00508                               *ate = eaten;
00509                               break;
00510                               }
00511 
00512                            // reallocate the output buffer
00513                            int len = pOut - output.ptr;
00514                            output.length = len + len / 2;
00515                            pOut = output.ptr + len;
00516                            pMax = output.ptr + output.length - 2;
00517                            }
00518 
00519                         if (b < 0x10000)
00520                             *pOut++ = b;
00521                         else
00522                            if (b < 0x110000)
00523                               {
00524                               pOut[0] = 0xd800 | (((b - 0x10000) >> 10) & 0x3ff);
00525                               pOut[1] = 0xdc00 | ((b - 0x10000) & 0x3ff);
00526                               pOut += 2;
00527                               }
00528                            else
00529                               error ("Unicode.toUtf16 : invalid dchar");
00530                         }
00531                 
00532                 // return the produced output
00533                 return output [0..(pOut - output.ptr)];
00534         }
00535 
00536         /***********************************************************************
00537 
00538                 Decode Utf16 produced by the above toUtf16() method. 
00539         
00540                 If the output is provided off the stack, it should be large 
00541                 enough to encompass the entire transcoding; failing to do 
00542                 so will cause the output to be moved onto the heap instead.
00543 
00544                 Returns a slice of the output buffer, corresponding to the 
00545                 converted characters. For optimum performance, the returned
00546                 buffer should be specified as 'output' on subsequent calls.
00547 
00548         ***********************************************************************/
00549 
00550         static final dchar[] toUtf32 (wchar[] input, dchar[] output=null, uint* ate=null)
00551         {
00552                 int     produced;
00553                 wchar*  pIn = input;
00554                 wchar*  pMax = pIn + input.length;
00555                 wchar*  pValid;
00556 
00557                 if (ate is null)
00558                     if (input.length > output.length)
00559                         output.length = input.length;
00560 
00561                 if (input.length)
00562                 foreach (inout dchar d; output)
00563                         {
00564                         pValid = pIn;
00565                         dchar b = cast(dchar) *pIn;
00566 
00567                         // simple conversion ~ see http://www.unicode.org/faq/utf_bom.html#35
00568                         if (b >= 0xd800 && b <= 0xdfff)
00569                             b = ((b - 0xd7c0) << 10) + (*++pIn - 0xdc00);
00570 
00571                         if (b >= 0x110000)
00572                             error ("Unicode.toUtf32 : invalid utf16 input");
00573 
00574                         d = b;
00575                         ++produced;
00576 
00577                         if (++pIn >= pMax)
00578                             if (pIn > pMax)   
00579                                {
00580                                // yep ~ return tail or throw error?
00581                                if (ate)
00582                                   {
00583                                   pIn = pValid; 
00584                                   --produced;
00585                                   break;
00586                                   }
00587                                error ("Unicode.toUtf32 : incomplete utf16 input");  
00588                                }
00589                             else
00590                                break;
00591                         }
00592 
00593                 // do we still have some input left?
00594                 if (ate)
00595                     *ate = pIn - input.ptr;
00596                 else
00597                    if (pIn < pMax)
00598                        // this should never happen!
00599                        error ("Unicode.toUtf32 : utf16 overflow");
00600                 
00601                 // return the produced output
00602                 return output [0..produced];
00603         }
00604 
00605 
00606         /***********************************************************************
00607 
00608                 Convert from an external coding of 'type' to an internally
00609                 normalized representation of T.
00610 
00611                 T refers to the destination, whereas 'type' refers to the 
00612                 source.
00613 
00614         ***********************************************************************/
00615 
00616         struct Into(T)
00617         {
00618                 /***************************************************************
00619 
00620                 ***************************************************************/
00621 
00622                 static uint type ()
00623                 {
00624                         static if (is (T == char))
00625                                    return Type.Utf8;
00626                         static if (is (T == wchar))
00627                                    return Type.Utf16;
00628                         static if (is (T == dchar))
00629                                    return Type.Utf32;
00630                 }
00631 
00632                 /***************************************************************
00633 
00634                 ***************************************************************/
00635 
00636                 static void[] convert (void[] x, uint type, void[] dst=null, uint* ate=null)
00637                 {
00638                         void[] ret;
00639 
00640                         static if (is (T == char))
00641                                   {
00642                                   if (type == Type.Utf8)
00643                                       return x;
00644 
00645                                   if (type == Type.Utf16)
00646                                       ret = toUtf8 (cast(wchar[]) x, cast(char[]) dst, ate);
00647                                   else
00648                                   if (type == Type.Utf32)
00649                                       ret = toUtf8 (cast(dchar[]) x, cast(char[]) dst, ate);
00650                                   }
00651 
00652                         static if (is (T == wchar))
00653                                   {
00654                                   if (type == Type.Utf16)
00655                                       return x;
00656 
00657                                   if (type == Type.Utf8)
00658                                       ret = toUtf16 (cast(char[]) x, cast(wchar[]) dst, ate);
00659                                   else
00660                                   if (type == Type.Utf32)
00661                                       ret = toUtf16 (cast(dchar[]) x, cast(wchar[]) dst, ate);
00662                                   }
00663 
00664                         static if (is (T == dchar))
00665                                   {
00666                                   if (type == Type.Utf32)
00667                                       return x;
00668 
00669                                   if (type == Type.Utf8)
00670                                       ret = toUtf32 (cast(char[]) x, cast(dchar[]) dst, ate);
00671                                   else
00672                                   if (type == Type.Utf16)
00673                                       ret = toUtf32 (cast(wchar[]) x, cast(dchar[]) dst, ate);
00674                                   }
00675                         if (ate)
00676                             *ate *= Type.widths[type];
00677                         return ret;
00678                 }
00679         }
00680 
00681 
00682         /***********************************************************************
00683 
00684                 Convert to an external coding of 'type' from an internally 
00685                 normalized representation of T.
00686 
00687                 T refers to the source, whereas 'type' is the destination.
00688 
00689         ***********************************************************************/
00690 
00691         struct From(T)
00692         {
00693                 /***************************************************************
00694 
00695                 ***************************************************************/
00696 
00697                 static uint type ()
00698                 {
00699                         static if (is (T == char))
00700                                    return Type.Utf8;
00701                         static if (is (T == wchar))
00702                                    return Type.Utf16;
00703                         static if (is (T == dchar))
00704                                    return Type.Utf32;
00705                 }
00706 
00707                 /***************************************************************
00708 
00709                 ***************************************************************/
00710 
00711                 static void[] convert (void[] x, uint type, void[] dst=null, uint* ate=null)
00712                 {
00713                         void[] ret;
00714 
00715                         static if (is (T == char))
00716                                   {
00717                                   if (type == Type.Utf8)
00718                                       return x;
00719 
00720                                   if (type == Type.Utf16)
00721                                       ret = toUtf16 (cast(char[]) x, cast(wchar[]) dst, ate);
00722                                   else
00723                                   if (type == Type.Utf32)
00724                                       ret = toUtf32 (cast(char[]) x, cast(dchar[]) dst, ate);
00725                                   }
00726 
00727                         static if (is (T == wchar))
00728                                   {
00729                                   if (type == Type.Utf16)
00730                                       return x;
00731 
00732                                   if (type == Type.Utf8)
00733                                       ret = toUtf8 (cast(wchar[]) x, cast(char[]) dst, ate);
00734                                   else
00735                                   if (type == Type.Utf32)
00736                                       ret = toUtf32 (cast(wchar[]) x, cast(dchar[]) dst, ate);
00737                                   }
00738 
00739                         static if (is (T == dchar))
00740                                   {
00741                                   if (type == Type.Utf32)
00742                                       return x;
00743 
00744                                   if (type == Type.Utf8)
00745                                       ret = toUtf8 (cast(dchar[]) x, cast(char[]) dst, ate);
00746                                   else
00747                                   if (type == Type.Utf16)
00748                                       ret = toUtf16 (cast(dchar[]) x, cast(wchar[]) dst, ate);
00749                                   }
00750 
00751                         static if (is (T == wchar))
00752                                   {
00753                                   if (ate)
00754                                       *ate *= 2;
00755                                   }
00756                         static if (is (T == dchar))
00757                                   {
00758                                   if (ate)
00759                                       *ate *= 4;
00760                                   }
00761                         return ret;
00762                 }
00763         }
00764 }
00765 
00766 
00767 
00768 
00769 
00770 
00771 /+
00772 
00773 version=QTEMPLATE;
00774 version (TEMPLATE)
00775 {
00776 
00777 /*******************************************************************************
00778 
00779         Convert from an external coding of 'type' to an internally normalized
00780         representation of T.
00781 
00782         T refers to the destination, whereas 'type' refers to the source.
00783 
00784 *******************************************************************************/
00785 private import mango.convert.Type;
00786 
00787 struct UtfCodec1(T)
00788 {
00789         private void[] tmp;
00790 
00791         void dthis (int size = 0)
00792         {
00793                 tmp = new ubyte[size];
00794         }
00795 
00796         private void[] update (void[] t)
00797         {
00798                 if (t.length > tmp.length)
00799                     tmp = t;
00800                 return t;
00801         }
00802 
00803         uint type ()
00804         {
00805                 static if (is (T == char))
00806                            return Type.Utf8;
00807                 static if (is (T == wchar))
00808                            return Type.Utf16;
00809                 static if (is (T == dchar))
00810                            return Type.Utf32;
00811         }
00812 
00813         void[] from (void[] x, uint type)
00814         {
00815                 switch (type)
00816                        {
00817                        static if (is (T == char))
00818                                  {
00819                                  case Type.Utf8:
00820                                       return cast(char[]) x;
00821                                  case Type.Utf16:
00822                                       return update (Unicode.toUtf8 (cast(wchar[]) x, cast(char[]) tmp));
00823                                  case Type.Utf32:
00824                                       return update (Unicode.toUtf8 (cast(dchar[]) x, cast(char[]) tmp));
00825                                  }
00826 
00827                        static if (is (T == wchar))
00828                                  {
00829                                  case Type.Utf8:
00830                                       return update (Unicode.toUtf16 (cast(char[]) x, cast(wchar[]) tmp));
00831                                  case Type.Utf16:
00832                                       return cast(wchar[]) x;
00833                                  case Type.Utf32:
00834                                       return update (Unicode.toUtf16 (cast(dchar[]) x, cast(wchar[]) tmp));
00835                                  }
00836 
00837                        static if (is (T == dchar))
00838                                  {
00839                                  case Type.Utf8:
00840                                       return update (Unicode.toUtf32 (cast(char[]) x, cast(dchar[]) tmp));
00841                                  case Type.Utf16:
00842                                       return update (Unicode.toUtf32 (cast(wchar[]) x, cast(dchar[]) tmp));
00843                                  case Type.Utf32:
00844                                       return cast(dchar[]) x;
00845                                  }
00846                                  default:
00847                                       break;
00848                         }
00849          }
00850 
00851 
00852         void[] into (void[] src, uint type, void[] dst=null, uint* ate=null)
00853         {
00854                 if (dst is null)
00855                     dst = tmp;
00856 
00857                 switch (type)
00858                        {
00859                        static if (is (T == char))
00860                                  {
00861                                  case Type.Utf8:
00862                                       return src;
00863                                  case Type.Utf16:
00864                                       return update (Unicode.toUtf16 (cast(char[]) src, cast(wchar[]) dst, ate));
00865                                  case Type.Utf32:
00866                                       return update (Unicode.toUtf32 (cast(char[]) src, cast(dchar[]) dst, ate));
00867                        }
00868 
00869                        static if (is (T == wchar))
00870                                  {
00871                                  case Type.Utf8:
00872                                       return update (Unicode.toUtf8 (cast(wchar[]) src, cast(char[]) dst, ate));
00873                                  case Type.Utf16:
00874                                        return src;
00875                                  case Type.Utf32:
00876                                       return update (Unicode.toUtf32 (cast(wchar[]) src, cast(dchar[]) dst, ate));
00877                                  }
00878 
00879                        static if (is (T == dchar))
00880                                  {
00881                                  case Type.Utf8:
00882                                       return update (Unicode.toUtf8 (cast(dchar[]) src, cast(char[]) dst, ate));
00883                                  case Type.Utf16:
00884                                       return update (Unicode.toUtf16 (cast(dchar[]) src, cast(wchar[]) dst, ate));
00885                                  case Type.Utf32:
00886                                       return src;
00887                                  }
00888                                  default:
00889                                       break;
00890                        }
00891         }
00892 }
00893 
00894 }
00895 
00896 version (FUNCTION)
00897 {
00898 
00899 private import mango.convert.Type;
00900 
00901         /***********************************************************************
00902 
00903         ***********************************************************************/
00904 
00905         static final void[] convert (void[] src, void[] dst, uint srcType, uint dstType, uint*ate)
00906         {
00907                 enum : ubyte {char2char, char2wchar, char2dchar, 
00908                               wchar2char, wchar2wchar, wchar2dchar, 
00909                               dchar2char, dchar2wchar, dchar2dchar};
00910 
00911                 const int[][4] router = [
00912                                         [char2char,  char2wchar,  char2dchar, 0], 
00913                                         [wchar2char, wchar2wchar, wchar2dchar, 0], 
00914                                         [dchar2char, dchar2wchar, dchar2dchar, 0], 
00915                                         [0, 0, 0, 0], 
00916                                         ];
00917 
00918 
00919                 srcType -= Type.Utf8;
00920                 dstType -= Type.Utf8;
00921                 assert (srcType < 3);
00922                 assert (dstType < 3);
00923                 
00924                 switch (router[srcType][dstType])
00925                        {
00926                        case char2char: 
00927                             return src;
00928 
00929                        case char2wchar: 
00930                             return Unicode.toUtf16 (cast(char[]) src, cast(wchar[]) dst, ate);
00931 
00932                        case char2dchar: 
00933                             return Unicode.toUtf32 (cast(char[]) src, cast(dchar[]) dst, ate);
00934 
00935 
00936                        case wchar2char: 
00937                             return Unicode.toUtf8 (cast(wchar[]) src, cast(char[]) dst, ate);
00938 
00939                        case wchar2wchar:
00940                             return src; 
00941 
00942                        case wchar2dchar: 
00943                             return Unicode.toUtf32 (cast(wchar[]) src, cast(dchar[]) dst, ate);
00944 
00945 
00946                        case dchar2char: 
00947                             return Unicode.toUtf8 (cast(dchar[]) src, cast(char[]) dst, ate);
00948 
00949                        case dchar2wchar: 
00950                             return Unicode.toUtf16 (cast(dchar[]) src, cast(wchar[]) dst, ate);
00951 
00952                        case dchar2dchar: 
00953                             return src;
00954 
00955                        default:
00956                             return null;
00957                        }
00958         }
00959 }
00960 +/

Generated on Sat Dec 24 17:28:34 2005 for Mango by  doxygen 1.4.0