Main Page | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Class Members | File Members | Related Pages

Utf.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file Utf.d
00004         
00005         Copyright (c) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026         4. Derivative works are permitted, but they must carry this notice
00027            in full and credit the original source.
00028 
00029 
00030                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00031 
00032 
00033         @version        Initial version, October 2004      
00034         @author         Kris
00035 
00036 
00037 *******************************************************************************/
00038 
00039 module mango.io.Utf;
00040 
00041 private import mango.io.Exception;
00042 
00043 /*******************************************************************************
00044 
00045         Fast UTF-8 to Unicode transcoder. These are really sensitive to
00046         small changes on 32bit x86 devices, because the register set of
00047         those devices is so small. Beware of subtle changes which might
00048         extend the execution-period by as much as 200% ...
00049 
00050         These routines were tuned on an Intel P3; other devices may work
00051         more efficiently with a slightly different approach, though this
00052         is likely to be reasonably optimal on AMD x86 CPUs also. These
00053         algorithms could benefit significantly from those extra AMD64 
00054         registers.
00055 
00056         Note that foreach can produce noticeable more efficient code than 
00057         equivalent for() loops, with either indices or pointers. The 0.98
00058         compiler version exhibited some rather erratic behavior over the
00059         course of testing: in particular, elapsed time of method execution
00060         is noticeably dependent upon its physical location within the file
00061         (or, more specifically, the enclosing class). Yes, it sure sounds 
00062         crazy that if you switch the order of encode() with decode() that 
00063         they will consistently execute slower than as currently arranged.        
00064 
00065         Finally, please note that these are between 5 and 30 times faster 
00066         than equivalent functions in the std.utf Phobos module (dependent
00067         upon the mix of char values). Those functions (strangely) often
00068         allocate memory on a character basis, so will become significantly 
00069         slower where there's heap-contention by multiple threads.
00070         
00071 *******************************************************************************/
00072 
00073 class Utf
00074 {
00075         /***********************************************************************
00076 
00077         ***********************************************************************/
00078 
00079         private static final void error (char[] msg)
00080         {
00081                 throw new IOException (msg);
00082         }
00083 
00084         /***********************************************************************
00085 
00086                 Encode Utf8 up to a maximum of 3 bytes long (four, five & 
00087                 six byte variations are not supported). Throws an exception
00088                 where the input wchar is greater than 0xd800.
00089 
00090                 If the output is provided off the stack, it should be large 
00091                 enough to encompass the entire utf8 encoding. This option is 
00092                 provided purely as an optimization for those cases where all 
00093                 boundary conditions are explicitly checked for by the caller.
00094 
00095         ***********************************************************************/
00096 
00097         static final char[] toUtf8 (wchar[] input, char[] output = null)
00098         {
00099                 if (! output.length)
00100                       output = new char [input.length * 2 + 3];
00101 
00102                 char* pOut = output;
00103                 char* pMax = pOut + output.length - 3;
00104 
00105                 foreach (wchar b; input)
00106                         { 
00107                         if (pOut >= pMax)
00108                            {
00109                            int len = pOut - output.ptr;
00110                            output.length = len + len / 2;
00111                            pOut = output.ptr + len;
00112                            pMax = pOut + len - 4;
00113                            }
00114 
00115                         if (b < 0x80)
00116                             *pOut++ = b;
00117                         else
00118                            if (b < 0x0800)
00119                               {
00120                               pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
00121                               pOut[1] = 0x80 | (b & 0x3f);
00122                               pOut += 2;
00123                               }
00124                            else
00125                               if (b < 0xd800)
00126                                  {
00127                                  pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
00128                                  pOut[1] = 0x80 | ((b >> 6)  & 0x3f);
00129                                  pOut[2] = 0x80 | (b & 0x3f);
00130                                  pOut += 3;
00131                                  }
00132                               else
00133                                  error ("invalid wchar");
00134                         }
00135                 
00136                 return output [0..(pOut - output.ptr)];
00137         }
00138 
00139 
00140         /***********************************************************************
00141 
00142                 Decode Utf8 produced by the above toUtf8() method. 
00143         
00144                 If the output is provided off the stack, it should be large 
00145                 enough to encompass the entire utf8 encoding. This option is 
00146                 provided purely as an optimization for those cases where all 
00147                 boundary conditions are explicitly checked for by the caller.
00148 
00149         ***********************************************************************/
00150 
00151         static final wchar[] toUtf16 (char[] input, wchar[] output = null)
00152         {
00153                 int     produced;
00154                 char*   pIn = input;
00155                 char*   pMax = pIn + input.length;
00156 
00157                 if (! output.length)
00158                       output = new wchar[input.length];
00159 
00160                 foreach (inout wchar d; output)
00161                         {
00162                         wchar b = cast(wchar) *pIn;
00163                         if (b & 0x80)
00164                             if (b < 0xe0)
00165                                {
00166                                b &= 0x1f;
00167                                b = (b << 6) | (*++pIn & 0x3f);
00168                                }
00169                             else
00170                                if (b < 0xf0)
00171                                   {
00172                                   b &= 0x0f;
00173                                   b = (b << 6) | (pIn[1] & 0x3f);
00174                                   b = (b << 6) | (pIn[2] & 0x3f);
00175                                   pIn += 2;
00176                                   }
00177 
00178                         d = b;
00179                         ++produced;
00180 
00181                         if (++pIn >= pMax)
00182                             if (pIn > pMax)    
00183                                 error ("invalid utf8");  
00184                             else
00185                                break;
00186                         }
00187                        
00188                 if (pIn < pMax)
00189                     error ("utf8 overflow");
00190                 return output [0..produced];
00191         }
00192 
00193 
00194         /***********************************************************************
00195 
00196                 Encode Utf8 up to a maximum of 4 bytes long (five & six
00197                 byte variations are not supported). Throws an exception
00198                 where the input dchar is greater than 0x10ffff.
00199 
00200                 If the output is provided off the stack, it should be large 
00201                 enough to encompass the entire utf8 encoding. This option is 
00202                 provided purely as an optimization for those cases where all 
00203                 boundary conditions are explicitly checked for by the caller.
00204 
00205         ***********************************************************************/
00206 
00207         static final char[] toUtf8 (dchar[] input, char[] output = null)
00208         {
00209                 if (! output.length)
00210                       output = new char [input.length * 2 + 4];
00211 
00212                 char* pOut = output;
00213                 char* pMax = pOut + output.length - 4;
00214 
00215                 foreach (dchar b; input)
00216                         { 
00217                         if (pOut >= pMax)
00218                            {
00219                            int len = pOut - output.ptr;
00220                            output.length = len + len / 2;
00221                            pOut = output.ptr + len;
00222                            pMax = pOut + len - 4;
00223                            }
00224 
00225                         if (b < 0x80)
00226                             *pOut++ = b;
00227                         else
00228                            if (b < 0x0800)
00229                               {
00230                               pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
00231                               pOut[1] = 0x80 | (b & 0x3f);
00232                               pOut += 2;
00233                               }
00234                            else
00235                               if (b < 0x80000)
00236                                  {
00237                                  pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
00238                                  pOut[1] = 0x80 | ((b >> 6)  & 0x3f);
00239                                  pOut[2] = 0x80 | (b & 0x3f);
00240                                  pOut += 3;
00241                                  }
00242                               else
00243                                  if (b < 0x110000)
00244                                     {
00245                                     pOut[0] = 0xf0 | ((b >> 18) & 0x3f);
00246                                     pOut[1] = 0x80 | ((b >> 12) & 0x3f);
00247                                     pOut[2] = 0x80 | ((b >> 6)  & 0x3f);
00248                                     pOut[3] = 0x80 | (b & 0x3f);
00249                                     pOut += 4;
00250                                     }
00251                                  else
00252                                     error ("invalid dchar");
00253                         }
00254                 
00255                 return output [0..(pOut - output.ptr)];
00256         }
00257 
00258 
00259         /***********************************************************************
00260 
00261                 Decode Utf8 produced by the above toUtf8() method. 
00262         
00263                 If the output is provided off the stack, it should be large 
00264                 enough to encompass the entire utf8 encoding. This option is 
00265                 provided purely as an optimization for those cases where all 
00266                 boundary conditions are explicitly checked for by the caller.
00267 
00268         ***********************************************************************/
00269 
00270         static final dchar[] toUtf32 (char[] input, dchar[] output = null)
00271         {
00272                 int     produced;
00273                 char*   pIn = input;
00274                 char*   pMax = pIn + input.length;
00275                 
00276                 if (! output.length)
00277                       output = new dchar[input.length];
00278 
00279                 foreach (inout dchar d; output)
00280                         {
00281                         dchar b = cast(dchar) *pIn;
00282                         if (b & 0x80)
00283                             if (b < 0xe0)
00284                                {
00285                                b &= 0x1f;
00286                                b = (b << 6) | (*++pIn & 0x3f);
00287                                }
00288                             else
00289                                if (b < 0xf0)
00290                                   {
00291                                   b &= 0x0f;
00292                                   b = (b << 6) | (pIn[1] & 0x3f);
00293                                   b = (b << 6) | (pIn[2] & 0x3f);
00294                                   pIn += 2;
00295                                   }
00296                                else
00297                                   {
00298                                   b &= 0x07;
00299                                   b = (b << 6) | (pIn[1] & 0x3f);
00300                                   b = (b << 6) | (pIn[2] & 0x3f);
00301                                   b = (b << 6) | (pIn[3] & 0x3f);
00302                                   pIn += 3;
00303                                   }
00304 
00305                         d = b;
00306                         ++produced;
00307 
00308                         if (++pIn >= pMax)
00309                             if (pIn > pMax)    
00310                                 error ("invalid utf8");  
00311                             else
00312                                break;
00313                         }
00314 
00315                 if (pIn < pMax)
00316                     error ("utf8 overflow");
00317                 return output [0..produced];
00318         }
00319 }

Generated on Fri Nov 11 18:44:24 2005 for Mango by  doxygen 1.4.0