Main Page | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members | Related Pages

Utf8.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file Utf8.d
00004         
00005         Copyright (C) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026 
00027                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00028 
00029 
00030         @version        Initial version, October 2004      
00031         @author         Kris
00032 
00033 
00034 *******************************************************************************/
00035 
00036 module mango.io.Utf8;
00037 
00038 private import mango.io.Exception;
00039 
00040 /*******************************************************************************
00041 
00042         Fast UTF-8 to Unicode transcoder. These are really sensitive to
00043         small changes on 32bit x86 devices, because the register set of
00044         those devices is so small. Beware of subtle changes which might
00045         extend the execution-period by as much as 200% ...
00046 
00047         These routines were tuned on an Intel P3; other devices may work
00048         more efficiently with a slightly different approach, though this
00049         is likely to be reasonably optimal on AMD x86 CPUs also. These
00050         algorithms could benefit significantly from those extra AMD64 
00051         registers.
00052 
00053         Note that foreach can produce noticeable more efficient code than 
00054         equivalent for() loops, with either indices or pointers. The 0.98
00055         compiler version exhibited some rather erratic behavior over the
00056         course of testing: in particular, elapsed time of method execution
00057         is noticeably dependent upon its physical location within the file
00058         (or, more specifically, the enclosing class). Yes, it sure sounds 
00059         crazy that if you switch the order of encode() with decode() that 
00060         they will consistently execute slower than as currently arranged.        
00061 
00062         Finally, please note that these are between 5 and 30 times faster 
00063         than equivalent functions in the std.utf Phobos module (dependent
00064         upon the mix of char values). Those functions (strangely) often
00065         allocate memory on a character basis, so will become significantly 
00066         slower where there's heap-contention by multiple threads.
00067         
00068 *******************************************************************************/
00069 
00070 class Utf8
00071 {
00072         private static IOException InvalidEncoding;
00073         private static IOException InvalidDecoding;
00074 
00075         /***********************************************************************
00076 
00077         ***********************************************************************/
00078 
00079         private static this ()
00080         {
00081                 InvalidEncoding = new IOException ("invalid input while encoding");
00082                 InvalidDecoding = new IOException ("invalid input while decoding");
00083         }
00084 
00085         /***********************************************************************
00086 
00087                 Encode UTF-8 up to a maximum of 3 bytes long (four, five & six
00088                 byte variations are not supported). Throws an exception where 
00089                 the input wchar is greater than 0xd7ff.
00090 
00091         ***********************************************************************/
00092 
00093         static final char[] encode (wchar[] input, char[] output, inout uint consumed)
00094         {
00095                 char*   pOut;
00096                 uint    eaten;
00097                 int     space;
00098         
00099                 pOut = output;
00100                 space = output.length;
00101                 foreach (wchar b; input)
00102                         {                        
00103                         if (b < 0x80)
00104                            {
00105                            if (space--)
00106                                *pOut++ = b;
00107                            else
00108                               break;
00109                            }
00110                         else
00111                            if (b < 0x8000)
00112                               {
00113                               if ((space -= 2) < 0)
00114                                    break;
00115 
00116                               pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
00117                               pOut[1] = 0x80 | (b & 0x3f);
00118                               pOut += 2;
00119                               }
00120                            else
00121                               if (b < 0xd800)
00122                                  {
00123                                  if ((space -= 3) < 0)
00124                                       break;
00125 
00126                                  pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
00127                                  pOut[1] = 0x80 | ((b >> 6)  & 0x3f);
00128                                  pOut[2] = 0x80 | (b & 0x3f);
00129                                  pOut += 3;
00130                                  }
00131                               else
00132                                  throw InvalidEncoding;
00133                         ++eaten;
00134                         }
00135                 
00136                 consumed = eaten;
00137                 return output [0..(pOut - &output[0])];
00138         }
00139 
00140 
00141         /***********************************************************************
00142 
00143                 Decode UTF-8 produced by the above encode() method. This
00144                 executes notably faster than the validating version.
00145         
00146         ***********************************************************************/
00147 
00148         static final wchar[] decode (char[] input, wchar[] output, inout uint consumed)
00149         {
00150                 uint    produced;
00151                 uint    available;
00152                 char*   pIn = input;
00153 
00154                 available = input.length;
00155                 foreach (inout wchar d; output)
00156                         {
00157                         if (! available--)
00158                               break;
00159 
00160                         wchar b = cast(wchar) *pIn;
00161                         if (b & 0x80)
00162                            {
00163                            if (b < 0xe0)
00164                               {
00165                               if (! available--)
00166                                     break;
00167 
00168                               b &= 0x1f;
00169                               b = (b << 6) | (*++pIn & 0x3f);
00170                               }
00171                            else
00172                               if (b < 0xf0)
00173                                  {
00174                                  if (available < 2)
00175                                      break;
00176 
00177                                  b &= 0x0f;
00178                                  b = (b << 6) | (pIn[1] & 0x3f);
00179                                  b = (b << 6) | (pIn[2] & 0x3f);
00180                                  available -= 2;
00181                                  pIn += 2;
00182                                  }
00183                               else
00184                                  throw InvalidDecoding;
00185                            }    
00186                                        
00187                         d = b;
00188                         ++pIn;
00189                         ++produced;
00190                         }
00191 
00192                 consumed = pIn - &input[0];
00193                 return output [0..produced];
00194         }
00195 
00196         /***********************************************************************
00197 
00198                 Encode UTF-8 up to a maximum of 3 bytes long (four, five & six
00199                 byte variations are not supported). Throws an exception where 
00200                 the input wchar is greater than 0xd7ff.
00201 
00202         ***********************************************************************/
00203 
00204         static final char[] encode (wchar[] input)
00205         {
00206                 uint   x;
00207                 char[] tmp = new char [input.length * 3 + 1];
00208 
00209                 return encode (input, tmp, x);
00210         }
00211 
00212         /***********************************************************************
00213 
00214                 Decode UTF-8 produced by the above encode() method. This
00215                 executes notably faster than the validating version.
00216         
00217         ***********************************************************************/
00218 
00219         static final wchar[] decode (char[] input)
00220         {
00221                 uint    x;
00222                 wchar[] tmp = new wchar [input.length+1];
00223 
00224                 return decode (input, tmp, x);
00225         }
00226 }

Generated on Sun Nov 7 19:06:54 2004 for Mango by doxygen 1.3.6