Main Page | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members | Related Pages

Utf8.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file Utf8.d
00004         
00005         Copyright (c) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026         4. Derivative works are permitted, but they must carry this notice
00027            in full and credit the original source.
00028 
00029 
00030                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00031 
00032 
00033         @version        Initial version, October 2004      
00034         @author         Kris
00035 
00036 
00037 *******************************************************************************/
00038 
00039 module mango.io.Utf8;
00040 
00041 private import mango.io.Exception;
00042 
00043 /*******************************************************************************
00044 
00045         Fast UTF-8 to Unicode transcoder. These are really sensitive to
00046         small changes on 32bit x86 devices, because the register set of
00047         those devices is so small. Beware of subtle changes which might
00048         extend the execution-period by as much as 200% ...
00049 
00050         These routines were tuned on an Intel P3; other devices may work
00051         more efficiently with a slightly different approach, though this
00052         is likely to be reasonably optimal on AMD x86 CPUs also. These
00053         algorithms could benefit significantly from those extra AMD64 
00054         registers.
00055 
00056         Note that foreach can produce noticeable more efficient code than 
00057         equivalent for() loops, with either indices or pointers. The 0.98
00058         compiler version exhibited some rather erratic behavior over the
00059         course of testing: in particular, elapsed time of method execution
00060         is noticeably dependent upon its physical location within the file
00061         (or, more specifically, the enclosing class). Yes, it sure sounds 
00062         crazy that if you switch the order of encode() with decode() that 
00063         they will consistently execute slower than as currently arranged.        
00064 
00065         Finally, please note that these are between 5 and 30 times faster 
00066         than equivalent functions in the std.utf Phobos module (dependent
00067         upon the mix of char values). Those functions (strangely) often
00068         allocate memory on a character basis, so will become significantly 
00069         slower where there's heap-contention by multiple threads.
00070         
00071 *******************************************************************************/
00072 
00073 class Utf8
00074 {
00075         private static IOException InvalidEncoding;
00076         private static IOException InvalidDecoding;
00077 
00078         /***********************************************************************
00079 
00080         ***********************************************************************/
00081 
00082         private static this ()
00083         {
00084                 InvalidEncoding = new IOException ("invalid input while encoding");
00085                 InvalidDecoding = new IOException ("invalid input while decoding");
00086         }
00087 
00088         /***********************************************************************
00089 
00090                 Encode UTF-8 up to a maximum of 3 bytes long (four, five & six
00091                 byte variations are not supported). Throws an exception where 
00092                 the input wchar is greater than 0xd7ff.
00093 
00094         ***********************************************************************/
00095 
00096         static final char[] encode (wchar[] input, char[] output, inout uint consumed)
00097         {
00098                 char*   pOut;
00099                 uint    eaten;
00100                 int     space;
00101         
00102                 pOut = output;
00103                 space = output.length;
00104                 foreach (wchar b; input)
00105                         {                        
00106                         if (b < 0x80)
00107                            {
00108                            if (space--)
00109                                *pOut++ = b;
00110                            else
00111                               break;
00112                            }
00113                         else
00114                            if (b < 0x8000)
00115                               {
00116                               if ((space -= 2) < 0)
00117                                    break;
00118 
00119                               pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
00120                               pOut[1] = 0x80 | (b & 0x3f);
00121                               pOut += 2;
00122                               }
00123                            else
00124                               if (b < 0xd800)
00125                                  {
00126                                  if ((space -= 3) < 0)
00127                                       break;
00128 
00129                                  pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
00130                                  pOut[1] = 0x80 | ((b >> 6)  & 0x3f);
00131                                  pOut[2] = 0x80 | (b & 0x3f);
00132                                  pOut += 3;
00133                                  }
00134                               else
00135                                  throw InvalidEncoding;
00136                         ++eaten;
00137                         }
00138                 
00139                 consumed = eaten;
00140                 return output [0..(pOut - &output[0])];
00141         }
00142 
00143 
00144         /***********************************************************************
00145 
00146                 Decode UTF-8 produced by the above encode() method. This
00147                 executes notably faster than the validating version.
00148         
00149         ***********************************************************************/
00150 
00151         static final wchar[] decode (char[] input, wchar[] output, inout uint consumed)
00152         {
00153                 uint    produced;
00154                 uint    available;
00155                 char*   pIn = input;
00156 
00157                 available = input.length;
00158                 foreach (inout wchar d; output)
00159                         {
00160                         if (! available--)
00161                               break;
00162 
00163                         wchar b = cast(wchar) *pIn;
00164                         if (b & 0x80)
00165                            {
00166                            if (b < 0xe0)
00167                               {
00168                               if (! available--)
00169                                     break;
00170 
00171                               b &= 0x1f;
00172                               b = (b << 6) | (*++pIn & 0x3f);
00173                               }
00174                            else
00175                               if (b < 0xf0)
00176                                  {
00177                                  if (available < 2)
00178                                      break;
00179 
00180                                  b &= 0x0f;
00181                                  b = (b << 6) | (pIn[1] & 0x3f);
00182                                  b = (b << 6) | (pIn[2] & 0x3f);
00183                                  available -= 2;
00184                                  pIn += 2;
00185                                  }
00186                               else
00187                                  throw InvalidDecoding;
00188                            }    
00189                                        
00190                         d = b;
00191                         ++pIn;
00192                         ++produced;
00193                         }
00194 
00195                 consumed = pIn - &input[0];
00196                 return output [0..produced];
00197         }
00198 
00199         /***********************************************************************
00200 
00201                 Encode UTF-8 up to a maximum of 3 bytes long (four, five & six
00202                 byte variations are not supported). Throws an exception where 
00203                 the input wchar is greater than 0xd7ff.
00204 
00205         ***********************************************************************/
00206 
00207         static final char[] encode (wchar[] input)
00208         {
00209                 uint   x;
00210                 char[] tmp = new char [input.length * 3 + 1];
00211 
00212                 return encode (input, tmp, x);
00213         }
00214 
00215         /***********************************************************************
00216 
00217                 Decode UTF-8 produced by the above encode() method. This
00218                 executes notably faster than the validating version.
00219         
00220         ***********************************************************************/
00221 
00222         static final wchar[] decode (char[] input)
00223         {
00224                 uint    x;
00225                 wchar[] tmp = new wchar [input.length+1];
00226 
00227                 return decode (input, tmp, x);
00228         }
00229 }

Generated on Tue Jan 25 21:18:25 2005 for Mango by doxygen 1.3.6