Main Page | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members | Related Pages

Utf8.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file Utf8.d
00004         
00005         Copyright (C) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026 
00027                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00028 
00029 
00030         @version        Initial version, August 2004      
00031         @author         Kris
00032 
00033 
00034 *******************************************************************************/
00035 
00036 module mango.xcode.Utf8;
00037 
00038 private import mango.xcode.Transcoder;
00039 
00040 /*******************************************************************************
00041 
00042         Fast UTF-8 to Unicode transcoder. These are really sensitive to
00043         small changes on 32bit x86 devices, because the register set of
00044         those devices is so small. Beware of subtle changes which might
00045         extend the execution-period by as much as 200% ...
00046 
00047         These routines were tuned on an Intel P3; other devices may work
00048         more efficiently with a slightly different approach, though this
00049         is likely to be reasonably optimal on AMD x86 CPUs also. These
00050         algorithms could benefit significantly from those extra AMD64 
00051         registers.
00052 
00053         Note that foreach can produce noticeable more efficient code than 
00054         equivalent for() loops, with either indices or pointers. The 0.98
00055         compiler version exhibited some rather erratic behavior over the
00056         course of testing: in particular, elapsed time of method execution
00057         is noticeably dependent upon its physical location within the file
00058         (or, more specifically, the enclosing class). Yes, it sure sounds 
00059         crazy that if you switch the order of encode() with decode() that 
00060         they will consistently execute slower than as currently arranged.        
00061 
00062         Finally, please note that these are between 5 and 30 times faster 
00063         than equivalent functions in the std.utf Phobos module (dependent
00064         upon the mix of char values). Those functions (strangely) often
00065         allocate memory on a character basis, so will become significantly 
00066         slower where there's heap-contention by multiple threads.
00067         
00068 *******************************************************************************/
00069 
00070 class TranscoderUtf8 : Transcoder
00071 {
00072         /***********************************************************************
00073 
00074         ***********************************************************************/
00075 
00076         char[] toString ()
00077         {
00078                 return "utf-8";
00079         }
00080 
00081         /***********************************************************************
00082 
00083                 Encode UTF-8 up to a maximum of 4 bytes long (five & six
00084                 byte variations are not supported). Throws an exception
00085                 where the input dchar is greater than 0x10ffff.
00086 
00087         ***********************************************************************/
00088 
00089         char[] encode (dchar[] input, char[] output, out uint consumed)
00090         {
00091                 char*   pOut;
00092                 uint    eaten;
00093                 int     space;
00094         
00095                 pOut = output;
00096                 space = output.length;
00097                 foreach (dchar b; input)
00098                         {                        
00099                         if (b < 0x80)
00100                            {
00101                            if (space--)
00102                                *pOut++ = b;
00103                            else
00104                               break;
00105                            }
00106                         else
00107                            if (b < 0x8000)
00108                               {
00109                               if ((space -= 2) < 0)
00110                                    break;
00111 
00112                               pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
00113                               pOut[1] = 0x80 | (b & 0x3f);
00114                               pOut += 2;
00115                               }
00116                            else
00117                               if (b < 0x80000)
00118                                  {
00119                                  if ((space -= 3) < 0)
00120                                       break;
00121 
00122                                  pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
00123                                  pOut[1] = 0x80 | ((b >> 6)  & 0x3f);
00124                                  pOut[2] = 0x80 | (b & 0x3f);
00125                                  pOut += 3;
00126                                  }
00127                               else
00128                                  if (b < 0x110000)
00129                                     {
00130                                     if ((space -= 4) < 0)
00131                                          break;
00132 
00133                                     pOut[0] = 0xf0 | ((b >> 18) & 0x3f);
00134                                     pOut[1] = 0x80 | ((b >> 12) & 0x3f);
00135                                     pOut[2] = 0x80 | ((b >> 6)  & 0x3f);
00136                                     pOut[3] = 0x80 | (b & 0x3f);
00137                                     pOut += 4;
00138                                     }
00139                                  else
00140                                     {
00141                                     fault ("invalid input while encoding");
00142                                     return null;
00143                                     }
00144                         ++eaten;
00145                         }
00146                 
00147                 consumed = eaten;
00148                 return output [0..(pOut - &output[0])];
00149         }
00150 
00151 
00152         /***********************************************************************
00153 
00154                 Decode UTF-8 produced by the above encode() method. This
00155                 executes notably faster than the validating version.
00156         
00157         ***********************************************************************/
00158 
00159         dchar[] decode (char[] input, dchar[] output, out uint consumed)
00160         {
00161                 uint    produced;
00162                 uint    available;
00163                 char*   pIn = input;
00164 
00165                 available = input.length;
00166                 foreach (inout dchar d; output)
00167                         {
00168                         if (! available--)
00169                               break;
00170 
00171                         dchar b = cast(dchar) *pIn;
00172                         if (b & 0x80)
00173                            {
00174                            if (b < 0xe0)
00175                               {
00176                               if (! available--)
00177                                     break;
00178 
00179                               b &= 0x1f;
00180                               b = (b << 6) | (*++pIn & 0x3f);
00181                               }
00182                            else
00183                               if (b < 0xf0)
00184                                  {
00185                                  if (available < 2)
00186                                      break;
00187 
00188                                  b &= 0x0f;
00189                                  b = (b << 6) | (pIn[1] & 0x3f);
00190                                  b = (b << 6) | (pIn[2] & 0x3f);
00191                                  available -= 2;
00192                                  pIn += 2;
00193                                  }
00194                               else
00195                                  {
00196                                  if (available < 3)
00197                                      break;
00198 
00199                                  b &= 0x07;
00200                                  b = (b << 6) | (pIn[1] & 0x3f);
00201                                  b = (b << 6) | (pIn[2] & 0x3f);
00202                                  b = (b << 6) | (pIn[3] & 0x3f);
00203                                  available -= 3;
00204                                  pIn += 3;
00205                                  }
00206                            }    
00207                                        
00208                         d = b;
00209                         ++pIn;
00210                         ++produced;
00211                         }
00212 
00213                 consumed = pIn - &input[0];
00214                 return output [0..produced];
00215         }
00216 }
00217 
00218 
00219 
00220 /*******************************************************************************
00221 
00222         Fast UTF-8 to Unicode transcoder. This adds some validation to 
00223         the decoding process.
00224         
00225 *******************************************************************************/
00226 
00227 class TranscoderUtf8Checked : TranscoderUtf8
00228 {
00229         /***********************************************************************
00230 
00231                 Check the incoming stream for validity while decoding. This
00232                 does not perform the full range of validations, but catches
00233                 a large percentage of content errors due to the block-nature
00234                 of these methods. That is; those cases where an error is not 
00235                 spotted will typically lead to an error within the following
00236                 segment. Errors not explicitly looked for are those relating
00237                 to the five & six byte utf-8 combinations.
00238 
00239                 Currently executes at about 2x of the unchecked version.
00240                 
00241         ***********************************************************************/
00242 
00243         dchar[] decode (char[] input, dchar[] output, out uint consumed)
00244         {
00245                 uint    produced;
00246                 uint    available;
00247                 char*   pIn = input;
00248 
00249                 available = input.length;
00250                 foreach (inout dchar d; output)
00251                         {
00252                         if (! available)
00253                               break;
00254                         
00255                         dchar b = cast(dchar) *pIn;
00256                         if (b & 0x80)
00257                            {
00258                            uint len = b < 0xe0 ? 1 : (b < 0xf0 ? 2 : 3);
00259                            if (available > len)
00260                               {
00261                               available -= len;
00262                               b &= ((1 << (6 - len)) - 1);
00263 
00264                               do {
00265                                  char c = *++pIn;
00266                                  if ((c & 0xc0) != 0x80)
00267                                       fault ("invalid input sequence");                                 
00268 
00269                                  b = (b << 6) | (c & 0x3f);
00270                                  } while (--len);
00271 
00272                               if (!(b < 0xd800 || 
00273                                    (b > 0xdfff && b <= 0x10ffff && 
00274                                     b != 0xfffe && b != 0xffff)))
00275                                     fault ("invalid unicode character");                                 
00276                               }
00277                            else
00278                               break;
00279                            }
00280 
00281                         --available;                      
00282                         d = b;
00283                         ++pIn;
00284                         ++produced;
00285                         }
00286 
00287                 consumed = pIn - &input[0];
00288                 return output [0..produced];
00289         }
00290 }

Generated on Sun Oct 24 22:31:17 2004 for Mango by doxygen 1.3.6