00001 /******************************************************************************* 00002 00003 @file Utf8.d 00004 00005 Copyright (C) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 00027 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00028 00029 00030 @version Initial version, October 2004 00031 @author Kris 00032 00033 00034 *******************************************************************************/ 00035 00036 module mango.io.Utf8; 00037 00038 private import mango.io.Exception; 00039 00040 /******************************************************************************* 00041 00042 Fast UTF-8 to Unicode transcoder. These are really sensitive to 00043 small changes on 32bit x86 devices, because the register set of 00044 those devices is so small. Beware of subtle changes which might 00045 extend the execution-period by as much as 200% ... 00046 00047 These routines were tuned on an Intel P3; other devices may work 00048 more efficiently with a slightly different approach, though this 00049 is likely to be reasonably optimal on AMD x86 CPUs also. These 00050 algorithms could benefit significantly from those extra AMD64 00051 registers. 00052 00053 Note that foreach can produce noticeable more efficient code than 00054 equivalent for() loops, with either indices or pointers. The 0.98 00055 compiler version exhibited some rather erratic behavior over the 00056 course of testing: in particular, elapsed time of method execution 00057 is noticeably dependent upon its physical location within the file 00058 (or, more specifically, the enclosing class). Yes, it sure sounds 00059 crazy that if you switch the order of encode() with decode() that 00060 they will consistently execute slower than as currently arranged. 00061 00062 Finally, please note that these are between 5 and 30 times faster 00063 than equivalent functions in the std.utf Phobos module (dependent 00064 upon the mix of char values). Those functions (strangely) often 00065 allocate memory on a character basis, so will become significantly 00066 slower where there's heap-contention by multiple threads. 00067 00068 *******************************************************************************/ 00069 00070 class Utf8 00071 { 00072 private static IOException InvalidEncoding; 00073 private static IOException InvalidDecoding; 00074 00075 /*********************************************************************** 00076 00077 ***********************************************************************/ 00078 00079 private static this () 00080 { 00081 InvalidEncoding = new IOException ("invalid input while encoding"); 00082 InvalidDecoding = new IOException ("invalid input while decoding"); 00083 } 00084 00085 /*********************************************************************** 00086 00087 Encode UTF-8 up to a maximum of 3 bytes long (four, five & six 00088 byte variations are not supported). Throws an exception where 00089 the input wchar is greater than 0xd7ff. 00090 00091 ***********************************************************************/ 00092 00093 static final char[] encode (wchar[] input, char[] output, inout uint consumed) 00094 { 00095 char* pOut; 00096 uint eaten; 00097 int space; 00098 00099 pOut = output; 00100 space = output.length; 00101 foreach (wchar b; input) 00102 { 00103 if (b < 0x80) 00104 { 00105 if (space--) 00106 *pOut++ = b; 00107 else 00108 break; 00109 } 00110 else 00111 if (b < 0x8000) 00112 { 00113 if ((space -= 2) < 0) 00114 break; 00115 00116 pOut[0] = 0xc0 | ((b >> 6) & 0x3f); 00117 pOut[1] = 0x80 | (b & 0x3f); 00118 pOut += 2; 00119 } 00120 else 00121 if (b < 0xd800) 00122 { 00123 if ((space -= 3) < 0) 00124 break; 00125 00126 pOut[0] = 0xe0 | ((b >> 12) & 0x3f); 00127 pOut[1] = 0x80 | ((b >> 6) & 0x3f); 00128 pOut[2] = 0x80 | (b & 0x3f); 00129 pOut += 3; 00130 } 00131 else 00132 throw InvalidEncoding; 00133 ++eaten; 00134 } 00135 00136 consumed = eaten; 00137 return output [0..(pOut - &output[0])]; 00138 } 00139 00140 00141 /*********************************************************************** 00142 00143 Decode UTF-8 produced by the above encode() method. This 00144 executes notably faster than the validating version. 00145 00146 ***********************************************************************/ 00147 00148 static final wchar[] decode (char[] input, wchar[] output, inout uint consumed) 00149 { 00150 uint produced; 00151 uint available; 00152 char* pIn = input; 00153 00154 available = input.length; 00155 foreach (inout wchar d; output) 00156 { 00157 if (! available--) 00158 break; 00159 00160 wchar b = cast(wchar) *pIn; 00161 if (b & 0x80) 00162 { 00163 if (b < 0xe0) 00164 { 00165 if (! available--) 00166 break; 00167 00168 b &= 0x1f; 00169 b = (b << 6) | (*++pIn & 0x3f); 00170 } 00171 else 00172 if (b < 0xf0) 00173 { 00174 if (available < 2) 00175 break; 00176 00177 b &= 0x0f; 00178 b = (b << 6) | (pIn[1] & 0x3f); 00179 b = (b << 6) | (pIn[2] & 0x3f); 00180 available -= 2; 00181 pIn += 2; 00182 } 00183 else 00184 throw InvalidDecoding; 00185 } 00186 00187 d = b; 00188 ++pIn; 00189 ++produced; 00190 } 00191 00192 consumed = pIn - &input[0]; 00193 return output [0..produced]; 00194 } 00195 00196 /*********************************************************************** 00197 00198 Encode UTF-8 up to a maximum of 3 bytes long (four, five & six 00199 byte variations are not supported). Throws an exception where 00200 the input wchar is greater than 0xd7ff. 00201 00202 ***********************************************************************/ 00203 00204 static final char[] encode (wchar[] input) 00205 { 00206 uint x; 00207 char[] tmp = new char [input.length * 3 + 1]; 00208 00209 return encode (input, tmp, x); 00210 } 00211 00212 /*********************************************************************** 00213 00214 Decode UTF-8 produced by the above encode() method. This 00215 executes notably faster than the validating version. 00216 00217 ***********************************************************************/ 00218 00219 static final wchar[] decode (char[] input) 00220 { 00221 uint x; 00222 wchar[] tmp = new wchar [input.length+1]; 00223 00224 return decode (input, tmp, x); 00225 } 00226 }