00001 /******************************************************************************* 00002 00003 @file Utf8.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version, October 2004 00034 @author Kris 00035 00036 00037 *******************************************************************************/ 00038 00039 module mango.io.Utf8; 00040 00041 private import mango.io.Exception; 00042 00043 /******************************************************************************* 00044 00045 Fast UTF-8 to Unicode transcoder. These are really sensitive to 00046 small changes on 32bit x86 devices, because the register set of 00047 those devices is so small. Beware of subtle changes which might 00048 extend the execution-period by as much as 200% ... 00049 00050 These routines were tuned on an Intel P3; other devices may work 00051 more efficiently with a slightly different approach, though this 00052 is likely to be reasonably optimal on AMD x86 CPUs also. These 00053 algorithms could benefit significantly from those extra AMD64 00054 registers. 00055 00056 Note that foreach can produce noticeable more efficient code than 00057 equivalent for() loops, with either indices or pointers. The 0.98 00058 compiler version exhibited some rather erratic behavior over the 00059 course of testing: in particular, elapsed time of method execution 00060 is noticeably dependent upon its physical location within the file 00061 (or, more specifically, the enclosing class). Yes, it sure sounds 00062 crazy that if you switch the order of encode() with decode() that 00063 they will consistently execute slower than as currently arranged. 00064 00065 Finally, please note that these are between 5 and 30 times faster 00066 than equivalent functions in the std.utf Phobos module (dependent 00067 upon the mix of char values). Those functions (strangely) often 00068 allocate memory on a character basis, so will become significantly 00069 slower where there's heap-contention by multiple threads. 00070 00071 *******************************************************************************/ 00072 00073 class Utf8 00074 { 00075 private static IOException InvalidEncoding; 00076 private static IOException InvalidDecoding; 00077 00078 /*********************************************************************** 00079 00080 ***********************************************************************/ 00081 00082 private static this () 00083 { 00084 InvalidEncoding = new IOException ("invalid input while encoding"); 00085 InvalidDecoding = new IOException ("invalid input while decoding"); 00086 } 00087 00088 /*********************************************************************** 00089 00090 Encode UTF-8 up to a maximum of 3 bytes long (four, five & six 00091 byte variations are not supported). Throws an exception where 00092 the input wchar is greater than 0xd7ff. 00093 00094 ***********************************************************************/ 00095 00096 static final char[] encode (wchar[] input, char[] output, inout uint consumed) 00097 { 00098 char* pOut; 00099 uint eaten; 00100 int space; 00101 00102 pOut = output; 00103 space = output.length; 00104 foreach (wchar b; input) 00105 { 00106 if (b < 0x80) 00107 { 00108 if (space--) 00109 *pOut++ = b; 00110 else 00111 break; 00112 } 00113 else 00114 if (b < 0x8000) 00115 { 00116 if ((space -= 2) < 0) 00117 break; 00118 00119 pOut[0] = 0xc0 | ((b >> 6) & 0x3f); 00120 pOut[1] = 0x80 | (b & 0x3f); 00121 pOut += 2; 00122 } 00123 else 00124 if (b < 0xd800) 00125 { 00126 if ((space -= 3) < 0) 00127 break; 00128 00129 pOut[0] = 0xe0 | ((b >> 12) & 0x3f); 00130 pOut[1] = 0x80 | ((b >> 6) & 0x3f); 00131 pOut[2] = 0x80 | (b & 0x3f); 00132 pOut += 3; 00133 } 00134 else 00135 throw InvalidEncoding; 00136 ++eaten; 00137 } 00138 00139 consumed = eaten; 00140 return output [0..(pOut - &output[0])]; 00141 } 00142 00143 00144 /*********************************************************************** 00145 00146 Decode UTF-8 produced by the above encode() method. This 00147 executes notably faster than the validating version. 00148 00149 ***********************************************************************/ 00150 00151 static final wchar[] decode (char[] input, wchar[] output, inout uint consumed) 00152 { 00153 uint produced; 00154 uint available; 00155 char* pIn = input; 00156 00157 available = input.length; 00158 foreach (inout wchar d; output) 00159 { 00160 if (! available--) 00161 break; 00162 00163 wchar b = cast(wchar) *pIn; 00164 if (b & 0x80) 00165 { 00166 if (b < 0xe0) 00167 { 00168 if (! available--) 00169 break; 00170 00171 b &= 0x1f; 00172 b = (b << 6) | (*++pIn & 0x3f); 00173 } 00174 else 00175 if (b < 0xf0) 00176 { 00177 if (available < 2) 00178 break; 00179 00180 b &= 0x0f; 00181 b = (b << 6) | (pIn[1] & 0x3f); 00182 b = (b << 6) | (pIn[2] & 0x3f); 00183 available -= 2; 00184 pIn += 2; 00185 } 00186 else 00187 throw InvalidDecoding; 00188 } 00189 00190 d = b; 00191 ++pIn; 00192 ++produced; 00193 } 00194 00195 consumed = pIn - &input[0]; 00196 return output [0..produced]; 00197 } 00198 00199 /*********************************************************************** 00200 00201 Encode UTF-8 up to a maximum of 3 bytes long (four, five & six 00202 byte variations are not supported). Throws an exception where 00203 the input wchar is greater than 0xd7ff. 00204 00205 ***********************************************************************/ 00206 00207 static final char[] encode (wchar[] input) 00208 { 00209 uint x; 00210 char[] tmp = new char [input.length * 3 + 1]; 00211 00212 return encode (input, tmp, x); 00213 } 00214 00215 /*********************************************************************** 00216 00217 Decode UTF-8 produced by the above encode() method. This 00218 executes notably faster than the validating version. 00219 00220 ***********************************************************************/ 00221 00222 static final wchar[] decode (char[] input) 00223 { 00224 uint x; 00225 wchar[] tmp = new wchar [input.length+1]; 00226 00227 return decode (input, tmp, x); 00228 } 00229 }