00001 /******************************************************************************* 00002 00003 @file UnicodeBom.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version; December 2005 00034 00035 @author Kris 00036 00037 *******************************************************************************/ 00038 00039 module mango.convert.UnicodeBom; 00040 00041 private import mango.convert.Type; 00042 00043 private import mango.sys.ByteSwap; 00044 00045 public import mango.convert.Unicode; 00046 00047 /******************************************************************************* 00048 00049 Convert unicode content 00050 00051 Unicode is an encoding of textual material. The purpose of this module 00052 is to interface external-encoding with a programmer-defined internal- 00053 encoding. This internal encoding is declared via the template argument 00054 T, whilst the external encoding is either specified or derived. 00055 00056 Three internal encodings are supported: char, wchar, and dchar. The 00057 methods herein operate upon arrays of this type. That is, decode() 00058 returns an array of the type, while encode() expect an array of said 00059 type. 00060 00061 Supported external encodings are as follow (from Unicode.d): 00062 00063 Unicode.Unknown 00064 Unicode.UTF_8 00065 Unicode.UTF_8N 00066 Unicode.UTF_16 00067 Unicode.UTF_16BE 00068 Unicode.UTF_16LE 00069 Unicode.UTF_32 00070 Unicode.UTF_32BE 00071 Unicode.UTF_32LE 00072 00073 These can be divided into non-explicit and explicit encodings: 00074 00075 Unicode.Unknown 00076 Unicode.UTF_8 00077 Unicode.UTF_16 00078 Unicode.UTF_32 00079 00080 00081 Unicode.UTF_8N 00082 Unicode.UTF_16BE 00083 Unicode.UTF_16LE 00084 Unicode.UTF_32BE 00085 Unicode.UTF_32LE 00086 00087 The former group of non-explicit encodings may be used to 'discover' 00088 an unknown encoding, by examining the first few bytes of the content 00089 for a signature. This signature is optional, but is often written such 00090 that the content is self-describing. When an encoding is unknown, using 00091 one of the non-explicit encodings will cause the decode() method to look 00092 for a signature and adjust itself accordingly. It is possible that a 00093 ZWNBSP character might be confused with the signature; today's unicode 00094 content is supposed to use the WORD-JOINER character instead. 00095 00096 The group of explicit encodings are for use when the content encoding 00097 is known. These *must* be used when converting back to external encoding, 00098 since written content must be in a known format. It should be noted that, 00099 during a decode() operation, the existence of a signature is in conflict 00100 with these explicit varieties. 00101 00102 00103 See 00104 $(LINK http://www.utf-8.com/) 00105 $(LINK http://www.hackcraft.net/xmlUnicode/) 00106 $(LINK http://www.unicode.org/faq/utf_bom.html/) 00107 $(LINK http://www.azillionmonkeys.com/qed/unicode.html/) 00108 $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/) 00109 00110 *******************************************************************************/ 00111 00112 class UnicodeBomTemplate(T) 00113 { 00114 static if (!is (T == char) && !is (T == wchar) && !is (T == dchar)) 00115 pragma (msg, "Template type must be char, wchar, or dchar"); 00116 00117 00118 private int encoding; // the current encoding 00119 private Info* settings; // pointer to encoding configuration 00120 00121 private Unicode.From!(T) from; 00122 private Unicode.Into!(T) into; 00123 00124 private struct Info 00125 { 00126 int type; // type of element (char/wchar/dchar) 00127 int encoding; // Unicode.xx encoding 00128 char[] bom; // pattern to match for signature 00129 bool test, // should we test for this encoding? 00130 endian, // this encoding have endian concerns? 00131 bigEndian; // is this a big-endian encoding? 00132 int fallback; // can this encoding be defaulted? 00133 }; 00134 00135 private const Info[] lookup = 00136 [ 00137 {Type.Utf8, Unicode.Unknown, null, true, false, false, Unicode.UTF_8N}, 00138 {Type.Utf8, Unicode.UTF_8, null, true, false, false, Unicode.UTF_8N}, 00139 {Type.Utf8, Unicode.UTF_8N, x"efbbbf", false}, 00140 {Type.Utf16, Unicode.UTF_16, null, true, false, false, Unicode.UTF_16BE}, 00141 {Type.Utf16, Unicode.UTF_16BE, x"feff", false, true, true}, 00142 {Type.Utf16, Unicode.UTF_16LE, x"fffe", false, true}, 00143 {Type.Utf32, Unicode.UTF_32, null, true, false, false, Unicode.UTF_32BE}, 00144 {Type.Utf32, Unicode.UTF_32BE, x"0000feff", false, true, true}, 00145 {Type.Utf32, Unicode.UTF_32LE, x"fffe0000", false, true}, 00146 ]; 00147 00148 00149 /*********************************************************************** 00150 00151 Construct a instance using the given external encoding ~ one 00152 of the Unicode.xx types 00153 00154 ***********************************************************************/ 00155 00156 this (int encoding) 00157 { 00158 setup (encoding); 00159 } 00160 00161 /*********************************************************************** 00162 00163 Return the current encoding. This is either the originally 00164 specified encoding, or a derived one obtained by inspecting 00165 the content for a BOM. The latter is performed as part of 00166 the decode() method 00167 00168 ***********************************************************************/ 00169 00170 final int getEncoding () 00171 { 00172 return encoding; 00173 } 00174 00175 /*********************************************************************** 00176 00177 Return the signature (BOM) of the current encoding 00178 00179 ***********************************************************************/ 00180 00181 final void[] getSignature () 00182 { 00183 return settings.bom; 00184 } 00185 00186 /*********************************************************************** 00187 00188 Convert the provided content. The content is inspected 00189 for a BOM signature, which is stripped. An exception is 00190 thrown if a signature is present when, according to the 00191 encoding type, it should not be. Conversely, An exception 00192 is thrown if there is no known signature where the current 00193 encoding expects one to be present 00194 00195 ***********************************************************************/ 00196 00197 final T[] decode (void[] content) 00198 { 00199 // look for a BOM 00200 auto info = test (content); 00201 00202 // are we expecting a BOM? 00203 if (lookup[encoding].test) 00204 if (info) 00205 { 00206 // yep ~ and we got one 00207 setup (info.encoding); 00208 00209 // strip BOM from content 00210 content = content [info.bom.length .. length]; 00211 } 00212 else 00213 // can this encoding be defaulted? 00214 if (settings.fallback) 00215 setup (settings.fallback); 00216 else 00217 Unicode.error ("UnicodeBom.decode :: unknown or missing BOM"); 00218 else 00219 if (info) 00220 // found a BOM when using an explicit encoding 00221 Unicode.error ("UnicodeBom.decode :: explicit encoding does not permit BOM"); 00222 00223 // convert it to internal representation 00224 return cast(T[]) into.convert (swapBytes(content), settings.type); 00225 } 00226 00227 /*********************************************************************** 00228 00229 Perform encoding of content. Note that the encoding must be 00230 of the explicit variety by the time we get here 00231 00232 ***********************************************************************/ 00233 00234 final void[] encode (T[] content) 00235 { 00236 if (settings.test) 00237 Unicode.error ("UnicodeBom.encode :: cannot write to a non-specific encoding"); 00238 00239 // convert it to external representation, and write 00240 return swapBytes (from.convert (content, settings.type)); 00241 } 00242 00243 /*********************************************************************** 00244 00245 Scan the BOM signatures looking for a match. We scan in 00246 reverse order to get the longest match first 00247 00248 ***********************************************************************/ 00249 00250 private final Info* test (void[] content) 00251 { 00252 for (Info* info=lookup.ptr+lookup.length; --info >= lookup;) 00253 if (info.bom) 00254 { 00255 int len = info.bom.length; 00256 if (len <= content.length) 00257 if (content[0..len] == info.bom[0..len]) 00258 return info; 00259 } 00260 return null; 00261 } 00262 00263 /*********************************************************************** 00264 00265 Swap bytes around, as required by the encoding 00266 00267 ***********************************************************************/ 00268 00269 private final void[] swapBytes (void[] content) 00270 { 00271 bool endian = settings.endian; 00272 bool swap = settings.bigEndian; 00273 00274 version (BigEndian) 00275 swap = !swap; 00276 00277 if (endian && swap) 00278 { 00279 if (settings.type == Type.Utf16) 00280 ByteSwap.swap16 (content, content.length); 00281 else 00282 ByteSwap.swap32 (content, content.length); 00283 } 00284 return content; 00285 } 00286 00287 /*********************************************************************** 00288 00289 Configure this instance with unicode converters 00290 00291 ***********************************************************************/ 00292 00293 private final void setup (int encoding) 00294 { 00295 assert (Unicode.isValid (encoding)); 00296 00297 this.settings = &lookup[encoding]; 00298 this.encoding = encoding; 00299 } 00300 } 00301