00001 /******************************************************************************* 00002 00003 @file UnicodeFile.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version; December 2005 00034 00035 @author Kris 00036 00037 *******************************************************************************/ 00038 00039 module mango.io.UnicodeFile; 00040 00041 public import mango.io.FilePath; 00042 00043 private import mango.io.FileStyle, 00044 mango.io.FileProxy, 00045 mango.io.Exception, 00046 mango.io.FileConduit; 00047 00048 private import mango.sys.ByteSwap; 00049 00050 public import mango.convert.Type, 00051 mango.convert.Unicode; 00052 00053 /******************************************************************************* 00054 00055 Read and write unicode files 00056 00057 For our purposes, unicode files are an encoding of textual content. 00058 The goal of this module is to interface that external-encoding with 00059 a programmer-defined internal-encoding. This internal encoding is 00060 declared via the template argument T, whilst the external encoding 00061 is either specified or derived via methods herein. 00062 00063 Three internal encodings are supported: char, wchar, and dchar. The 00064 methods within operate upon arrays of this type. For example, read() 00065 returns an array of the type, whilst write() and append() expect an 00066 array of said type. 00067 00068 Supported external encodings are as follow (from Unicode.d): 00069 00070 Unicode.Unknown 00071 Unicode.UTF_8 00072 Unicode.UTF_8N 00073 Unicode.UTF_16 00074 Unicode.UTF_16BE 00075 Unicode.UTF_16LE 00076 Unicode.UTF_32 00077 Unicode.UTF_32BE 00078 Unicode.UTF_32LE 00079 00080 These can be divided into non-explicit and explicit encodings: 00081 00082 Unicode.Unknown 00083 Unicode.UTF_8 00084 Unicode.UTF_16 00085 Unicode.UTF_32 00086 00087 00088 Unicode.UTF_8N 00089 Unicode.UTF_16BE 00090 Unicode.UTF_16LE 00091 Unicode.UTF_32BE 00092 Unicode.UTF_32LE 00093 00094 The former group of non-explicit encodings may be used to 'discover' 00095 an unknown encoding, by examining the first few bytes of the file 00096 content for a signature. This signature is optional for all files, 00097 but is often written such that the content is self-describing. When 00098 the encoding is unknown, using one of the non-explicit encodings will 00099 cause the read() method to look for a signature and adjust itself 00100 accordingly. It is possible that a ZWNBSP character might be confused 00101 with the signature; today's files are supposed to use the WORD-JOINER 00102 character instead. 00103 00104 The group of explicit encodings are for use when the file encoding is 00105 known. These *must* be used when writing or appending, since written 00106 content must be in a known format. It should be noted that, during a 00107 read operation, the presence of a signature is in conflict with these 00108 explicit varieties. 00109 00110 Method read() returns the current content of the file, whilst write() 00111 sets the file content, and file length, to the provided array. Method 00112 append() adds content to the tail of the file. When appending, it is 00113 your responsibility to ensure the existing and current encodings are 00114 correctly matched. 00115 00116 Methods to inspect the file system, check the status of a file or 00117 directory, and other facilities are made available via the FileProxy 00118 superclass. 00119 00120 Note that the convert() method can be used to convert an arbitrary 00121 array of content ~ said content can come from somewhere other than 00122 a file (a socket, for example). 00123 00124 00125 See 00126 $(LINK http://www.utf-8.com/) 00127 $(LINK http://www.hackcraft.net/xmlUnicode/) 00128 $(LINK http://www.unicode.org/faq/utf_bom.html/) 00129 $(LINK http://www.azillionmonkeys.com/qed/unicode.html/) 00130 $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/) 00131 00132 *******************************************************************************/ 00133 00134 class UnicodeFile(T) : FileProxy 00135 { 00136 private int encoding; // the current encoding 00137 private Info* settings; // pointer to encoding configuration 00138 00139 private Unicode from, // how to convert from the file 00140 into; // how to convert into the file 00141 00142 private struct Info 00143 { 00144 int type; // type of element (char/wchar/dchar) 00145 int encoding; // Unicode.xx encoding 00146 char[] bom; // pattern to match for signature 00147 bool test, // should we test for this encoding? 00148 endian, // this encoding have endian concerns? 00149 bigEndian; // is this a big-endian encoding? 00150 int fallback; // can this encoding be defaulted? 00151 }; 00152 00153 private const Info[] lookup = 00154 [ 00155 {Type.Utf8, Unicode.Unknown, null, true}, 00156 {Type.Utf8, Unicode.UTF_8, null, true, false, false, Unicode.UTF_8N}, 00157 {Type.Utf8, Unicode.UTF_8N, x"efbbbf", false}, 00158 {Type.Utf16, Unicode.UTF_16, null, true, false, false, Unicode.UTF_16BE}, 00159 {Type.Utf16, Unicode.UTF_16BE, x"feff", false, true, true}, 00160 {Type.Utf16, Unicode.UTF_16LE, x"fffe", false, true}, 00161 {Type.Utf32, Unicode.UTF_32, null, true, false, false, Unicode.UTF_32BE}, 00162 {Type.Utf32, Unicode.UTF_32BE, x"0000feff", false, true, true}, 00163 {Type.Utf32, Unicode.UTF_32LE, x"fffe0000", false, true}, 00164 ]; 00165 00166 00167 /*********************************************************************** 00168 00169 Construct a UnicodeFile from a text string. The provided 00170 encoding represents the external file encoding, and should 00171 be one of the Unicode.xx types 00172 00173 ***********************************************************************/ 00174 00175 this (char[] path, int encoding) 00176 { 00177 super (path); 00178 setup (encoding); 00179 } 00180 00181 /*********************************************************************** 00182 00183 Construct a UnicodeFile from the provided FilePath. The given 00184 encoding represents the external file encoding, and should 00185 be one of the Unicode.xx types 00186 00187 ***********************************************************************/ 00188 00189 this (FilePath path, int encoding) 00190 { 00191 super (path); 00192 setup (encoding); 00193 } 00194 00195 /*********************************************************************** 00196 00197 Return the current encoding. This is either the originally 00198 specified encoding, or a derived one obtained by inspecting 00199 the file content for a BOM. The latter is performed as part 00200 of the read() method. 00201 00202 ***********************************************************************/ 00203 00204 int getEncoding () 00205 { 00206 return encoding; 00207 } 00208 00209 /*********************************************************************** 00210 00211 Return the content of the file. The content is inspected 00212 for a BOM signature, which is stripped. An exception is 00213 thrown if a signature is present when, according to the 00214 encoding type, it should not be. Conversely, An exception 00215 is thrown if there is no known signature where the current 00216 encoding expects one to be present. 00217 00218 ***********************************************************************/ 00219 00220 T[] read () 00221 { 00222 auto FileConduit conduit = new FileConduit (this); 00223 00224 // allocate enough space for the entire file 00225 auto content = new ubyte [conduit.length]; 00226 00227 //read the content 00228 if (conduit.read (content) != content.length) 00229 throw new IOException ("unexpected eof"); 00230 00231 return convert (content); 00232 } 00233 00234 /*********************************************************************** 00235 00236 Set the file content and length to reflect the given array. 00237 The content will be encoded accordingly. 00238 00239 ***********************************************************************/ 00240 00241 UnicodeFile write (T[] content, bool bom = false) 00242 { 00243 return write (content, FileStyle.ReadWriteCreate, bom); 00244 } 00245 00246 /*********************************************************************** 00247 00248 Append content to the file; the content will be encoded 00249 accordingly. 00250 00251 Note that it is it is your responsibility to ensure the 00252 existing and current encodings are correctly matched. 00253 00254 ***********************************************************************/ 00255 00256 UnicodeFile append (T[] content) 00257 { 00258 return write (content, FileStyle.WriteAppending, false); 00259 } 00260 00261 /*********************************************************************** 00262 00263 Convert the provided content. The content is inspected 00264 for a BOM signature, which is stripped. An exception is 00265 thrown if a signature is present when, according to the 00266 encoding type, it should not be. Conversely, An exception 00267 is thrown if there is no known signature where the current 00268 encoding expects one to be present. 00269 00270 ***********************************************************************/ 00271 00272 T[] convert (void[] content) 00273 { 00274 // look for a BOM 00275 auto info = test (content); 00276 00277 // are we expecting a BOM? 00278 if (lookup[encoding].test) 00279 if (info) 00280 { 00281 // yep ~ and we got one 00282 setup (info.encoding); 00283 00284 // strip BOM from content 00285 content = content [info.bom.length .. length]; 00286 } 00287 else 00288 // can this encoding be defaulted? 00289 if (settings.fallback) 00290 setup (settings.fallback); 00291 else 00292 throw new IOException ("unknown BOM"); 00293 else 00294 if (info) 00295 // found a BOM when using an explicit encoding 00296 throw new IOException ("unexpected BOM"); 00297 00298 // convert it to internal representation 00299 return cast(T[]) into.convert (swapBytes(content), settings.type); 00300 } 00301 00302 /*********************************************************************** 00303 00304 Internal method to perform writing of content. Note that 00305 the encoding must be of the explicit variety by the time 00306 we get here. 00307 00308 ***********************************************************************/ 00309 00310 private UnicodeFile write (T[] content, FileStyle style, bool bom) 00311 { 00312 if (settings.test) 00313 throw new IOException ("cannot write a non-specific encoding"); 00314 00315 auto FileConduit conduit = new FileConduit (this, style); 00316 00317 if (bom) 00318 conduit.flush (settings.bom); 00319 00320 // convert it to external representation, and write 00321 conduit.flush (swapBytes (from.convert (content, settings.type))); 00322 return this; 00323 } 00324 00325 /*********************************************************************** 00326 00327 Scan the BOM signatures looking for a match. We scan in 00328 reverse order to get the longest match first. 00329 00330 ***********************************************************************/ 00331 00332 private Info* test (void[] content) 00333 { 00334 for (Info* info=lookup.ptr+lookup.length; --info >= lookup;) 00335 if (info.bom) 00336 { 00337 int len = info.bom.length; 00338 if (len <= content.length) 00339 if (content[0..len] == info.bom[0..len]) 00340 return info; 00341 } 00342 return null; 00343 } 00344 00345 /*********************************************************************** 00346 00347 Swap bytes around, as required by the encoding 00348 00349 ***********************************************************************/ 00350 00351 private void[] swapBytes (void[] content) 00352 { 00353 bool endian = settings.endian; 00354 bool swap = settings.bigEndian; 00355 00356 version (BigEndian) 00357 swap = !swap; 00358 00359 if (endian && swap) 00360 { 00361 if (settings.type == Type.Utf16) 00362 ByteSwap.swap16 (content, content.length); 00363 else 00364 ByteSwap.swap32 (content, content.length); 00365 } 00366 return content; 00367 } 00368 00369 /*********************************************************************** 00370 00371 Configure this instance with unicode converters 00372 00373 ***********************************************************************/ 00374 00375 private void setup (int encoding) 00376 { 00377 assert (Unicode.isValid (encoding)); 00378 00379 this.from = new Unicode.From!(T); 00380 this.into = new Unicode.Into!(T); 00381 00382 this.settings = &lookup[encoding]; 00383 this.encoding = encoding; 00384 } 00385 } 00386 00387 00388 // convenience aliases 00389 00390 alias UnicodeFile!(char) UnicodeFile8; 00391 alias UnicodeFile!(wchar) UnicodeFile16; 00392 alias UnicodeFile!(dchar) UnicodeFile32;