00001 /******************************************************************************* 00002 00003 @file UnicodeFile.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version; December 2005 00034 00035 @author Kris 00036 00037 *******************************************************************************/ 00038 00039 module mango.io.UnicodeFile; 00040 00041 public import mango.io.FilePath; 00042 public import mango.convert.Unicode; 00043 00044 private import mango.io.FileProxy, 00045 mango.io.Exception, 00046 mango.io.FileConduit; 00047 00048 private import mango.sys.ByteSwap; 00049 00050 private import mango.convert.UnicodeBom; 00051 00052 /******************************************************************************* 00053 00054 Read and write unicode files 00055 00056 For our purposes, unicode files are an encoding of textual material. 00057 The goal of this module is to interface that external-encoding with 00058 a programmer-defined internal-encoding. This internal encoding is 00059 declared via the template argument T, whilst the external encoding 00060 is either specified or derived. 00061 00062 Three internal encodings are supported: char, wchar, and dchar. The 00063 methods herein operate upon arrays of this type. For example, read() 00064 returns an array of the type, whilst write() and append() expect an 00065 array of said type. 00066 00067 Supported external encodings are as follow (from Unicode.d): 00068 00069 Unicode.Unknown 00070 Unicode.UTF_8 00071 Unicode.UTF_8N 00072 Unicode.UTF_16 00073 Unicode.UTF_16BE 00074 Unicode.UTF_16LE 00075 Unicode.UTF_32 00076 Unicode.UTF_32BE 00077 Unicode.UTF_32LE 00078 00079 These can be divided into non-explicit and explicit encodings: 00080 00081 Unicode.Unknown 00082 Unicode.UTF_8 00083 Unicode.UTF_16 00084 Unicode.UTF_32 00085 00086 00087 Unicode.UTF_8N 00088 Unicode.UTF_16BE 00089 Unicode.UTF_16LE 00090 Unicode.UTF_32BE 00091 Unicode.UTF_32LE 00092 00093 The former group of non-explicit encodings may be used to 'discover' 00094 an unknown encoding, by examining the first few bytes of the file 00095 content for a signature. This signature is optional for all files, 00096 but is often written such that the content is self-describing. When 00097 the encoding is unknown, using one of the non-explicit encodings will 00098 cause the read() method to look for a signature and adjust itself 00099 accordingly. It is possible that a ZWNBSP character might be confused 00100 with the signature; today's files are supposed to use the WORD-JOINER 00101 character instead. 00102 00103 The group of explicit encodings are for use when the file encoding is 00104 known. These *must* be used when writing or appending, since written 00105 content must be in a known format. It should be noted that, during a 00106 read operation, the presence of a signature is in conflict with these 00107 explicit varieties. 00108 00109 Method read() returns the current content of the file, whilst write() 00110 sets the file content, and file length, to the provided array. Method 00111 append() adds content to the tail of the file. When appending, it is 00112 your responsibility to ensure the existing and current encodings are 00113 correctly matched. 00114 00115 Methods to inspect the file system, check the status of a file or 00116 directory, and other facilities are made available via the FileProxy 00117 superclass. 00118 00119 00120 See 00121 $(LINK http://www.utf-8.com/) 00122 $(LINK http://www.hackcraft.net/xmlUnicode/) 00123 $(LINK http://www.unicode.org/faq/utf_bom.html/) 00124 $(LINK http://www.azillionmonkeys.com/qed/unicode.html/) 00125 $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/) 00126 00127 *******************************************************************************/ 00128 00129 class UnicodeFileTemplate(T) : FileProxy 00130 { 00131 private UnicodeBomTemplate!(T) unicode; 00132 00133 /*********************************************************************** 00134 00135 Construct a UnicodeFile from the provided FilePath. The given 00136 encoding represents the external file encoding, and should 00137 be one of the Unicode.xx types 00138 00139 ***********************************************************************/ 00140 00141 this (FilePath path, int encoding) 00142 { 00143 super (path); 00144 unicode = new UnicodeBomTemplate!(T)(encoding); 00145 } 00146 00147 /*********************************************************************** 00148 00149 Construct a UnicodeFile from a text string. The provided 00150 encoding represents the external file encoding, and should 00151 be one of the Unicode.xx types 00152 00153 ***********************************************************************/ 00154 00155 this (char[] path, int encoding) 00156 { 00157 this (new FilePath(path), encoding); 00158 } 00159 00160 /*********************************************************************** 00161 00162 Return the current encoding. This is either the originally 00163 specified encoding, or a derived one obtained by inspecting 00164 the file content for a BOM. The latter is performed as part 00165 of the read() method. 00166 00167 ***********************************************************************/ 00168 00169 int getEncoding () 00170 { 00171 return unicode.getEncoding(); 00172 } 00173 00174 /*********************************************************************** 00175 00176 Return the content of the file. The content is inspected 00177 for a BOM signature, which is stripped. An exception is 00178 thrown if a signature is present when, according to the 00179 encoding type, it should not be. Conversely, An exception 00180 is thrown if there is no known signature where the current 00181 encoding expects one to be present. 00182 00183 ***********************************************************************/ 00184 00185 T[] read () 00186 { 00187 auto conduit = new FileConduit (this); 00188 00189 try { 00190 // allocate enough space for the entire file 00191 auto content = new ubyte [conduit.length]; 00192 00193 //read the content 00194 if (conduit.read (content) != content.length) 00195 throw new IOException ("unexpected eof"); 00196 00197 return unicode.decode (content); 00198 } finally { 00199 conduit.close(); 00200 } 00201 } 00202 00203 /*********************************************************************** 00204 00205 Set the file content and length to reflect the given array. 00206 The content will be encoded accordingly. 00207 00208 ***********************************************************************/ 00209 00210 UnicodeFileTemplate write (T[] content, bool bom = false) 00211 { 00212 return write (content, FileStyle.ReadWriteCreate, bom); 00213 } 00214 00215 /*********************************************************************** 00216 00217 Append content to the file; the content will be encoded 00218 accordingly. 00219 00220 Note that it is your responsibility to ensure the 00221 existing and current encodings are correctly matched. 00222 00223 ***********************************************************************/ 00224 00225 UnicodeFileTemplate append (T[] content) 00226 { 00227 return write (content, FileStyle.WriteAppending, false); 00228 } 00229 00230 /*********************************************************************** 00231 00232 Internal method to perform writing of content. Note that 00233 the encoding must be of the explicit variety by the time 00234 we get here. 00235 00236 ***********************************************************************/ 00237 00238 private final UnicodeFileTemplate write (T[] content, FileStyle.Bits style, bool bom) 00239 { 00240 // convert to external representation 00241 void[] converted = unicode.encode (content); 00242 00243 // open file after conversion ~ in case of exceptions 00244 auto FileConduit conduit = new FileConduit (this, style); 00245 00246 try { 00247 if (bom) 00248 conduit.flush (unicode.getSignature); 00249 00250 // and write 00251 conduit.flush (converted); 00252 } finally { 00253 conduit.close(); 00254 } 00255 return this; 00256 } 00257 } 00258 00259 00260 // convenience aliases 00261 00262 alias UnicodeFileTemplate!(char) UnicodeFile; 00263 alias UnicodeFileTemplate!(wchar) UnicodeFile16; 00264 alias UnicodeFileTemplate!(dchar) UnicodeFile32;