Main Page | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Class Members | File Members | Related Pages

UnicodeBom.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file UnicodeBom.d
00004         
00005         Copyright (c) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026         4. Derivative works are permitted, but they must carry this notice
00027            in full and credit the original source.
00028 
00029 
00030                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00031 
00032 
00033         @version        Initial version; December 2005      
00034 
00035         @author         Kris
00036 
00037 *******************************************************************************/
00038 
00039 module mango.convert.UnicodeBom;
00040 
00041 private import  mango.convert.Type;
00042 
00043 private import  mango.sys.ByteSwap;
00044 
00045 public  import  mango.convert.Unicode;
00046 
00047 /*******************************************************************************
00048 
00049         Convert unicode content
00050 
00051         Unicode is an encoding of textual material. The purpose of this module 
00052         is to interface external-encoding with a programmer-defined internal-
00053         encoding. This internal encoding is declared via the template argument 
00054         T, whilst the external encoding is either specified or derived.
00055 
00056         Three internal encodings are supported: char, wchar, and dchar. The
00057         methods herein operate upon arrays of this type. That is, decode()
00058         returns an array of the type, while encode() expect an array of said 
00059         type.
00060 
00061         Supported external encodings are as follow (from Unicode.d):
00062 
00063                 Unicode.Unknown 
00064                 Unicode.UTF_8
00065                 Unicode.UTF_8N
00066                 Unicode.UTF_16
00067                 Unicode.UTF_16BE
00068                 Unicode.UTF_16LE 
00069                 Unicode.UTF_32 
00070                 Unicode.UTF_32BE
00071                 Unicode.UTF_32LE 
00072 
00073         These can be divided into non-explicit and explicit encodings:
00074 
00075                 Unicode.Unknown 
00076                 Unicode.UTF_8
00077                 Unicode.UTF_16
00078                 Unicode.UTF_32 
00079 
00080 
00081                 Unicode.UTF_8N
00082                 Unicode.UTF_16BE
00083                 Unicode.UTF_16LE 
00084                 Unicode.UTF_32BE
00085                 Unicode.UTF_32LE 
00086         
00087         The former group of non-explicit encodings may be used to 'discover'
00088         an unknown encoding, by examining the first few bytes of the content
00089         for a signature. This signature is optional, but is often written such 
00090         that the content is self-describing. When an encoding is unknown, using 
00091         one of the non-explicit encodings will cause the decode() method to look 
00092         for a signature and adjust itself accordingly. It is possible that a 
00093         ZWNBSP character might be confused with the signature; today's unicode 
00094         content is supposed to use the WORD-JOINER character instead.
00095        
00096         The group of explicit encodings are for use when the content encoding 
00097         is known. These *must* be used when converting back to external encoding, 
00098         since written content must be in a known format. It should be noted that, 
00099         during a decode() operation, the existence of a signature is in conflict 
00100         with these explicit varieties.
00101 
00102 
00103         See 
00104         $(LINK http://www.utf-8.com/)
00105         $(LINK http://www.hackcraft.net/xmlUnicode/)
00106         $(LINK http://www.unicode.org/faq/utf_bom.html/)
00107         $(LINK http://www.azillionmonkeys.com/qed/unicode.html/)
00108         $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/)
00109 
00110 *******************************************************************************/
00111 
00112 class UnicodeBomTemplate(T) 
00113 {
00114         static if (!is (T == char) && !is (T == wchar) && !is (T == dchar)) 
00115                     pragma (msg, "Template type must be char, wchar, or dchar");
00116 
00117 
00118         private int     encoding;       // the current encoding
00119         private Info*   settings;       // pointer to encoding configuration
00120 
00121         private Unicode.From!(T) from;
00122         private Unicode.Into!(T) into;
00123 
00124         private struct  Info
00125                 {
00126                 int     type;           // type of element (char/wchar/dchar)
00127                 int     encoding;       // Unicode.xx encoding
00128                 char[]  bom;            // pattern to match for signature
00129                 bool    test,           // should we test for this encoding?
00130                         endian,         // this encoding have endian concerns?
00131                         bigEndian;      // is this a big-endian encoding?
00132                 int     fallback;       // can this encoding be defaulted?
00133                 };
00134 
00135         private const Info[] lookup =
00136         [
00137         {Type.Utf8,  Unicode.Unknown,  null,        true, false, false, Unicode.UTF_8N},
00138         {Type.Utf8,  Unicode.UTF_8,    null,        true, false, false, Unicode.UTF_8N},
00139         {Type.Utf8,  Unicode.UTF_8N,   x"efbbbf",   false},
00140         {Type.Utf16, Unicode.UTF_16,   null,        true, false, false, Unicode.UTF_16BE},
00141         {Type.Utf16, Unicode.UTF_16BE, x"feff",     false, true, true},
00142         {Type.Utf16, Unicode.UTF_16LE, x"fffe",     false, true},
00143         {Type.Utf32, Unicode.UTF_32,   null,        true, false, false, Unicode.UTF_32BE},
00144         {Type.Utf32, Unicode.UTF_32BE, x"0000feff", false, true, true},
00145         {Type.Utf32, Unicode.UTF_32LE, x"fffe0000", false, true},
00146         ];
00147 
00148 
00149         /***********************************************************************
00150         
00151                 Construct a instance using the given external encoding ~ one 
00152                 of the Unicode.xx types 
00153 
00154         ***********************************************************************/
00155                                   
00156         this (int encoding)
00157         {
00158                 setup (encoding);
00159         }
00160         
00161         /***********************************************************************
00162 
00163                 Return the current encoding. This is either the originally
00164                 specified encoding, or a derived one obtained by inspecting
00165                 the content for a BOM. The latter is performed as part of 
00166                 the decode() method
00167 
00168         ***********************************************************************/
00169 
00170         final int getEncoding ()
00171         {
00172                 return encoding;
00173         }
00174         
00175         /***********************************************************************
00176 
00177                 Return the signature (BOM) of the current encoding
00178 
00179         ***********************************************************************/
00180 
00181         final void[] getSignature ()
00182         {
00183                 return settings.bom;
00184         }
00185 
00186         /***********************************************************************
00187 
00188                 Convert the provided content. The content is inspected 
00189                 for a BOM signature, which is stripped. An exception is
00190                 thrown if a signature is present when, according to the
00191                 encoding type, it should not be. Conversely, An exception
00192                 is thrown if there is no known signature where the current
00193                 encoding expects one to be present
00194 
00195         ***********************************************************************/
00196 
00197         final T[] decode (void[] content)
00198         {
00199                 // look for a BOM
00200                 auto info = test (content);
00201 
00202                 // are we expecting a BOM?
00203                 if (lookup[encoding].test)
00204                     if (info)
00205                        {
00206                        // yep ~ and we got one
00207                        setup (info.encoding);
00208 
00209                        // strip BOM from content
00210                        content = content [info.bom.length .. length];
00211                        }
00212                     else
00213                        // can this encoding be defaulted?
00214                        if (settings.fallback)
00215                            setup (settings.fallback);
00216                        else
00217                           Unicode.error ("UnicodeBom.decode :: unknown or missing BOM");
00218                 else
00219                    if (info)
00220                        // found a BOM when using an explicit encoding
00221                        Unicode.error ("UnicodeBom.decode :: explicit encoding does not permit BOM");   
00222                 
00223                 // convert it to internal representation
00224                 return cast(T[]) into.convert (swapBytes(content), settings.type);
00225         }
00226 
00227         /***********************************************************************
00228 
00229                 Perform encoding of content. Note that the encoding must be 
00230                 of the explicit variety by the time we get here
00231 
00232         ***********************************************************************/
00233 
00234         final void[] encode (T[] content)
00235         {
00236                 if (settings.test)
00237                     Unicode.error ("UnicodeBom.encode :: cannot write to a non-specific encoding");
00238 
00239                 // convert it to external representation, and write
00240                return swapBytes (from.convert (content, settings.type));
00241         }
00242 
00243         /***********************************************************************
00244 
00245                 Scan the BOM signatures looking for a match. We scan in 
00246                 reverse order to get the longest match first
00247 
00248         ***********************************************************************/
00249 
00250         private final Info* test (void[] content)
00251         {
00252                 for (Info* info=lookup.ptr+lookup.length; --info >= lookup;)
00253                      if (info.bom)
00254                         {
00255                         int len = info.bom.length;
00256                         if (len <= content.length)
00257                             if (content[0..len] == info.bom[0..len])
00258                                 return info;
00259                         }
00260                 return null;
00261         }
00262         
00263         /***********************************************************************
00264 
00265                 Swap bytes around, as required by the encoding
00266 
00267         ***********************************************************************/
00268 
00269         private final void[] swapBytes (void[] content)
00270         {
00271                 bool endian = settings.endian;
00272                 bool swap   = settings.bigEndian;
00273 
00274                 version (BigEndian)
00275                          swap = !swap;
00276 
00277                 if (endian && swap)
00278                    {
00279                    if (settings.type == Type.Utf16)
00280                        ByteSwap.swap16 (content, content.length);
00281                    else
00282                        ByteSwap.swap32 (content, content.length);
00283                    }
00284                 return content;
00285         }
00286 
00287         /***********************************************************************
00288 
00289                 Configure this instance with unicode converters
00290 
00291         ***********************************************************************/
00292 
00293         private final void setup (int encoding)
00294         {
00295                 assert (Unicode.isValid (encoding));
00296 
00297                 this.settings = &lookup[encoding];
00298                 this.encoding = encoding;
00299         }
00300 }
00301 

Generated on Sat Dec 24 17:28:34 2005 for Mango by  doxygen 1.4.0