Main Page | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members | Related Pages

UConverter.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file UConverter.d
00004         
00005         Copyright (C) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026 
00027                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00028 
00029 
00030         @version        Initial version, October 2004      
00031         @author         Kris
00032 
00033         Note that this package and documentation is built around the ICU 
00034         project (http://oss.software.ibm.com/icu/). Below is the license 
00035         statement as specified by that software:
00036 
00037 
00038                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00039 
00040 
00041         ICU License - ICU 1.8.1 and later
00042 
00043         COPYRIGHT AND PERMISSION NOTICE
00044 
00045         Copyright (c) 1995-2003 International Business Machines Corporation and 
00046         others.
00047 
00048         All rights reserved.
00049 
00050         Permission is hereby granted, free of charge, to any person obtaining a
00051         copy of this software and associated documentation files (the
00052         "Software"), to deal in the Software without restriction, including
00053         without limitation the rights to use, copy, modify, merge, publish,
00054         distribute, and/or sell copies of the Software, and to permit persons
00055         to whom the Software is furnished to do so, provided that the above
00056         copyright notice(s) and this permission notice appear in all copies of
00057         the Software and that both the above copyright notice(s) and this
00058         permission notice appear in supporting documentation.
00059 
00060         THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
00061         OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00062         MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
00063         OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
00064         HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
00065         INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
00066         FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
00067         NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
00068         WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
00069 
00070         Except as contained in this notice, the name of a copyright holder
00071         shall not be used in advertising or otherwise to promote the sale, use
00072         or other dealings in this Software without prior written authorization
00073         of the copyright holder.
00074 
00075         ----------------------------------------------------------------------
00076 
00077         All trademarks and registered trademarks mentioned herein are the 
00078         property of their respective owners.
00079 
00080 *******************************************************************************/
00081 
00082 module mango.icu.UConverter;
00083 
00084 private import mango.icu.ICU;
00085 
00086 /*******************************************************************************
00087 
00088         This API is used to convert codepage or character encoded data to 
00089         and from UTF-16. You can open a converter with ucnv_open(). With 
00090         that converter, you can get its properties, set options, convert 
00091         your data and close the converter.
00092 
00093         Since many software programs recogize different converter names 
00094         for different types of converters, there are other functions in 
00095         this API to iterate over the converter aliases. 
00096 
00097         See <A HREF="http://oss.software.ibm.com/icu/apiref/ucnv_8h.html">
00098         this page</A> for full details.
00099 
00100 *******************************************************************************/
00101 
00102 class UConverter : ICU
00103 {
00104         private Handle converter;
00105 
00106         /***********************************************************************
00107         
00108                 Creates a UConverter object with the names specified as a 
00109                 string. 
00110                 
00111                 The actual name will be resolved with the alias file using 
00112                 a case-insensitive string comparison that ignores delimiters 
00113                 '-', '_', and ' ' (dash, underscore, and space). E.g., the 
00114                 names "UTF8", "utf-8", and "Utf 8" are all equivalent. If null
00115                 is passed for the converter name, it will create one with the 
00116                 getDefaultName() return value.
00117 
00118                 A converter name may contain options like a locale specification 
00119                 to control the specific behavior of the converter instantiated. 
00120                 The meaning of the options depends on the particular converter: 
00121                 if an option is not defined for or recognized, it is ignored.
00122 
00123                 Options are appended to the converter name string, with an 
00124                 OptionSepChar between the name and the first option and also 
00125                 between adjacent options.
00126 
00127                 The conversion behavior and names can vary between platforms, 
00128                 and ICU may convert some characters differently from other 
00129                 platforms. Details on this topic are in the User's Guide.
00130                 
00131         ***********************************************************************/
00132 
00133         this (char[] name)
00134         {
00135                 Error e;
00136 
00137                 converter = ucnv_open (toString (name), e);
00138                 if (isError (e))
00139                     exception ("failed to create converter for '"~name~"'");
00140         }
00141 
00142         /***********************************************************************
00143 
00144                 Deletes the unicode converter and releases resources 
00145                 associated with just this instance. Does not free up 
00146                 shared converter tables.        
00147 
00148         ***********************************************************************/
00149 
00150         ~this ()
00151         {
00152                 if (converter)
00153                     ucnv_close (converter);
00154                 converter = null;
00155         }
00156 
00157         /***********************************************************************
00158 
00159                 Do a fuzzy compare of two converter/alias names. The 
00160                 comparison is case-insensitive. It also ignores the 
00161                 characters '-', '_', and ' ' (dash, underscore, and space). 
00162                 Thus the strings "UTF-8", "utf_8", and "Utf 8" are exactly 
00163                 equivalent
00164         
00165         ***********************************************************************/
00166 
00167         static final int compareNames (char[] a, char[] b)
00168         {
00169                 return ucnv_compareNames (toString(a), toString(b));
00170         }
00171 
00172         /***********************************************************************
00173         
00174                 Resets the state of this converter to the default state.
00175 
00176                 This is used in the case of an error, to restart a 
00177                 conversion from a known default state. It will also 
00178                 empty the internal output buffers.
00179 
00180         ***********************************************************************/
00181 
00182         void reset ()
00183         {
00184                 ucnv_reset (converter);
00185         }
00186 
00187         /***********************************************************************
00188         
00189                 Resets the from-Unicode part of this converter state to the 
00190                 default state.
00191 
00192                 This is used in the case of an error to restart a conversion 
00193                 from Unicode to a known default state. It will also empty the 
00194                 internal output buffers used for the conversion from Unicode 
00195                 codepoints. 
00196 
00197         ***********************************************************************/
00198 
00199         void resetDecoder ()
00200         {
00201                 ucnv_resetToUnicode (converter);
00202         }
00203 
00204         /***********************************************************************
00205         
00206                 Resets the from-Unicode part of this converter state to the 
00207                 default state.
00208 
00209                 This is used in the case of an error to restart a conversion
00210                 from Unicode to a known default state. It will also empty the 
00211                 internal output buffers used for the conversion from Unicode 
00212                 codepoints. 
00213 
00214         ***********************************************************************/
00215 
00216         void resetEncoder ()
00217         {
00218                 ucnv_resetFromUnicode (converter);
00219         }
00220 
00221         /***********************************************************************
00222         
00223                 Returns the maximum number of bytes that are output per 
00224                 UChar in conversion from Unicode using this converter.
00225 
00226                 The returned number can be used to calculate the size of 
00227                 a target buffer for conversion from Unicode.
00228 
00229                 This number may not be the same as the maximum number of 
00230                 bytes per "conversion unit". In other words, it may not 
00231                 be the intuitively expected number of bytes per character 
00232                 that would be published for a charset, and may not fulfill 
00233                 any other purpose than the allocation of an output buffer 
00234                 of guaranteed sufficient size for a given input length and 
00235                 converter.
00236 
00237                 Examples for special cases that are taken into account:
00238 
00239                 * Supplementary code points may convert to more bytes than 
00240                   BMP code points. This function returns bytes per UChar 
00241                   (UTF-16 code unit), not per Unicode code point, for efficient 
00242                   buffer allocation.
00243                 * State-shifting output (SI/SO, escapes, etc.) from stateful 
00244                   converters.
00245                 * When m input UChars are converted to n output bytes, then 
00246                   the maximum m/n is taken into account.
00247 
00248                 The number returned here does not take into account:
00249 
00250                 * callbacks which output more than one charset character 
00251                   sequence per call, like escape callbacks
00252                 * initial and final non-character bytes that are output by 
00253                   some converters (automatic BOMs, initial escape sequence, 
00254                   final SI, etc.)
00255 
00256                 Examples for returned values:
00257 
00258                 * SBCS charsets: 1
00259                 * Shift-JIS: 2
00260                 * UTF-16: 2 (2 per BMP, 4 per surrogate _pair_, BOM not counted)
00261                 * UTF-8: 3 (3 per BMP, 4 per surrogate _pair_)
00262                 * EBCDIC_STATEFUL (EBCDIC mixed SBCS/DBCS): 3 (SO + DBCS)
00263                 * ISO-2022: 3 (always outputs UTF-8)
00264                 * ISO-2022-JP: 6 (4-byte escape sequences + DBCS)
00265                 * ISO-2022-CN: 8 (4-byte designator sequences + 2-byte SS2/SS3 
00266                   + DBCS)
00267 
00268         ***********************************************************************/
00269 
00270         ubyte getMaxCharSize ()
00271         {
00272                 return ucnv_getMaxCharSize (converter);
00273         }
00274 
00275         /***********************************************************************
00276 
00277                 Returns the minimum byte length for characters in this 
00278                 codepage. This is usually either 1 or 2.         
00279 
00280         ***********************************************************************/
00281 
00282         ubyte getMinCharSize ()
00283         {
00284                 return ucnv_getMinCharSize (converter);
00285         }
00286 
00287         /***********************************************************************
00288 
00289                 Gets the internal, canonical name of the converter (zero-
00290                 terminated). 
00291 
00292         ***********************************************************************/
00293 
00294         char[] getName ()
00295         {
00296                 Error e;
00297 
00298                 char[] name = toArray (ucnv_getName (converter, e));
00299                 testError (e, "failed to get converter name");
00300                 return name;
00301         }
00302 
00303         /***********************************************************************
00304 
00305                 Determines if the converter contains ambiguous mappings of 
00306                 the same character or not
00307 
00308         ***********************************************************************/
00309 
00310         bool isAmbiguous ()
00311         {
00312                 return cast(bool) ucnv_isAmbiguous (converter);
00313         }
00314 
00315         /***********************************************************************
00316 
00317                 Detects Unicode signature byte sequences at the start 
00318                 of the byte stream and returns the charset name of the 
00319                 indicated Unicode charset. An exception is thrown when 
00320                 no Unicode signature is recognized. 
00321                 
00322                 A caller can create a Converter using the charset name. 
00323                 The first code unit (UChar) from the start of the stream 
00324                 will be U+FEFF (the Unicode BOM/signature character) and 
00325                 can usually be ignored.
00326 
00327         ***********************************************************************/
00328 
00329         static final char[] detectSignature (void[] input)
00330         {
00331                 Error   e;
00332                 uint    len;
00333                 char*   name;
00334 
00335                 name = ucnv_detectUnicodeSignature (input, input.length, len, e);
00336                 if (name == null || isError (e))
00337                     exception ("failed to detect signature");
00338                 return toArray (name);                
00339         }
00340 
00341         /***********************************************************************
00342 
00343                 Converts an array of unicode characters to an array of 
00344                 codepage characters.
00345 
00346                 This function is optimized for converting a continuous 
00347                 stream of data in buffer-sized chunks, where the entire 
00348                 source and target does not fit in available buffers.
00349 
00350                 The source pointer is an in/out parameter. It starts out 
00351                 pointing where the conversion is to begin, and ends up 
00352                 pointing after the last UChar consumed.
00353 
00354                 Target similarly starts out pointer at the first available 
00355                 byte in the output buffer, and ends up pointing after the 
00356                 last byte written to the output.
00357 
00358                 The converter always attempts to consume the entire source 
00359                 buffer, unless (1.) the target buffer is full, or (2.) a 
00360                 failing error is returned from the current callback function. 
00361                 When a successful error status has been returned, it means 
00362                 that all of the source buffer has been consumed. At that 
00363                 point, the caller should reset the source and sourceLimit 
00364                 pointers to point to the next chunk.
00365 
00366                 At the end of the stream (flush==true), the input is completely 
00367                 consumed when *source==sourceLimit and no error code is set. 
00368                 The converter object is then automatically reset by this 
00369                 function. (This means that a converter need not be reset 
00370                 explicitly between data streams if it finishes the previous 
00371                 stream without errors.)
00372 
00373                 This is a stateful conversion. Additionally, even when all 
00374                 source data has been consumed, some data may be in the 
00375                 converters' internal state. Call this function repeatedly, 
00376                 updating the target pointers with the next empty chunk of 
00377                 target in case of a U_BUFFER_OVERFLOW_ERROR, and updating 
00378                 the source pointers with the next chunk of source when a 
00379                 successful error status is returned, until there are no more 
00380                 chunks of source data.
00381 
00382                 Parameters:
00383 
00384                     converter       the Unicode converter
00385                     target          I/O parameter. Input : Points to the 
00386                                     beginning of the buffer to copy codepage 
00387                                     characters to. Output : points to after 
00388                                     the last codepage character copied to 
00389                                     target.
00390                     targetLimit     the pointer just after last of the 
00391                                     target buffer
00392                     source          I/O parameter, pointer to pointer to 
00393                                     the source Unicode character buffer.
00394                     sourceLimit     the pointer just after the last of 
00395                                     the source buffer
00396                     offsets         if NULL is passed, nothing will happen
00397                                     to it, otherwise it needs to have the 
00398                                     same number of allocated cells as target. 
00399                                     Will fill in offsets from target to source 
00400                                     pointer e.g: offsets[3] is equal to 6, it 
00401                                     means that the target[3] was a result of 
00402                                     transcoding source[6] For output data 
00403                                     carried across calls, and other data 
00404                                     without a specific source character 
00405                                     (such as from escape sequences or 
00406                                     callbacks) -1 will be placed for offsets.
00407                     flush           set to TRUE if the current source buffer 
00408                                     is the last available chunk of the source,
00409                                     FALSE otherwise. Note that if a failing 
00410                                     status is returned, this function may 
00411                                     have to be called multiple times with 
00412                                     flush set to TRUE until the source buffer 
00413                                     is consumed.
00414 
00415         ***********************************************************************/
00416 
00417         void encode (wchar** src, wchar* srcLimit, char** dst, char* dstLimit, int* offsets, bool flush)
00418         {
00419                 Error e;
00420 
00421                 ucnv_fromUnicode (converter, dst, dstLimit, src, srcLimit, offsets, flush, e);
00422                 testError (e, "failed to encode ouput");
00423         }
00424 
00425         /***********************************************************************
00426 
00427                 Encode the Unicode string into a codepage string.
00428 
00429                 This function is a more convenient but less powerful version 
00430                 of encode(). It is only useful for whole strings, not 
00431                 for streaming conversion. The maximum output buffer capacity 
00432                 required (barring output from callbacks) should be calculated
00433                 using getMaxCharSize().
00434 
00435         ***********************************************************************/
00436 
00437         uint encode (wchar[] src, char[] dst)
00438         {
00439                 Error e;
00440                 uint  len;
00441 
00442                 len = ucnv_fromUChars (converter, dst, dst.length, src, src.length, e);
00443                 testError (e, "failed to encode ouput");
00444                 return len;                
00445         }
00446 
00447         /***********************************************************************
00448 
00449                 Converts a buffer of codepage bytes into an array of unicode 
00450                 UChars characters.
00451 
00452                 This function is optimized for converting a continuous stream 
00453                 of data in buffer-sized chunks, where the entire source and 
00454                 target does not fit in available buffers.
00455 
00456                 The source pointer is an in/out parameter. It starts out pointing 
00457                 where the conversion is to begin, and ends up pointing after the 
00458                 last byte of source consumed.
00459 
00460                 Target similarly starts out pointer at the first available UChar 
00461                 in the output buffer, and ends up pointing after the last UChar 
00462                 written to the output. It does NOT necessarily keep UChar sequences 
00463                 together.
00464 
00465                 The converter always attempts to consume the entire source buffer, 
00466                 unless (1.) the target buffer is full, or (2.) a failing error is 
00467                 returned from the current callback function. When a successful 
00468                 error status has been returned, it means that all of the source 
00469                 buffer has been consumed. At that point, the caller should reset 
00470                 the source and sourceLimit pointers to point to the next chunk.
00471 
00472                 At the end of the stream (flush==true), the input is completely 
00473                 consumed when *source==sourceLimit and no error code is set The 
00474                 converter object is then automatically reset by this function. 
00475                 (This means that a converter need not be reset explicitly between 
00476                 data streams if it finishes the previous stream without errors.)
00477 
00478                 This is a stateful conversion. Additionally, even when all source 
00479                 data has been consumed, some data may be in the converters' internal 
00480                 state. Call this function repeatedly, updating the target pointers 
00481                 with the next empty chunk of target in case of a BufferOverflow, and 
00482                 updating the source pointers with the next chunk of source when a 
00483                 successful error status is returned, until there are no more chunks 
00484                 of source data.
00485 
00486                 Parameters:
00487                     converter       the Unicode converter
00488                     target  I/O     parameter. Input : Points to the beginning 
00489                                     of the buffer to copy UChars into. Output : 
00490                                     points to after the last UChar copied.
00491                     targetLimit     the pointer just after the end of the target 
00492                                     buffer
00493                     source  I/O     parameter, pointer to pointer to the source 
00494                                     codepage buffer.
00495                     sourceLimit     the pointer to the byte after the end of the 
00496                                     source buffer
00497                     offsets         if NULL is passed, nothing will happen to 
00498                                     it, otherwise it needs to have the same 
00499                                     number of allocated cells as target. Will 
00500                                     fill in offsets from target to source pointer
00501                                     e.g: offsets[3] is equal to 6, it means that 
00502                                     the target[3] was a result of transcoding 
00503                                     source[6] For output data carried across 
00504                                     calls, and other data without a specific 
00505                                     source character (such as from escape 
00506                                     sequences or callbacks) -1 will be placed 
00507                                     for offsets.
00508                     flush           set to true if the current source buffer 
00509                                     is the last available chunk of the source, 
00510                                     false otherwise. Note that if a failing 
00511                                     status is returned, this function may have 
00512                                     to be called multiple times with flush set 
00513                                     to true until the source buffer is consumed.
00514 
00515         ***********************************************************************/
00516 
00517         void decode (char** src, char* srcLimit, wchar** dst, wchar* dstLimit, int* offsets, bool flush)
00518         {
00519                 Error e;
00520 
00521                 ucnv_toUnicode (converter, dst, dstLimit, src, srcLimit, offsets, flush, e);
00522                 testError (e, "failed to decode input");
00523         }
00524 
00525         /***********************************************************************
00526 
00527                 Decode the codepage string into a Unicode string.
00528 
00529                 This function is a more convenient but less powerful version 
00530                 of decode(). It is only useful for whole strings, not for 
00531                 streaming conversion. The maximum output buffer capacity 
00532                 required (barring output from callbacks) will be 2*src.length 
00533                 (each char may be converted into a surrogate pair)
00534 
00535         ***********************************************************************/
00536 
00537         uint decode (char[] src, wchar[] dst)
00538         {
00539                 Error e;
00540                 uint  len;
00541 
00542                 len = ucnv_toUChars (converter, dst, dst.length, src, src.length, e);
00543                 testError (e, "failed to decode input");
00544                 return len;                
00545         }
00546 
00547 
00548         /***********************************************************************
00549         
00550                 Bind the ICU functions from a shared library. This is
00551                 complicated by the issues regarding D and DLLs on the
00552                 Windows platform
00553 
00554         ***********************************************************************/
00555 
00556         version (Win32)
00557         {
00558                 private static void*    library;
00559                 private static char[]   libraryName = "icuuc30.dll";     
00560 
00561                 /***************************************************************
00562 
00563                 ***************************************************************/
00564 
00565                 private static extern (C) 
00566                 {
00567                         int    function (char*, char*) ucnv_compareNames;
00568                         Handle function (char*, inout Error) ucnv_open;
00569                         char*  function (void*, uint, inout uint, inout Error) ucnv_detectUnicodeSignature;
00570                         void   function (Handle) ucnv_close;
00571                         void   function (Handle) ucnv_reset;
00572                         int    function (Handle) ucnv_resetToUnicode;
00573                         int    function (Handle) ucnv_resetFromUnicode;
00574                         ubyte  function (Handle) ucnv_getMaxCharSize;
00575                         ubyte  function (Handle) ucnv_getMinCharSize;
00576                         char*  function (Handle, inout Error) ucnv_getName;
00577                         uint   function (Handle, wchar*, uint, char*, uint, inout Error) ucnv_toUChars;
00578                         uint   function (Handle, char*, uint, wchar*, uint, inout Error) ucnv_fromUChars;
00579                         void   function (Handle, char**, char*, wchar**, wchar*, int*, ubyte, inout Error) ucnv_fromUnicode;
00580                         void   function (Handle, wchar**, wchar*, char**, char*, int*, ubyte, inout Error)  ucnv_toUnicode;
00581                         ubyte  function (Handle) ucnv_isAmbiguous;
00582                 }
00583 
00584                 /***************************************************************
00585 
00586                 ***************************************************************/
00587 
00588                 static  FunctionLoader.Bind[] targets = 
00589                         [
00590                         {cast(void**) &ucnv_open,                   "ucnv_open"}, 
00591                         {cast(void**) &ucnv_close,                  "ucnv_close"},
00592                         {cast(void**) &ucnv_reset,                  "ucnv_reset"},
00593                         {cast(void**) &ucnv_resetToUnicode,         "ucnv_resetToUnicode"},
00594                         {cast(void**) &ucnv_resetFromUnicode,       "ucnv_resetFromUnicode"},
00595                         {cast(void**) &ucnv_compareNames,           "ucnv_compareNames"},
00596                         {cast(void**) &ucnv_getMaxCharSize,         "ucnv_getMaxCharSize"},
00597                         {cast(void**) &ucnv_getMinCharSize,         "ucnv_getMinCharSize"},
00598                         {cast(void**) &ucnv_getName,                "ucnv_getName"},
00599                         {cast(void**) &ucnv_detectUnicodeSignature, "ucnv_detectUnicodeSignature"},
00600                         {cast(void**) &ucnv_toUChars,               "ucnv_toUChars"},
00601                         {cast(void**) &ucnv_fromUChars,             "ucnv_fromUChars"},
00602                         {cast(void**) &ucnv_toUnicode,              "ucnv_toUnicode"},
00603                         {cast(void**) &ucnv_fromUnicode,            "ucnv_fromUnicode"},
00604                         {cast(void**) &ucnv_isAmbiguous,            "ucnv_isAmbiguous"},
00605                         ];
00606 
00607                 /***************************************************************
00608 
00609                 ***************************************************************/
00610 
00611                 static this ()
00612                 {
00613                         library = FunctionLoader.bind (libraryName, targets);
00614                 }
00615 
00616                 /***************************************************************
00617 
00618                 ***************************************************************/
00619 
00620                 static ~this ()
00621                 {
00622                         FunctionLoader.unbind (library);
00623                 }
00624         }
00625 }
00626 
00627 

Generated on Sun Nov 7 19:06:53 2004 for Mango by doxygen 1.3.6