00001 /******************************************************************************* 00002 00003 @file UConverter.d 00004 00005 Copyright (C) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 00027 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00028 00029 00030 @version Initial version, October 2004 00031 @author Kris 00032 00033 Note that this package and documentation is built around the ICU 00034 project (http://oss.software.ibm.com/icu/). Below is the license 00035 statement as specified by that software: 00036 00037 00038 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00039 00040 00041 ICU License - ICU 1.8.1 and later 00042 00043 COPYRIGHT AND PERMISSION NOTICE 00044 00045 Copyright (c) 1995-2003 International Business Machines Corporation and 00046 others. 00047 00048 All rights reserved. 00049 00050 Permission is hereby granted, free of charge, to any person obtaining a 00051 copy of this software and associated documentation files (the 00052 "Software"), to deal in the Software without restriction, including 00053 without limitation the rights to use, copy, modify, merge, publish, 00054 distribute, and/or sell copies of the Software, and to permit persons 00055 to whom the Software is furnished to do so, provided that the above 00056 copyright notice(s) and this permission notice appear in all copies of 00057 the Software and that both the above copyright notice(s) and this 00058 permission notice appear in supporting documentation. 00059 00060 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 00061 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00062 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 00063 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 00064 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL 00065 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING 00066 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, 00067 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION 00068 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 00069 00070 Except as contained in this notice, the name of a copyright holder 00071 shall not be used in advertising or otherwise to promote the sale, use 00072 or other dealings in this Software without prior written authorization 00073 of the copyright holder. 00074 00075 ---------------------------------------------------------------------- 00076 00077 All trademarks and registered trademarks mentioned herein are the 00078 property of their respective owners. 00079 00080 *******************************************************************************/ 00081 00082 module mango.icu.UConverter; 00083 00084 private import mango.icu.ICU; 00085 00086 /******************************************************************************* 00087 00088 This API is used to convert codepage or character encoded data to 00089 and from UTF-16. You can open a converter with ucnv_open(). With 00090 that converter, you can get its properties, set options, convert 00091 your data and close the converter. 00092 00093 Since many software programs recogize different converter names 00094 for different types of converters, there are other functions in 00095 this API to iterate over the converter aliases. 00096 00097 See <A HREF="http://oss.software.ibm.com/icu/apiref/ucnv_8h.html"> 00098 this page</A> for full details. 00099 00100 *******************************************************************************/ 00101 00102 class UConverter : ICU 00103 { 00104 private Handle converter; 00105 00106 /*********************************************************************** 00107 00108 Creates a UConverter object with the names specified as a 00109 string. 00110 00111 The actual name will be resolved with the alias file using 00112 a case-insensitive string comparison that ignores delimiters 00113 '-', '_', and ' ' (dash, underscore, and space). E.g., the 00114 names "UTF8", "utf-8", and "Utf 8" are all equivalent. If null 00115 is passed for the converter name, it will create one with the 00116 getDefaultName() return value. 00117 00118 A converter name may contain options like a locale specification 00119 to control the specific behavior of the converter instantiated. 00120 The meaning of the options depends on the particular converter: 00121 if an option is not defined for or recognized, it is ignored. 00122 00123 Options are appended to the converter name string, with an 00124 OptionSepChar between the name and the first option and also 00125 between adjacent options. 00126 00127 The conversion behavior and names can vary between platforms, 00128 and ICU may convert some characters differently from other 00129 platforms. Details on this topic are in the User's Guide. 00130 00131 ***********************************************************************/ 00132 00133 this (char[] name) 00134 { 00135 Error e; 00136 00137 converter = ucnv_open (toString (name), e); 00138 if (isError (e)) 00139 exception ("failed to create converter for '"~name~"'"); 00140 } 00141 00142 /*********************************************************************** 00143 00144 Deletes the unicode converter and releases resources 00145 associated with just this instance. Does not free up 00146 shared converter tables. 00147 00148 ***********************************************************************/ 00149 00150 ~this () 00151 { 00152 if (converter) 00153 ucnv_close (converter); 00154 converter = null; 00155 } 00156 00157 /*********************************************************************** 00158 00159 Do a fuzzy compare of two converter/alias names. The 00160 comparison is case-insensitive. It also ignores the 00161 characters '-', '_', and ' ' (dash, underscore, and space). 00162 Thus the strings "UTF-8", "utf_8", and "Utf 8" are exactly 00163 equivalent 00164 00165 ***********************************************************************/ 00166 00167 static final int compareNames (char[] a, char[] b) 00168 { 00169 return ucnv_compareNames (toString(a), toString(b)); 00170 } 00171 00172 /*********************************************************************** 00173 00174 Resets the state of this converter to the default state. 00175 00176 This is used in the case of an error, to restart a 00177 conversion from a known default state. It will also 00178 empty the internal output buffers. 00179 00180 ***********************************************************************/ 00181 00182 void reset () 00183 { 00184 ucnv_reset (converter); 00185 } 00186 00187 /*********************************************************************** 00188 00189 Resets the from-Unicode part of this converter state to the 00190 default state. 00191 00192 This is used in the case of an error to restart a conversion 00193 from Unicode to a known default state. It will also empty the 00194 internal output buffers used for the conversion from Unicode 00195 codepoints. 00196 00197 ***********************************************************************/ 00198 00199 void resetDecoder () 00200 { 00201 ucnv_resetToUnicode (converter); 00202 } 00203 00204 /*********************************************************************** 00205 00206 Resets the from-Unicode part of this converter state to the 00207 default state. 00208 00209 This is used in the case of an error to restart a conversion 00210 from Unicode to a known default state. It will also empty the 00211 internal output buffers used for the conversion from Unicode 00212 codepoints. 00213 00214 ***********************************************************************/ 00215 00216 void resetEncoder () 00217 { 00218 ucnv_resetFromUnicode (converter); 00219 } 00220 00221 /*********************************************************************** 00222 00223 Returns the maximum number of bytes that are output per 00224 UChar in conversion from Unicode using this converter. 00225 00226 The returned number can be used to calculate the size of 00227 a target buffer for conversion from Unicode. 00228 00229 This number may not be the same as the maximum number of 00230 bytes per "conversion unit". In other words, it may not 00231 be the intuitively expected number of bytes per character 00232 that would be published for a charset, and may not fulfill 00233 any other purpose than the allocation of an output buffer 00234 of guaranteed sufficient size for a given input length and 00235 converter. 00236 00237 Examples for special cases that are taken into account: 00238 00239 * Supplementary code points may convert to more bytes than 00240 BMP code points. This function returns bytes per UChar 00241 (UTF-16 code unit), not per Unicode code point, for efficient 00242 buffer allocation. 00243 * State-shifting output (SI/SO, escapes, etc.) from stateful 00244 converters. 00245 * When m input UChars are converted to n output bytes, then 00246 the maximum m/n is taken into account. 00247 00248 The number returned here does not take into account: 00249 00250 * callbacks which output more than one charset character 00251 sequence per call, like escape callbacks 00252 * initial and final non-character bytes that are output by 00253 some converters (automatic BOMs, initial escape sequence, 00254 final SI, etc.) 00255 00256 Examples for returned values: 00257 00258 * SBCS charsets: 1 00259 * Shift-JIS: 2 00260 * UTF-16: 2 (2 per BMP, 4 per surrogate _pair_, BOM not counted) 00261 * UTF-8: 3 (3 per BMP, 4 per surrogate _pair_) 00262 * EBCDIC_STATEFUL (EBCDIC mixed SBCS/DBCS): 3 (SO + DBCS) 00263 * ISO-2022: 3 (always outputs UTF-8) 00264 * ISO-2022-JP: 6 (4-byte escape sequences + DBCS) 00265 * ISO-2022-CN: 8 (4-byte designator sequences + 2-byte SS2/SS3 00266 + DBCS) 00267 00268 ***********************************************************************/ 00269 00270 ubyte getMaxCharSize () 00271 { 00272 return ucnv_getMaxCharSize (converter); 00273 } 00274 00275 /*********************************************************************** 00276 00277 Returns the minimum byte length for characters in this 00278 codepage. This is usually either 1 or 2. 00279 00280 ***********************************************************************/ 00281 00282 ubyte getMinCharSize () 00283 { 00284 return ucnv_getMinCharSize (converter); 00285 } 00286 00287 /*********************************************************************** 00288 00289 Gets the internal, canonical name of the converter (zero- 00290 terminated). 00291 00292 ***********************************************************************/ 00293 00294 char[] getName () 00295 { 00296 Error e; 00297 00298 char[] name = toArray (ucnv_getName (converter, e)); 00299 testError (e, "failed to get converter name"); 00300 return name; 00301 } 00302 00303 /*********************************************************************** 00304 00305 Determines if the converter contains ambiguous mappings of 00306 the same character or not 00307 00308 ***********************************************************************/ 00309 00310 bool isAmbiguous () 00311 { 00312 return cast(bool) ucnv_isAmbiguous (converter); 00313 } 00314 00315 /*********************************************************************** 00316 00317 Detects Unicode signature byte sequences at the start 00318 of the byte stream and returns the charset name of the 00319 indicated Unicode charset. An exception is thrown when 00320 no Unicode signature is recognized. 00321 00322 A caller can create a Converter using the charset name. 00323 The first code unit (UChar) from the start of the stream 00324 will be U+FEFF (the Unicode BOM/signature character) and 00325 can usually be ignored. 00326 00327 ***********************************************************************/ 00328 00329 static final char[] detectSignature (void[] input) 00330 { 00331 Error e; 00332 uint len; 00333 char* name; 00334 00335 name = ucnv_detectUnicodeSignature (input, input.length, len, e); 00336 if (name == null || isError (e)) 00337 exception ("failed to detect signature"); 00338 return toArray (name); 00339 } 00340 00341 /*********************************************************************** 00342 00343 Converts an array of unicode characters to an array of 00344 codepage characters. 00345 00346 This function is optimized for converting a continuous 00347 stream of data in buffer-sized chunks, where the entire 00348 source and target does not fit in available buffers. 00349 00350 The source pointer is an in/out parameter. It starts out 00351 pointing where the conversion is to begin, and ends up 00352 pointing after the last UChar consumed. 00353 00354 Target similarly starts out pointer at the first available 00355 byte in the output buffer, and ends up pointing after the 00356 last byte written to the output. 00357 00358 The converter always attempts to consume the entire source 00359 buffer, unless (1.) the target buffer is full, or (2.) a 00360 failing error is returned from the current callback function. 00361 When a successful error status has been returned, it means 00362 that all of the source buffer has been consumed. At that 00363 point, the caller should reset the source and sourceLimit 00364 pointers to point to the next chunk. 00365 00366 At the end of the stream (flush==true), the input is completely 00367 consumed when *source==sourceLimit and no error code is set. 00368 The converter object is then automatically reset by this 00369 function. (This means that a converter need not be reset 00370 explicitly between data streams if it finishes the previous 00371 stream without errors.) 00372 00373 This is a stateful conversion. Additionally, even when all 00374 source data has been consumed, some data may be in the 00375 converters' internal state. Call this function repeatedly, 00376 updating the target pointers with the next empty chunk of 00377 target in case of a U_BUFFER_OVERFLOW_ERROR, and updating 00378 the source pointers with the next chunk of source when a 00379 successful error status is returned, until there are no more 00380 chunks of source data. 00381 00382 Parameters: 00383 00384 converter the Unicode converter 00385 target I/O parameter. Input : Points to the 00386 beginning of the buffer to copy codepage 00387 characters to. Output : points to after 00388 the last codepage character copied to 00389 target. 00390 targetLimit the pointer just after last of the 00391 target buffer 00392 source I/O parameter, pointer to pointer to 00393 the source Unicode character buffer. 00394 sourceLimit the pointer just after the last of 00395 the source buffer 00396 offsets if NULL is passed, nothing will happen 00397 to it, otherwise it needs to have the 00398 same number of allocated cells as target. 00399 Will fill in offsets from target to source 00400 pointer e.g: offsets[3] is equal to 6, it 00401 means that the target[3] was a result of 00402 transcoding source[6] For output data 00403 carried across calls, and other data 00404 without a specific source character 00405 (such as from escape sequences or 00406 callbacks) -1 will be placed for offsets. 00407 flush set to TRUE if the current source buffer 00408 is the last available chunk of the source, 00409 FALSE otherwise. Note that if a failing 00410 status is returned, this function may 00411 have to be called multiple times with 00412 flush set to TRUE until the source buffer 00413 is consumed. 00414 00415 ***********************************************************************/ 00416 00417 void encode (wchar** src, wchar* srcLimit, char** dst, char* dstLimit, int* offsets, bool flush) 00418 { 00419 Error e; 00420 00421 ucnv_fromUnicode (converter, dst, dstLimit, src, srcLimit, offsets, flush, e); 00422 testError (e, "failed to encode ouput"); 00423 } 00424 00425 /*********************************************************************** 00426 00427 Encode the Unicode string into a codepage string. 00428 00429 This function is a more convenient but less powerful version 00430 of encode(). It is only useful for whole strings, not 00431 for streaming conversion. The maximum output buffer capacity 00432 required (barring output from callbacks) should be calculated 00433 using getMaxCharSize(). 00434 00435 ***********************************************************************/ 00436 00437 uint encode (wchar[] src, char[] dst) 00438 { 00439 Error e; 00440 uint len; 00441 00442 len = ucnv_fromUChars (converter, dst, dst.length, src, src.length, e); 00443 testError (e, "failed to encode ouput"); 00444 return len; 00445 } 00446 00447 /*********************************************************************** 00448 00449 Converts a buffer of codepage bytes into an array of unicode 00450 UChars characters. 00451 00452 This function is optimized for converting a continuous stream 00453 of data in buffer-sized chunks, where the entire source and 00454 target does not fit in available buffers. 00455 00456 The source pointer is an in/out parameter. It starts out pointing 00457 where the conversion is to begin, and ends up pointing after the 00458 last byte of source consumed. 00459 00460 Target similarly starts out pointer at the first available UChar 00461 in the output buffer, and ends up pointing after the last UChar 00462 written to the output. It does NOT necessarily keep UChar sequences 00463 together. 00464 00465 The converter always attempts to consume the entire source buffer, 00466 unless (1.) the target buffer is full, or (2.) a failing error is 00467 returned from the current callback function. When a successful 00468 error status has been returned, it means that all of the source 00469 buffer has been consumed. At that point, the caller should reset 00470 the source and sourceLimit pointers to point to the next chunk. 00471 00472 At the end of the stream (flush==true), the input is completely 00473 consumed when *source==sourceLimit and no error code is set The 00474 converter object is then automatically reset by this function. 00475 (This means that a converter need not be reset explicitly between 00476 data streams if it finishes the previous stream without errors.) 00477 00478 This is a stateful conversion. Additionally, even when all source 00479 data has been consumed, some data may be in the converters' internal 00480 state. Call this function repeatedly, updating the target pointers 00481 with the next empty chunk of target in case of a BufferOverflow, and 00482 updating the source pointers with the next chunk of source when a 00483 successful error status is returned, until there are no more chunks 00484 of source data. 00485 00486 Parameters: 00487 converter the Unicode converter 00488 target I/O parameter. Input : Points to the beginning 00489 of the buffer to copy UChars into. Output : 00490 points to after the last UChar copied. 00491 targetLimit the pointer just after the end of the target 00492 buffer 00493 source I/O parameter, pointer to pointer to the source 00494 codepage buffer. 00495 sourceLimit the pointer to the byte after the end of the 00496 source buffer 00497 offsets if NULL is passed, nothing will happen to 00498 it, otherwise it needs to have the same 00499 number of allocated cells as target. Will 00500 fill in offsets from target to source pointer 00501 e.g: offsets[3] is equal to 6, it means that 00502 the target[3] was a result of transcoding 00503 source[6] For output data carried across 00504 calls, and other data without a specific 00505 source character (such as from escape 00506 sequences or callbacks) -1 will be placed 00507 for offsets. 00508 flush set to true if the current source buffer 00509 is the last available chunk of the source, 00510 false otherwise. Note that if a failing 00511 status is returned, this function may have 00512 to be called multiple times with flush set 00513 to true until the source buffer is consumed. 00514 00515 ***********************************************************************/ 00516 00517 void decode (char** src, char* srcLimit, wchar** dst, wchar* dstLimit, int* offsets, bool flush) 00518 { 00519 Error e; 00520 00521 ucnv_toUnicode (converter, dst, dstLimit, src, srcLimit, offsets, flush, e); 00522 testError (e, "failed to decode input"); 00523 } 00524 00525 /*********************************************************************** 00526 00527 Decode the codepage string into a Unicode string. 00528 00529 This function is a more convenient but less powerful version 00530 of decode(). It is only useful for whole strings, not for 00531 streaming conversion. The maximum output buffer capacity 00532 required (barring output from callbacks) will be 2*src.length 00533 (each char may be converted into a surrogate pair) 00534 00535 ***********************************************************************/ 00536 00537 uint decode (char[] src, wchar[] dst) 00538 { 00539 Error e; 00540 uint len; 00541 00542 len = ucnv_toUChars (converter, dst, dst.length, src, src.length, e); 00543 testError (e, "failed to decode input"); 00544 return len; 00545 } 00546 00547 00548 /*********************************************************************** 00549 00550 Bind the ICU functions from a shared library. This is 00551 complicated by the issues regarding D and DLLs on the 00552 Windows platform 00553 00554 ***********************************************************************/ 00555 00556 version (Win32) 00557 { 00558 private static void* library; 00559 private static char[] libraryName = "icuuc30.dll"; 00560 00561 /*************************************************************** 00562 00563 ***************************************************************/ 00564 00565 private static extern (C) 00566 { 00567 int function (char*, char*) ucnv_compareNames; 00568 Handle function (char*, inout Error) ucnv_open; 00569 char* function (void*, uint, inout uint, inout Error) ucnv_detectUnicodeSignature; 00570 void function (Handle) ucnv_close; 00571 void function (Handle) ucnv_reset; 00572 int function (Handle) ucnv_resetToUnicode; 00573 int function (Handle) ucnv_resetFromUnicode; 00574 ubyte function (Handle) ucnv_getMaxCharSize; 00575 ubyte function (Handle) ucnv_getMinCharSize; 00576 char* function (Handle, inout Error) ucnv_getName; 00577 uint function (Handle, wchar*, uint, char*, uint, inout Error) ucnv_toUChars; 00578 uint function (Handle, char*, uint, wchar*, uint, inout Error) ucnv_fromUChars; 00579 void function (Handle, char**, char*, wchar**, wchar*, int*, ubyte, inout Error) ucnv_fromUnicode; 00580 void function (Handle, wchar**, wchar*, char**, char*, int*, ubyte, inout Error) ucnv_toUnicode; 00581 ubyte function (Handle) ucnv_isAmbiguous; 00582 } 00583 00584 /*************************************************************** 00585 00586 ***************************************************************/ 00587 00588 static FunctionLoader.Bind[] targets = 00589 [ 00590 {cast(void**) &ucnv_open, "ucnv_open"}, 00591 {cast(void**) &ucnv_close, "ucnv_close"}, 00592 {cast(void**) &ucnv_reset, "ucnv_reset"}, 00593 {cast(void**) &ucnv_resetToUnicode, "ucnv_resetToUnicode"}, 00594 {cast(void**) &ucnv_resetFromUnicode, "ucnv_resetFromUnicode"}, 00595 {cast(void**) &ucnv_compareNames, "ucnv_compareNames"}, 00596 {cast(void**) &ucnv_getMaxCharSize, "ucnv_getMaxCharSize"}, 00597 {cast(void**) &ucnv_getMinCharSize, "ucnv_getMinCharSize"}, 00598 {cast(void**) &ucnv_getName, "ucnv_getName"}, 00599 {cast(void**) &ucnv_detectUnicodeSignature, "ucnv_detectUnicodeSignature"}, 00600 {cast(void**) &ucnv_toUChars, "ucnv_toUChars"}, 00601 {cast(void**) &ucnv_fromUChars, "ucnv_fromUChars"}, 00602 {cast(void**) &ucnv_toUnicode, "ucnv_toUnicode"}, 00603 {cast(void**) &ucnv_fromUnicode, "ucnv_fromUnicode"}, 00604 {cast(void**) &ucnv_isAmbiguous, "ucnv_isAmbiguous"}, 00605 ]; 00606 00607 /*************************************************************** 00608 00609 ***************************************************************/ 00610 00611 static this () 00612 { 00613 library = FunctionLoader.bind (libraryName, targets); 00614 } 00615 00616 /*************************************************************** 00617 00618 ***************************************************************/ 00619 00620 static ~this () 00621 { 00622 FunctionLoader.unbind (library); 00623 } 00624 } 00625 } 00626 00627