00001 /******************************************************************************* 00002 00003 @file UMango.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version, October 2004 00034 @author Kris 00035 00036 *******************************************************************************/ 00037 00038 module mango.icu.UMango; 00039 00040 public import mango.icu.UConverter; 00041 00042 private import mango.convert.Type; 00043 00044 /******************************************************************************* 00045 00046 Include these classes when compiled with the Mango.io package. 00047 They represent the 'glue' to bind said package to the unicode 00048 converters provided by ICU. 00049 00050 *******************************************************************************/ 00051 00052 version (Isolated){} 00053 else 00054 { 00055 private import mango.io.model.IReader; 00056 private import mango.io.model.IWriter; 00057 00058 /*********************************************************************** 00059 00060 Abstract base class for String decoders. These decoders 00061 bind the ICU functionality to the Mango.io package, and 00062 provide some utility functions such as input streaming. 00063 00064 These decoder classes will always attempt to fill their 00065 destination (provided) output array, but may terminate 00066 early if (a) a defined read 'limit' on the input stream 00067 has been reached or (b) a partial surrogate-pair would 00068 be left at the output tail. Each decoder returns a count 00069 of how many output elements were actually converted. 00070 00071 ***********************************************************************/ 00072 00073 class StringDecoder : AbstractDecoder, IReadable 00074 { 00075 private UConverter cvt; 00076 private bool done; 00077 private IBuffer bound; 00078 private uint limit = uint.max; 00079 00080 /*************************************************************** 00081 00082 Decoders can be used to convert directly into a 00083 provided destination. The converter will try to 00084 fill the destination, up to the configured input 00085 'limit', and returns the number of elements thus 00086 converted. This returned value will be less than 00087 the destination capacity when either the 'limit' 00088 was reached, or when a partial surrogate would 00089 be placed at the tail. 00090 00091 ***************************************************************/ 00092 00093 abstract uint read (IBuffer b, wchar[] dst); 00094 00095 /*************************************************************** 00096 00097 Signature for BufferDecoder handlers. These 00098 decoders are intended to be usable as the 00099 default handlers within the reader constructs. 00100 Use IReader.setDecoder() to set a decoder as 00101 the default handler. 00102 00103 ***************************************************************/ 00104 00105 abstract uint decoder (void* p, uint capacity, uint type); 00106 00107 /*************************************************************** 00108 00109 Return the type of this decoder 00110 00111 ***************************************************************/ 00112 00113 abstract uint type (); 00114 00115 /*************************************************************** 00116 00117 Set the limit for this decoder. This will cause 00118 the decoder to halt after reading the specified 00119 number of bytes from its input. The decoder may 00120 also halt before that point if the destination 00121 becomes full. Use method toGo() to monitor how 00122 much content has been read so far. 00123 00124 ***************************************************************/ 00125 00126 void setLimit (uint limit) 00127 { 00128 this.limit = limit; 00129 } 00130 00131 /*************************************************************** 00132 00133 Change the converter used for this decoder. 00134 00135 ***************************************************************/ 00136 00137 void setConverter (UConverter cvt) 00138 in { 00139 assert (cvt); 00140 } 00141 body 00142 { 00143 this.cvt = cvt; 00144 } 00145 00146 /*************************************************************** 00147 00148 Reset the converter and the input limit. The latter 00149 defaults to being unlimited, causing the decoder to 00150 read until the destination is full. 00151 00152 ***************************************************************/ 00153 00154 void reset (uint limit = uint.max) 00155 { 00156 setLimit (limit); 00157 cvt.reset (); 00158 } 00159 00160 /*************************************************************** 00161 00162 Return the number of bytes yet to be read 00163 00164 ***************************************************************/ 00165 00166 protected uint toGo () 00167 { 00168 return limit; 00169 } 00170 00171 /*************************************************************** 00172 00173 Placeholder for subclasses to do something useful 00174 when applied to an IReader. See UString for an 00175 example of such usage. 00176 00177 ***************************************************************/ 00178 00179 protected void read (IReader r) 00180 { 00181 } 00182 00183 /*************************************************************** 00184 00185 Bind this StringDecoder to the specified IReader. 00186 This is invoked by an IReader to install it as the 00187 default handler, and thus be used by all subsequent 00188 IReader.get() requests for the subclass type. 00189 00190 Note that the byte limit will be respected if 'limit' 00191 has been set, which can be useful when converting an 00192 unknown number of elements (a la HTTP). 00193 00194 ***************************************************************/ 00195 00196 final void bind (IBuffer buffer) 00197 { 00198 bound = buffer; 00199 } 00200 00201 /*************************************************************** 00202 00203 Decode IBuffer input until the delegate indicates 00204 it is finished. Typically, that occurs when either 00205 the destination is full, or the input 'limit' has 00206 been reached. 00207 00208 ***************************************************************/ 00209 00210 private final void decode (IBuffer buffer, uint delegate (void[]) dg) 00211 { 00212 done = false; 00213 while (limit && !done) 00214 { 00215 buffer.get (1, false); 00216 buffer.read (dg); 00217 } 00218 } 00219 } 00220 00221 00222 /*********************************************************************** 00223 00224 Decode a byte stream into UTF16 wchars. This decoder can: 00225 00226 - be used as the default wchar handler when attached to 00227 an IReader (see IReader.setDecoder). 00228 00229 - be used directly to fill a provided destination array 00230 with converted wchars. 00231 00232 - be used in either of the prior two cases with a 'limit' 00233 placed upon the number of input bytes converted (in 00234 addition to the destination capacity limit). This can 00235 be useful when the number of raw bytes is known, but 00236 the number of wchar elements is not, and can be handy 00237 for streaming conversions. 00238 00239 ***********************************************************************/ 00240 00241 class StringDecoder16 : StringDecoder 00242 { 00243 /*************************************************************** 00244 00245 Construct a decoder with the given UConverter, and 00246 an optional 'limit' to the number of input bytes to 00247 be converted. 00248 00249 ***************************************************************/ 00250 00251 this (UConverter cvt, uint limit = uint.max) 00252 { 00253 this.cvt = cvt; 00254 super.reset (limit); 00255 } 00256 00257 /*************************************************************** 00258 00259 Construct a decoder of the given specification, and 00260 an optional 'limit' to the number of input bytes to 00261 be converted. 00262 00263 ***************************************************************/ 00264 00265 this (char[] type, uint limit = uint.max) 00266 { 00267 this (new UConverter (type), limit); 00268 } 00269 00270 /*************************************************************** 00271 00272 Return the type of this decoder 00273 00274 ***************************************************************/ 00275 00276 uint type () 00277 { 00278 return Type.Utf16; 00279 } 00280 00281 /*************************************************************** 00282 00283 Signature for BufferDecoder handlers. These 00284 decoders are intended to be usable as the 00285 default handlers within the reader constructs. 00286 Use IReader.setDecoder() to set a decoder as 00287 the default handler. 00288 00289 ***************************************************************/ 00290 00291 protected uint decoder (void* p, uint capacity, uint type) 00292 { 00293 // this ugly conversion/casting back and forth is 00294 // a lot more efficient than the intrinsic array 00295 // conversion generated via an array[] cast 00296 return read (bound, (cast(wchar*) p)[0..capacity / wchar.sizeof]) * wchar.sizeof; 00297 } 00298 00299 /*************************************************************** 00300 00301 Decoders can be used to convert directly into a 00302 provided destination. The converter will try to 00303 fill the destination, up to the configured input 00304 'limit', and returns the number of elements thus 00305 converted. This returned value will be less than 00306 the destination capacity when either the 'limit' 00307 was reached, or when a partial surrogate would 00308 have been placed at the tail. 00309 00310 ***************************************************************/ 00311 00312 final uint read (IBuffer buffer, wchar[] dst) 00313 { 00314 uint produced; 00315 00316 uint read (void[] x) 00317 { 00318 UAdjust adj; 00319 uint len = x.length; 00320 00321 // have we read enough from the source? 00322 if (len > limit) 00323 len = limit; 00324 00325 // do the conversion; test for overflow. 00326 // There's an issue here with certain 00327 // conversion types (e.g. utf7) where byte 00328 // combinations appear ambiguous. It is 00329 // possible that the converter will cache 00330 // such combinations until it determines 00331 // the result from subsequent input data. 00332 // However, if such a condition occurs at 00333 // the tail end of an input stream, the 00334 // conversion may stall whilst waiting on 00335 // more input. There does not appear to 00336 // be a means of identifying whether or 00337 // not content has been cached, so there 00338 // is little one can do at this time ... 00339 // Note that this issue does not exist 00340 // when 'limit' is active 00341 done = cvt.decode (x[0..len], dst[produced..length], adj, len == 0); 00342 00343 // adjust output. Note that we always clip 00344 // the bytes read to match the output size 00345 if ((produced += adj.output) >= dst.length) 00346 done = true; 00347 00348 // are we limiting input? 00349 if (limit != uint.max) 00350 limit -= adj.input; 00351 00352 // say how much we consumed 00353 return adj.input; 00354 } 00355 00356 decode (buffer, &read); 00357 return produced; 00358 } 00359 } 00360 00361 00362 00363 /*********************************************************************** 00364 00365 ***********************************************************************/ 00366 00367 class StringEncoder : AbstractEncoder 00368 { 00369 private bool more; 00370 private IBuffer bound; 00371 00372 /*************************************************************** 00373 00374 ***************************************************************/ 00375 00376 abstract void reset (); 00377 00378 /*************************************************************** 00379 00380 ***************************************************************/ 00381 00382 abstract uint type (); 00383 00384 /*************************************************************** 00385 00386 ***************************************************************/ 00387 00388 abstract uint encoder (void* p, uint count, uint type); 00389 00390 /*************************************************************** 00391 00392 Bind this StringEncoder to the specified IWriter. 00393 This is invoked by an IWriter to install it as the 00394 default handler, and thus be used by all subsequent 00395 IReader.put() requests for the subclass type. 00396 00397 ***************************************************************/ 00398 00399 void bind (IBuffer buffer) 00400 { 00401 bound = buffer; 00402 } 00403 00404 /*************************************************************** 00405 00406 ***************************************************************/ 00407 00408 private final void encode (IBuffer b, uint delegate (void[]) dg) 00409 { 00410 more = true; 00411 b.write (dg); 00412 00413 while (more) 00414 { 00415 // this should be some 'realistic' number, but 00416 // is needed to handle the case of a GrowBuffer 00417 b.makeRoom (1024); 00418 b.write (dg); 00419 } 00420 } 00421 } 00422 00423 00424 /*********************************************************************** 00425 00426 ***********************************************************************/ 00427 00428 class StringEncoder8 : StringEncoder 00429 { 00430 private ITranscoder xcode; 00431 00432 /*************************************************************** 00433 00434 Construct an encoder for the given UConverter, 00435 where the source-content encoding is specified 00436 by 'source'. 00437 00438 The default source-encoding is assumed to be utf8. 00439 00440 ***************************************************************/ 00441 00442 this (UConverter cvt, char[] source = "utf8") 00443 { 00444 xcode = (new UConverter(source)).createTranscoder (cvt); 00445 } 00446 00447 /*************************************************************** 00448 00449 Construct an encoder of the given output 'type', 00450 where the source-content encoding is specified 00451 by 'source'. 00452 00453 The default source-encoding is assumed to be utf8. 00454 00455 ***************************************************************/ 00456 00457 this (char[] type, char[] source = "utf8") 00458 { 00459 this (new UConverter(type), source); 00460 } 00461 00462 /*************************************************************** 00463 00464 ***************************************************************/ 00465 00466 void encode (IBuffer b, char[] c) 00467 { 00468 uint write (void[] x) 00469 { 00470 UAdjust adj; 00471 00472 more = xcode.convert (c, x, adj, c.length == 0); 00473 c = c[adj.input..length]; 00474 return adj.output; 00475 } 00476 00477 super.encode (b, &write); 00478 } 00479 00480 /*************************************************************** 00481 00482 ***************************************************************/ 00483 00484 protected uint encoder (void* p, uint count, uint type) 00485 { 00486 encode (bound, (cast(char*) p)[0..count/char.sizeof]); 00487 return 0; 00488 } 00489 00490 /*************************************************************** 00491 00492 ***************************************************************/ 00493 00494 uint type () 00495 { 00496 return Type.Utf8; 00497 } 00498 00499 /*************************************************************** 00500 00501 ***************************************************************/ 00502 00503 void reset () 00504 { 00505 xcode.reset(); 00506 } 00507 } 00508 00509 00510 /*********************************************************************** 00511 00512 ***********************************************************************/ 00513 00514 class StringEncoder16 : StringEncoder 00515 { 00516 private UConverter cvt; 00517 00518 /*************************************************************** 00519 00520 ***************************************************************/ 00521 00522 this (UConverter cvt) 00523 { 00524 this.cvt = cvt; 00525 } 00526 00527 /*************************************************************** 00528 00529 Construct an encoder of the given output 'type'. 00530 00531 The source-encoding is assumed to be utf16. 00532 00533 ***************************************************************/ 00534 00535 this (char[] type) 00536 { 00537 this (new UConverter(type)); 00538 } 00539 00540 /*************************************************************** 00541 00542 ***************************************************************/ 00543 00544 void encode (IBuffer b, wchar[] w) 00545 { 00546 uint write (void[] x) 00547 { 00548 UAdjust adj; 00549 00550 more = cvt.encode (w, x, adj, w.length == 0); 00551 w = w[adj.input..length]; 00552 return adj.output; 00553 } 00554 00555 super.encode (b, &write); 00556 } 00557 00558 /*************************************************************** 00559 00560 ***************************************************************/ 00561 00562 protected uint encoder (void* p, uint count, uint type) 00563 { 00564 encode (bound, (cast(wchar*) p)[0..count/wchar.sizeof]); 00565 return 0; 00566 } 00567 00568 /*************************************************************** 00569 00570 ***************************************************************/ 00571 00572 uint type () 00573 { 00574 return Type.Utf16; 00575 } 00576 00577 /*************************************************************** 00578 00579 ***************************************************************/ 00580 00581 void reset () 00582 { 00583 cvt.reset(); 00584 } 00585 } 00586 00587 00588 /*********************************************************************** 00589 00590 ***********************************************************************/ 00591 00592 class StringEncoder32 : StringEncoder 00593 { 00594 private ITranscoder xcode; 00595 00596 /*************************************************************** 00597 00598 ***************************************************************/ 00599 00600 this (UConverter cvt) 00601 { 00602 xcode = (new UConverter("utf32")).createTranscoder (cvt); 00603 } 00604 00605 /*************************************************************** 00606 00607 Construct an encoder of the given output 'type'. 00608 00609 The source-encoding is assumed to be utf32. 00610 00611 ***************************************************************/ 00612 00613 this (char[] type) 00614 { 00615 this (new UConverter(type)); 00616 } 00617 00618 /*************************************************************** 00619 00620 ***************************************************************/ 00621 00622 void encode (IBuffer b, dchar[] d) 00623 { 00624 uint write (void[] x) 00625 { 00626 UAdjust adj; 00627 00628 more = xcode.convert (d, x, adj, d.length == 0); 00629 d = d[adj.input..length]; 00630 return adj.output; 00631 } 00632 00633 super.encode (b, &write); 00634 } 00635 00636 /*************************************************************** 00637 00638 ***************************************************************/ 00639 00640 protected uint encoder (void* p, uint count, uint type) 00641 { 00642 encode (bound, (cast(dchar*) p)[0..count/dchar.sizeof]); 00643 return 0; 00644 } 00645 00646 /*************************************************************** 00647 00648 ***************************************************************/ 00649 00650 uint type () 00651 { 00652 return Type.Utf32; 00653 } 00654 00655 /*************************************************************** 00656 00657 ***************************************************************/ 00658 00659 void reset () 00660 { 00661 xcode.reset(); 00662 } 00663 } 00664 }