00001 /******************************************************************************* 00002 00003 @file UMango.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version, October 2004 00034 @author Kris 00035 00036 *******************************************************************************/ 00037 00038 module mango.icu.UMango; 00039 00040 public import mango.icu.UConverter; 00041 00042 /******************************************************************************* 00043 00044 Include these classes when compiled with the Mango.io package. 00045 They represent the 'glue' to bind said package to the unicode 00046 converters provided by ICU. 00047 00048 *******************************************************************************/ 00049 00050 version (Mango) 00051 { 00052 private import mango.io.model.IReader; 00053 private import mango.io.model.IWriter; 00054 00055 /*********************************************************************** 00056 00057 Abstract base class for String decoders. These decoders 00058 bind the ICU functionality to the Mango.io package, and 00059 provide some utility functions such as input streaming. 00060 00061 These decoder classes will always attempt to fill their 00062 destination (provided) output array, but may terminate 00063 early if (a) a defined read 'limit' on the input stream 00064 has been reached or (b) a partial surrogate-pair would 00065 be left at the output tail. Each decoder returns a count 00066 of how many output elements were actually converted. 00067 00068 ***********************************************************************/ 00069 00070 class StringDecoder : IReadable, IDecoder 00071 { 00072 private UConverter cvt; 00073 private bool done; 00074 private IBuffer bound; 00075 private uint limit = uint.max; 00076 00077 /*************************************************************** 00078 00079 Decoders can be used to convert directly into a 00080 provided destination. The converter will try to 00081 fill the destination, up to the configured input 00082 'limit', and returns the number of elements thus 00083 converted. This returned value will be less than 00084 the destination capacity when either the 'limit' 00085 was reached, or when a partial surrogate would 00086 be placed at the tail. 00087 00088 ***************************************************************/ 00089 00090 abstract uint read (IBuffer b, wchar[] dst); 00091 00092 /*************************************************************** 00093 00094 Signature for BufferDecoder handlers. These 00095 decoders are intended to be usable as the 00096 default handlers within the reader constructs. 00097 Use IReader.setDecoder() to set a decoder as 00098 the default handler. 00099 00100 ***************************************************************/ 00101 00102 abstract uint decoder (void* p, uint capacity); 00103 00104 /*************************************************************** 00105 00106 Return the type of this decoder 00107 00108 ***************************************************************/ 00109 00110 abstract ConverterType type (); 00111 00112 /*************************************************************** 00113 00114 Set the limit for this decoder. This will cause 00115 the decoder to halt after reading the specified 00116 number of bytes from its input. The decoder may 00117 also halt before that point if the destination 00118 becomes full. Use method toGo() to monitor how 00119 much content has been read so far. 00120 00121 ***************************************************************/ 00122 00123 void setLimit (uint limit) 00124 { 00125 this.limit = limit; 00126 } 00127 00128 /*************************************************************** 00129 00130 Change the converter used for this decoder. 00131 00132 ***************************************************************/ 00133 00134 void setConverter (UConverter cvt) 00135 in { 00136 assert (cvt); 00137 } 00138 body 00139 { 00140 this.cvt = cvt; 00141 } 00142 00143 /*************************************************************** 00144 00145 Reset the converter and the input limit. The latter 00146 defaults to being unlimited, causing the decoder to 00147 read until the destination is full. 00148 00149 ***************************************************************/ 00150 00151 void reset (uint limit = uint.max) 00152 { 00153 setLimit (limit); 00154 cvt.reset (); 00155 } 00156 00157 /*************************************************************** 00158 00159 Return the number of bytes yet to be read 00160 00161 ***************************************************************/ 00162 00163 protected uint toGo () 00164 { 00165 return limit; 00166 } 00167 00168 /*************************************************************** 00169 00170 Placeholder for subclasses to do something useful 00171 when applied to an IReader. See UString for an 00172 example of such usage. 00173 00174 ***************************************************************/ 00175 00176 protected void read (IReader r) 00177 { 00178 } 00179 00180 /*************************************************************** 00181 00182 Bind this StringDecoder to the specified IReader. 00183 This is invoked by an IReader to install it as the 00184 default handler, and thus be used by all subsequent 00185 IReader.get() requests for the subclass type. 00186 00187 Note that the byte limit will be respected if 'limit' 00188 has been set, which can be useful when converting an 00189 unknown number of elements (a la HTTP). 00190 00191 ***************************************************************/ 00192 00193 final BufferDecoder bind (IReader reader) 00194 { 00195 bound = reader.getBuffer (); 00196 return &decoder; 00197 } 00198 00199 /*************************************************************** 00200 00201 Decode IBuffer input until the delegate indicates 00202 it is finished. Typically, that occurs when either 00203 the destination is full, or the input 'limit' has 00204 been reached. 00205 00206 ***************************************************************/ 00207 00208 private final void decode (IBuffer buffer, int delegate (void[]) dg) 00209 { 00210 done = false; 00211 while (limit && !done) 00212 { 00213 buffer.get (1, false); 00214 buffer.read (dg); 00215 } 00216 } 00217 } 00218 00219 00220 /*********************************************************************** 00221 00222 Decode a byte stream into UTF16 wchars. This decoder can: 00223 00224 - be used as the default wchar handler when attached to 00225 an IReader (see IReader.setDecoder). 00226 00227 - be used directly to fill a provided destination array 00228 with converted wchars. 00229 00230 - be used in either of the prior two cases with a 'limit' 00231 placed upon the number of input bytes converted (in 00232 addition to the destination capacity limit). This can 00233 be useful when the number of raw bytes is known, but 00234 the number of wchar elements is not, and can be handy 00235 for streaming conversions. 00236 00237 ***********************************************************************/ 00238 00239 class StringDecoder16 : StringDecoder 00240 { 00241 /*************************************************************** 00242 00243 Construct a decoder with the given UConverter, and 00244 an optional 'limit' to the number of input bytes to 00245 be converted. 00246 00247 ***************************************************************/ 00248 00249 this (UConverter cvt, uint limit = uint.max) 00250 { 00251 this.cvt = cvt; 00252 super.reset (limit); 00253 } 00254 00255 /*************************************************************** 00256 00257 Construct a decoder of the given specification, and 00258 an optional 'limit' to the number of input bytes to 00259 be converted. 00260 00261 ***************************************************************/ 00262 00263 this (char[] type, uint limit = uint.max) 00264 { 00265 this (new UConverter (type), limit); 00266 } 00267 00268 /*************************************************************** 00269 00270 Return the type of this decoder 00271 00272 ***************************************************************/ 00273 00274 ConverterType type () 00275 { 00276 return ConverterType.WChar; 00277 } 00278 00279 /*************************************************************** 00280 00281 Signature for BufferDecoder handlers. These 00282 decoders are intended to be usable as the 00283 default handlers within the reader constructs. 00284 Use IReader.setDecoder() to set a decoder as 00285 the default handler. 00286 00287 ***************************************************************/ 00288 00289 protected uint decoder (void* p, uint capacity) 00290 { 00291 // this ugly conversion/casting back and forth is 00292 // a lot more efficient than the intrinsic array 00293 // conversion generated via an array[] cast 00294 return read (bound, (cast(wchar*) p)[0..capacity / wchar.sizeof]) * wchar.sizeof; 00295 } 00296 00297 /*************************************************************** 00298 00299 Decoders can be used to convert directly into a 00300 provided destination. The converter will try to 00301 fill the destination, up to the configured input 00302 'limit', and returns the number of elements thus 00303 converted. This returned value will be less than 00304 the destination capacity when either the 'limit' 00305 was reached, or when a partial surrogate would 00306 have been placed at the tail. 00307 00308 ***************************************************************/ 00309 00310 final uint read (IBuffer buffer, wchar[] dst) 00311 { 00312 uint produced; 00313 00314 int read (void[] x) 00315 { 00316 UAdjust adj; 00317 uint len = x.length; 00318 00319 // have we read enough from the source? 00320 if (len > limit) 00321 len = limit; 00322 00323 // do the conversion; test for overflow. 00324 // There's an issue here with certain 00325 // conversion types (e.g. utf7) where byte 00326 // combinations appear ambiguous. It is 00327 // possible that the converter will cache 00328 // such combinations until it determines 00329 // the result from subsequent input data. 00330 // However, if such a condition occurs at 00331 // the tail end of an input stream, the 00332 // conversion may stall whilst waiting on 00333 // more input. There does not appear to 00334 // be a means of identifying whether or 00335 // not content has been cached, so there 00336 // is little one can do at this time ... 00337 // Note that this issue does not exist 00338 // when 'limit' is active 00339 done = cvt.decode (x[0..len], dst[produced..length], adj, len == 0); 00340 00341 // adjust output. Note that we always clip 00342 // the bytes read to match the output size 00343 if ((produced += adj.output) >= dst.length) 00344 done = true; 00345 00346 // are we limiting input? 00347 if (limit != uint.max) 00348 limit -= adj.input; 00349 00350 // say how much we consumed 00351 return adj.input; 00352 } 00353 00354 decode (buffer, &read); 00355 return produced; 00356 } 00357 } 00358 00359 00360 00361 /*********************************************************************** 00362 00363 ***********************************************************************/ 00364 00365 class StringEncoder : IEncoder 00366 { 00367 private bool more; 00368 private IBuffer bound; 00369 00370 /*************************************************************** 00371 00372 ***************************************************************/ 00373 00374 abstract void reset (); 00375 00376 /*************************************************************** 00377 00378 ***************************************************************/ 00379 00380 abstract ConverterType type (); 00381 00382 /*************************************************************** 00383 00384 ***************************************************************/ 00385 00386 abstract void encoder (void* p, uint count); 00387 00388 /*************************************************************** 00389 00390 Bind this StringEncoder to the specified IWriter. 00391 This is invoked by an IWriter to install it as the 00392 default handler, and thus be used by all subsequent 00393 IReader.put() requests for the subclass type. 00394 00395 ***************************************************************/ 00396 00397 final BufferEncoder bind (IWriter w) 00398 { 00399 bound = w.getBuffer (); 00400 return &encoder; 00401 } 00402 00403 /*************************************************************** 00404 00405 ***************************************************************/ 00406 00407 private final void encode (IBuffer b, int delegate (void[]) dg) 00408 { 00409 more = true; 00410 while (more) 00411 { 00412 if (! b.writable) 00413 b.flush (); 00414 b.write (dg); 00415 } 00416 } 00417 } 00418 00419 00420 /*********************************************************************** 00421 00422 ***********************************************************************/ 00423 00424 class StringEncoder8 : StringEncoder 00425 { 00426 private ITranscoder xcode; 00427 00428 /*************************************************************** 00429 00430 Construct an encoder for the given UConverter, 00431 where the source-content encoding is specified 00432 by 'source'. 00433 00434 The default source-encoding is assumed to be utf8. 00435 00436 ***************************************************************/ 00437 00438 this (UConverter cvt, char[] source = "utf8") 00439 { 00440 xcode = (new UConverter(source)).createTranscoder (cvt); 00441 } 00442 00443 /*************************************************************** 00444 00445 Construct an encoder of the given output 'type', 00446 where the source-content encoding is specified 00447 by 'source'. 00448 00449 The default source-encoding is assumed to be utf8. 00450 00451 ***************************************************************/ 00452 00453 this (char[] type, char[] source = "utf8") 00454 { 00455 this (new UConverter(type), source); 00456 } 00457 00458 /*************************************************************** 00459 00460 ***************************************************************/ 00461 00462 void encode (IBuffer b, char[] c) 00463 { 00464 int write (void[] x) 00465 { 00466 UAdjust adj; 00467 00468 more = xcode.convert (c, x, adj, c.length == 0); 00469 c = c[adj.input..length]; 00470 return adj.output; 00471 } 00472 00473 super.encode (b, &write); 00474 } 00475 00476 /*************************************************************** 00477 00478 ***************************************************************/ 00479 00480 protected void encoder (void* p, uint count) 00481 { 00482 encode (bound, (cast(char*) p)[0..count/char.sizeof]); 00483 } 00484 00485 /*************************************************************** 00486 00487 ***************************************************************/ 00488 00489 ConverterType type () 00490 { 00491 return ConverterType.Char; 00492 } 00493 00494 /*************************************************************** 00495 00496 ***************************************************************/ 00497 00498 void reset () 00499 { 00500 xcode.reset(); 00501 } 00502 } 00503 00504 00505 /*********************************************************************** 00506 00507 ***********************************************************************/ 00508 00509 class StringEncoder16 : StringEncoder 00510 { 00511 private UConverter cvt; 00512 00513 /*************************************************************** 00514 00515 ***************************************************************/ 00516 00517 this (UConverter cvt) 00518 { 00519 this.cvt = cvt; 00520 } 00521 00522 /*************************************************************** 00523 00524 Construct an encoder of the given output 'type'. 00525 00526 The source-encoding is assumed to be utf16. 00527 00528 ***************************************************************/ 00529 00530 this (char[] type) 00531 { 00532 this (new UConverter(type)); 00533 } 00534 00535 /*************************************************************** 00536 00537 ***************************************************************/ 00538 00539 void encode (IBuffer b, wchar[] w) 00540 { 00541 int write (void[] x) 00542 { 00543 UAdjust adj; 00544 00545 more = cvt.encode (w, x, adj, w.length == 0); 00546 w = w[adj.input..length]; 00547 return adj.output; 00548 } 00549 00550 super.encode (b, &write); 00551 } 00552 00553 /*************************************************************** 00554 00555 ***************************************************************/ 00556 00557 protected void encoder (void* p, uint count) 00558 { 00559 encode (bound, (cast(wchar*) p)[0..count/wchar.sizeof]); 00560 } 00561 00562 /*************************************************************** 00563 00564 ***************************************************************/ 00565 00566 ConverterType type () 00567 { 00568 return ConverterType.WChar; 00569 } 00570 00571 /*************************************************************** 00572 00573 ***************************************************************/ 00574 00575 void reset () 00576 { 00577 cvt.reset(); 00578 } 00579 } 00580 00581 00582 /*********************************************************************** 00583 00584 ***********************************************************************/ 00585 00586 class StringEncoder32 : StringEncoder 00587 { 00588 private ITranscoder xcode; 00589 00590 /*************************************************************** 00591 00592 ***************************************************************/ 00593 00594 this (UConverter cvt) 00595 { 00596 xcode = (new UConverter("utf32")).createTranscoder (cvt); 00597 } 00598 00599 /*************************************************************** 00600 00601 Construct an encoder of the given output 'type'. 00602 00603 The source-encoding is assumed to be utf32. 00604 00605 ***************************************************************/ 00606 00607 this (char[] type) 00608 { 00609 this (new UConverter(type)); 00610 } 00611 00612 /*************************************************************** 00613 00614 ***************************************************************/ 00615 00616 void encode (IBuffer b, dchar[] d) 00617 { 00618 int write (void[] x) 00619 { 00620 UAdjust adj; 00621 00622 more = xcode.convert (d, x, adj, d.length == 0); 00623 d = d[adj.input..length]; 00624 return adj.output; 00625 } 00626 00627 super.encode (b, &write); 00628 } 00629 00630 /*************************************************************** 00631 00632 ***************************************************************/ 00633 00634 protected void encoder (void* p, uint count) 00635 { 00636 encode (bound, (cast(dchar*) p)[0..count/dchar.sizeof]); 00637 } 00638 00639 /*************************************************************** 00640 00641 ***************************************************************/ 00642 00643 ConverterType type () 00644 { 00645 return ConverterType.DChar; 00646 } 00647 00648 /*************************************************************** 00649 00650 ***************************************************************/ 00651 00652 void reset () 00653 { 00654 xcode.reset(); 00655 } 00656 } 00657 }