00001 /******************************************************************************* 00002 00003 @file UString.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version, October 2004 00034 @author Kris 00035 00036 Note that this package and documentation is built around the ICU 00037 project (http://oss.software.ibm.com/icu/). Below is the license 00038 statement as specified by that software: 00039 00040 00041 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00042 00043 00044 ICU License - ICU 1.8.1 and later 00045 00046 COPYRIGHT AND PERMISSION NOTICE 00047 00048 Copyright (c) 1995-2003 International Business Machines Corporation and 00049 others. 00050 00051 All rights reserved. 00052 00053 Permission is hereby granted, free of charge, to any person obtaining a 00054 copy of this software and associated documentation files (the 00055 "Software"), to deal in the Software without restriction, including 00056 without limitation the rights to use, copy, modify, merge, publish, 00057 distribute, and/or sell copies of the Software, and to permit persons 00058 to whom the Software is furnished to do so, provided that the above 00059 copyright notice(s) and this permission notice appear in all copies of 00060 the Software and that both the above copyright notice(s) and this 00061 permission notice appear in supporting documentation. 00062 00063 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 00064 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00065 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 00066 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 00067 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL 00068 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING 00069 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, 00070 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION 00071 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 00072 00073 Except as contained in this notice, the name of a copyright holder 00074 shall not be used in advertising or otherwise to promote the sale, use 00075 or other dealings in this Software without prior written authorization 00076 of the copyright holder. 00077 00078 ---------------------------------------------------------------------- 00079 00080 All trademarks and registered trademarks mentioned herein are the 00081 property of their respective owners. 00082 00083 *******************************************************************************/ 00084 00085 module mango.icu.UString; 00086 00087 private import mango.icu.ICU, 00088 mango.icu.UChar, 00089 mango.icu.ULocale; 00090 00091 /******************************************************************************* 00092 00093 *******************************************************************************/ 00094 00095 private extern (C) void memmove (void* dst, void* src, uint bytes); 00096 00097 /******************************************************************************* 00098 00099 Bind to the IReadable and IWritable interfaces if we're building 00100 along with the mango.io package 00101 00102 *******************************************************************************/ 00103 00104 version (Mango) 00105 { 00106 private import mango.icu.UMango; 00107 00108 private import mango.io.model.IReader, 00109 mango.io.model.IWriter; 00110 00111 private interface ITextOther : IWritable {} 00112 private interface IStringOther : IReadable {} 00113 } 00114 else 00115 { 00116 private interface ITextOther {} 00117 private interface IStringOther {} 00118 } 00119 00120 00121 /******************************************************************************* 00122 00123 UString is a string class that stores Unicode characters directly 00124 and provides similar functionality as the Java String class. 00125 00126 In ICU, a Unicode string consists of 16-bit Unicode code units. 00127 A Unicode character may be stored with either one code unit — 00128 which is the most common case — or with a matched pair of 00129 special code units ("surrogates"). The data type for code units 00130 is UChar. 00131 00132 For single-character handling, a Unicode character code point is 00133 a value in the range 0..0x10ffff. ICU uses the UChar32 type for 00134 code points. 00135 00136 Indexes and offsets into and lengths of strings always count code 00137 units, not code points. This is the same as with multi-byte char* 00138 strings in traditional string handling. Operations on partial 00139 strings typically do not test for code point boundaries. If necessary, 00140 the user needs to take care of such boundaries by testing for the code 00141 unit values or by using functions like getChar32Start() 00142 and getChar32Limit() 00143 00144 UString methods are more lenient with regard to input parameter values 00145 than other ICU APIs. In particular: 00146 00147 - If indexes are out of bounds for a UString object (< 0 or > length) 00148 then they are "pinned" to the nearest boundary. 00149 00150 - If primitive string pointer values (e.g., const wchar* or char*) for 00151 input strings are null, then those input string parameters are treated 00152 as if they pointed to an empty string. However, this is not the case 00153 for char* parameters for charset names or other IDs. 00154 00155 *******************************************************************************/ 00156 00157 class UString : UText, IStringOther 00158 { 00159 alias opCat append; 00160 alias opIndexAssign setCharAt; 00161 00162 /*********************************************************************** 00163 00164 Create an empty UString with the specified available space 00165 00166 ***********************************************************************/ 00167 00168 this (uint space = 0) 00169 { 00170 content.length = space; 00171 mutable = true; 00172 } 00173 00174 /*********************************************************************** 00175 00176 Create a UString upon the provided content. If said content 00177 is immutable (read-only) then you might consider setting the 00178 'mutable' parameter to false. Doing so will avoid allocating 00179 heap-space for the content until it is modified. 00180 00181 ***********************************************************************/ 00182 00183 this (wchar[] content, bool mutable = true) 00184 { 00185 setTo (content, mutable); 00186 } 00187 00188 /*********************************************************************** 00189 00190 Create a UString via the content of a UText. Note that the 00191 default is to assume the content is immutable (read-only). 00192 00193 ***********************************************************************/ 00194 00195 this (UText other, bool mutable = false) 00196 { 00197 this (other.get, mutable); 00198 } 00199 00200 /*********************************************************************** 00201 00202 Create a UString via the content of a UString. If said content 00203 is immutable (read-only) then you might consider setting the 00204 'mutable' parameter to false. Doing so will avoid allocating 00205 heap-space for the content until it is modified via UString 00206 methods. 00207 00208 ***********************************************************************/ 00209 00210 this (UString other, bool mutable = true) 00211 { 00212 this (other.get, mutable); 00213 } 00214 00215 /*********************************************************************** 00216 00217 Support for reading content via the IO system 00218 00219 ***********************************************************************/ 00220 00221 version (Mango) 00222 { 00223 /*************************************************************** 00224 00225 Internal adapter to handle loading and conversion 00226 of UString content. Once constructed, this may be 00227 used as the target for an IReader. Alternatively, 00228 invoke the load() method with an IBuffer of choice. 00229 00230 ***************************************************************/ 00231 00232 class UStringDecoder : StringDecoder16 00233 { 00234 private UString s; 00235 00236 // construct a decoder on the given UString 00237 this (UConverter c, uint bytes, UString s) 00238 { 00239 super (c, bytes); 00240 this.s = s; 00241 } 00242 00243 // IReadable adapter to perform the conversion 00244 protected void read (IReader r) 00245 { 00246 load (r.getBuffer); 00247 } 00248 00249 // read from the provided buffer until we 00250 // either have all the content, or an eof 00251 // condition throws an exception. 00252 package void load (IBuffer b) 00253 { 00254 uint produced = super.read (b, s.content); 00255 while (toGo) 00256 { 00257 s.expand (toGo); 00258 produced += super.read (b, s.content[produced..length]); 00259 } 00260 s.len = produced; 00261 } 00262 } 00263 00264 /*************************************************************** 00265 00266 Another constructor for loading known content length 00267 into a UString. 00268 00269 ***************************************************************/ 00270 00271 this (IBuffer buffer, uint contentLength, UConverter cvt) 00272 { 00273 this (contentLength); 00274 UStringDecoder sd = new UStringDecoder (cvt, contentLength, this); 00275 sd.load (buffer); 00276 } 00277 00278 /*************************************************************** 00279 00280 Read as many bytes from the input as is necessary 00281 to produce the expected number of wchar elements. 00282 This uses the default wchar handler, which can be 00283 altered by binding a StringDecoder to the IReader 00284 in use (see UMango for details). 00285 00286 We're mutable, so ensure we don't mess with the 00287 IO buffers. Interestingly, changing the length 00288 of a D array will account for slice assignments 00289 (it checks the pointer to see if it's a starting 00290 point in the pool). Unfortunately, that doesn't 00291 catch the case where a slice starts at offset 0, 00292 which is where IBuffer slices may come from. 00293 00294 To be safe, we ask the allocator in use whether 00295 the content it provided can be mutated or not. 00296 Note that this is not necessary for UText, since 00297 that is a read-only construct. 00298 00299 ***************************************************************/ 00300 00301 void read (IReader r) 00302 { 00303 r.get (content); 00304 len = content.length; 00305 mutable = r.getAllocator.isMutable (content); 00306 } 00307 00308 /*************************************************************** 00309 00310 Return a streaming decoder that can be used to 00311 populate this UString with a specified number of 00312 input bytes. 00313 00314 This differs from the above read() method in the 00315 way content is read: in the above case, exactly 00316 the specified number of wchar elements will be 00317 converter from the input, whereas in this case 00318 a variable number of wchar elements are converted 00319 until 'bytes' have been read from the input. This 00320 is useful in those cases where the original number 00321 of elements has been lost, and only the resultant 00322 converted byte-count remains (a la HTTP). 00323 00324 The returned StringDecoder is one-shot only. You may 00325 reuse it (both the converter and the byte count) via 00326 its reset() method. 00327 00328 One applies the resultant converter directly with an 00329 IReader like so: 00330 00331 @code 00332 UString s = ...; 00333 IReader r = ...; 00334 00335 // r >> s.createDecoder(cvt, bytes); 00336 r.get (s.createDecoder(cvt, bytes)); 00337 @endcode 00338 00339 which will read the specified number of bytes from 00340 the input and convert them to an appropriate number 00341 of wchars within the UString. 00342 00343 ***************************************************************/ 00344 00345 StringDecoder createDecoder (UConverter c, uint bytes) 00346 { 00347 return new UStringDecoder (c, bytes, this); 00348 } 00349 } 00350 00351 /*********************************************************************** 00352 00353 Append text to this UString 00354 00355 ***********************************************************************/ 00356 00357 UString opCat (UText other) 00358 { 00359 return opCat (other.get); 00360 } 00361 00362 /*********************************************************************** 00363 00364 Append partial text to this UString 00365 00366 ***********************************************************************/ 00367 00368 UString opCat (UText other, uint start, uint len=uint.max) 00369 { 00370 other.pinIndices (start, len); 00371 return opCat (other.content [start..start+len]); 00372 } 00373 00374 /*********************************************************************** 00375 00376 Append a single character to this UString 00377 00378 ***********************************************************************/ 00379 00380 UString opCat (wchar chr) 00381 { 00382 return opCat (&chr, 1); 00383 } 00384 00385 /*********************************************************************** 00386 00387 Append text to this UString 00388 00389 ***********************************************************************/ 00390 00391 UString opCat (wchar[] chars) 00392 { 00393 return opCat (chars, chars.length); 00394 } 00395 00396 /*********************************************************************** 00397 00398 Converts a sequence of UTF-8 bytes to UChars (UTF-16) 00399 00400 ***********************************************************************/ 00401 00402 UString opCat (char[] chars) 00403 { 00404 uint fmt (wchar* dst, uint len, inout Error e) 00405 { 00406 uint x; 00407 00408 u_strFromUTF8 (dst, len, &x, chars, chars.length, e); 00409 return x; 00410 } 00411 00412 expand (chars.length); 00413 return format (&fmt, "failed to append UTF char[]"); 00414 } 00415 00416 /*********************************************************************** 00417 00418 Set a section of this UString to the specified character 00419 00420 ***********************************************************************/ 00421 00422 UString setTo (wchar chr, uint start=0, uint len=uint.max) 00423 { 00424 pinIndices (start, len); 00425 if (! mutable) 00426 realloc (); 00427 content [start..start+len] = chr; 00428 return this; 00429 } 00430 00431 /*********************************************************************** 00432 00433 Set the content to the provided array. Parameter 'mutable' 00434 specifies whether the given array is likely to change. If 00435 not, the array is aliased until such time this UString is 00436 altered. 00437 00438 ***********************************************************************/ 00439 00440 UString setTo (wchar[] chars, bool mutable = true) 00441 { 00442 len = chars.length; 00443 if ((this.mutable = mutable) == true) 00444 content = chars.dup; 00445 else 00446 content = chars; 00447 return this; 00448 } 00449 00450 /*********************************************************************** 00451 00452 Replace the content of this UString. If the new content 00453 is immutable (read-only) then you might consider setting the 00454 'mutable' parameter to false. Doing so will avoid allocating 00455 heap-space for the content until it is modified via one of 00456 these methods. 00457 00458 ***********************************************************************/ 00459 00460 UString setTo (UText other, bool mutable = true) 00461 { 00462 return setTo (other.get, mutable); 00463 } 00464 00465 /*********************************************************************** 00466 00467 Replace the content of this UString. If the new content 00468 is immutable (read-only) then you might consider setting the 00469 'mutable' parameter to false. Doing so will avoid allocating 00470 heap-space for the content until it is modified via one of 00471 these methods. 00472 00473 ***********************************************************************/ 00474 00475 UString setTo (UText other, uint start, uint len, bool mutable = true) 00476 { 00477 other.pinIndices (start, len); 00478 return setTo (other.content [start..start+len], mutable); 00479 } 00480 00481 /*********************************************************************** 00482 00483 Replace the character at the specified location. 00484 00485 ***********************************************************************/ 00486 00487 final UString opIndexAssign (wchar chr, uint index) 00488 in { 00489 if (index >= len) 00490 exception ("index of out bounds"); 00491 } 00492 body 00493 { 00494 if (! mutable) 00495 realloc (); 00496 content [index] = chr; 00497 return this; 00498 } 00499 00500 /*********************************************************************** 00501 00502 Remove a piece of this UString. 00503 00504 ***********************************************************************/ 00505 00506 UString remove (uint start, uint length=uint.max) 00507 { 00508 pinIndices (start, length); 00509 if (length) 00510 if (start >= len) 00511 truncate (start); 00512 else 00513 { 00514 if (! mutable) 00515 realloc (); 00516 00517 uint i = start + length; 00518 memmove (&content[start], &content[i], (len-i) * wchar.sizeof); 00519 len -= length; 00520 } 00521 return this; 00522 } 00523 00524 /*********************************************************************** 00525 00526 Truncate the length of this UString. 00527 00528 ***********************************************************************/ 00529 00530 UString truncate (uint length=0) 00531 { 00532 if (length <= len) 00533 len = length; 00534 return this; 00535 } 00536 00537 /*********************************************************************** 00538 00539 Insert leading spaces in this UString 00540 00541 ***********************************************************************/ 00542 00543 UString padLeading (uint count, wchar padChar = 0x0020) 00544 { 00545 expand (count); 00546 memmove (&content[count], content, len * wchar.sizeof); 00547 len += count; 00548 return setTo (padChar, 0, count); 00549 } 00550 00551 /*********************************************************************** 00552 00553 Append some trailing spaces to this UString. 00554 00555 ***********************************************************************/ 00556 00557 UString padTrailing (uint length, wchar padChar = 0x0020) 00558 { 00559 expand (length); 00560 len += length; 00561 return setTo (padChar, len-length, length); 00562 } 00563 00564 /*********************************************************************** 00565 00566 Check for available space within the buffer, and expand 00567 as necessary. 00568 00569 ***********************************************************************/ 00570 00571 package final void expand (uint count) 00572 { 00573 if ((len + count) > content.length) 00574 realloc (count); 00575 } 00576 00577 /*********************************************************************** 00578 00579 Allocate memory due to a change in the content. We handle 00580 the distinction between mutable and immutable here. 00581 00582 ***********************************************************************/ 00583 00584 private final void realloc (uint count = 0) 00585 { 00586 uint size = (content.length + count + 63) & ~63; 00587 00588 if (mutable) 00589 content.length = size; 00590 else 00591 { 00592 mutable = true; 00593 wchar[] x = content; 00594 content = new wchar [size]; 00595 if (len) 00596 content[0..len] = x; 00597 } 00598 } 00599 00600 /*********************************************************************** 00601 00602 Internal method to support UString appending 00603 00604 ***********************************************************************/ 00605 00606 private final UString opCat (wchar* chars, uint count) 00607 { 00608 expand (count); 00609 content[len..len+count] = chars[0..count]; 00610 len += count; 00611 return this; 00612 } 00613 00614 /*********************************************************************** 00615 00616 Internal method to support formatting into this UString. 00617 This is used by many of the ICU wrappers to append content 00618 into a UString. 00619 00620 ***********************************************************************/ 00621 00622 typedef uint delegate (wchar* dst, uint len, inout Error e) Formatter; 00623 00624 package final UString format (Formatter format, char[] msg) 00625 { 00626 Error e; 00627 uint length; 00628 00629 while (true) 00630 { 00631 e = e.OK; 00632 length = format (&content[len], content.length - len, e); 00633 if (e == e.BufferOverflow) 00634 expand (length); 00635 else 00636 break; 00637 } 00638 00639 if (isError (e)) 00640 exception (msg); 00641 00642 len += length; 00643 return this; 00644 } 00645 } 00646 00647 00648 /******************************************************************************* 00649 00650 Immutable (read-only) text -- use UString for mutable strings. 00651 00652 *******************************************************************************/ 00653 00654 class UText : ICU, ITextOther 00655 { 00656 alias opIndex charAt; 00657 00658 // the core of the UText and UString attributes. The name 'len' 00659 // is used rather than the more obvious 'length' since there is 00660 // a collision with the silly array[length] syntactic sugar ... 00661 package uint len; 00662 package wchar[] content; 00663 00664 // this should probably be in UString only, but there seems to 00665 // be a compiler bug where it doesn't get initialised correctly, 00666 // and it's perhaps useful to have here for when a UString is 00667 // passed as a UText argument. 00668 private bool mutable; 00669 00670 // toFolded() argument 00671 public enum CaseOption 00672 { 00673 Default = 0, 00674 SpecialI = 1 00675 } 00676 00677 /*********************************************************************** 00678 00679 Hidden constructor 00680 00681 ***********************************************************************/ 00682 00683 private this () 00684 { 00685 } 00686 00687 /*********************************************************************** 00688 00689 Construct read-only wrapper around the given content 00690 00691 ***********************************************************************/ 00692 00693 this (wchar[] content) 00694 { 00695 this.content = content; 00696 this.len = content.length; 00697 } 00698 00699 /*********************************************************************** 00700 00701 Support for writing via the Mango IO subsystem 00702 00703 ***********************************************************************/ 00704 00705 version (Mango) 00706 { 00707 void write (IWriter w) 00708 { 00709 w.opShlw (get); 00710 } 00711 } 00712 00713 /*********************************************************************** 00714 00715 Return the valid content from this UText 00716 00717 ***********************************************************************/ 00718 00719 final package wchar[] get () 00720 { 00721 return content [0..len]; 00722 } 00723 00724 /*********************************************************************** 00725 00726 Is this UText equal to another? 00727 00728 ***********************************************************************/ 00729 00730 final override int opEquals (Object o) 00731 { 00732 UText other = cast(UText) o; 00733 00734 if (other) 00735 return (other is this || compare (other) == 0); 00736 return 0; 00737 } 00738 00739 /*********************************************************************** 00740 00741 Compare this UText to another. 00742 00743 ***********************************************************************/ 00744 00745 final override int opCmp (Object o) 00746 { 00747 UText other = cast(UText) o; 00748 00749 if (other is this) 00750 return 0; 00751 else 00752 if (other) 00753 return compare (other); 00754 return 1; 00755 } 00756 00757 /*********************************************************************** 00758 00759 Hash this UText 00760 00761 ***********************************************************************/ 00762 00763 final override uint toHash () 00764 { 00765 return typeid(wchar[]).getHash (&content[0..len]); 00766 } 00767 00768 /*********************************************************************** 00769 00770 Clone this UText into a UString 00771 00772 ***********************************************************************/ 00773 00774 final UString copy () 00775 { 00776 return new UString (content); 00777 } 00778 00779 /*********************************************************************** 00780 00781 Clone a section of this UText into a UString 00782 00783 ***********************************************************************/ 00784 00785 final UString extract (uint start, uint len=uint.max) 00786 { 00787 pinIndices (start, len); 00788 return new UString (content[start..start+len]); 00789 } 00790 00791 /*********************************************************************** 00792 00793 Count unicode code points in the length UChar code units of 00794 the string. A code point may occupy either one or two UChar 00795 code units. Counting code points involves reading all code 00796 units. 00797 00798 ***********************************************************************/ 00799 00800 final uint codePoints (uint start=0, uint length=uint.max) 00801 { 00802 pinIndices (start, length); 00803 return u_countChar32 (&content[start], length); 00804 } 00805 00806 /*********************************************************************** 00807 00808 Return an indication whether or not there are surrogate pairs 00809 within the string. 00810 00811 ***********************************************************************/ 00812 00813 final bool hasSurrogates (uint start=0, uint length=uint.max) 00814 { 00815 pinIndices (start, length); 00816 return codePoints (start, length) != length; 00817 } 00818 00819 /*********************************************************************** 00820 00821 Return the character at the specified position. 00822 00823 ***********************************************************************/ 00824 00825 final wchar opIndex (uint index) 00826 in { 00827 if (index >= len) 00828 exception ("index of out bounds"); 00829 } 00830 body 00831 { 00832 return content [index]; 00833 } 00834 00835 /*********************************************************************** 00836 00837 Return the length of the valid content 00838 00839 ***********************************************************************/ 00840 00841 final uint length () 00842 { 00843 return len; 00844 } 00845 00846 /*********************************************************************** 00847 00848 The comparison can be done in code unit order or in code 00849 point order. They differ only in UTF-16 when comparing 00850 supplementary code points (U+10000..U+10ffff) to BMP code 00851 points near the end of the BMP (i.e., U+e000..U+ffff). 00852 00853 In code unit order, high BMP code points sort after 00854 supplementary code points because they are stored as 00855 pairs of surrogates which are at U+d800..U+dfff. 00856 00857 ***********************************************************************/ 00858 00859 final int compare (UText other, bool codePointOrder=false) 00860 { 00861 return compare (other.get, codePointOrder); 00862 } 00863 00864 /*********************************************************************** 00865 00866 The comparison can be done in code unit order or in code 00867 point order. They differ only in UTF-16 when comparing 00868 supplementary code points (U+10000..U+10ffff) to BMP code 00869 points near the end of the BMP (i.e., U+e000..U+ffff). 00870 00871 In code unit order, high BMP code points sort after 00872 supplementary code points because they are stored as 00873 pairs of surrogates which are at U+d800..U+dfff. 00874 00875 ***********************************************************************/ 00876 00877 final int compare (wchar[] other, bool codePointOrder=false) 00878 { 00879 return u_strCompare (content, len, other, other.length, codePointOrder); 00880 } 00881 00882 /*********************************************************************** 00883 00884 The comparison can be done in UTF-16 code unit order or 00885 in code point order. They differ only when comparing 00886 supplementary code points (U+10000..U+10ffff) to BMP code 00887 points near the end of the BMP (i.e., U+e000..U+ffff). 00888 00889 In code unit order, high BMP code points sort after 00890 supplementary code points because they are stored as 00891 pairs of surrogates which are at U+d800..U+dfff. 00892 00893 ***********************************************************************/ 00894 00895 final int compareFolded (UText other, CaseOption option = CaseOption.Default) 00896 { 00897 return compareFolded (other.content, option); 00898 } 00899 00900 /*********************************************************************** 00901 00902 The comparison can be done in UTF-16 code unit order or 00903 in code point order. They differ only when comparing 00904 supplementary code points (U+10000..U+10ffff) to BMP code 00905 points near the end of the BMP (i.e., U+e000..U+ffff). 00906 00907 In code unit order, high BMP code points sort after 00908 supplementary code points because they are stored as 00909 pairs of surrogates which are at U+d800..U+dfff. 00910 00911 ***********************************************************************/ 00912 00913 final int compareFolded (wchar[] other, CaseOption option = CaseOption.Default) 00914 { 00915 return compareFolded (get, other, option); 00916 } 00917 00918 /*********************************************************************** 00919 00920 Does this UText start with specified string? 00921 00922 ***********************************************************************/ 00923 00924 final bool startsWith (UText other) 00925 { 00926 return startsWith (other.get); 00927 } 00928 00929 /*********************************************************************** 00930 00931 Does this UText start with specified string? 00932 00933 ***********************************************************************/ 00934 00935 final bool startsWith (wchar[] chars) 00936 { 00937 if (len >= chars.length) 00938 return compareFolded (content[0..chars.length], chars) == 0; 00939 return false; 00940 } 00941 00942 /*********************************************************************** 00943 00944 Does this UText end with specified string? 00945 00946 ***********************************************************************/ 00947 00948 final bool endsWith (UText other) 00949 { 00950 return endsWith (other.get); 00951 } 00952 00953 /*********************************************************************** 00954 00955 Does this UText end with specified string? 00956 00957 ***********************************************************************/ 00958 00959 final bool endsWith (wchar[] chars) 00960 { 00961 if (len >= chars.length) 00962 return compareFolded (content[len-chars.length..len], chars) == 0; 00963 return false; 00964 } 00965 00966 /*********************************************************************** 00967 00968 Find the first occurrence of a BMP code point in a string. 00969 A surrogate code point is found only if its match in the 00970 text is not part of a surrogate pair. 00971 00972 ***********************************************************************/ 00973 00974 final uint indexOf (wchar c, uint start=0) 00975 { 00976 pinIndex (start); 00977 wchar* s = u_memchr (&content[start], c, len-start); 00978 if (s) 00979 return s - cast(wchar*) content; 00980 return uint.max; 00981 } 00982 00983 /*********************************************************************** 00984 00985 Find the first occurrence of a substring in a string. 00986 00987 The substring is found at code point boundaries. That means 00988 that if the substring begins with a trail surrogate or ends 00989 with a lead surrogate, then it is found only if these 00990 surrogates stand alone in the text. Otherwise, the substring 00991 edge units would be matched against halves of surrogate pairs. 00992 00993 ***********************************************************************/ 00994 00995 final uint indexOf (UText other, uint start=0) 00996 { 00997 return indexOf (other.get, start); 00998 } 00999 01000 /*********************************************************************** 01001 01002 Find the first occurrence of a substring in a string. 01003 01004 The substring is found at code point boundaries. That means 01005 that if the substring begins with a trail surrogate or ends 01006 with a lead surrogate, then it is found only if these 01007 surrogates stand alone in the text. Otherwise, the substring 01008 edge units would be matched against halves of surrogate pairs. 01009 01010 ***********************************************************************/ 01011 01012 final uint indexOf (wchar[] chars, uint start=0) 01013 { 01014 pinIndex (start); 01015 wchar* s = u_strFindFirst (&content[start], len-start, chars, chars.length); 01016 if (s) 01017 return s - cast(wchar*) content; 01018 return uint.max; 01019 } 01020 01021 /*********************************************************************** 01022 01023 Find the last occurrence of a BMP code point in a string. 01024 A surrogate code point is found only if its match in the 01025 text is not part of a surrogate pair. 01026 01027 ***********************************************************************/ 01028 01029 final uint lastIndexOf (wchar c, uint start=uint.max) 01030 { 01031 pinIndex (start); 01032 wchar* s = u_memrchr (content, c, start); 01033 if (s) 01034 return s - cast(wchar*) content; 01035 return uint.max; 01036 } 01037 01038 /*********************************************************************** 01039 01040 Find the last occurrence of a BMP code point in a string. 01041 A surrogate code point is found only if its match in the 01042 text is not part of a surrogate pair. 01043 01044 ***********************************************************************/ 01045 01046 final uint lastIndexOf (UText other, uint start=uint.max) 01047 { 01048 return lastIndexOf (other.get, start); 01049 } 01050 01051 /*********************************************************************** 01052 01053 Find the last occurrence of a substring in a string. 01054 01055 The substring is found at code point boundaries. That means 01056 that if the substring begins with a trail surrogate or ends 01057 with a lead surrogate, then it is found only if these 01058 surrogates stand alone in the text. Otherwise, the substring 01059 edge units would be matched against halves of surrogate pairs. 01060 01061 ***********************************************************************/ 01062 01063 final uint lastIndexOf (wchar[] chars, uint start=uint.max) 01064 { 01065 pinIndex (start); 01066 wchar* s = u_strFindLast (content, start, chars, chars.length); 01067 if (s) 01068 return s - cast(wchar*) content; 01069 return uint.max; 01070 } 01071 01072 /*********************************************************************** 01073 01074 Lowercase the characters into a seperate UString. 01075 01076 Casing is locale-dependent and context-sensitive. The 01077 result may be longer or shorter than the original. 01078 01079 Note that the return value refers to the provided destination 01080 UString. 01081 01082 ***********************************************************************/ 01083 01084 final UString toLower (UString dst) 01085 { 01086 return toLower (dst, ULocale.Default); 01087 } 01088 01089 /*********************************************************************** 01090 01091 Lowercase the characters into a seperate UString. 01092 01093 Casing is locale-dependent and context-sensitive. The 01094 result may be longer or shorter than the original. 01095 01096 Note that the return value refers to the provided destination 01097 UString. 01098 01099 ***********************************************************************/ 01100 01101 final UString toLower (UString dst, inout ULocale locale) 01102 { 01103 uint lower (wchar* dst, uint length, inout Error e) 01104 { 01105 return u_strToLower (dst, length, content, len, toString(locale.name), e); 01106 } 01107 01108 dst.expand (len + 32); 01109 return dst.format (&lower, "toLower() failed"); 01110 } 01111 01112 /*********************************************************************** 01113 01114 Uppercase the characters into a seperate UString. 01115 01116 Casing is locale-dependent and context-sensitive. The 01117 result may be longer or shorter than the original. 01118 01119 Note that the return value refers to the provided destination 01120 UString. 01121 01122 ***********************************************************************/ 01123 01124 final UString toUpper (UString dst) 01125 { 01126 return toUpper (dst, ULocale.Default); 01127 } 01128 01129 /*********************************************************************** 01130 01131 Uppercase the characters into a seperate UString. 01132 01133 Casing is locale-dependent and context-sensitive. The 01134 result may be longer or shorter than the original. 01135 01136 Note that the return value refers to the provided destination 01137 UString. 01138 01139 ***********************************************************************/ 01140 01141 final UString toUpper (UString dst, inout ULocale locale) 01142 { 01143 uint upper (wchar* dst, uint length, inout Error e) 01144 { 01145 return u_strToUpper (dst, length, content, len, toString(locale.name), e); 01146 } 01147 01148 dst.expand (len + 32); 01149 return dst.format (&upper, "toUpper() failed"); 01150 } 01151 01152 /*********************************************************************** 01153 01154 Case-fold the characters into a seperate UString. 01155 01156 Case-folding is locale-independent and not context-sensitive, 01157 but there is an option for whether to include or exclude 01158 mappings for dotted I and dotless i that are marked with 'I' 01159 in CaseFolding.txt. The result may be longer or shorter than 01160 the original. 01161 01162 Note that the return value refers to the provided destination 01163 UString. 01164 01165 ***********************************************************************/ 01166 01167 final UString toFolded (UString dst, CaseOption option = CaseOption.Default) 01168 { 01169 uint fold (wchar* dst, uint length, inout Error e) 01170 { 01171 return u_strFoldCase (dst, length, content, len, option, e); 01172 } 01173 01174 dst.expand (len + 32); 01175 return dst.format (&fold, "toFolded() failed"); 01176 } 01177 01178 /*********************************************************************** 01179 01180 Converts a sequence of wchar (UTF-16) to UTF-8 bytes. If 01181 the output array is not provided, an array of appropriate 01182 size will be allocated and returned. Where the output is 01183 provided, it must be large enough to hold potentially four 01184 bytes per character for surrogate-pairs or three bytes per 01185 character for BMP only. Consider using UConverter where 01186 streaming conversions are required. 01187 01188 Returns an array slice representing the valid UTF8 content. 01189 01190 ***********************************************************************/ 01191 01192 final char[] toUtf8 (char[] dst = null) 01193 { 01194 uint x; 01195 Error e; 01196 01197 if (! cast(char*) dst) 01198 dst = new char[len * 4]; 01199 01200 u_strToUTF8 (dst, dst.length, &x, content, len, e); 01201 testError (e, "failed to convert to UTF8"); 01202 return dst [0..x]; 01203 } 01204 01205 /*********************************************************************** 01206 01207 Remove leading and trailing whitespace from this UText. 01208 Note that we slice the content to remove leading space. 01209 01210 ***********************************************************************/ 01211 01212 UText trim () 01213 { 01214 wchar c; 01215 uint i = len; 01216 01217 // cut off trailing white space 01218 while (i && ((c = charAt(i-1)) == 0x20 || UChar.isWhiteSpace (c))) 01219 --i; 01220 len = i; 01221 01222 // now remove leading whitespace 01223 for (i=0; i < len && ((c = charAt(i)) == 0x20 || UChar.isWhiteSpace (c)); ++i) {} 01224 if (i) 01225 { 01226 len -= i; 01227 content = content[i..length-i]; 01228 } 01229 01230 return this; 01231 } 01232 01233 /*********************************************************************** 01234 01235 Unescape a string of characters and write the resulting 01236 Unicode characters to the destination buffer. The following 01237 escape sequences are recognized: 01238 01239 uhhhh 4 hex digits; h in [0-9A-Fa-f] 01240 Uhhhhhhhh 8 hex digits 01241 xhh 1-2 hex digits 01242 x{h...} 1-8 hex digits 01243 ooo 1-3 octal digits; o in [0-7] 01244 cX control-X; X is masked with 0x1F 01245 01246 as well as the standard ANSI C escapes: 01247 01248 a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A, 01249 v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B, 01250 \\" =U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C 01251 01252 Anything else following a backslash is generically escaped. 01253 For example, "[a\\-z]" returns "[a-z]". 01254 01255 If an escape sequence is ill-formed, this method returns an 01256 empty string. An example of an ill-formed sequence is "\\u" 01257 followed by fewer than 4 hex digits. 01258 01259 ***********************************************************************/ 01260 01261 final UString unEscape () 01262 { 01263 UString result = new UString (len); 01264 for (uint i=0; i < len;) 01265 { 01266 dchar c = charAt(i++); 01267 if (c == 0x005C) 01268 { 01269 // bump index ... 01270 c = u_unescapeAt (&_charAt, &i, len, cast(void*) this); 01271 01272 // error? 01273 if (c == 0xFFFFFFFF) 01274 { 01275 result.truncate (); // return empty string 01276 break; // invalid escape sequence 01277 } 01278 } 01279 result.append (c); 01280 } 01281 return result; 01282 } 01283 01284 /*********************************************************************** 01285 01286 Is this code point a surrogate (U+d800..U+dfff)? 01287 01288 ***********************************************************************/ 01289 01290 final static bool isSurrogate (wchar c) 01291 { 01292 return (c & 0xfffff800) == 0xd800; 01293 } 01294 01295 /*********************************************************************** 01296 01297 Is this code unit a lead surrogate (U+d800..U+dbff)? 01298 01299 ***********************************************************************/ 01300 01301 final static bool isLeading (wchar c) 01302 { 01303 return (c & 0xfffffc00) == 0xd800; 01304 } 01305 01306 /*********************************************************************** 01307 01308 Is this code unit a trail surrogate (U+dc00..U+dfff)? 01309 01310 ***********************************************************************/ 01311 01312 final static bool isTrailing (wchar c) 01313 { 01314 return (c & 0xfffffc00) == 0xdc00; 01315 } 01316 01317 /*********************************************************************** 01318 01319 Adjust a random-access offset to a code point boundary 01320 at the start of a code point. If the offset points to 01321 the trail surrogate of a surrogate pair, then the offset 01322 is decremented. Otherwise, it is not modified. 01323 01324 ***********************************************************************/ 01325 01326 final uint getCharStart (uint i) 01327 in { 01328 if (i >= len) 01329 exception ("index of out bounds"); 01330 } 01331 body 01332 { 01333 if (isTrailing (content[i]) && i && isLeading (content[i-1])) 01334 --i; 01335 return i; 01336 } 01337 01338 /*********************************************************************** 01339 01340 Adjust a random-access offset to a code point boundary 01341 after a code point. If the offset is behind the lead 01342 surrogate of a surrogate pair, then the offset is 01343 incremented. Otherwise, it is not modified. 01344 01345 ***********************************************************************/ 01346 01347 final uint getCharLimit (uint i) 01348 in { 01349 if (i >= len) 01350 exception ("index of out bounds"); 01351 } 01352 body 01353 { 01354 if (i && isLeading(content[i-1]) && isTrailing (content[i])) 01355 ++i; 01356 return i; 01357 } 01358 01359 /*********************************************************************** 01360 01361 Callback for C unescapeAt() function 01362 01363 ***********************************************************************/ 01364 01365 extern (C) 01366 { 01367 typedef wchar function (uint offset, void* context) CharAt; 01368 01369 private static wchar _charAt (uint offset, void* context) 01370 { 01371 return (cast(UString) context).charAt (offset); 01372 } 01373 } 01374 01375 /*********************************************************************** 01376 01377 Pin the given index to a valid position. 01378 01379 ***********************************************************************/ 01380 01381 final private void pinIndex (inout uint x) 01382 { 01383 if (x > len) 01384 x = len; 01385 } 01386 01387 /*********************************************************************** 01388 01389 Pin the given index and length to a valid position. 01390 01391 ***********************************************************************/ 01392 01393 final private void pinIndices (inout uint start, inout uint length) 01394 { 01395 if (start > len) 01396 start = len; 01397 01398 if (length > (len - start)) 01399 length = len - start; 01400 } 01401 01402 /*********************************************************************** 01403 01404 Helper for comparison methods 01405 01406 ***********************************************************************/ 01407 01408 final private int compareFolded (wchar[] s1, wchar[] s2, CaseOption option = CaseOption.Default) 01409 { 01410 Error e; 01411 01412 int x = u_strCaseCompare (s1, s1.length, s2, s2.length, option, e); 01413 testError (e, "compareFolded failed"); 01414 return x; 01415 } 01416 01417 01418 /*********************************************************************** 01419 01420 Bind the ICU functions from a shared library. This is 01421 complicated by the issues regarding D and DLLs on the 01422 Windows platform 01423 01424 ***********************************************************************/ 01425 01426 private static void* library; 01427 01428 /*********************************************************************** 01429 01430 ***********************************************************************/ 01431 01432 private static extern (C) 01433 { 01434 wchar* function (wchar*, uint, wchar*, uint) u_strFindFirst; 01435 wchar* function (wchar*, uint, wchar*, uint) u_strFindLast; 01436 wchar* function (wchar*, wchar, uint) u_memchr; 01437 wchar* function (wchar*, wchar, uint) u_memrchr; 01438 int function (wchar*, uint, wchar*, uint, bool) u_strCompare; 01439 int function (wchar*, uint, wchar*, uint, uint, inout Error) u_strCaseCompare; 01440 dchar function (CharAt, uint*, uint, void*) u_unescapeAt; 01441 uint function (wchar*, uint) u_countChar32; 01442 uint function (wchar*, uint, wchar*, uint, char*, inout Error) u_strToUpper; 01443 uint function (wchar*, uint, wchar*, uint, char*, inout Error) u_strToLower; 01444 uint function (wchar*, uint, wchar*, uint, uint, inout Error) u_strFoldCase; 01445 wchar* function (wchar*, uint, uint*, char*, uint, inout Error) u_strFromUTF8; 01446 char* function (char*, uint, uint*, wchar*, uint, inout Error) u_strToUTF8; 01447 } 01448 01449 /*********************************************************************** 01450 01451 ***********************************************************************/ 01452 01453 static FunctionLoader.Bind[] targets = 01454 [ 01455 {cast(void**) &u_strFindFirst, "u_strFindFirst"}, 01456 {cast(void**) &u_strFindLast, "u_strFindLast"}, 01457 {cast(void**) &u_memchr, "u_memchr"}, 01458 {cast(void**) &u_memrchr, "u_memrchr"}, 01459 {cast(void**) &u_strCompare, "u_strCompare"}, 01460 {cast(void**) &u_strCaseCompare, "u_strCaseCompare"}, 01461 {cast(void**) &u_unescapeAt, "u_unescapeAt"}, 01462 {cast(void**) &u_countChar32, "u_countChar32"}, 01463 {cast(void**) &u_strToUpper, "u_strToUpper"}, 01464 {cast(void**) &u_strToLower, "u_strToLower"}, 01465 {cast(void**) &u_strFoldCase, "u_strFoldCase"}, 01466 {cast(void**) &u_strFromUTF8, "u_strFromUTF8"}, 01467 {cast(void**) &u_strToUTF8, "u_strToUTF8"}, 01468 ]; 01469 01470 /*********************************************************************** 01471 01472 ***********************************************************************/ 01473 01474 static this () 01475 { 01476 library = FunctionLoader.bind (icuuc, targets); 01477 //test (); 01478 } 01479 01480 /*********************************************************************** 01481 01482 ***********************************************************************/ 01483 01484 static ~this () 01485 { 01486 FunctionLoader.unbind (library); 01487 } 01488 01489 /*********************************************************************** 01490 01491 ***********************************************************************/ 01492 01493 private static void test() 01494 { 01495 UString s = new UString (r"aaaqw \uabcd eaaa"); 01496 char[] x = "dssfsdff"; 01497 s ~ x ~ x; 01498 wchar c = s[3]; 01499 s[3] = 'Q'; 01500 int y = s.indexOf ("qwe"); 01501 s.unEscape (); 01502 s.toUpper (new UString); 01503 s.padLeading(2).padTrailing(2).trim(); 01504 } 01505 }