00001 /******************************************************************************* 00002 00003 @file UString.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version, October 2004 00034 @author Kris 00035 00036 Note that this package and documentation is built around the ICU 00037 project (http://oss.software.ibm.com/icu/). Below is the license 00038 statement as specified by that software: 00039 00040 00041 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00042 00043 00044 ICU License - ICU 1.8.1 and later 00045 00046 COPYRIGHT AND PERMISSION NOTICE 00047 00048 Copyright (c) 1995-2003 International Business Machines Corporation and 00049 others. 00050 00051 All rights reserved. 00052 00053 Permission is hereby granted, free of charge, to any person obtaining a 00054 copy of this software and associated documentation files (the 00055 "Software"), to deal in the Software without restriction, including 00056 without limitation the rights to use, copy, modify, merge, publish, 00057 distribute, and/or sell copies of the Software, and to permit persons 00058 to whom the Software is furnished to do so, provided that the above 00059 copyright notice(s) and this permission notice appear in all copies of 00060 the Software and that both the above copyright notice(s) and this 00061 permission notice appear in supporting documentation. 00062 00063 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 00064 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00065 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 00066 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 00067 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL 00068 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING 00069 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, 00070 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION 00071 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 00072 00073 Except as contained in this notice, the name of a copyright holder 00074 shall not be used in advertising or otherwise to promote the sale, use 00075 or other dealings in this Software without prior written authorization 00076 of the copyright holder. 00077 00078 ---------------------------------------------------------------------- 00079 00080 All trademarks and registered trademarks mentioned herein are the 00081 property of their respective owners. 00082 00083 *******************************************************************************/ 00084 00085 module mango.icu.UString; 00086 00087 private import mango.icu.ICU, 00088 mango.icu.UChar, 00089 mango.icu.ULocale; 00090 00091 /******************************************************************************* 00092 00093 *******************************************************************************/ 00094 00095 private extern (C) void memmove (void* dst, void* src, uint bytes); 00096 00097 /******************************************************************************* 00098 00099 Bind to the IReadable and IWritable interfaces if we're building 00100 along with the mango.io package 00101 00102 *******************************************************************************/ 00103 00104 version (Isolated) 00105 { 00106 private interface ITextOther {} 00107 private interface IStringOther {} 00108 } 00109 else 00110 { 00111 private import mango.icu.UMango; 00112 00113 private import mango.io.model.IReader, 00114 mango.io.model.IWriter; 00115 00116 private interface ITextOther : IWritable {} 00117 private interface IStringOther : IReadable {} 00118 } 00119 00120 00121 /******************************************************************************* 00122 00123 UString is a string class that stores Unicode characters directly 00124 and provides similar functionality as the Java String class. 00125 00126 In ICU, a Unicode string consists of 16-bit Unicode code units. 00127 A Unicode character may be stored with either one code unit — 00128 which is the most common case — or with a matched pair of 00129 special code units ("surrogates"). The data type for code units 00130 is UChar. 00131 00132 For single-character handling, a Unicode character code point is 00133 a value in the range 0..0x10ffff. ICU uses the UChar32 type for 00134 code points. 00135 00136 Indexes and offsets into and lengths of strings always count code 00137 units, not code points. This is the same as with multi-byte char* 00138 strings in traditional string handling. Operations on partial 00139 strings typically do not test for code point boundaries. If necessary, 00140 the user needs to take care of such boundaries by testing for the code 00141 unit values or by using functions like getChar32Start() 00142 and getChar32Limit() 00143 00144 UString methods are more lenient with regard to input parameter values 00145 than other ICU APIs. In particular: 00146 00147 - If indexes are out of bounds for a UString object (< 0 or > length) 00148 then they are "pinned" to the nearest boundary. 00149 00150 - If primitive string pointer values (e.g., const wchar* or char*) for 00151 input strings are null, then those input string parameters are treated 00152 as if they pointed to an empty string. However, this is not the case 00153 for char* parameters for charset names or other IDs. 00154 00155 *******************************************************************************/ 00156 00157 class UString : UText, IStringOther 00158 { 00159 alias opCat append; 00160 alias opIndexAssign setCharAt; 00161 00162 /*********************************************************************** 00163 00164 Create an empty UString with the specified available space 00165 00166 ***********************************************************************/ 00167 00168 this (uint space = 0) 00169 { 00170 content.length = space; 00171 mutable = true; 00172 } 00173 00174 /*********************************************************************** 00175 00176 Create a UString upon the provided content. If said content 00177 is immutable (read-only) then you might consider setting the 00178 'mutable' parameter to false. Doing so will avoid allocating 00179 heap-space for the content until it is modified. 00180 00181 ***********************************************************************/ 00182 00183 this (wchar[] content, bool mutable = true) 00184 { 00185 setTo (content, mutable); 00186 } 00187 00188 /*********************************************************************** 00189 00190 Create a UString via the content of a UText. Note that the 00191 default is to assume the content is immutable (read-only). 00192 00193 ***********************************************************************/ 00194 00195 this (UText other, bool mutable = false) 00196 { 00197 this (other.get, mutable); 00198 } 00199 00200 /*********************************************************************** 00201 00202 Create a UString via the content of a UString. If said content 00203 is immutable (read-only) then you might consider setting the 00204 'mutable' parameter to false. Doing so will avoid allocating 00205 heap-space for the content until it is modified via UString 00206 methods. 00207 00208 ***********************************************************************/ 00209 00210 this (UString other, bool mutable = true) 00211 { 00212 this (other.get, mutable); 00213 } 00214 00215 /*********************************************************************** 00216 00217 Support for reading content via the IO system 00218 00219 ***********************************************************************/ 00220 00221 version (Isolated){} 00222 else 00223 { 00224 /*************************************************************** 00225 00226 Internal adapter to handle loading and conversion 00227 of UString content. Once constructed, this may be 00228 used as the target for an IReader. Alternatively, 00229 invoke the load() method with an IBuffer of choice. 00230 00231 ***************************************************************/ 00232 00233 class UStringDecoder : StringDecoder16 00234 { 00235 private UString s; 00236 00237 // construct a decoder on the given UString 00238 this (UConverter c, uint bytes, UString s) 00239 { 00240 super (c, bytes); 00241 this.s = s; 00242 } 00243 00244 // IReadable adapter to perform the conversion 00245 protected void read (IReader r) 00246 { 00247 load (r.getBuffer); 00248 } 00249 00250 // read from the provided buffer until we 00251 // either have all the content, or an eof 00252 // condition throws an exception. 00253 package void load (IBuffer b) 00254 { 00255 uint produced = super.read (b, s.content); 00256 while (toGo) 00257 { 00258 s.expand (toGo); 00259 produced += super.read (b, s.content[produced..length]); 00260 } 00261 s.len = produced; 00262 } 00263 } 00264 00265 /*************************************************************** 00266 00267 Another constructor for loading known content length 00268 into a UString. 00269 00270 ***************************************************************/ 00271 00272 this (IBuffer buffer, uint contentLength, UConverter cvt) 00273 { 00274 this (contentLength); 00275 UStringDecoder sd = new UStringDecoder (cvt, contentLength, this); 00276 sd.load (buffer); 00277 } 00278 00279 /*************************************************************** 00280 00281 Read as many bytes from the input as is necessary 00282 to produce the expected number of wchar elements. 00283 This uses the default wchar handler, which can be 00284 altered by binding a StringDecoder to the IReader 00285 in use (see UMango for details). 00286 00287 We're mutable, so ensure we don't mess with the 00288 IO buffers. Interestingly, changing the length 00289 of a D array will account for slice assignments 00290 (it checks the pointer to see if it's a starting 00291 point in the pool). Unfortunately, that doesn't 00292 catch the case where a slice starts at offset 0, 00293 which is where IBuffer slices may come from. 00294 00295 To be safe, we ask the allocator in use whether 00296 the content it provided can be mutated or not. 00297 Note that this is not necessary for UText, since 00298 that is a read-only construct. 00299 00300 ***************************************************************/ 00301 00302 void read (IReader r) 00303 { 00304 r.get (content); 00305 len = content.length; 00306 mutable = r.getAllocator.isMutable (content); 00307 } 00308 00309 /*************************************************************** 00310 00311 Return a streaming decoder that can be used to 00312 populate this UString with a specified number of 00313 input bytes. 00314 00315 This differs from the above read() method in the 00316 way content is read: in the above case, exactly 00317 the specified number of wchar elements will be 00318 converter from the input, whereas in this case 00319 a variable number of wchar elements are converted 00320 until 'bytes' have been read from the input. This 00321 is useful in those cases where the original number 00322 of elements has been lost, and only the resultant 00323 converted byte-count remains (a la HTTP). 00324 00325 The returned StringDecoder is one-shot only. You may 00326 reuse it (both the converter and the byte count) via 00327 its reset() method. 00328 00329 One applies the resultant converter directly with an 00330 IReader like so: 00331 00332 @code 00333 UString s = ...; 00334 IReader r = ...; 00335 00336 // r >> s.createDecoder(cvt, bytes); 00337 r.get (s.createDecoder(cvt, bytes)); 00338 @endcode 00339 00340 which will read the specified number of bytes from 00341 the input and convert them to an appropriate number 00342 of wchars within the UString. 00343 00344 ***************************************************************/ 00345 00346 StringDecoder createDecoder (UConverter c, uint bytes) 00347 { 00348 return new UStringDecoder (c, bytes, this); 00349 } 00350 } 00351 00352 /*********************************************************************** 00353 00354 Append text to this UString 00355 00356 ***********************************************************************/ 00357 00358 UString opCat (UText other) 00359 { 00360 return opCat (other.get); 00361 } 00362 00363 /*********************************************************************** 00364 00365 Append partial text to this UString 00366 00367 ***********************************************************************/ 00368 00369 UString opCat (UText other, uint start, uint len=uint.max) 00370 { 00371 other.pinIndices (start, len); 00372 return opCat (other.content [start..start+len]); 00373 } 00374 00375 /*********************************************************************** 00376 00377 Append a single character to this UString 00378 00379 ***********************************************************************/ 00380 00381 UString opCat (wchar chr) 00382 { 00383 return opCat (&chr, 1); 00384 } 00385 00386 /*********************************************************************** 00387 00388 Append text to this UString 00389 00390 ***********************************************************************/ 00391 00392 UString opCat (wchar[] chars) 00393 { 00394 return opCat (chars, chars.length); 00395 } 00396 00397 /*********************************************************************** 00398 00399 Converts a sequence of UTF-8 bytes to UChars (UTF-16) 00400 00401 ***********************************************************************/ 00402 00403 UString opCat (char[] chars) 00404 { 00405 uint fmt (wchar* dst, uint len, inout Error e) 00406 { 00407 uint x; 00408 00409 u_strFromUTF8 (dst, len, &x, chars, chars.length, e); 00410 return x; 00411 } 00412 00413 expand (chars.length); 00414 return format (&fmt, "failed to append UTF char[]"); 00415 } 00416 00417 /*********************************************************************** 00418 00419 Set a section of this UString to the specified character 00420 00421 ***********************************************************************/ 00422 00423 UString setTo (wchar chr, uint start=0, uint len=uint.max) 00424 { 00425 pinIndices (start, len); 00426 if (! mutable) 00427 realloc (); 00428 content [start..start+len] = chr; 00429 return this; 00430 } 00431 00432 /*********************************************************************** 00433 00434 Set the content to the provided array. Parameter 'mutable' 00435 specifies whether the given array is likely to change. If 00436 not, the array is aliased until such time this UString is 00437 altered. 00438 00439 ***********************************************************************/ 00440 00441 UString setTo (wchar[] chars, bool mutable = true) 00442 { 00443 len = chars.length; 00444 if ((this.mutable = mutable) == true) 00445 content = chars.dup; 00446 else 00447 content = chars; 00448 return this; 00449 } 00450 00451 /*********************************************************************** 00452 00453 Replace the content of this UString. If the new content 00454 is immutable (read-only) then you might consider setting the 00455 'mutable' parameter to false. Doing so will avoid allocating 00456 heap-space for the content until it is modified via one of 00457 these methods. 00458 00459 ***********************************************************************/ 00460 00461 UString setTo (UText other, bool mutable = true) 00462 { 00463 return setTo (other.get, mutable); 00464 } 00465 00466 /*********************************************************************** 00467 00468 Replace the content of this UString. If the new content 00469 is immutable (read-only) then you might consider setting the 00470 'mutable' parameter to false. Doing so will avoid allocating 00471 heap-space for the content until it is modified via one of 00472 these methods. 00473 00474 ***********************************************************************/ 00475 00476 UString setTo (UText other, uint start, uint len, bool mutable = true) 00477 { 00478 other.pinIndices (start, len); 00479 return setTo (other.content [start..start+len], mutable); 00480 } 00481 00482 /*********************************************************************** 00483 00484 Replace the character at the specified location. 00485 00486 ***********************************************************************/ 00487 00488 final UString opIndexAssign (wchar chr, uint index) 00489 in { 00490 if (index >= len) 00491 exception ("index of out bounds"); 00492 } 00493 body 00494 { 00495 if (! mutable) 00496 realloc (); 00497 content [index] = chr; 00498 return this; 00499 } 00500 00501 /*********************************************************************** 00502 00503 Remove a piece of this UString. 00504 00505 ***********************************************************************/ 00506 00507 UString remove (uint start, uint length=uint.max) 00508 { 00509 pinIndices (start, length); 00510 if (length) 00511 if (start >= len) 00512 truncate (start); 00513 else 00514 { 00515 if (! mutable) 00516 realloc (); 00517 00518 uint i = start + length; 00519 memmove (&content[start], &content[i], (len-i) * wchar.sizeof); 00520 len -= length; 00521 } 00522 return this; 00523 } 00524 00525 /*********************************************************************** 00526 00527 Truncate the length of this UString. 00528 00529 ***********************************************************************/ 00530 00531 UString truncate (uint length=0) 00532 { 00533 if (length <= len) 00534 len = length; 00535 return this; 00536 } 00537 00538 /*********************************************************************** 00539 00540 Insert leading spaces in this UString 00541 00542 ***********************************************************************/ 00543 00544 UString padLeading (uint count, wchar padChar = 0x0020) 00545 { 00546 expand (count); 00547 memmove (&content[count], content, len * wchar.sizeof); 00548 len += count; 00549 return setTo (padChar, 0, count); 00550 } 00551 00552 /*********************************************************************** 00553 00554 Append some trailing spaces to this UString. 00555 00556 ***********************************************************************/ 00557 00558 UString padTrailing (uint length, wchar padChar = 0x0020) 00559 { 00560 expand (length); 00561 len += length; 00562 return setTo (padChar, len-length, length); 00563 } 00564 00565 /*********************************************************************** 00566 00567 Check for available space within the buffer, and expand 00568 as necessary. 00569 00570 ***********************************************************************/ 00571 00572 package final void expand (uint count) 00573 { 00574 if ((len + count) > content.length) 00575 realloc (count); 00576 } 00577 00578 /*********************************************************************** 00579 00580 Allocate memory due to a change in the content. We handle 00581 the distinction between mutable and immutable here. 00582 00583 ***********************************************************************/ 00584 00585 private final void realloc (uint count = 0) 00586 { 00587 uint size = (content.length + count + 63) & ~63; 00588 00589 if (mutable) 00590 content.length = size; 00591 else 00592 { 00593 mutable = true; 00594 wchar[] x = content; 00595 content = new wchar [size]; 00596 if (len) 00597 content[0..len] = x; 00598 } 00599 } 00600 00601 /*********************************************************************** 00602 00603 Internal method to support UString appending 00604 00605 ***********************************************************************/ 00606 00607 private final UString opCat (wchar* chars, uint count) 00608 { 00609 expand (count); 00610 content[len..len+count] = chars[0..count]; 00611 len += count; 00612 return this; 00613 } 00614 00615 /*********************************************************************** 00616 00617 Internal method to support formatting into this UString. 00618 This is used by many of the ICU wrappers to append content 00619 into a UString. 00620 00621 ***********************************************************************/ 00622 00623 typedef uint delegate (wchar* dst, uint len, inout Error e) Formatter; 00624 00625 package final UString format (Formatter format, char[] msg) 00626 { 00627 Error e; 00628 uint length; 00629 00630 while (true) 00631 { 00632 e = e.OK; 00633 length = format (&content[len], content.length - len, e); 00634 if (e == e.BufferOverflow) 00635 expand (length); 00636 else 00637 break; 00638 } 00639 00640 if (isError (e)) 00641 exception (msg); 00642 00643 len += length; 00644 return this; 00645 } 00646 } 00647 00648 00649 /******************************************************************************* 00650 00651 Immutable (read-only) text -- use UString for mutable strings. 00652 00653 *******************************************************************************/ 00654 00655 class UText : ICU, ITextOther 00656 { 00657 alias opIndex charAt; 00658 00659 // the core of the UText and UString attributes. The name 'len' 00660 // is used rather than the more obvious 'length' since there is 00661 // a collision with the silly array[length] syntactic sugar ... 00662 package uint len; 00663 package wchar[] content; 00664 00665 // this should probably be in UString only, but there seems to 00666 // be a compiler bug where it doesn't get initialised correctly, 00667 // and it's perhaps useful to have here for when a UString is 00668 // passed as a UText argument. 00669 private bool mutable; 00670 00671 // toFolded() argument 00672 public enum CaseOption 00673 { 00674 Default = 0, 00675 SpecialI = 1 00676 } 00677 00678 /*********************************************************************** 00679 00680 Hidden constructor 00681 00682 ***********************************************************************/ 00683 00684 private this () 00685 { 00686 } 00687 00688 /*********************************************************************** 00689 00690 Construct read-only wrapper around the given content 00691 00692 ***********************************************************************/ 00693 00694 this (wchar[] content) 00695 { 00696 this.content = content; 00697 this.len = content.length; 00698 } 00699 00700 /*********************************************************************** 00701 00702 Support for writing via the Mango IO subsystem 00703 00704 ***********************************************************************/ 00705 00706 version (Isolated){} 00707 else 00708 { 00709 void write (IWriter w) 00710 { 00711 w.put (get); 00712 } 00713 } 00714 00715 /*********************************************************************** 00716 00717 Return the valid content from this UText 00718 00719 ***********************************************************************/ 00720 00721 final package wchar[] get () 00722 { 00723 return content [0..len]; 00724 } 00725 00726 /*********************************************************************** 00727 00728 Is this UText equal to another? 00729 00730 ***********************************************************************/ 00731 00732 final override int opEquals (Object o) 00733 { 00734 UText other = cast(UText) o; 00735 00736 if (other) 00737 return (other is this || compare (other) == 0); 00738 return 0; 00739 } 00740 00741 /*********************************************************************** 00742 00743 Compare this UText to another. 00744 00745 ***********************************************************************/ 00746 00747 final override int opCmp (Object o) 00748 { 00749 UText other = cast(UText) o; 00750 00751 if (other is this) 00752 return 0; 00753 else 00754 if (other) 00755 return compare (other); 00756 return 1; 00757 } 00758 00759 /*********************************************************************** 00760 00761 Hash this UText 00762 00763 ***********************************************************************/ 00764 00765 final override uint toHash () 00766 { 00767 return typeid(wchar[]).getHash (&content[0..len]); 00768 } 00769 00770 /*********************************************************************** 00771 00772 Clone this UText into a UString 00773 00774 ***********************************************************************/ 00775 00776 final UString copy () 00777 { 00778 return new UString (content); 00779 } 00780 00781 /*********************************************************************** 00782 00783 Clone a section of this UText into a UString 00784 00785 ***********************************************************************/ 00786 00787 final UString extract (uint start, uint len=uint.max) 00788 { 00789 pinIndices (start, len); 00790 return new UString (content[start..start+len]); 00791 } 00792 00793 /*********************************************************************** 00794 00795 Count unicode code points in the length UChar code units of 00796 the string. A code point may occupy either one or two UChar 00797 code units. Counting code points involves reading all code 00798 units. 00799 00800 ***********************************************************************/ 00801 00802 final uint codePoints (uint start=0, uint length=uint.max) 00803 { 00804 pinIndices (start, length); 00805 return u_countChar32 (&content[start], length); 00806 } 00807 00808 /*********************************************************************** 00809 00810 Return an indication whether or not there are surrogate pairs 00811 within the string. 00812 00813 ***********************************************************************/ 00814 00815 final bool hasSurrogates (uint start=0, uint length=uint.max) 00816 { 00817 pinIndices (start, length); 00818 return codePoints (start, length) != length; 00819 } 00820 00821 /*********************************************************************** 00822 00823 Return the character at the specified position. 00824 00825 ***********************************************************************/ 00826 00827 final wchar opIndex (uint index) 00828 in { 00829 if (index >= len) 00830 exception ("index of out bounds"); 00831 } 00832 body 00833 { 00834 return content [index]; 00835 } 00836 00837 /*********************************************************************** 00838 00839 Return the length of the valid content 00840 00841 ***********************************************************************/ 00842 00843 final uint length () 00844 { 00845 return len; 00846 } 00847 00848 /*********************************************************************** 00849 00850 The comparison can be done in code unit order or in code 00851 point order. They differ only in UTF-16 when comparing 00852 supplementary code points (U+10000..U+10ffff) to BMP code 00853 points near the end of the BMP (i.e., U+e000..U+ffff). 00854 00855 In code unit order, high BMP code points sort after 00856 supplementary code points because they are stored as 00857 pairs of surrogates which are at U+d800..U+dfff. 00858 00859 ***********************************************************************/ 00860 00861 final int compare (UText other, bool codePointOrder=false) 00862 { 00863 return compare (other.get, codePointOrder); 00864 } 00865 00866 /*********************************************************************** 00867 00868 The comparison can be done in code unit order or in code 00869 point order. They differ only in UTF-16 when comparing 00870 supplementary code points (U+10000..U+10ffff) to BMP code 00871 points near the end of the BMP (i.e., U+e000..U+ffff). 00872 00873 In code unit order, high BMP code points sort after 00874 supplementary code points because they are stored as 00875 pairs of surrogates which are at U+d800..U+dfff. 00876 00877 ***********************************************************************/ 00878 00879 final int compare (wchar[] other, bool codePointOrder=false) 00880 { 00881 return u_strCompare (content, len, other, other.length, codePointOrder); 00882 } 00883 00884 /*********************************************************************** 00885 00886 The comparison can be done in UTF-16 code unit order or 00887 in code point order. They differ only when comparing 00888 supplementary code points (U+10000..U+10ffff) to BMP code 00889 points near the end of the BMP (i.e., U+e000..U+ffff). 00890 00891 In code unit order, high BMP code points sort after 00892 supplementary code points because they are stored as 00893 pairs of surrogates which are at U+d800..U+dfff. 00894 00895 ***********************************************************************/ 00896 00897 final int compareFolded (UText other, CaseOption option = CaseOption.Default) 00898 { 00899 return compareFolded (other.content, option); 00900 } 00901 00902 /*********************************************************************** 00903 00904 The comparison can be done in UTF-16 code unit order or 00905 in code point order. They differ only when comparing 00906 supplementary code points (U+10000..U+10ffff) to BMP code 00907 points near the end of the BMP (i.e., U+e000..U+ffff). 00908 00909 In code unit order, high BMP code points sort after 00910 supplementary code points because they are stored as 00911 pairs of surrogates which are at U+d800..U+dfff. 00912 00913 ***********************************************************************/ 00914 00915 final int compareFolded (wchar[] other, CaseOption option = CaseOption.Default) 00916 { 00917 return compareFolded (get, other, option); 00918 } 00919 00920 /*********************************************************************** 00921 00922 Does this UText start with specified string? 00923 00924 ***********************************************************************/ 00925 00926 final bool startsWith (UText other) 00927 { 00928 return startsWith (other.get); 00929 } 00930 00931 /*********************************************************************** 00932 00933 Does this UText start with specified string? 00934 00935 ***********************************************************************/ 00936 00937 final bool startsWith (wchar[] chars) 00938 { 00939 if (len >= chars.length) 00940 return compareFolded (content[0..chars.length], chars) == 0; 00941 return false; 00942 } 00943 00944 /*********************************************************************** 00945 00946 Does this UText end with specified string? 00947 00948 ***********************************************************************/ 00949 00950 final bool endsWith (UText other) 00951 { 00952 return endsWith (other.get); 00953 } 00954 00955 /*********************************************************************** 00956 00957 Does this UText end with specified string? 00958 00959 ***********************************************************************/ 00960 00961 final bool endsWith (wchar[] chars) 00962 { 00963 if (len >= chars.length) 00964 return compareFolded (content[len-chars.length..len], chars) == 0; 00965 return false; 00966 } 00967 00968 /*********************************************************************** 00969 00970 Find the first occurrence of a BMP code point in a string. 00971 A surrogate code point is found only if its match in the 00972 text is not part of a surrogate pair. 00973 00974 ***********************************************************************/ 00975 00976 final uint indexOf (wchar c, uint start=0) 00977 { 00978 pinIndex (start); 00979 wchar* s = u_memchr (&content[start], c, len-start); 00980 if (s) 00981 return s - cast(wchar*) content; 00982 return uint.max; 00983 } 00984 00985 /*********************************************************************** 00986 00987 Find the first occurrence of a substring in a string. 00988 00989 The substring is found at code point boundaries. That means 00990 that if the substring begins with a trail surrogate or ends 00991 with a lead surrogate, then it is found only if these 00992 surrogates stand alone in the text. Otherwise, the substring 00993 edge units would be matched against halves of surrogate pairs. 00994 00995 ***********************************************************************/ 00996 00997 final uint indexOf (UText other, uint start=0) 00998 { 00999 return indexOf (other.get, start); 01000 } 01001 01002 /*********************************************************************** 01003 01004 Find the first occurrence of a substring in a string. 01005 01006 The substring is found at code point boundaries. That means 01007 that if the substring begins with a trail surrogate or ends 01008 with a lead surrogate, then it is found only if these 01009 surrogates stand alone in the text. Otherwise, the substring 01010 edge units would be matched against halves of surrogate pairs. 01011 01012 ***********************************************************************/ 01013 01014 final uint indexOf (wchar[] chars, uint start=0) 01015 { 01016 pinIndex (start); 01017 wchar* s = u_strFindFirst (&content[start], len-start, chars, chars.length); 01018 if (s) 01019 return s - cast(wchar*) content; 01020 return uint.max; 01021 } 01022 01023 /*********************************************************************** 01024 01025 Find the last occurrence of a BMP code point in a string. 01026 A surrogate code point is found only if its match in the 01027 text is not part of a surrogate pair. 01028 01029 ***********************************************************************/ 01030 01031 final uint lastIndexOf (wchar c, uint start=uint.max) 01032 { 01033 pinIndex (start); 01034 wchar* s = u_memrchr (content, c, start); 01035 if (s) 01036 return s - cast(wchar*) content; 01037 return uint.max; 01038 } 01039 01040 /*********************************************************************** 01041 01042 Find the last occurrence of a BMP code point in a string. 01043 A surrogate code point is found only if its match in the 01044 text is not part of a surrogate pair. 01045 01046 ***********************************************************************/ 01047 01048 final uint lastIndexOf (UText other, uint start=uint.max) 01049 { 01050 return lastIndexOf (other.get, start); 01051 } 01052 01053 /*********************************************************************** 01054 01055 Find the last occurrence of a substring in a string. 01056 01057 The substring is found at code point boundaries. That means 01058 that if the substring begins with a trail surrogate or ends 01059 with a lead surrogate, then it is found only if these 01060 surrogates stand alone in the text. Otherwise, the substring 01061 edge units would be matched against halves of surrogate pairs. 01062 01063 ***********************************************************************/ 01064 01065 final uint lastIndexOf (wchar[] chars, uint start=uint.max) 01066 { 01067 pinIndex (start); 01068 wchar* s = u_strFindLast (content, start, chars, chars.length); 01069 if (s) 01070 return s - cast(wchar*) content; 01071 return uint.max; 01072 } 01073 01074 /*********************************************************************** 01075 01076 Lowercase the characters into a seperate UString. 01077 01078 Casing is locale-dependent and context-sensitive. The 01079 result may be longer or shorter than the original. 01080 01081 Note that the return value refers to the provided destination 01082 UString. 01083 01084 ***********************************************************************/ 01085 01086 final UString toLower (UString dst) 01087 { 01088 return toLower (dst, ULocale.Default); 01089 } 01090 01091 /*********************************************************************** 01092 01093 Lowercase the characters into a seperate UString. 01094 01095 Casing is locale-dependent and context-sensitive. The 01096 result may be longer or shorter than the original. 01097 01098 Note that the return value refers to the provided destination 01099 UString. 01100 01101 ***********************************************************************/ 01102 01103 final UString toLower (UString dst, inout ULocale locale) 01104 { 01105 uint lower (wchar* dst, uint length, inout Error e) 01106 { 01107 return u_strToLower (dst, length, content, len, toString(locale.name), e); 01108 } 01109 01110 dst.expand (len + 32); 01111 return dst.format (&lower, "toLower() failed"); 01112 } 01113 01114 /*********************************************************************** 01115 01116 Uppercase the characters into a seperate UString. 01117 01118 Casing is locale-dependent and context-sensitive. The 01119 result may be longer or shorter than the original. 01120 01121 Note that the return value refers to the provided destination 01122 UString. 01123 01124 ***********************************************************************/ 01125 01126 final UString toUpper (UString dst) 01127 { 01128 return toUpper (dst, ULocale.Default); 01129 } 01130 01131 /*********************************************************************** 01132 01133 Uppercase the characters into a seperate UString. 01134 01135 Casing is locale-dependent and context-sensitive. The 01136 result may be longer or shorter than the original. 01137 01138 Note that the return value refers to the provided destination 01139 UString. 01140 01141 ***********************************************************************/ 01142 01143 final UString toUpper (UString dst, inout ULocale locale) 01144 { 01145 uint upper (wchar* dst, uint length, inout Error e) 01146 { 01147 return u_strToUpper (dst, length, content, len, toString(locale.name), e); 01148 } 01149 01150 dst.expand (len + 32); 01151 return dst.format (&upper, "toUpper() failed"); 01152 } 01153 01154 /*********************************************************************** 01155 01156 Case-fold the characters into a seperate UString. 01157 01158 Case-folding is locale-independent and not context-sensitive, 01159 but there is an option for whether to include or exclude 01160 mappings for dotted I and dotless i that are marked with 'I' 01161 in CaseFolding.txt. The result may be longer or shorter than 01162 the original. 01163 01164 Note that the return value refers to the provided destination 01165 UString. 01166 01167 ***********************************************************************/ 01168 01169 final UString toFolded (UString dst, CaseOption option = CaseOption.Default) 01170 { 01171 uint fold (wchar* dst, uint length, inout Error e) 01172 { 01173 return u_strFoldCase (dst, length, content, len, option, e); 01174 } 01175 01176 dst.expand (len + 32); 01177 return dst.format (&fold, "toFolded() failed"); 01178 } 01179 01180 /*********************************************************************** 01181 01182 Converts a sequence of wchar (UTF-16) to UTF-8 bytes. If 01183 the output array is not provided, an array of appropriate 01184 size will be allocated and returned. Where the output is 01185 provided, it must be large enough to hold potentially four 01186 bytes per character for surrogate-pairs or three bytes per 01187 character for BMP only. Consider using UConverter where 01188 streaming conversions are required. 01189 01190 Returns an array slice representing the valid UTF8 content. 01191 01192 ***********************************************************************/ 01193 01194 final char[] toUtf8 (char[] dst = null) 01195 { 01196 uint x; 01197 Error e; 01198 01199 if (! cast(char*) dst) 01200 dst = new char[len * 4]; 01201 01202 u_strToUTF8 (dst, dst.length, &x, content, len, e); 01203 testError (e, "failed to convert to UTF8"); 01204 return dst [0..x]; 01205 } 01206 01207 /*********************************************************************** 01208 01209 Remove leading and trailing whitespace from this UText. 01210 Note that we slice the content to remove leading space. 01211 01212 ***********************************************************************/ 01213 01214 UText trim () 01215 { 01216 wchar c; 01217 uint i = len; 01218 01219 // cut off trailing white space 01220 while (i && ((c = charAt(i-1)) == 0x20 || UChar.isWhiteSpace (c))) 01221 --i; 01222 len = i; 01223 01224 // now remove leading whitespace 01225 for (i=0; i < len && ((c = charAt(i)) == 0x20 || UChar.isWhiteSpace (c)); ++i) {} 01226 if (i) 01227 { 01228 len -= i; 01229 content = content[i..length-i]; 01230 } 01231 01232 return this; 01233 } 01234 01235 /*********************************************************************** 01236 01237 Unescape a string of characters and write the resulting 01238 Unicode characters to the destination buffer. The following 01239 escape sequences are recognized: 01240 01241 uhhhh 4 hex digits; h in [0-9A-Fa-f] 01242 Uhhhhhhhh 8 hex digits 01243 xhh 1-2 hex digits 01244 x{h...} 1-8 hex digits 01245 ooo 1-3 octal digits; o in [0-7] 01246 cX control-X; X is masked with 0x1F 01247 01248 as well as the standard ANSI C escapes: 01249 01250 a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A, 01251 v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B, 01252 \\" =U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C 01253 01254 Anything else following a backslash is generically escaped. 01255 For example, "[a\\-z]" returns "[a-z]". 01256 01257 If an escape sequence is ill-formed, this method returns an 01258 empty string. An example of an ill-formed sequence is "\\u" 01259 followed by fewer than 4 hex digits. 01260 01261 ***********************************************************************/ 01262 01263 final UString unEscape () 01264 { 01265 UString result = new UString (len); 01266 for (uint i=0; i < len;) 01267 { 01268 dchar c = charAt(i++); 01269 if (c == 0x005C) 01270 { 01271 // bump index ... 01272 c = u_unescapeAt (&_charAt, &i, len, cast(void*) this); 01273 01274 // error? 01275 if (c == 0xFFFFFFFF) 01276 { 01277 result.truncate (); // return empty string 01278 break; // invalid escape sequence 01279 } 01280 } 01281 result.append (c); 01282 } 01283 return result; 01284 } 01285 01286 /*********************************************************************** 01287 01288 Is this code point a surrogate (U+d800..U+dfff)? 01289 01290 ***********************************************************************/ 01291 01292 final static bool isSurrogate (wchar c) 01293 { 01294 return (c & 0xfffff800) == 0xd800; 01295 } 01296 01297 /*********************************************************************** 01298 01299 Is this code unit a lead surrogate (U+d800..U+dbff)? 01300 01301 ***********************************************************************/ 01302 01303 final static bool isLeading (wchar c) 01304 { 01305 return (c & 0xfffffc00) == 0xd800; 01306 } 01307 01308 /*********************************************************************** 01309 01310 Is this code unit a trail surrogate (U+dc00..U+dfff)? 01311 01312 ***********************************************************************/ 01313 01314 final static bool isTrailing (wchar c) 01315 { 01316 return (c & 0xfffffc00) == 0xdc00; 01317 } 01318 01319 /*********************************************************************** 01320 01321 Adjust a random-access offset to a code point boundary 01322 at the start of a code point. If the offset points to 01323 the trail surrogate of a surrogate pair, then the offset 01324 is decremented. Otherwise, it is not modified. 01325 01326 ***********************************************************************/ 01327 01328 final uint getCharStart (uint i) 01329 in { 01330 if (i >= len) 01331 exception ("index of out bounds"); 01332 } 01333 body 01334 { 01335 if (isTrailing (content[i]) && i && isLeading (content[i-1])) 01336 --i; 01337 return i; 01338 } 01339 01340 /*********************************************************************** 01341 01342 Adjust a random-access offset to a code point boundary 01343 after a code point. If the offset is behind the lead 01344 surrogate of a surrogate pair, then the offset is 01345 incremented. Otherwise, it is not modified. 01346 01347 ***********************************************************************/ 01348 01349 final uint getCharLimit (uint i) 01350 in { 01351 if (i >= len) 01352 exception ("index of out bounds"); 01353 } 01354 body 01355 { 01356 if (i && isLeading(content[i-1]) && isTrailing (content[i])) 01357 ++i; 01358 return i; 01359 } 01360 01361 /*********************************************************************** 01362 01363 Callback for C unescapeAt() function 01364 01365 ***********************************************************************/ 01366 01367 extern (C) 01368 { 01369 typedef wchar function (uint offset, void* context) CharAt; 01370 01371 private static wchar _charAt (uint offset, void* context) 01372 { 01373 return (cast(UString) context).charAt (offset); 01374 } 01375 } 01376 01377 /*********************************************************************** 01378 01379 Pin the given index to a valid position. 01380 01381 ***********************************************************************/ 01382 01383 final private void pinIndex (inout uint x) 01384 { 01385 if (x > len) 01386 x = len; 01387 } 01388 01389 /*********************************************************************** 01390 01391 Pin the given index and length to a valid position. 01392 01393 ***********************************************************************/ 01394 01395 final private void pinIndices (inout uint start, inout uint length) 01396 { 01397 if (start > len) 01398 start = len; 01399 01400 if (length > (len - start)) 01401 length = len - start; 01402 } 01403 01404 /*********************************************************************** 01405 01406 Helper for comparison methods 01407 01408 ***********************************************************************/ 01409 01410 final private int compareFolded (wchar[] s1, wchar[] s2, CaseOption option = CaseOption.Default) 01411 { 01412 Error e; 01413 01414 int x = u_strCaseCompare (s1, s1.length, s2, s2.length, option, e); 01415 testError (e, "compareFolded failed"); 01416 return x; 01417 } 01418 01419 01420 /*********************************************************************** 01421 01422 Bind the ICU functions from a shared library. This is 01423 complicated by the issues regarding D and DLLs on the 01424 Windows platform 01425 01426 ***********************************************************************/ 01427 01428 private static void* library; 01429 01430 /*********************************************************************** 01431 01432 ***********************************************************************/ 01433 01434 private static extern (C) 01435 { 01436 wchar* function (wchar*, uint, wchar*, uint) u_strFindFirst; 01437 wchar* function (wchar*, uint, wchar*, uint) u_strFindLast; 01438 wchar* function (wchar*, wchar, uint) u_memchr; 01439 wchar* function (wchar*, wchar, uint) u_memrchr; 01440 int function (wchar*, uint, wchar*, uint, bool) u_strCompare; 01441 int function (wchar*, uint, wchar*, uint, uint, inout Error) u_strCaseCompare; 01442 dchar function (CharAt, uint*, uint, void*) u_unescapeAt; 01443 uint function (wchar*, uint) u_countChar32; 01444 uint function (wchar*, uint, wchar*, uint, char*, inout Error) u_strToUpper; 01445 uint function (wchar*, uint, wchar*, uint, char*, inout Error) u_strToLower; 01446 uint function (wchar*, uint, wchar*, uint, uint, inout Error) u_strFoldCase; 01447 wchar* function (wchar*, uint, uint*, char*, uint, inout Error) u_strFromUTF8; 01448 char* function (char*, uint, uint*, wchar*, uint, inout Error) u_strToUTF8; 01449 } 01450 01451 /*********************************************************************** 01452 01453 ***********************************************************************/ 01454 01455 static FunctionLoader.Bind[] targets = 01456 [ 01457 {cast(void**) &u_strFindFirst, "u_strFindFirst"}, 01458 {cast(void**) &u_strFindLast, "u_strFindLast"}, 01459 {cast(void**) &u_memchr, "u_memchr"}, 01460 {cast(void**) &u_memrchr, "u_memrchr"}, 01461 {cast(void**) &u_strCompare, "u_strCompare"}, 01462 {cast(void**) &u_strCaseCompare, "u_strCaseCompare"}, 01463 {cast(void**) &u_unescapeAt, "u_unescapeAt"}, 01464 {cast(void**) &u_countChar32, "u_countChar32"}, 01465 {cast(void**) &u_strToUpper, "u_strToUpper"}, 01466 {cast(void**) &u_strToLower, "u_strToLower"}, 01467 {cast(void**) &u_strFoldCase, "u_strFoldCase"}, 01468 {cast(void**) &u_strFromUTF8, "u_strFromUTF8"}, 01469 {cast(void**) &u_strToUTF8, "u_strToUTF8"}, 01470 ]; 01471 01472 /*********************************************************************** 01473 01474 ***********************************************************************/ 01475 01476 static this () 01477 { 01478 library = FunctionLoader.bind (icuuc, targets); 01479 //test (); 01480 } 01481 01482 /*********************************************************************** 01483 01484 ***********************************************************************/ 01485 01486 static ~this () 01487 { 01488 FunctionLoader.unbind (library); 01489 } 01490 01491 /*********************************************************************** 01492 01493 ***********************************************************************/ 01494 01495 private static void test() 01496 { 01497 UString s = new UString (r"aaaqw \uabcd eaaa"); 01498 char[] x = "dssfsdff"; 01499 s ~ x ~ x; 01500 wchar c = s[3]; 01501 s[3] = 'Q'; 01502 int y = s.indexOf ("qwe"); 01503 s.unEscape (); 01504 s.toUpper (new UString); 01505 s.padLeading(2).padTrailing(2).trim(); 01506 } 01507 }