00001 /******************************************************************************* 00002 00003 @file UString.d 00004 00005 Copyright (C) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 00027 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00028 00029 00030 @version Initial version, October 2004 00031 @author Kris 00032 00033 Note that this package and documentation is built around the ICU 00034 project (http://oss.software.ibm.com/icu/). Below is the license 00035 statement as specified by that software: 00036 00037 00038 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00039 00040 00041 ICU License - ICU 1.8.1 and later 00042 00043 COPYRIGHT AND PERMISSION NOTICE 00044 00045 Copyright (c) 1995-2003 International Business Machines Corporation and 00046 others. 00047 00048 All rights reserved. 00049 00050 Permission is hereby granted, free of charge, to any person obtaining a 00051 copy of this software and associated documentation files (the 00052 "Software"), to deal in the Software without restriction, including 00053 without limitation the rights to use, copy, modify, merge, publish, 00054 distribute, and/or sell copies of the Software, and to permit persons 00055 to whom the Software is furnished to do so, provided that the above 00056 copyright notice(s) and this permission notice appear in all copies of 00057 the Software and that both the above copyright notice(s) and this 00058 permission notice appear in supporting documentation. 00059 00060 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 00061 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00062 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 00063 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 00064 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL 00065 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING 00066 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, 00067 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION 00068 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 00069 00070 Except as contained in this notice, the name of a copyright holder 00071 shall not be used in advertising or otherwise to promote the sale, use 00072 or other dealings in this Software without prior written authorization 00073 of the copyright holder. 00074 00075 ---------------------------------------------------------------------- 00076 00077 All trademarks and registered trademarks mentioned herein are the 00078 property of their respective owners. 00079 00080 *******************************************************************************/ 00081 00082 module mango.icu.UString; 00083 00084 private import mango.icu.ICU, 00085 mango.icu.UChar, 00086 mango.icu.ULocale; 00087 00088 /******************************************************************************* 00089 00090 *******************************************************************************/ 00091 00092 private extern (C) void memmove (void* dst, void* src, uint bytes); 00093 00094 /******************************************************************************* 00095 00096 Bind to the IReadable and IWritable interfaces if we're building 00097 along with the mango.io package 00098 00099 *******************************************************************************/ 00100 00101 version (Mango) 00102 { 00103 private import mango.io.model.IReader, 00104 mango.io.model.IWriter; 00105 } 00106 else 00107 { 00108 interface IReadable {}; 00109 interface IWritable {}; 00110 } 00111 00112 00113 /******************************************************************************* 00114 00115 UString is a string class that stores Unicode characters directly 00116 and provides similar functionality as the Java String class. 00117 00118 In ICU, a Unicode string consists of 16-bit Unicode code units. 00119 A Unicode character may be stored with either one code unit — 00120 which is the most common case — or with a matched pair of 00121 special code units ("surrogates"). The data type for code units 00122 is UChar. 00123 00124 For single-character handling, a Unicode character code point is 00125 a value in the range 0..0x10ffff. ICU uses the UChar32 type for 00126 code points. 00127 00128 Indexes and offsets into and lengths of strings always count code 00129 units, not code points. This is the same as with multi-byte char* 00130 strings in traditional string handling. Operations on partial 00131 strings typically do not test for code point boundaries. If necessary, 00132 the user needs to take care of such boundaries by testing for the code 00133 unit values or by using functions like getChar32Start() 00134 and getChar32Limit() 00135 00136 UString methods are more lenient with regard to input parameter values 00137 than other ICU APIs. In particular: 00138 00139 - If indexes are out of bounds for a UString object (<0 or > length) 00140 then they are "pinned" to the nearest boundary. 00141 00142 - If primitive string pointer values (e.g., const wchar* or char*) for 00143 input strings are null, then those input string parameters are treated 00144 as if they pointed to an empty string. However, this is not the case 00145 for char* parameters for charset names or other IDs. 00146 00147 *******************************************************************************/ 00148 00149 class UString : UText 00150 { 00151 /*********************************************************************** 00152 00153 Create an empty UString with the specified available space 00154 00155 ***********************************************************************/ 00156 00157 this (uint space = 0) 00158 { 00159 content.length = space; 00160 mutable = true; 00161 } 00162 00163 /*********************************************************************** 00164 00165 Create a UString upon the provided content. If said content 00166 is immutable (read-only) then you might consider setting the 00167 'mutable' parameter to false. Doing so will avoid allocating 00168 heap-space for the content until it is modified via one of 00169 these methods. 00170 00171 ***********************************************************************/ 00172 00173 this (wchar[] content, bool mutable = true) 00174 { 00175 setTo (content, mutable); 00176 } 00177 00178 /*********************************************************************** 00179 00180 Create a UString via the content of a UText. Note that the 00181 default is to assume the content is immutable (read-only). 00182 00183 ***********************************************************************/ 00184 00185 this (UText other, bool mutable = false) 00186 { 00187 this (other.get, mutable); 00188 } 00189 00190 /*********************************************************************** 00191 00192 Create a UString via the content of a UString. If said content 00193 is immutable (read-only) then you might consider setting the 00194 'mutable' parameter to false. Doing so will avoid allocating 00195 heap-space for the content until it is modified via one of 00196 these methods. 00197 00198 ***********************************************************************/ 00199 00200 this (UString other, bool mutable = true) 00201 { 00202 this (other.get, mutable); 00203 } 00204 00205 /*********************************************************************** 00206 00207 Support for reading content via the IO system 00208 00209 ***********************************************************************/ 00210 00211 version (Mango) 00212 { 00213 void read (IReader r) 00214 { 00215 super.read (r); 00216 00217 // we're mutable, so ensure we don't mess with the 00218 // IO buffers. Interestingly, changing the length 00219 // of a D array can account for slice assignments 00220 // (it checks the pointer to see if it's a starting 00221 // point in the pool). Unfortunately, that doesn't 00222 // catch the case where a slice starts at offset 0, 00223 // which is where IO buffer slices may come from. To 00224 // be safe, we dup the content here. Note that this 00225 // is not the case for UText, since it is read-only. 00226 if (r.getAllocator.isReadOnly (content)) 00227 mutable = false; 00228 } 00229 } 00230 00231 /*********************************************************************** 00232 00233 Append text to this UString 00234 00235 ***********************************************************************/ 00236 00237 UString append (UText other) 00238 { 00239 return append (other.get); 00240 } 00241 00242 /*********************************************************************** 00243 00244 Append partial text to this UString 00245 00246 ***********************************************************************/ 00247 00248 UString append (UText other, uint start, uint len=uint.max) 00249 { 00250 other.pinIndices (start, len); 00251 return append (other.content [start..start+len]); 00252 } 00253 00254 /*********************************************************************** 00255 00256 Append a single character to this UString 00257 00258 ***********************************************************************/ 00259 00260 UString append (wchar chr) 00261 { 00262 return append (&chr, 1); 00263 } 00264 00265 /*********************************************************************** 00266 00267 Append text to this UString 00268 00269 ***********************************************************************/ 00270 00271 UString append (wchar[] chars) 00272 { 00273 return append (chars, chars.length); 00274 } 00275 00276 /*********************************************************************** 00277 00278 Set a section of this UString to the specified character 00279 00280 ***********************************************************************/ 00281 00282 UString setTo (wchar chr, uint start=0, uint len=uint.max) 00283 { 00284 pinIndices (start, len); 00285 if (! mutable) 00286 realloc (); 00287 content [start..start+len] = chr; 00288 return this; 00289 } 00290 00291 /*********************************************************************** 00292 00293 Set the content to the provided array. Parameter 'mutable' 00294 specifies whether the given array is likely to change. If 00295 not, the array is aliased until such time this UString is 00296 altered. 00297 00298 ***********************************************************************/ 00299 00300 UString setTo (wchar[] chars, bool mutable = true) 00301 { 00302 len = chars.length; 00303 if ((this.mutable = mutable) == true) 00304 content = chars.dup; 00305 else 00306 content = chars; 00307 return this; 00308 } 00309 00310 /*********************************************************************** 00311 00312 Replace the content of this UString. If the new content 00313 is immutable (read-only) then you might consider setting the 00314 'mutable' parameter to false. Doing so will avoid allocating 00315 heap-space for the content until it is modified via one of 00316 these methods. 00317 00318 ***********************************************************************/ 00319 00320 UString setTo (UText other, bool mutable = true) 00321 { 00322 return setTo (other.get, mutable); 00323 } 00324 00325 /*********************************************************************** 00326 00327 Replace the content of this UString. If the new content 00328 is immutable (read-only) then you might consider setting the 00329 'mutable' parameter to false. Doing so will avoid allocating 00330 heap-space for the content until it is modified via one of 00331 these methods. 00332 00333 ***********************************************************************/ 00334 00335 UString setTo (UText other, uint start, uint len, bool mutable = true) 00336 { 00337 other.pinIndices (start, len); 00338 return setTo (other.content [start..start+len], mutable); 00339 } 00340 00341 /*********************************************************************** 00342 00343 Replace the character at the specified location. 00344 00345 ***********************************************************************/ 00346 00347 UString setCharAt (uint index, wchar chr) 00348 { 00349 pinIndex (index); 00350 if (! mutable) 00351 realloc (); 00352 content [index] = chr; 00353 return this; 00354 } 00355 00356 /*********************************************************************** 00357 00358 Remove leading and trailing whitespace from this UString 00359 00360 ***********************************************************************/ 00361 00362 UString trim () 00363 { 00364 wchar c; 00365 uint i = len; 00366 00367 // cut off trailing white space 00368 while (i && ((c = charAt(i-1)) == 0x20 || UChar.isWhiteSpace (c))) 00369 --i; 00370 len = i; 00371 00372 // now remove leading whitespace 00373 for (i=0; i < len && ((c = charAt(i)) == 0x20 || UChar.isWhiteSpace (c)); ++i) {} 00374 return remove (0, i); 00375 } 00376 00377 /*********************************************************************** 00378 00379 Remove a piece of this UString. 00380 00381 ***********************************************************************/ 00382 00383 UString remove (uint start, uint length=uint.max) 00384 { 00385 pinIndices (start, length); 00386 if (length) 00387 if (start >= len) 00388 truncate (start); 00389 else 00390 { 00391 if (! mutable) 00392 realloc (); 00393 00394 uint i = start + length; 00395 memmove (cast(wchar*)content+start, cast(wchar*)content+i, (len-i) * wchar.sizeof); 00396 len -= length; 00397 } 00398 return this; 00399 } 00400 00401 /*********************************************************************** 00402 00403 Truncate the length of this UString. 00404 00405 ***********************************************************************/ 00406 00407 UString truncate (uint length=0) 00408 { 00409 if (length <= len) 00410 len = length; 00411 return this; 00412 } 00413 00414 /*********************************************************************** 00415 00416 Insert leading spaces in this UString 00417 00418 ***********************************************************************/ 00419 00420 UString padLeading (uint length, wchar padChar = 0x0020) 00421 { 00422 expand (length); 00423 memmove (cast(wchar*) content + length, content, len * wchar.sizeof); 00424 len += length; 00425 return setTo (padChar, 0, length); 00426 } 00427 00428 /*********************************************************************** 00429 00430 Append some trailing spaces to this UString. 00431 00432 ***********************************************************************/ 00433 00434 UString padTrailing (uint length, wchar padChar = 0x0020) 00435 { 00436 expand (length); 00437 len += length; 00438 return setTo (padChar, len-length, length); 00439 } 00440 00441 /*********************************************************************** 00442 00443 Check for available space within the buffer, and expand 00444 as necessary. 00445 00446 ***********************************************************************/ 00447 00448 package final void expand (uint count) 00449 { 00450 if ((len + count) > content.length) 00451 realloc (count); 00452 } 00453 00454 /*********************************************************************** 00455 00456 Allocate memory dud to a change in the content. We handle 00457 the distinction between mutable and immutable here. 00458 00459 ***********************************************************************/ 00460 00461 private final void realloc (uint count = 0) 00462 { 00463 uint size = (content.length + count + 63) & ~63; 00464 00465 if (mutable) 00466 content.length = size; 00467 else 00468 { 00469 mutable = true; 00470 wchar[] x = content; 00471 content = new wchar [size]; 00472 if (len) 00473 content[0..len] = x; 00474 } 00475 } 00476 00477 /*********************************************************************** 00478 00479 Internal method to support UString appending 00480 00481 ***********************************************************************/ 00482 00483 private final UString append (wchar* chars, uint count) 00484 { 00485 expand (count); 00486 content[len..len+count] = chars[0..count]; 00487 len += count; 00488 return this; 00489 } 00490 00491 /*********************************************************************** 00492 00493 Internal method to support formatting into this UString. 00494 This is used by many of the ICU wrappers to append content 00495 into a UString. 00496 00497 ***********************************************************************/ 00498 00499 typedef uint delegate (wchar* dst, uint len, inout Error e) Formatter; 00500 00501 package final void format (Formatter format, char[] msg) 00502 { 00503 Error e; 00504 uint length; 00505 00506 while (true) 00507 { 00508 e = e.OK; 00509 length = format (cast(wchar*)content + len, content.length - len, e); 00510 if (e == e.BufferOverflow) 00511 expand (length); 00512 else 00513 break; 00514 } 00515 00516 if (isError (e)) 00517 exception (msg); 00518 00519 len += length; 00520 } 00521 } 00522 00523 00524 /******************************************************************************* 00525 00526 Immutable (read-only) text -- use UString for mutable strings. 00527 00528 *******************************************************************************/ 00529 00530 class UText : ICU, IReadable, IWritable 00531 { 00532 // the core of the UText and UString attributes. The name 'len' 00533 // is used rather than the more obvious 'length' since there is 00534 // a collision with the silly array[length] syntactic sugar ... 00535 package uint len; 00536 package wchar[] content; 00537 00538 // this should probably be in UString only, but there seems to 00539 // be a compiler bug where it doesn't get initialised correctly, 00540 // and it's perhaps useful to have here for when a UString is 00541 // passed as a UText argument. 00542 private bool mutable; 00543 00544 // toFolded() argument 00545 public enum CaseOption 00546 { 00547 Default = 0, 00548 SpecialI = 1 00549 } 00550 00551 /*********************************************************************** 00552 00553 Hidden constructor 00554 00555 ***********************************************************************/ 00556 00557 private this () 00558 { 00559 } 00560 00561 /*********************************************************************** 00562 00563 Construct read-only wrapper around the given content 00564 00565 ***********************************************************************/ 00566 00567 this (wchar[] content) 00568 { 00569 this.content = content; 00570 this.len = content.length; 00571 } 00572 00573 /*********************************************************************** 00574 00575 Support for reading and writing via the Mango IO subsystem 00576 00577 ***********************************************************************/ 00578 00579 version (Mango) 00580 { 00581 void read (IReader r) 00582 { 00583 r.get (content); 00584 len = content.length; 00585 } 00586 00587 void write (IWriter w) 00588 { 00589 w.putw (get); 00590 } 00591 } 00592 00593 /*********************************************************************** 00594 00595 Return the valid content from this UText 00596 00597 ***********************************************************************/ 00598 00599 final package wchar[] get () 00600 { 00601 return content [0..len]; 00602 } 00603 00604 /*********************************************************************** 00605 00606 Is this UText equal to another? 00607 00608 ***********************************************************************/ 00609 00610 final override int opEquals (Object o) 00611 { 00612 UText other = cast(UText) o; 00613 00614 if (other is null || other is this) 00615 return 0; 00616 00617 return typeid(wchar[]).equals (&content[0..len], &other.content[0..other.len]); 00618 } 00619 00620 /*********************************************************************** 00621 00622 Compare this UText to another. 00623 00624 ***********************************************************************/ 00625 00626 final override int opCmp (Object o) 00627 { 00628 UText other = cast(UText) o; 00629 00630 if (other is null || other is this) 00631 return 1; 00632 00633 return typeid(wchar[]).compare (&content[0..len], &other.content[0..other.len]); 00634 } 00635 00636 /*********************************************************************** 00637 00638 Hash this UText 00639 00640 ***********************************************************************/ 00641 00642 final override uint toHash () 00643 { 00644 return typeid(wchar[]).getHash (&content[0..len]); 00645 } 00646 00647 /*********************************************************************** 00648 00649 Clone this UText into a UString 00650 00651 ***********************************************************************/ 00652 00653 final UString copy () 00654 { 00655 return new UString (content); 00656 } 00657 00658 /*********************************************************************** 00659 00660 Clone a section of this UText into a UString 00661 00662 ***********************************************************************/ 00663 00664 final UString extract (uint start, uint len=uint.max) 00665 { 00666 pinIndices (start, len); 00667 return new UString (content[start..start+len]); 00668 } 00669 00670 /*********************************************************************** 00671 00672 Count unicode code points in the length UChar code units of 00673 the string. A code point may occupy either one or two UChar 00674 code units. Counting code points involves reading all code 00675 units. 00676 00677 ***********************************************************************/ 00678 00679 final uint codePoints (uint start=0, uint length=uint.max) 00680 { 00681 pinIndices (start, length); 00682 return u_countChar32 (cast(wchar*) content+start, length); 00683 } 00684 00685 /*********************************************************************** 00686 00687 Return an indication whether or not there are surrogate pairs 00688 within the string. 00689 00690 ***********************************************************************/ 00691 00692 final bool hasSurrogates (uint start=0, uint length=uint.max) 00693 { 00694 pinIndices (start, length); 00695 return codePoints (start, length) != length; 00696 } 00697 00698 /*********************************************************************** 00699 00700 Return the character at the specified position. 00701 00702 ***********************************************************************/ 00703 00704 final wchar charAt (uint index) 00705 { 00706 return content [index]; 00707 } 00708 00709 /*********************************************************************** 00710 00711 Return the length of the valid content 00712 00713 ***********************************************************************/ 00714 00715 final uint length () 00716 { 00717 return len; 00718 } 00719 00720 /*********************************************************************** 00721 00722 The comparison can be done in code unit order or in code 00723 point order. They differ only in UTF-16 when comparing 00724 supplementary code points (U+10000..U+10ffff) to BMP code 00725 points near the end of the BMP (i.e., U+e000..U+ffff). 00726 00727 In code unit order, high BMP code points sort after 00728 supplementary code points because they are stored as 00729 pairs of surrogates which are at U+d800..U+dfff. 00730 00731 ***********************************************************************/ 00732 00733 final uint compare (UString other, bool codePointOrder=false) 00734 { 00735 return compare (other.get, codePointOrder); 00736 } 00737 00738 /*********************************************************************** 00739 00740 The comparison can be done in code unit order or in code 00741 point order. They differ only in UTF-16 when comparing 00742 supplementary code points (U+10000..U+10ffff) to BMP code 00743 points near the end of the BMP (i.e., U+e000..U+ffff). 00744 00745 In code unit order, high BMP code points sort after 00746 supplementary code points because they are stored as 00747 pairs of surrogates which are at U+d800..U+dfff. 00748 00749 ***********************************************************************/ 00750 00751 final uint compare (wchar[] other, bool codePointOrder=false) 00752 { 00753 return u_strCompare (content, len, other, other.length, codePointOrder); 00754 } 00755 00756 /*********************************************************************** 00757 00758 The comparison can be done in UTF-16 code unit order or 00759 in code point order. They differ only when comparing 00760 supplementary code points (U+10000..U+10ffff) to BMP code 00761 points near the end of the BMP (i.e., U+e000..U+ffff). 00762 00763 In code unit order, high BMP code points sort after 00764 supplementary code points because they are stored as 00765 pairs of surrogates which are at U+d800..U+dfff. 00766 00767 ***********************************************************************/ 00768 00769 final uint compareFolded (UString other, CaseOption option = CaseOption.Default) 00770 { 00771 return compareFolded (other.content, option); 00772 } 00773 00774 /*********************************************************************** 00775 00776 The comparison can be done in UTF-16 code unit order or 00777 in code point order. They differ only when comparing 00778 supplementary code points (U+10000..U+10ffff) to BMP code 00779 points near the end of the BMP (i.e., U+e000..U+ffff). 00780 00781 In code unit order, high BMP code points sort after 00782 supplementary code points because they are stored as 00783 pairs of surrogates which are at U+d800..U+dfff. 00784 00785 ***********************************************************************/ 00786 00787 final uint compareFolded (wchar[] other, CaseOption option = CaseOption.Default) 00788 { 00789 return compareFolded (get, other, option); 00790 } 00791 00792 /*********************************************************************** 00793 00794 Does this UText start with specified string? 00795 00796 ***********************************************************************/ 00797 00798 final bool startsWith (UText other) 00799 { 00800 return startsWith (other.get); 00801 } 00802 00803 /*********************************************************************** 00804 00805 Does this UText start with specified string? 00806 00807 ***********************************************************************/ 00808 00809 final bool startsWith (wchar[] chars) 00810 { 00811 if (len >= chars.length) 00812 return compareFolded (content[0..chars.length], chars) == 0; 00813 return false; 00814 } 00815 00816 /*********************************************************************** 00817 00818 Does this UText end with specified string? 00819 00820 ***********************************************************************/ 00821 00822 final bool endsWith (UText other) 00823 { 00824 return endsWith (other.get); 00825 } 00826 00827 /*********************************************************************** 00828 00829 Does this UText end with specified string? 00830 00831 ***********************************************************************/ 00832 00833 final bool endsWith (wchar[] chars) 00834 { 00835 if (len >= chars.length) 00836 return compareFolded (content[len-chars.length..len], chars) == 0; 00837 return false; 00838 } 00839 00840 /*********************************************************************** 00841 00842 Find the first occurrence of a BMP code point in a string. 00843 A surrogate code point is found only if its match in the 00844 text is not part of a surrogate pair. 00845 00846 ***********************************************************************/ 00847 00848 final uint indexOf (wchar c, uint start=0) 00849 { 00850 pinIndex (start); 00851 wchar* s = u_memchr (cast(wchar*)content+start, c, len-start); 00852 if (s) 00853 return s - cast(wchar*) content; 00854 return -1; 00855 } 00856 00857 /*********************************************************************** 00858 00859 Find the first occurrence of a substring in a string. 00860 00861 The substring is found at code point boundaries. That means 00862 that if the substring begins with a trail surrogate or ends 00863 with a lead surrogate, then it is found only if these 00864 surrogates stand alone in the text. Otherwise, the substring 00865 edge units would be matched against halves of surrogate pairs. 00866 00867 ***********************************************************************/ 00868 00869 final uint indexOf (UText other, uint start=0) 00870 { 00871 return indexOf (other.get, start); 00872 } 00873 00874 /*********************************************************************** 00875 00876 Find the first occurrence of a substring in a string. 00877 00878 The substring is found at code point boundaries. That means 00879 that if the substring begins with a trail surrogate or ends 00880 with a lead surrogate, then it is found only if these 00881 surrogates stand alone in the text. Otherwise, the substring 00882 edge units would be matched against halves of surrogate pairs. 00883 00884 ***********************************************************************/ 00885 00886 final uint indexOf (wchar[] chars, uint start=0) 00887 { 00888 pinIndex (start); 00889 wchar* s = u_strFindFirst (cast(wchar*)content+start, len-start, chars, chars.length); 00890 if (s) 00891 return s - cast(wchar*) content; 00892 return -1; 00893 } 00894 00895 /*********************************************************************** 00896 00897 Find the last occurrence of a BMP code point in a string. 00898 A surrogate code point is found only if its match in the 00899 text is not part of a surrogate pair. 00900 00901 ***********************************************************************/ 00902 00903 final uint lastIndexOf (wchar c, uint start=uint.max) 00904 { 00905 pinIndex (start); 00906 wchar* s = u_memrchr (content, c, start); 00907 if (s) 00908 return s - cast(wchar*) content; 00909 return -1; 00910 } 00911 00912 /*********************************************************************** 00913 00914 Find the last occurrence of a BMP code point in a string. 00915 A surrogate code point is found only if its match in the 00916 text is not part of a surrogate pair. 00917 00918 ***********************************************************************/ 00919 00920 final uint lastIndexOf (UText other, uint start=uint.max) 00921 { 00922 return lastIndexOf (other.get, start); 00923 } 00924 00925 /*********************************************************************** 00926 00927 Find the last occurrence of a substring in a string. 00928 00929 The substring is found at code point boundaries. That means 00930 that if the substring begins with a trail surrogate or ends 00931 with a lead surrogate, then it is found only if these 00932 surrogates stand alone in the text. Otherwise, the substring 00933 edge units would be matched against halves of surrogate pairs. 00934 00935 ***********************************************************************/ 00936 00937 final uint lastIndexOf (wchar[] chars, uint start=uint.max) 00938 { 00939 pinIndex (start); 00940 wchar* s = u_strFindLast (content, start, chars, chars.length); 00941 if (s) 00942 return s - cast(wchar*) content; 00943 return -1; 00944 } 00945 00946 /*********************************************************************** 00947 00948 Lowercase the characters into a seperate UString. 00949 00950 Casing is locale-dependent and context-sensitive. The 00951 result may be longer or shorter than the original. 00952 00953 ***********************************************************************/ 00954 00955 final UString toLower () 00956 { 00957 return toLower (ULocale.Default); 00958 } 00959 00960 /*********************************************************************** 00961 00962 Lowercase the characters into a seperate UString. 00963 00964 Casing is locale-dependent and context-sensitive. The 00965 result may be longer or shorter than the original. 00966 00967 ***********************************************************************/ 00968 00969 final UString toLower (inout ULocale locale) 00970 { 00971 uint lower (wchar* dst, uint length, inout Error e) 00972 { 00973 return u_strToLower (dst, length, content, len, toString(locale.name), e); 00974 } 00975 00976 return caseConvert (&lower); 00977 } 00978 00979 /*********************************************************************** 00980 00981 Uppercase the characters into a seperate UString. 00982 00983 Casing is locale-dependent and context-sensitive. The 00984 result may be longer or shorter than the original. 00985 00986 ***********************************************************************/ 00987 00988 final UString toUpper () 00989 { 00990 return toUpper (ULocale.Default); 00991 } 00992 00993 /*********************************************************************** 00994 00995 Uppercase the characters into a seperate UString. 00996 00997 Casing is locale-dependent and context-sensitive. The 00998 result may be longer or shorter than the original. 00999 01000 ***********************************************************************/ 01001 01002 final UString toUpper (inout ULocale locale) 01003 { 01004 uint upper (wchar* dst, uint length, inout Error e) 01005 { 01006 return u_strToUpper (dst, length, content, len, toString(locale.name), e); 01007 } 01008 01009 return caseConvert (&upper); 01010 } 01011 01012 /*********************************************************************** 01013 01014 Case-fold the characters into a seperate UString. 01015 01016 Case-folding is locale-independent and not context-sensitive, 01017 but there is an option for whether to include or exclude 01018 mappings for dotted I and dotless i that are marked with 'I' 01019 in CaseFolding.txt. The result may be longer or shorter than 01020 the original. 01021 01022 ***********************************************************************/ 01023 01024 final UString toFolded (CaseOption option = CaseOption.Default) 01025 { 01026 uint fold (wchar* dst, uint length, inout Error e) 01027 { 01028 return u_strFoldCase (dst, length, content, len, option, e); 01029 } 01030 01031 return caseConvert (&fold); 01032 } 01033 01034 /*********************************************************************** 01035 01036 Unescape a string of characters and write the resulting 01037 Unicode characters to the destination buffer. The following 01038 escape sequences are recognized: 01039 01040 uhhhh 4 hex digits; h in [0-9A-Fa-f] 01041 Uhhhhhhhh 8 hex digits 01042 xhh 1-2 hex digits 01043 x{h...} 1-8 hex digits 01044 ooo 1-3 octal digits; o in [0-7] 01045 cX control-X; X is masked with 0x1F 01046 01047 as well as the standard ANSI C escapes: 01048 01049 a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A, 01050 v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B, 01051 \\" =U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C 01052 01053 Anything else following a backslash is generically escaped. 01054 For example, "[a\\-z]" returns "[a-z]". 01055 01056 If an escape sequence is ill-formed, this method returns an 01057 empty string. An example of an ill-formed sequence is "\\u" 01058 followed by fewer than 4 hex digits. 01059 01060 ***********************************************************************/ 01061 01062 final UString unEscape () 01063 { 01064 UString result = new UString (len); 01065 for (uint i=0; i < len;) 01066 { 01067 dchar c = charAt(i++); 01068 if (c == 0x005C) 01069 { 01070 // bump index ... 01071 c = u_unescapeAt (&_charAt, &i, len, cast(void*) this); 01072 01073 // error? 01074 if (c == 0xFFFFFFFF) 01075 { 01076 result.truncate (); // return empty string 01077 break; // invalid escape sequence 01078 } 01079 } 01080 result.append (c); 01081 } 01082 return result; 01083 } 01084 01085 /*********************************************************************** 01086 01087 Callback for C unescapeAt() function 01088 01089 ***********************************************************************/ 01090 01091 extern (C) 01092 { 01093 typedef wchar function (uint offset, void* context) CharAt; 01094 01095 private static wchar _charAt (uint offset, void* context) 01096 { 01097 return (cast(UString) context).charAt (offset); 01098 } 01099 } 01100 01101 /*********************************************************************** 01102 01103 Pin the given index to a valid position. 01104 01105 ***********************************************************************/ 01106 01107 final private void pinIndex (inout uint x) 01108 { 01109 if (x > len) 01110 x = len; 01111 } 01112 01113 /*********************************************************************** 01114 01115 Pin the given index and length to a valid position. 01116 01117 ***********************************************************************/ 01118 01119 final private void pinIndices (inout uint start, inout uint length) 01120 { 01121 if (start > len) 01122 start = len; 01123 01124 if (length > (len - start)) 01125 length = len - start; 01126 } 01127 01128 /*********************************************************************** 01129 01130 Helper for comparison methods 01131 01132 ***********************************************************************/ 01133 01134 final private uint compareFolded (wchar[] s1, wchar[] s2, CaseOption option = CaseOption.Default) 01135 { 01136 Error e; 01137 01138 int x = u_strCaseCompare (s1, s1.length, s2, s2.length, option, e); 01139 testError (e, "compareFolded failed"); 01140 return x; 01141 } 01142 01143 /*********************************************************************** 01144 01145 Helper for conversion methods 01146 01147 ***********************************************************************/ 01148 01149 final private UString caseConvert (UString.Formatter cvt) 01150 { 01151 UString s = new UString (len + 32); 01152 s.format (cvt, "case converter failed"); 01153 return s; 01154 } 01155 01156 /*********************************************************************** 01157 01158 Bind the ICU functions from a shared library. This is 01159 complicated by the issues regarding D and DLLs on the 01160 Windows platform 01161 01162 ***********************************************************************/ 01163 01164 version (Win32) 01165 { 01166 private static void* library; 01167 private static char[] libraryName = "icuuc30.dll"; 01168 01169 /*************************************************************** 01170 01171 ***************************************************************/ 01172 01173 private static extern (C) 01174 { 01175 wchar* function (wchar*, uint, wchar*, uint) u_strFindFirst; 01176 wchar* function (wchar*, uint, wchar*, uint) u_strFindLast; 01177 wchar* function (wchar*, wchar, uint) u_memchr; 01178 wchar* function (wchar*, wchar, uint) u_memrchr; 01179 int function (wchar*, uint, wchar*, uint, bool) u_strCompare; 01180 int function (wchar*, uint, wchar*, uint, uint, inout Error) u_strCaseCompare; 01181 dchar function (CharAt, uint*, uint, void*) u_unescapeAt; 01182 uint function (wchar*, uint) u_countChar32; 01183 uint function (wchar*, uint, wchar*, uint, char*, inout Error) u_strToUpper; 01184 uint function (wchar*, uint, wchar*, uint, char*, inout Error) u_strToLower; 01185 uint function (wchar*, uint, wchar*, uint, uint, inout Error) u_strFoldCase; 01186 } 01187 01188 /*************************************************************** 01189 01190 ***************************************************************/ 01191 01192 static FunctionLoader.Bind[] targets = 01193 [ 01194 {cast(void**) &u_strFindFirst, "u_strFindFirst"}, 01195 {cast(void**) &u_strFindLast, "u_strFindLast"}, 01196 {cast(void**) &u_memchr, "u_memchr"}, 01197 {cast(void**) &u_memrchr, "u_memrchr"}, 01198 {cast(void**) &u_strCompare, "u_strCompare"}, 01199 {cast(void**) &u_strCaseCompare, "u_strCaseCompare"}, 01200 {cast(void**) &u_unescapeAt, "u_unescapeAt"}, 01201 {cast(void**) &u_countChar32, "u_countChar32"}, 01202 {cast(void**) &u_strToUpper, "u_strToUpper"}, 01203 {cast(void**) &u_strToLower, "u_strToLower"}, 01204 {cast(void**) &u_strFoldCase, "u_strFoldCase"}, 01205 ]; 01206 01207 /*************************************************************** 01208 01209 ***************************************************************/ 01210 01211 static this () 01212 { 01213 library = FunctionLoader.bind (libraryName, targets); 01214 //test (); 01215 } 01216 01217 /*************************************************************** 01218 01219 ***************************************************************/ 01220 01221 static ~this () 01222 { 01223 FunctionLoader.unbind (library); 01224 } 01225 } 01226 01227 /*********************************************************************** 01228 01229 ***********************************************************************/ 01230 01231 private static void test() 01232 { 01233 UString s = new UString (r"aaaqw \uabcd eaaa", false); 01234 UString t = new UString (s); 01235 UString y = new UString (new UText("")); 01236 01237 int x = s.indexOf ("qwe"); 01238 s.unEscape (); 01239 s.toUpper (); 01240 s.padLeading(2).padTrailing(2).trim(); 01241 } 01242 }