Main Page | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members | Related Pages

UString.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file UString.d
00004         
00005         Copyright (c) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026         4. Derivative works are permitted, but they must carry this notice
00027            in full and credit the original source.
00028 
00029 
00030                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00031 
00032 
00033         @version        Initial version, October 2004      
00034         @author         Kris
00035 
00036         Note that this package and documentation is built around the ICU 
00037         project (http://oss.software.ibm.com/icu/). Below is the license 
00038         statement as specified by that software:
00039 
00040 
00041                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00042 
00043 
00044         ICU License - ICU 1.8.1 and later
00045 
00046         COPYRIGHT AND PERMISSION NOTICE
00047 
00048         Copyright (c) 1995-2003 International Business Machines Corporation and 
00049         others.
00050 
00051         All rights reserved.
00052 
00053         Permission is hereby granted, free of charge, to any person obtaining a
00054         copy of this software and associated documentation files (the
00055         "Software"), to deal in the Software without restriction, including
00056         without limitation the rights to use, copy, modify, merge, publish,
00057         distribute, and/or sell copies of the Software, and to permit persons
00058         to whom the Software is furnished to do so, provided that the above
00059         copyright notice(s) and this permission notice appear in all copies of
00060         the Software and that both the above copyright notice(s) and this
00061         permission notice appear in supporting documentation.
00062 
00063         THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
00064         OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00065         MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
00066         OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
00067         HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
00068         INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
00069         FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
00070         NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
00071         WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
00072 
00073         Except as contained in this notice, the name of a copyright holder
00074         shall not be used in advertising or otherwise to promote the sale, use
00075         or other dealings in this Software without prior written authorization
00076         of the copyright holder.
00077 
00078         ----------------------------------------------------------------------
00079 
00080         All trademarks and registered trademarks mentioned herein are the 
00081         property of their respective owners.
00082 
00083 *******************************************************************************/
00084 
00085 module mango.icu.UString;
00086 
00087 private import  mango.icu.ICU,
00088                 mango.icu.UChar,
00089                 mango.icu.ULocale;
00090 
00091 /*******************************************************************************
00092 
00093 *******************************************************************************/
00094 
00095 private extern (C) void memmove (void* dst, void* src, uint bytes);
00096 
00097 /*******************************************************************************
00098 
00099         Bind to the IReadable and IWritable interfaces if we're building 
00100         along with the mango.io package
00101 
00102 *******************************************************************************/
00103 
00104 version (Isolated)
00105         {
00106         private interface ITextOther   {}
00107         private interface IStringOther {}
00108         }
00109      else
00110         {
00111         private import  mango.icu.UMango;
00112 
00113         private import  mango.io.model.IReader,
00114                         mango.io.model.IWriter;
00115 
00116         private interface ITextOther   : IWritable {}
00117         private interface IStringOther : IReadable {}
00118         }
00119           
00120 
00121 /*******************************************************************************
00122 
00123         UString is a string class that stores Unicode characters directly 
00124         and provides similar functionality as the Java String class.
00125 
00126         In ICU, a Unicode string consists of 16-bit Unicode code units. 
00127         A Unicode character may be stored with either one code unit — 
00128         which is the most common case — or with a matched pair of 
00129         special code units ("surrogates"). The data type for code units 
00130         is UChar.
00131 
00132         For single-character handling, a Unicode character code point is 
00133         a value in the range 0..0x10ffff. ICU uses the UChar32 type for 
00134         code points.
00135 
00136         Indexes and offsets into and lengths of strings always count code 
00137         units, not code points. This is the same as with multi-byte char* 
00138         strings in traditional string handling. Operations on partial 
00139         strings typically do not test for code point boundaries. If necessary, 
00140         the user needs to take care of such boundaries by testing for the code 
00141         unit values or by using functions like getChar32Start() 
00142         and getChar32Limit()
00143 
00144         UString methods are more lenient with regard to input parameter values 
00145         than other ICU APIs. In particular:
00146 
00147         - If indexes are out of bounds for a UString object (< 0 or > length) 
00148           then they are "pinned" to the nearest boundary.
00149 
00150         - If primitive string pointer values (e.g., const wchar* or char*) for 
00151           input strings are null, then those input string parameters are treated 
00152           as if they pointed to an empty string. However, this is not the case 
00153           for char* parameters for charset names or other IDs.
00154         
00155 *******************************************************************************/
00156 
00157 class UString : UText, IStringOther
00158 {
00159         alias opCat             append;
00160         alias opIndexAssign     setCharAt;
00161 
00162         /***********************************************************************
00163         
00164                 Create an empty UString with the specified available space
00165 
00166         ***********************************************************************/
00167 
00168         this (uint space = 0)
00169         {
00170                 content.length = space;
00171                 mutable = true;
00172         }
00173 
00174         /***********************************************************************
00175         
00176                 Create a UString upon the provided content. If said content
00177                 is immutable (read-only) then you might consider setting the
00178                 'mutable' parameter to false. Doing so will avoid allocating
00179                 heap-space for the content until it is modified.
00180 
00181         ***********************************************************************/
00182 
00183         this (wchar[] content, bool mutable = true)
00184         {
00185                 setTo (content, mutable);
00186         }
00187 
00188         /***********************************************************************
00189         
00190                 Create a UString via the content of a UText. Note that the
00191                 default is to assume the content is immutable (read-only).
00192                 
00193         ***********************************************************************/
00194         
00195         this (UText other, bool mutable = false)
00196         {
00197                 this (other.get, mutable);
00198         }
00199 
00200         /***********************************************************************
00201         
00202                 Create a UString via the content of a UString. If said content
00203                 is immutable (read-only) then you might consider setting the
00204                 'mutable' parameter to false. Doing so will avoid allocating
00205                 heap-space for the content until it is modified via UString 
00206                 methods.
00207 
00208         ***********************************************************************/
00209         
00210         this (UString other, bool mutable = true)
00211         {
00212                 this (other.get, mutable);
00213         }
00214 
00215         /***********************************************************************
00216         
00217                 Support for reading content via the IO system
00218 
00219         ***********************************************************************/
00220 
00221         version (Isolated){}
00222         else
00223         {
00224                 /***************************************************************
00225         
00226                         Internal adapter to handle loading and conversion
00227                         of UString content. Once constructed, this may be 
00228                         used as the target for an IReader. Alternatively, 
00229                         invoke the load() method with an IBuffer of choice.
00230 
00231                 ***************************************************************/
00232         
00233                 class UStringDecoder : StringDecoder16
00234                 {
00235                         private UString s;
00236 
00237                         // construct a decoder on the given UString
00238                         this (UConverter c, uint bytes, UString s)
00239                         {
00240                                 super (c, bytes);
00241                                 this.s = s;
00242                         }
00243 
00244                         // IReadable adapter to perform the conversion
00245                         protected void read (IReader r)
00246                         {
00247                                 load (r.getBuffer);
00248                         }
00249 
00250                         // read from the provided buffer until we 
00251                         // either have all the content, or an eof
00252                         // condition throws an exception.
00253                         package void load (IBuffer b)
00254                         {
00255                                 uint produced = super.read (b, s.content);
00256                                 while (toGo)
00257                                       {
00258                                       s.expand (toGo);
00259                                       produced += super.read (b, s.content[produced..length]);
00260                                       }
00261                                 s.len = produced;
00262                         }
00263                 }
00264 
00265                 /***************************************************************
00266         
00267                         Another constructor for loading known content length
00268                         into a UString.
00269 
00270                 ***************************************************************/
00271         
00272                 this (IBuffer buffer, uint contentLength, UConverter cvt)
00273                 {
00274                         this (contentLength);
00275                         UStringDecoder sd = new UStringDecoder (cvt, contentLength, this);
00276                         sd.load (buffer);
00277                 }
00278 
00279                 /***************************************************************
00280                 
00281                         Read as many bytes from the input as is necessary
00282                         to produce the expected number of wchar elements.
00283                         This uses the default wchar handler, which can be
00284                         altered by binding a StringDecoder to the IReader
00285                         in use (see UMango for details).
00286 
00287                         We're mutable, so ensure we don't mess with the
00288                         IO buffers. Interestingly, changing the length 
00289                         of a D array will account for slice assignments 
00290                         (it checks the pointer to see if it's a starting
00291                          point in the pool). Unfortunately, that doesn't
00292                         catch the case where a slice starts at offset 0,
00293                         which is where IBuffer slices may come from. 
00294                         
00295                         To be safe, we ask the allocator in use whether 
00296                         the content it provided can be mutated or not.
00297                         Note that this is not necessary for UText, since 
00298                         that is a read-only construct.
00299 
00300                 ***************************************************************/
00301 
00302                 void read (IReader r)
00303                 {
00304                         r.get (content);
00305                         len = content.length;
00306                         mutable = r.getAllocator.isMutable (content);
00307                 }
00308 
00309                 /***************************************************************
00310                 
00311                         Return a streaming decoder that can be used to 
00312                         populate this UString with a specified number of 
00313                         input bytes. 
00314 
00315                         This differs from the above read() method in the
00316                         way content is read: in the above case, exactly
00317                         the specified number of wchar elements will be
00318                         converter from the input, whereas in this case 
00319                         a variable number of wchar elements are converted
00320                         until 'bytes' have been read from the input. This 
00321                         is useful in those cases where the original number 
00322                         of elements has been lost, and only the resultant 
00323                         converted byte-count remains (a la HTTP).
00324 
00325                         The returned StringDecoder is one-shot only. You may
00326                         reuse it (both the converter and the byte count) via
00327                         its reset() method. 
00328 
00329                         One applies the resultant converter directly with an 
00330                         IReader like so:
00331 
00332                         @code
00333                         UString s = ...;
00334                         IReader r = ...;
00335 
00336                         // r >> s.createDecoder(cvt, bytes);
00337                         r.get (s.createDecoder(cvt, bytes));
00338                         @endcode
00339 
00340                         which will read the specified number of bytes from
00341                         the input and convert them to an appropriate number
00342                         of wchars within the UString. 
00343 
00344                 ***************************************************************/
00345 
00346                 StringDecoder createDecoder (UConverter c, uint bytes)
00347                 {
00348                         return new UStringDecoder (c, bytes, this);
00349                 }
00350         }
00351 
00352         /***********************************************************************
00353                 
00354                 Append text to this UString
00355 
00356         ***********************************************************************/
00357 
00358         UString opCat (UText other)
00359         {
00360                 return opCat (other.get);
00361         }
00362 
00363         /***********************************************************************
00364         
00365                 Append partial text to this UString
00366 
00367         ***********************************************************************/
00368 
00369         UString opCat (UText other, uint start, uint len=uint.max)
00370         {
00371                 other.pinIndices (start, len);
00372                 return opCat (other.content [start..start+len]);
00373         }
00374 
00375         /***********************************************************************
00376         
00377                 Append a single character to this UString
00378 
00379         ***********************************************************************/
00380 
00381         UString opCat (wchar chr)
00382         {
00383                 return opCat (&chr, 1);
00384         }
00385 
00386         /***********************************************************************
00387         
00388                 Append text to this UString
00389 
00390         ***********************************************************************/
00391 
00392         UString opCat (wchar[] chars)
00393         {
00394                 return opCat (chars, chars.length);
00395         }
00396 
00397         /***********************************************************************
00398                 
00399                 Converts a sequence of UTF-8 bytes to UChars (UTF-16)
00400 
00401         ***********************************************************************/
00402 
00403         UString opCat (char[] chars)
00404         {
00405                 uint fmt (wchar* dst, uint len, inout Error e)
00406                 {
00407                         uint x;
00408 
00409                         u_strFromUTF8 (dst, len, &x, chars, chars.length, e);
00410                         return x;
00411                 }
00412 
00413                 expand (chars.length);
00414                 return format (&fmt, "failed to append UTF char[]");
00415         }
00416 
00417         /***********************************************************************
00418                 
00419                 Set a section of this UString to the specified character
00420 
00421         ***********************************************************************/
00422 
00423         UString setTo (wchar chr, uint start=0, uint len=uint.max)
00424         {
00425                 pinIndices (start, len);
00426                 if (! mutable)
00427                       realloc ();
00428                 content [start..start+len] = chr;
00429                 return this;
00430         }
00431 
00432         /***********************************************************************
00433    
00434                 Set the content to the provided array. Parameter 'mutable'
00435                 specifies whether the given array is likely to change. If 
00436                 not, the array is aliased until such time this UString is
00437                 altered.
00438                      
00439         ***********************************************************************/
00440 
00441         UString setTo (wchar[] chars, bool mutable = true)
00442         {
00443                 len = chars.length;
00444                 if ((this.mutable = mutable) == true)
00445                      content = chars.dup;
00446                 else
00447                    content = chars;
00448                 return this;
00449         }
00450 
00451         /***********************************************************************
00452         
00453                 Replace the content of this UString. If the new content
00454                 is immutable (read-only) then you might consider setting the
00455                 'mutable' parameter to false. Doing so will avoid allocating
00456                 heap-space for the content until it is modified via one of
00457                 these methods.
00458 
00459         ***********************************************************************/
00460 
00461         UString setTo (UText other, bool mutable = true)
00462         {
00463                 return setTo (other.get, mutable);
00464         }
00465 
00466         /***********************************************************************
00467         
00468                 Replace the content of this UString. If the new content
00469                 is immutable (read-only) then you might consider setting the
00470                 'mutable' parameter to false. Doing so will avoid allocating
00471                 heap-space for the content until it is modified via one of
00472                 these methods.
00473 
00474         ***********************************************************************/
00475 
00476         UString setTo (UText other, uint start, uint len, bool mutable = true)
00477         {
00478                 other.pinIndices (start, len);
00479                 return setTo (other.content [start..start+len], mutable);
00480         }
00481 
00482         /***********************************************************************
00483         
00484                 Replace the character at the specified location.
00485 
00486         ***********************************************************************/
00487 
00488         final UString opIndexAssign (wchar chr, uint index)
00489         in {
00490                 if (index >= len)
00491                     exception ("index of out bounds"); 
00492            }
00493         body
00494         {
00495                 if (! mutable)
00496                       realloc ();
00497                 content [index] = chr;
00498                 return this;
00499         }
00500 
00501         /***********************************************************************
00502         
00503                 Remove a piece of this UString.
00504 
00505         ***********************************************************************/
00506 
00507         UString remove (uint start, uint length=uint.max)
00508         {
00509                 pinIndices (start, length);
00510                 if (length)
00511                     if (start >= len)
00512                         truncate (start);
00513                     else
00514                        {
00515                        if (! mutable)
00516                              realloc ();
00517 
00518                        uint i = start + length;
00519                        memmove (&content[start], &content[i], (len-i) * wchar.sizeof);
00520                        len -= length;
00521                        }
00522                 return this;
00523         }
00524 
00525         /***********************************************************************
00526         
00527                 Truncate the length of this UString.
00528 
00529         ***********************************************************************/
00530 
00531         UString truncate (uint length=0)
00532         {
00533                 if (length <= len)
00534                     len = length;
00535                 return this;
00536         }
00537 
00538         /***********************************************************************
00539         
00540                 Insert leading spaces in this UString
00541 
00542         ***********************************************************************/
00543 
00544         UString padLeading (uint count, wchar padChar = 0x0020)
00545         {
00546                 expand  (count);
00547                 memmove (&content[count], content, len * wchar.sizeof);
00548                 len += count;
00549                 return setTo (padChar, 0, count);
00550         }
00551 
00552         /***********************************************************************
00553         
00554                 Append some trailing spaces to this UString.
00555 
00556         ***********************************************************************/
00557 
00558         UString padTrailing (uint length, wchar padChar = 0x0020)
00559         {
00560                 expand (length);
00561                 len += length;
00562                 return setTo  (padChar, len-length, length);
00563         }
00564 
00565         /***********************************************************************
00566         
00567                 Check for available space within the buffer, and expand 
00568                 as necessary.
00569 
00570         ***********************************************************************/
00571 
00572         package final void expand (uint count)
00573         {
00574                 if ((len + count) > content.length)
00575                      realloc (count);
00576         }
00577 
00578         /***********************************************************************
00579         
00580                 Allocate memory due to a change in the content. We handle 
00581                 the distinction between mutable and immutable here.
00582 
00583         ***********************************************************************/
00584 
00585         private final void realloc (uint count = 0)
00586         {
00587                 uint size = (content.length + count + 63) & ~63;
00588                 
00589                 if (mutable)
00590                     content.length = size;
00591                 else
00592                    {
00593                    mutable = true;
00594                    wchar[] x = content;
00595                    content = new wchar [size];
00596                    if (len)
00597                        content[0..len] = x;
00598                    }
00599         }
00600 
00601         /***********************************************************************
00602         
00603                 Internal method to support UString appending
00604 
00605         ***********************************************************************/
00606 
00607         private final UString opCat (wchar* chars, uint count)
00608         {
00609                 expand (count);
00610                 content[len..len+count] = chars[0..count];
00611                 len += count;
00612                 return this;
00613         }
00614 
00615         /***********************************************************************
00616         
00617                 Internal method to support formatting into this UString. 
00618                 This is used by many of the ICU wrappers to append content
00619                 into a UString.
00620 
00621         ***********************************************************************/
00622 
00623         typedef uint delegate (wchar* dst, uint len, inout Error e) Formatter;
00624 
00625         package final UString format (Formatter format, char[] msg)
00626         {
00627                 Error   e;
00628                 uint    length;
00629 
00630                 while (true)
00631                       {
00632                       e = e.OK;
00633                       length = format (&content[len], content.length - len, e);
00634                       if (e == e.BufferOverflow)
00635                           expand (length);
00636                       else
00637                          break;
00638                       } 
00639 
00640                 if (isError (e))
00641                     exception (msg);
00642 
00643                 len += length;
00644                 return this;
00645         }
00646 }
00647 
00648 
00649 /*******************************************************************************
00650 
00651         Immutable (read-only) text -- use UString for mutable strings.
00652 
00653 *******************************************************************************/
00654 
00655 class UText : ICU, ITextOther
00656 {
00657         alias opIndex   charAt;
00658 
00659         // the core of the UText and UString attributes. The name 'len'
00660         // is used rather than the more obvious 'length' since there is
00661         // a collision with the silly array[length] syntactic sugar ...
00662         package uint    len;
00663         package wchar[] content;
00664 
00665         // this should probably be in UString only, but there seems to 
00666         // be a compiler bug where it doesn't get initialised correctly,
00667         // and it's perhaps useful to have here for when a UString is
00668         // passed as a UText argument.
00669         private bool    mutable;
00670 
00671         // toFolded() argument
00672         public enum     CaseOption 
00673                         {
00674                         Default  = 0, 
00675                         SpecialI = 1
00676                         }
00677 
00678         /***********************************************************************
00679         
00680                 Hidden constructor
00681 
00682         ***********************************************************************/
00683 
00684         private this ()
00685         {
00686         }
00687 
00688         /***********************************************************************
00689         
00690                 Construct read-only wrapper around the given content
00691 
00692         ***********************************************************************/
00693 
00694         this (wchar[] content)
00695         {
00696                 this.content = content;
00697                 this.len = content.length;
00698         }
00699 
00700         /***********************************************************************
00701         
00702                 Support for writing via the Mango IO subsystem
00703 
00704         ***********************************************************************/
00705 
00706         version (Isolated){}
00707         else
00708         {
00709                 void write (IWriter w)
00710                 {
00711                         w.putw (get);
00712                 }
00713         }
00714 
00715         /***********************************************************************
00716         
00717                 Return the valid content from this UText
00718 
00719         ***********************************************************************/
00720 
00721         final package wchar[] get ()
00722         {
00723                 return content [0..len];
00724         }
00725 
00726         /***********************************************************************
00727         
00728                 Is this UText equal to another?
00729 
00730         ***********************************************************************/
00731 
00732         final override int opEquals (Object o)
00733         {
00734                 UText other = cast(UText) o;
00735 
00736                 if (other)
00737                     return (other is this || compare (other) == 0);
00738                 return 0;
00739         }
00740 
00741         /***********************************************************************
00742         
00743                 Compare this UText to another.
00744 
00745         ***********************************************************************/
00746 
00747         final override int opCmp (Object o)
00748         {
00749                 UText other = cast(UText) o;
00750 
00751                 if (other is this)
00752                     return 0;
00753                 else
00754                    if (other)
00755                        return compare (other);
00756                 return 1;
00757         }
00758 
00759         /***********************************************************************
00760         
00761                 Hash this UText
00762 
00763         ***********************************************************************/
00764 
00765         final override uint toHash ()
00766         {
00767                 return typeid(wchar[]).getHash (&content[0..len]);
00768         }
00769 
00770         /***********************************************************************
00771         
00772                 Clone this UText into a UString
00773 
00774         ***********************************************************************/
00775 
00776         final UString copy ()
00777         {
00778                 return new UString (content);
00779         }
00780 
00781         /***********************************************************************
00782         
00783                 Clone a section of this UText into a UString
00784 
00785         ***********************************************************************/
00786 
00787         final UString extract (uint start, uint len=uint.max)
00788         {
00789                 pinIndices (start, len);
00790                 return new UString (content[start..start+len]);
00791         }
00792 
00793         /***********************************************************************
00794         
00795                 Count unicode code points in the length UChar code units of 
00796                 the string. A code point may occupy either one or two UChar 
00797                 code units. Counting code points involves reading all code 
00798                 units.
00799 
00800         ***********************************************************************/
00801 
00802         final uint codePoints (uint start=0, uint length=uint.max)
00803         {
00804                 pinIndices (start, length);
00805                 return u_countChar32 (&content[start], length);
00806         }
00807 
00808         /***********************************************************************
00809         
00810                 Return an indication whether or not there are surrogate pairs
00811                 within the string.
00812 
00813         ***********************************************************************/
00814 
00815         final bool hasSurrogates (uint start=0, uint length=uint.max)
00816         {
00817                 pinIndices (start, length);
00818                 return codePoints (start, length) != length;
00819         }
00820 
00821         /***********************************************************************
00822         
00823                 Return the character at the specified position.
00824 
00825         ***********************************************************************/
00826 
00827         final wchar opIndex (uint index)
00828         in {
00829                 if (index >= len)
00830                     exception ("index of out bounds"); 
00831            }
00832         body
00833         {
00834                 return content [index];
00835         }
00836 
00837         /***********************************************************************
00838         
00839                 Return the length of the valid content
00840 
00841         ***********************************************************************/
00842 
00843         final uint length ()
00844         {
00845                 return len;
00846         }
00847 
00848         /***********************************************************************
00849         
00850                 The comparison can be done in code unit order or in code 
00851                 point order. They differ only in UTF-16 when comparing 
00852                 supplementary code points (U+10000..U+10ffff) to BMP code 
00853                 points near the end of the BMP (i.e., U+e000..U+ffff). 
00854 
00855                 In code unit order, high BMP code points sort after 
00856                 supplementary code points because they are stored as 
00857                 pairs of surrogates which are at U+d800..U+dfff.
00858 
00859         ***********************************************************************/
00860 
00861         final int compare (UText other, bool codePointOrder=false)
00862         {
00863                 return compare (other.get, codePointOrder); 
00864         }
00865 
00866         /***********************************************************************
00867         
00868                 The comparison can be done in code unit order or in code 
00869                 point order. They differ only in UTF-16 when comparing 
00870                 supplementary code points (U+10000..U+10ffff) to BMP code 
00871                 points near the end of the BMP (i.e., U+e000..U+ffff). 
00872 
00873                 In code unit order, high BMP code points sort after 
00874                 supplementary code points because they are stored as 
00875                 pairs of surrogates which are at U+d800..U+dfff.
00876 
00877         ***********************************************************************/
00878 
00879         final int compare (wchar[] other, bool codePointOrder=false)
00880         {
00881                 return u_strCompare (content, len, other, other.length, codePointOrder); 
00882         }
00883 
00884         /***********************************************************************
00885         
00886                 The comparison can be done in UTF-16 code unit order or 
00887                 in code point order. They differ only when comparing 
00888                 supplementary code points (U+10000..U+10ffff) to BMP code 
00889                 points near the end of the BMP (i.e., U+e000..U+ffff). 
00890 
00891                 In code unit order, high BMP code points sort after 
00892                 supplementary code points because they are stored as
00893                 pairs of surrogates which are at U+d800..U+dfff.
00894 
00895         ***********************************************************************/
00896 
00897         final int compareFolded (UText other, CaseOption option = CaseOption.Default)
00898         {
00899                 return compareFolded (other.content, option);
00900         }
00901 
00902         /***********************************************************************
00903         
00904                 The comparison can be done in UTF-16 code unit order or 
00905                 in code point order. They differ only when comparing 
00906                 supplementary code points (U+10000..U+10ffff) to BMP code 
00907                 points near the end of the BMP (i.e., U+e000..U+ffff). 
00908 
00909                 In code unit order, high BMP code points sort after 
00910                 supplementary code points because they are stored as
00911                 pairs of surrogates which are at U+d800..U+dfff.
00912 
00913         ***********************************************************************/
00914 
00915         final int compareFolded (wchar[] other, CaseOption option = CaseOption.Default)
00916         {
00917                 return compareFolded (get, other, option);
00918         }
00919 
00920         /***********************************************************************
00921         
00922                 Does this UText start with specified string?
00923 
00924         ***********************************************************************/
00925 
00926         final bool startsWith (UText other)
00927         {
00928                 return startsWith (other.get);
00929         }
00930 
00931         /***********************************************************************
00932         
00933                 Does this UText start with specified string?
00934 
00935         ***********************************************************************/
00936 
00937         final bool startsWith (wchar[] chars)
00938         {
00939                 if (len >= chars.length)
00940                     return compareFolded (content[0..chars.length], chars) == 0;
00941                 return false;
00942         }
00943 
00944         /***********************************************************************
00945         
00946                 Does this UText end with specified string?
00947 
00948         ***********************************************************************/
00949 
00950         final bool endsWith (UText other)
00951         {
00952                 return endsWith (other.get);
00953         }
00954 
00955         /***********************************************************************
00956         
00957                 Does this UText end with specified string?
00958 
00959         ***********************************************************************/
00960 
00961         final bool endsWith (wchar[] chars)
00962         {
00963                 if (len >= chars.length)
00964                     return compareFolded (content[len-chars.length..len], chars) == 0;
00965                 return false;
00966         }
00967 
00968         /***********************************************************************
00969         
00970                 Find the first occurrence of a BMP code point in a string.
00971                 A surrogate code point is found only if its match in the 
00972                 text is not part of a surrogate pair.
00973 
00974         ***********************************************************************/
00975 
00976         final uint indexOf (wchar c, uint start=0)
00977         {
00978                 pinIndex (start);
00979                 wchar* s = u_memchr (&content[start], c, len-start);
00980                 if (s)
00981                     return s - cast(wchar*) content;
00982                 return uint.max;
00983         }
00984 
00985         /***********************************************************************
00986         
00987                 Find the first occurrence of a substring in a string. 
00988 
00989                 The substring is found at code point boundaries. That means 
00990                 that if the substring begins with a trail surrogate or ends 
00991                 with a lead surrogate, then it is found only if these 
00992                 surrogates stand alone in the text. Otherwise, the substring 
00993                 edge units would be matched against halves of surrogate pairs.
00994 
00995         ***********************************************************************/
00996 
00997         final uint indexOf (UText other, uint start=0)
00998         {
00999                 return indexOf (other.get, start);
01000         }
01001 
01002         /***********************************************************************
01003         
01004                 Find the first occurrence of a substring in a string. 
01005 
01006                 The substring is found at code point boundaries. That means 
01007                 that if the substring begins with a trail surrogate or ends 
01008                 with a lead surrogate, then it is found only if these 
01009                 surrogates stand alone in the text. Otherwise, the substring 
01010                 edge units would be matched against halves of surrogate pairs.
01011 
01012         ***********************************************************************/
01013 
01014         final uint indexOf (wchar[] chars, uint start=0)
01015         {
01016                 pinIndex (start);
01017                 wchar* s = u_strFindFirst (&content[start], len-start, chars, chars.length);
01018                 if (s)
01019                     return s - cast(wchar*) content;
01020                 return uint.max;
01021         }
01022 
01023         /***********************************************************************
01024         
01025                 Find the last occurrence of a BMP code point in a string.
01026                 A surrogate code point is found only if its match in the 
01027                 text is not part of a surrogate pair.
01028 
01029         ***********************************************************************/
01030 
01031         final uint lastIndexOf (wchar c, uint start=uint.max)
01032         {
01033                 pinIndex (start);
01034                 wchar* s = u_memrchr (content, c, start);
01035                 if (s)
01036                     return s - cast(wchar*) content;
01037                 return uint.max;
01038         }
01039 
01040         /***********************************************************************
01041         
01042                 Find the last occurrence of a BMP code point in a string.
01043                 A surrogate code point is found only if its match in the 
01044                 text is not part of a surrogate pair.
01045 
01046         ***********************************************************************/
01047 
01048         final uint lastIndexOf (UText other, uint start=uint.max)
01049         {
01050                 return lastIndexOf (other.get, start);
01051         }
01052 
01053         /***********************************************************************
01054         
01055                 Find the last occurrence of a substring in a string. 
01056 
01057                 The substring is found at code point boundaries. That means 
01058                 that if the substring begins with a trail surrogate or ends 
01059                 with a lead surrogate, then it is found only if these 
01060                 surrogates stand alone in the text. Otherwise, the substring 
01061                 edge units would be matched against halves of surrogate pairs.
01062 
01063         ***********************************************************************/
01064 
01065         final uint lastIndexOf (wchar[] chars, uint start=uint.max)
01066         {
01067                 pinIndex (start);
01068                 wchar* s = u_strFindLast (content, start, chars, chars.length);
01069                 if (s)
01070                     return s - cast(wchar*) content;
01071                 return uint.max;
01072         }
01073 
01074         /***********************************************************************
01075 
01076                 Lowercase the characters into a seperate UString.
01077 
01078                 Casing is locale-dependent and context-sensitive. The 
01079                 result may be longer or shorter than the original. 
01080         
01081                 Note that the return value refers to the provided destination 
01082                 UString.
01083 
01084         ***********************************************************************/
01085 
01086         final UString toLower (UString dst)
01087         {
01088                return toLower (dst, ULocale.Default);
01089         }
01090 
01091         /***********************************************************************
01092 
01093                 Lowercase the characters into a seperate UString.
01094 
01095                 Casing is locale-dependent and context-sensitive. The 
01096                 result may be longer or shorter than the original.
01097         
01098                 Note that the return value refers to the provided destination 
01099                 UString.
01100 
01101         ***********************************************************************/
01102 
01103         final UString toLower (UString dst, inout ULocale locale)
01104         {
01105                 uint lower (wchar* dst, uint length, inout Error e)
01106                 {
01107                         return u_strToLower (dst, length, content, len, toString(locale.name), e);
01108                 }
01109 
01110                 dst.expand (len + 32);
01111                 return dst.format (&lower, "toLower() failed");
01112         }
01113 
01114         /***********************************************************************
01115 
01116                 Uppercase the characters into a seperate UString.
01117 
01118                 Casing is locale-dependent and context-sensitive. The 
01119                 result may be longer or shorter than the original.
01120 
01121                 Note that the return value refers to the provided destination 
01122                 UString.
01123 
01124         ***********************************************************************/
01125 
01126         final UString toUpper (UString dst)
01127         {
01128                return toUpper (dst, ULocale.Default);
01129         }
01130 
01131         /***********************************************************************
01132 
01133                 Uppercase the characters into a seperate UString.
01134 
01135                 Casing is locale-dependent and context-sensitive. The 
01136                 result may be longer or shorter than the original.
01137 
01138                 Note that the return value refers to the provided destination 
01139                 UString.
01140 
01141         ***********************************************************************/
01142 
01143         final UString toUpper (UString dst, inout ULocale locale)
01144         {
01145                 uint upper (wchar* dst, uint length, inout Error e)
01146                 {
01147                         return u_strToUpper (dst, length, content, len, toString(locale.name), e);
01148                 }
01149 
01150                 dst.expand (len + 32);
01151                 return dst.format (&upper, "toUpper() failed");
01152         }
01153 
01154         /***********************************************************************
01155         
01156                 Case-fold the characters into a seperate UString.
01157 
01158                 Case-folding is locale-independent and not context-sensitive,
01159                 but there is an option for whether to include or exclude 
01160                 mappings for dotted I and dotless i that are marked with 'I' 
01161                 in CaseFolding.txt. The result may be longer or shorter than 
01162                 the original.
01163 
01164                 Note that the return value refers to the provided destination 
01165                 UString.
01166 
01167         ***********************************************************************/
01168 
01169         final UString toFolded (UString dst, CaseOption option = CaseOption.Default)
01170         {
01171                 uint fold (wchar* dst, uint length, inout Error e)
01172                 {
01173                         return u_strFoldCase (dst, length, content, len, option, e);
01174                 }
01175 
01176                 dst.expand (len + 32);
01177                 return dst.format (&fold, "toFolded() failed");
01178         }
01179 
01180         /***********************************************************************
01181 
01182                 Converts a sequence of wchar (UTF-16) to UTF-8 bytes. If
01183                 the output array is not provided, an array of appropriate
01184                 size will be allocated and returned. Where the output is 
01185                 provided, it must be large enough to hold potentially four
01186                 bytes per character for surrogate-pairs or three bytes per
01187                 character for BMP only. Consider using UConverter where
01188                 streaming conversions are required.
01189 
01190                 Returns an array slice representing the valid UTF8 content.
01191 
01192         ***********************************************************************/
01193 
01194         final char[] toUtf8 (char[] dst = null)
01195         {
01196                 uint    x;
01197                 Error   e;
01198 
01199                 if (! cast(char*) dst)
01200                       dst = new char[len * 4];
01201                       
01202                 u_strToUTF8 (dst, dst.length, &x, content, len, e);
01203                 testError (e, "failed to convert to UTF8");
01204                 return dst [0..x];
01205         }
01206 
01207         /***********************************************************************
01208         
01209                 Remove leading and trailing whitespace from this UText.
01210                 Note that we slice the content to remove leading space.
01211 
01212         ***********************************************************************/
01213 
01214         UText trim ()
01215         {
01216                 wchar   c;
01217                 uint    i = len;
01218 
01219                 // cut off trailing white space
01220                 while (i && ((c = charAt(i-1)) == 0x20 || UChar.isWhiteSpace (c)))
01221                        --i;
01222                 len = i;
01223 
01224                 // now remove leading whitespace
01225                 for (i=0; i < len && ((c = charAt(i)) == 0x20 || UChar.isWhiteSpace (c)); ++i) {}
01226                 if (i)
01227                    {
01228                    len -= i;
01229                    content = content[i..length-i];
01230                    }
01231                   
01232                 return this;
01233         }
01234 
01235         /***********************************************************************
01236         
01237                 Unescape a string of characters and write the resulting
01238                 Unicode characters to the destination buffer.  The following 
01239                 escape sequences are recognized:
01240                 
01241                   uhhhh       4 hex digits; h in [0-9A-Fa-f]
01242                   Uhhhhhhhh   8 hex digits
01243                   xhh         1-2 hex digits
01244                   x{h...}     1-8 hex digits
01245                   ooo         1-3 octal digits; o in [0-7]
01246                   cX          control-X; X is masked with 0x1F
01247                  
01248                 as well as the standard ANSI C escapes:
01249                  
01250                   a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A,
01251                   v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B,
01252                   \\" =U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
01253                  
01254                 Anything else following a backslash is generically escaped.  
01255                 For example, "[a\\-z]" returns "[a-z]".
01256                  
01257                 If an escape sequence is ill-formed, this method returns an 
01258                 empty string.  An example of an ill-formed sequence is "\\u" 
01259                 followed by fewer than 4 hex digits.
01260                  
01261          ***********************************************************************/
01262 
01263         final UString unEscape () 
01264         {
01265                 UString result = new UString (len);
01266                 for (uint i=0; i < len;) 
01267                     {
01268                     dchar c = charAt(i++);
01269                     if (c == 0x005C) 
01270                        {
01271                        // bump index ...
01272                        c = u_unescapeAt (&_charAt, &i, len, cast(void*) this); 
01273 
01274                        // error?
01275                        if (c == 0xFFFFFFFF) 
01276                           {
01277                           result.truncate ();   // return empty string
01278                           break;                // invalid escape sequence
01279                           }
01280                        }
01281                     result.append (c);
01282                     }
01283                 return result;
01284         }
01285 
01286         /***********************************************************************
01287         
01288                 Is this code point a surrogate (U+d800..U+dfff)?
01289 
01290         ***********************************************************************/
01291 
01292         final static bool isSurrogate (wchar c)
01293         {
01294                 return (c & 0xfffff800) == 0xd800;
01295         }
01296 
01297         /***********************************************************************
01298         
01299                 Is this code unit a lead surrogate (U+d800..U+dbff)?
01300 
01301         ***********************************************************************/
01302 
01303         final static bool isLeading (wchar c)
01304         {
01305                 return (c & 0xfffffc00) == 0xd800;
01306         }
01307 
01308         /***********************************************************************
01309         
01310                 Is this code unit a trail surrogate (U+dc00..U+dfff)?
01311 
01312         ***********************************************************************/
01313 
01314         final static bool isTrailing (wchar c)
01315         {
01316                 return (c & 0xfffffc00) == 0xdc00;
01317         }
01318 
01319         /***********************************************************************
01320         
01321                 Adjust a random-access offset to a code point boundary 
01322                 at the start of a code point. If the offset points to 
01323                 the trail surrogate of a surrogate pair, then the offset 
01324                 is decremented. Otherwise, it is not modified.
01325 
01326         ***********************************************************************/
01327 
01328         final uint getCharStart (uint i)
01329         in {
01330                 if (i >= len)
01331                     exception ("index of out bounds"); 
01332            }
01333         body
01334         {
01335                 if (isTrailing (content[i]) && i && isLeading (content[i-1]))
01336                     --i;
01337                 return i;
01338         }
01339 
01340         /***********************************************************************
01341         
01342                 Adjust a random-access offset to a code point boundary 
01343                 after a code point. If the offset is behind the lead 
01344                 surrogate of a surrogate pair, then the offset is 
01345                 incremented. Otherwise, it is not modified.
01346 
01347         ***********************************************************************/
01348 
01349         final uint getCharLimit (uint i)
01350         in {
01351                 if (i >= len)
01352                     exception ("index of out bounds"); 
01353            }
01354         body
01355         {
01356                 if (i && isLeading(content[i-1]) && isTrailing (content[i]))
01357                     ++i;
01358                 return i;
01359         }
01360 
01361         /***********************************************************************
01362         
01363                 Callback for C unescapeAt() function
01364 
01365         ***********************************************************************/
01366 
01367         extern (C)
01368         {
01369                 typedef wchar function (uint offset, void* context) CharAt;
01370 
01371                 private static wchar _charAt (uint offset, void* context)
01372                 {
01373                         return (cast(UString) context).charAt (offset);
01374                 }
01375         }
01376 
01377         /***********************************************************************
01378         
01379                 Pin the given index to a valid position.
01380 
01381         ***********************************************************************/
01382 
01383         final private void pinIndex (inout uint x)
01384         {
01385                 if (x > len)
01386                     x = len;
01387         }
01388 
01389         /***********************************************************************
01390         
01391                 Pin the given index and length to a valid position.
01392 
01393         ***********************************************************************/
01394 
01395         final private void pinIndices (inout uint start, inout uint length)
01396         {
01397                 if (start > len) 
01398                     start = len;
01399 
01400                 if (length > (len - start))
01401                     length = len - start;
01402         }
01403 
01404         /***********************************************************************
01405         
01406                 Helper for comparison methods
01407 
01408         ***********************************************************************/
01409 
01410         final private int compareFolded (wchar[] s1, wchar[] s2, CaseOption option = CaseOption.Default)
01411         {
01412                 Error e;
01413 
01414                 int x = u_strCaseCompare (s1, s1.length, s2, s2.length, option, e);
01415                 testError (e, "compareFolded failed");
01416                 return x; 
01417         }
01418 
01419 
01420         /***********************************************************************
01421         
01422                 Bind the ICU functions from a shared library. This is
01423                 complicated by the issues regarding D and DLLs on the
01424                 Windows platform
01425 
01426         ***********************************************************************/
01427                 
01428         private static void* library;
01429 
01430         /***********************************************************************
01431 
01432         ***********************************************************************/
01433 
01434         private static extern (C) 
01435         {
01436                 wchar* function (wchar*, uint, wchar*, uint) u_strFindFirst;
01437                 wchar* function (wchar*, uint, wchar*, uint) u_strFindLast;
01438                 wchar* function (wchar*, wchar, uint) u_memchr;
01439                 wchar* function (wchar*, wchar, uint) u_memrchr;
01440                 int    function (wchar*, uint, wchar*, uint, bool) u_strCompare;
01441                 int    function (wchar*, uint, wchar*, uint, uint, inout Error) u_strCaseCompare;
01442                 dchar  function (CharAt, uint*, uint, void*) u_unescapeAt;
01443                 uint   function (wchar*, uint) u_countChar32;
01444                 uint   function (wchar*, uint, wchar*, uint, char*, inout Error) u_strToUpper;
01445                 uint   function (wchar*, uint, wchar*, uint, char*, inout Error) u_strToLower;
01446                 uint   function (wchar*, uint, wchar*, uint, uint, inout Error) u_strFoldCase;
01447                 wchar* function (wchar*, uint, uint*, char*, uint, inout Error) u_strFromUTF8;
01448                 char*  function (char*, uint, uint*, wchar*, uint, inout Error) u_strToUTF8;
01449         }
01450 
01451         /***********************************************************************
01452 
01453         ***********************************************************************/
01454 
01455         static  FunctionLoader.Bind[] targets = 
01456                 [
01457                 {cast(void**) &u_strFindFirst,      "u_strFindFirst"},
01458                 {cast(void**) &u_strFindLast,       "u_strFindLast"},
01459                 {cast(void**) &u_memchr,            "u_memchr"},
01460                 {cast(void**) &u_memrchr,           "u_memrchr"},
01461                 {cast(void**) &u_strCompare,        "u_strCompare"},
01462                 {cast(void**) &u_strCaseCompare,    "u_strCaseCompare"},
01463                 {cast(void**) &u_unescapeAt,        "u_unescapeAt"},
01464                 {cast(void**) &u_countChar32,       "u_countChar32"},
01465                 {cast(void**) &u_strToUpper,        "u_strToUpper"},
01466                 {cast(void**) &u_strToLower,        "u_strToLower"},
01467                 {cast(void**) &u_strFoldCase,       "u_strFoldCase"},
01468                 {cast(void**) &u_strFromUTF8,       "u_strFromUTF8"},
01469                 {cast(void**) &u_strToUTF8,         "u_strToUTF8"},
01470                 ];
01471 
01472         /***********************************************************************
01473 
01474         ***********************************************************************/
01475 
01476         static this ()
01477         {
01478                 library = FunctionLoader.bind (icuuc, targets);
01479                 //test ();
01480         }
01481 
01482         /***********************************************************************
01483 
01484         ***********************************************************************/
01485 
01486         static ~this ()
01487         {
01488                 FunctionLoader.unbind (library);
01489         }
01490 
01491         /***********************************************************************
01492 
01493         ***********************************************************************/
01494 
01495         private static void test()
01496         {
01497                 UString s = new UString (r"aaaqw \uabcd eaaa");
01498                 char[] x = "dssfsdff";
01499                 s ~ x ~ x;
01500                 wchar c = s[3];
01501                 s[3] = 'Q';
01502                 int y = s.indexOf ("qwe");
01503                 s.unEscape ();
01504                 s.toUpper (new UString);
01505                 s.padLeading(2).padTrailing(2).trim();
01506         }
01507 }

Generated on Sat Apr 9 20:11:30 2005 for Mango by doxygen 1.3.6