Main Page | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members | Related Pages

UString.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file UString.d
00004         
00005         Copyright (c) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026         4. Derivative works are permitted, but they must carry this notice
00027            in full and credit the original source.
00028 
00029 
00030                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00031 
00032 
00033         @version        Initial version, October 2004      
00034         @author         Kris
00035 
00036         Note that this package and documentation is built around the ICU 
00037         project (http://oss.software.ibm.com/icu/). Below is the license 
00038         statement as specified by that software:
00039 
00040 
00041                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00042 
00043 
00044         ICU License - ICU 1.8.1 and later
00045 
00046         COPYRIGHT AND PERMISSION NOTICE
00047 
00048         Copyright (c) 1995-2003 International Business Machines Corporation and 
00049         others.
00050 
00051         All rights reserved.
00052 
00053         Permission is hereby granted, free of charge, to any person obtaining a
00054         copy of this software and associated documentation files (the
00055         "Software"), to deal in the Software without restriction, including
00056         without limitation the rights to use, copy, modify, merge, publish,
00057         distribute, and/or sell copies of the Software, and to permit persons
00058         to whom the Software is furnished to do so, provided that the above
00059         copyright notice(s) and this permission notice appear in all copies of
00060         the Software and that both the above copyright notice(s) and this
00061         permission notice appear in supporting documentation.
00062 
00063         THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
00064         OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00065         MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
00066         OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
00067         HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
00068         INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
00069         FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
00070         NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
00071         WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
00072 
00073         Except as contained in this notice, the name of a copyright holder
00074         shall not be used in advertising or otherwise to promote the sale, use
00075         or other dealings in this Software without prior written authorization
00076         of the copyright holder.
00077 
00078         ----------------------------------------------------------------------
00079 
00080         All trademarks and registered trademarks mentioned herein are the 
00081         property of their respective owners.
00082 
00083 *******************************************************************************/
00084 
00085 module mango.icu.UString;
00086 
00087 private import  mango.icu.ICU,
00088                 mango.icu.UChar,
00089                 mango.icu.ULocale;
00090 
00091 /*******************************************************************************
00092 
00093 *******************************************************************************/
00094 
00095 private extern (C) void memmove (void* dst, void* src, uint bytes);
00096 
00097 /*******************************************************************************
00098 
00099         Bind to the IReadable and IWritable interfaces if we're building 
00100         along with the mango.io package
00101 
00102 *******************************************************************************/
00103 
00104 version (Mango)
00105         {
00106         private import  mango.icu.UMango;
00107 
00108         private import  mango.io.model.IReader,
00109                         mango.io.model.IWriter;
00110 
00111         private interface ITextOther   : IWritable {}
00112         private interface IStringOther : IReadable {}
00113         }
00114      else
00115         {
00116         private interface ITextOther   {}
00117         private interface IStringOther {}
00118         }
00119           
00120 
00121 /*******************************************************************************
00122 
00123         UString is a string class that stores Unicode characters directly 
00124         and provides similar functionality as the Java String class.
00125 
00126         In ICU, a Unicode string consists of 16-bit Unicode code units. 
00127         A Unicode character may be stored with either one code unit — 
00128         which is the most common case — or with a matched pair of 
00129         special code units ("surrogates"). The data type for code units 
00130         is UChar.
00131 
00132         For single-character handling, a Unicode character code point is 
00133         a value in the range 0..0x10ffff. ICU uses the UChar32 type for 
00134         code points.
00135 
00136         Indexes and offsets into and lengths of strings always count code 
00137         units, not code points. This is the same as with multi-byte char* 
00138         strings in traditional string handling. Operations on partial 
00139         strings typically do not test for code point boundaries. If necessary, 
00140         the user needs to take care of such boundaries by testing for the code 
00141         unit values or by using functions like getChar32Start() 
00142         and getChar32Limit()
00143 
00144         UString methods are more lenient with regard to input parameter values 
00145         than other ICU APIs. In particular:
00146 
00147         - If indexes are out of bounds for a UString object (< 0 or > length) 
00148           then they are "pinned" to the nearest boundary.
00149 
00150         - If primitive string pointer values (e.g., const wchar* or char*) for 
00151           input strings are null, then those input string parameters are treated 
00152           as if they pointed to an empty string. However, this is not the case 
00153           for char* parameters for charset names or other IDs.
00154         
00155 *******************************************************************************/
00156 
00157 class UString : UText, IStringOther
00158 {
00159         alias opCat             append;
00160         alias opIndexAssign     setCharAt;
00161 
00162         /***********************************************************************
00163         
00164                 Create an empty UString with the specified available space
00165 
00166         ***********************************************************************/
00167 
00168         this (uint space = 0)
00169         {
00170                 content.length = space;
00171                 mutable = true;
00172         }
00173 
00174         /***********************************************************************
00175         
00176                 Create a UString upon the provided content. If said content
00177                 is immutable (read-only) then you might consider setting the
00178                 'mutable' parameter to false. Doing so will avoid allocating
00179                 heap-space for the content until it is modified.
00180 
00181         ***********************************************************************/
00182 
00183         this (wchar[] content, bool mutable = true)
00184         {
00185                 setTo (content, mutable);
00186         }
00187 
00188         /***********************************************************************
00189         
00190                 Create a UString via the content of a UText. Note that the
00191                 default is to assume the content is immutable (read-only).
00192                 
00193         ***********************************************************************/
00194         
00195         this (UText other, bool mutable = false)
00196         {
00197                 this (other.get, mutable);
00198         }
00199 
00200         /***********************************************************************
00201         
00202                 Create a UString via the content of a UString. If said content
00203                 is immutable (read-only) then you might consider setting the
00204                 'mutable' parameter to false. Doing so will avoid allocating
00205                 heap-space for the content until it is modified via UString 
00206                 methods.
00207 
00208         ***********************************************************************/
00209         
00210         this (UString other, bool mutable = true)
00211         {
00212                 this (other.get, mutable);
00213         }
00214 
00215         /***********************************************************************
00216         
00217                 Support for reading content via the IO system
00218 
00219         ***********************************************************************/
00220 
00221         version (Mango)
00222         {
00223                 /***************************************************************
00224         
00225                         Internal adapter to handle loading and conversion
00226                         of UString content. Once constructed, this may be 
00227                         used as the target for an IReader. Alternatively, 
00228                         invoke the load() method with an IBuffer of choice.
00229 
00230                 ***************************************************************/
00231         
00232                 class UStringDecoder : StringDecoder16
00233                 {
00234                         private UString s;
00235 
00236                         // construct a decoder on the given UString
00237                         this (UConverter c, uint bytes, UString s)
00238                         {
00239                                 super (c, bytes);
00240                                 this.s = s;
00241                         }
00242 
00243                         // IReadable adapter to perform the conversion
00244                         protected void read (IReader r)
00245                         {
00246                                 load (r.getBuffer);
00247                         }
00248 
00249                         // read from the provided buffer until we 
00250                         // either have all the content, or an eof
00251                         // condition throws an exception.
00252                         package void load (IBuffer b)
00253                         {
00254                                 uint produced = super.read (b, s.content);
00255                                 while (toGo)
00256                                       {
00257                                       s.expand (toGo);
00258                                       produced += super.read (b, s.content[produced..length]);
00259                                       }
00260                                 s.len = produced;
00261                         }
00262                 }
00263 
00264                 /***************************************************************
00265         
00266                         Another constructor for loading known content length
00267                         into a UString.
00268 
00269                 ***************************************************************/
00270         
00271                 this (IBuffer buffer, uint contentLength, UConverter cvt)
00272                 {
00273                         this (contentLength);
00274                         UStringDecoder sd = new UStringDecoder (cvt, contentLength, this);
00275                         sd.load (buffer);
00276                 }
00277 
00278                 /***************************************************************
00279                 
00280                         Read as many bytes from the input as is necessary
00281                         to produce the expected number of wchar elements.
00282                         This uses the default wchar handler, which can be
00283                         altered by binding a StringDecoder to the IReader
00284                         in use (see UMango for details).
00285 
00286                         We're mutable, so ensure we don't mess with the
00287                         IO buffers. Interestingly, changing the length 
00288                         of a D array will account for slice assignments 
00289                         (it checks the pointer to see if it's a starting
00290                          point in the pool). Unfortunately, that doesn't
00291                         catch the case where a slice starts at offset 0,
00292                         which is where IBuffer slices may come from. 
00293                         
00294                         To be safe, we ask the allocator in use whether 
00295                         the content it provided can be mutated or not.
00296                         Note that this is not necessary for UText, since 
00297                         that is a read-only construct.
00298 
00299                 ***************************************************************/
00300 
00301                 void read (IReader r)
00302                 {
00303                         r.get (content);
00304                         len = content.length;
00305                         mutable = r.getAllocator.isMutable (content);
00306                 }
00307 
00308                 /***************************************************************
00309                 
00310                         Return a streaming decoder that can be used to 
00311                         populate this UString with a specified number of 
00312                         input bytes. 
00313 
00314                         This differs from the above read() method in the
00315                         way content is read: in the above case, exactly
00316                         the specified number of wchar elements will be
00317                         converter from the input, whereas in this case 
00318                         a variable number of wchar elements are converted
00319                         until 'bytes' have been read from the input. This 
00320                         is useful in those cases where the original number 
00321                         of elements has been lost, and only the resultant 
00322                         converted byte-count remains (a la HTTP).
00323 
00324                         The returned StringDecoder is one-shot only. You may
00325                         reuse it (both the converter and the byte count) via
00326                         its reset() method. 
00327 
00328                         One applies the resultant converter directly with an 
00329                         IReader like so:
00330 
00331                         @code
00332                         UString s = ...;
00333                         IReader r = ...;
00334 
00335                         // r >> s.createDecoder(cvt, bytes);
00336                         r.get (s.createDecoder(cvt, bytes));
00337                         @endcode
00338 
00339                         which will read the specified number of bytes from
00340                         the input and convert them to an appropriate number
00341                         of wchars within the UString. 
00342 
00343                 ***************************************************************/
00344 
00345                 StringDecoder createDecoder (UConverter c, uint bytes)
00346                 {
00347                         return new UStringDecoder (c, bytes, this);
00348                 }
00349         }
00350 
00351         /***********************************************************************
00352                 
00353                 Append text to this UString
00354 
00355         ***********************************************************************/
00356 
00357         UString opCat (UText other)
00358         {
00359                 return opCat (other.get);
00360         }
00361 
00362         /***********************************************************************
00363         
00364                 Append partial text to this UString
00365 
00366         ***********************************************************************/
00367 
00368         UString opCat (UText other, uint start, uint len=uint.max)
00369         {
00370                 other.pinIndices (start, len);
00371                 return opCat (other.content [start..start+len]);
00372         }
00373 
00374         /***********************************************************************
00375         
00376                 Append a single character to this UString
00377 
00378         ***********************************************************************/
00379 
00380         UString opCat (wchar chr)
00381         {
00382                 return opCat (&chr, 1);
00383         }
00384 
00385         /***********************************************************************
00386         
00387                 Append text to this UString
00388 
00389         ***********************************************************************/
00390 
00391         UString opCat (wchar[] chars)
00392         {
00393                 return opCat (chars, chars.length);
00394         }
00395 
00396         /***********************************************************************
00397                 
00398                 Converts a sequence of UTF-8 bytes to UChars (UTF-16)
00399 
00400         ***********************************************************************/
00401 
00402         UString opCat (char[] chars)
00403         {
00404                 uint fmt (wchar* dst, uint len, inout Error e)
00405                 {
00406                         uint x;
00407 
00408                         u_strFromUTF8 (dst, len, &x, chars, chars.length, e);
00409                         return x;
00410                 }
00411 
00412                 expand (chars.length);
00413                 return format (&fmt, "failed to append UTF char[]");
00414         }
00415 
00416         /***********************************************************************
00417                 
00418                 Set a section of this UString to the specified character
00419 
00420         ***********************************************************************/
00421 
00422         UString setTo (wchar chr, uint start=0, uint len=uint.max)
00423         {
00424                 pinIndices (start, len);
00425                 if (! mutable)
00426                       realloc ();
00427                 content [start..start+len] = chr;
00428                 return this;
00429         }
00430 
00431         /***********************************************************************
00432    
00433                 Set the content to the provided array. Parameter 'mutable'
00434                 specifies whether the given array is likely to change. If 
00435                 not, the array is aliased until such time this UString is
00436                 altered.
00437                      
00438         ***********************************************************************/
00439 
00440         UString setTo (wchar[] chars, bool mutable = true)
00441         {
00442                 len = chars.length;
00443                 if ((this.mutable = mutable) == true)
00444                      content = chars.dup;
00445                 else
00446                    content = chars;
00447                 return this;
00448         }
00449 
00450         /***********************************************************************
00451         
00452                 Replace the content of this UString. If the new content
00453                 is immutable (read-only) then you might consider setting the
00454                 'mutable' parameter to false. Doing so will avoid allocating
00455                 heap-space for the content until it is modified via one of
00456                 these methods.
00457 
00458         ***********************************************************************/
00459 
00460         UString setTo (UText other, bool mutable = true)
00461         {
00462                 return setTo (other.get, mutable);
00463         }
00464 
00465         /***********************************************************************
00466         
00467                 Replace the content of this UString. If the new content
00468                 is immutable (read-only) then you might consider setting the
00469                 'mutable' parameter to false. Doing so will avoid allocating
00470                 heap-space for the content until it is modified via one of
00471                 these methods.
00472 
00473         ***********************************************************************/
00474 
00475         UString setTo (UText other, uint start, uint len, bool mutable = true)
00476         {
00477                 other.pinIndices (start, len);
00478                 return setTo (other.content [start..start+len], mutable);
00479         }
00480 
00481         /***********************************************************************
00482         
00483                 Replace the character at the specified location.
00484 
00485         ***********************************************************************/
00486 
00487         final UString opIndexAssign (wchar chr, uint index)
00488         in {
00489                 if (index >= len)
00490                     exception ("index of out bounds"); 
00491            }
00492         body
00493         {
00494                 if (! mutable)
00495                       realloc ();
00496                 content [index] = chr;
00497                 return this;
00498         }
00499 
00500         /***********************************************************************
00501         
00502                 Remove a piece of this UString.
00503 
00504         ***********************************************************************/
00505 
00506         UString remove (uint start, uint length=uint.max)
00507         {
00508                 pinIndices (start, length);
00509                 if (length)
00510                     if (start >= len)
00511                         truncate (start);
00512                     else
00513                        {
00514                        if (! mutable)
00515                              realloc ();
00516 
00517                        uint i = start + length;
00518                        memmove (&content[start], &content[i], (len-i) * wchar.sizeof);
00519                        len -= length;
00520                        }
00521                 return this;
00522         }
00523 
00524         /***********************************************************************
00525         
00526                 Truncate the length of this UString.
00527 
00528         ***********************************************************************/
00529 
00530         UString truncate (uint length=0)
00531         {
00532                 if (length <= len)
00533                     len = length;
00534                 return this;
00535         }
00536 
00537         /***********************************************************************
00538         
00539                 Insert leading spaces in this UString
00540 
00541         ***********************************************************************/
00542 
00543         UString padLeading (uint count, wchar padChar = 0x0020)
00544         {
00545                 expand  (count);
00546                 memmove (&content[count], content, len * wchar.sizeof);
00547                 len += count;
00548                 return setTo (padChar, 0, count);
00549         }
00550 
00551         /***********************************************************************
00552         
00553                 Append some trailing spaces to this UString.
00554 
00555         ***********************************************************************/
00556 
00557         UString padTrailing (uint length, wchar padChar = 0x0020)
00558         {
00559                 expand (length);
00560                 len += length;
00561                 return setTo  (padChar, len-length, length);
00562         }
00563 
00564         /***********************************************************************
00565         
00566                 Check for available space within the buffer, and expand 
00567                 as necessary.
00568 
00569         ***********************************************************************/
00570 
00571         package final void expand (uint count)
00572         {
00573                 if ((len + count) > content.length)
00574                      realloc (count);
00575         }
00576 
00577         /***********************************************************************
00578         
00579                 Allocate memory due to a change in the content. We handle 
00580                 the distinction between mutable and immutable here.
00581 
00582         ***********************************************************************/
00583 
00584         private final void realloc (uint count = 0)
00585         {
00586                 uint size = (content.length + count + 63) & ~63;
00587                 
00588                 if (mutable)
00589                     content.length = size;
00590                 else
00591                    {
00592                    mutable = true;
00593                    wchar[] x = content;
00594                    content = new wchar [size];
00595                    if (len)
00596                        content[0..len] = x;
00597                    }
00598         }
00599 
00600         /***********************************************************************
00601         
00602                 Internal method to support UString appending
00603 
00604         ***********************************************************************/
00605 
00606         private final UString opCat (wchar* chars, uint count)
00607         {
00608                 expand (count);
00609                 content[len..len+count] = chars[0..count];
00610                 len += count;
00611                 return this;
00612         }
00613 
00614         /***********************************************************************
00615         
00616                 Internal method to support formatting into this UString. 
00617                 This is used by many of the ICU wrappers to append content
00618                 into a UString.
00619 
00620         ***********************************************************************/
00621 
00622         typedef uint delegate (wchar* dst, uint len, inout Error e) Formatter;
00623 
00624         package final UString format (Formatter format, char[] msg)
00625         {
00626                 Error   e;
00627                 uint    length;
00628 
00629                 while (true)
00630                       {
00631                       e = e.OK;
00632                       length = format (&content[len], content.length - len, e);
00633                       if (e == e.BufferOverflow)
00634                           expand (length);
00635                       else
00636                          break;
00637                       } 
00638 
00639                 if (isError (e))
00640                     exception (msg);
00641 
00642                 len += length;
00643                 return this;
00644         }
00645 }
00646 
00647 
00648 /*******************************************************************************
00649 
00650         Immutable (read-only) text -- use UString for mutable strings.
00651 
00652 *******************************************************************************/
00653 
00654 class UText : ICU, ITextOther
00655 {
00656         alias opIndex   charAt;
00657 
00658         // the core of the UText and UString attributes. The name 'len'
00659         // is used rather than the more obvious 'length' since there is
00660         // a collision with the silly array[length] syntactic sugar ...
00661         package uint    len;
00662         package wchar[] content;
00663 
00664         // this should probably be in UString only, but there seems to 
00665         // be a compiler bug where it doesn't get initialised correctly,
00666         // and it's perhaps useful to have here for when a UString is
00667         // passed as a UText argument.
00668         private bool    mutable;
00669 
00670         // toFolded() argument
00671         public enum     CaseOption 
00672                         {
00673                         Default  = 0, 
00674                         SpecialI = 1
00675                         }
00676 
00677         /***********************************************************************
00678         
00679                 Hidden constructor
00680 
00681         ***********************************************************************/
00682 
00683         private this ()
00684         {
00685         }
00686 
00687         /***********************************************************************
00688         
00689                 Construct read-only wrapper around the given content
00690 
00691         ***********************************************************************/
00692 
00693         this (wchar[] content)
00694         {
00695                 this.content = content;
00696                 this.len = content.length;
00697         }
00698 
00699         /***********************************************************************
00700         
00701                 Support for writing via the Mango IO subsystem
00702 
00703         ***********************************************************************/
00704 
00705         version (Mango)
00706         {
00707                 void write (IWriter w)
00708                 {
00709                         w.opShlw (get);
00710                 }
00711         }
00712 
00713         /***********************************************************************
00714         
00715                 Return the valid content from this UText
00716 
00717         ***********************************************************************/
00718 
00719         final package wchar[] get ()
00720         {
00721                 return content [0..len];
00722         }
00723 
00724         /***********************************************************************
00725         
00726                 Is this UText equal to another?
00727 
00728         ***********************************************************************/
00729 
00730         final override int opEquals (Object o)
00731         {
00732                 UText other = cast(UText) o;
00733 
00734                 if (other)
00735                     return (other is this || compare (other) == 0);
00736                 return 0;
00737         }
00738 
00739         /***********************************************************************
00740         
00741                 Compare this UText to another.
00742 
00743         ***********************************************************************/
00744 
00745         final override int opCmp (Object o)
00746         {
00747                 UText other = cast(UText) o;
00748 
00749                 if (other is this)
00750                     return 0;
00751                 else
00752                    if (other)
00753                        return compare (other);
00754                 return 1;
00755         }
00756 
00757         /***********************************************************************
00758         
00759                 Hash this UText
00760 
00761         ***********************************************************************/
00762 
00763         final override uint toHash ()
00764         {
00765                 return typeid(wchar[]).getHash (&content[0..len]);
00766         }
00767 
00768         /***********************************************************************
00769         
00770                 Clone this UText into a UString
00771 
00772         ***********************************************************************/
00773 
00774         final UString copy ()
00775         {
00776                 return new UString (content);
00777         }
00778 
00779         /***********************************************************************
00780         
00781                 Clone a section of this UText into a UString
00782 
00783         ***********************************************************************/
00784 
00785         final UString extract (uint start, uint len=uint.max)
00786         {
00787                 pinIndices (start, len);
00788                 return new UString (content[start..start+len]);
00789         }
00790 
00791         /***********************************************************************
00792         
00793                 Count unicode code points in the length UChar code units of 
00794                 the string. A code point may occupy either one or two UChar 
00795                 code units. Counting code points involves reading all code 
00796                 units.
00797 
00798         ***********************************************************************/
00799 
00800         final uint codePoints (uint start=0, uint length=uint.max)
00801         {
00802                 pinIndices (start, length);
00803                 return u_countChar32 (&content[start], length);
00804         }
00805 
00806         /***********************************************************************
00807         
00808                 Return an indication whether or not there are surrogate pairs
00809                 within the string.
00810 
00811         ***********************************************************************/
00812 
00813         final bool hasSurrogates (uint start=0, uint length=uint.max)
00814         {
00815                 pinIndices (start, length);
00816                 return codePoints (start, length) != length;
00817         }
00818 
00819         /***********************************************************************
00820         
00821                 Return the character at the specified position.
00822 
00823         ***********************************************************************/
00824 
00825         final wchar opIndex (uint index)
00826         in {
00827                 if (index >= len)
00828                     exception ("index of out bounds"); 
00829            }
00830         body
00831         {
00832                 return content [index];
00833         }
00834 
00835         /***********************************************************************
00836         
00837                 Return the length of the valid content
00838 
00839         ***********************************************************************/
00840 
00841         final uint length ()
00842         {
00843                 return len;
00844         }
00845 
00846         /***********************************************************************
00847         
00848                 The comparison can be done in code unit order or in code 
00849                 point order. They differ only in UTF-16 when comparing 
00850                 supplementary code points (U+10000..U+10ffff) to BMP code 
00851                 points near the end of the BMP (i.e., U+e000..U+ffff). 
00852 
00853                 In code unit order, high BMP code points sort after 
00854                 supplementary code points because they are stored as 
00855                 pairs of surrogates which are at U+d800..U+dfff.
00856 
00857         ***********************************************************************/
00858 
00859         final int compare (UText other, bool codePointOrder=false)
00860         {
00861                 return compare (other.get, codePointOrder); 
00862         }
00863 
00864         /***********************************************************************
00865         
00866                 The comparison can be done in code unit order or in code 
00867                 point order. They differ only in UTF-16 when comparing 
00868                 supplementary code points (U+10000..U+10ffff) to BMP code 
00869                 points near the end of the BMP (i.e., U+e000..U+ffff). 
00870 
00871                 In code unit order, high BMP code points sort after 
00872                 supplementary code points because they are stored as 
00873                 pairs of surrogates which are at U+d800..U+dfff.
00874 
00875         ***********************************************************************/
00876 
00877         final int compare (wchar[] other, bool codePointOrder=false)
00878         {
00879                 return u_strCompare (content, len, other, other.length, codePointOrder); 
00880         }
00881 
00882         /***********************************************************************
00883         
00884                 The comparison can be done in UTF-16 code unit order or 
00885                 in code point order. They differ only when comparing 
00886                 supplementary code points (U+10000..U+10ffff) to BMP code 
00887                 points near the end of the BMP (i.e., U+e000..U+ffff). 
00888 
00889                 In code unit order, high BMP code points sort after 
00890                 supplementary code points because they are stored as
00891                 pairs of surrogates which are at U+d800..U+dfff.
00892 
00893         ***********************************************************************/
00894 
00895         final int compareFolded (UText other, CaseOption option = CaseOption.Default)
00896         {
00897                 return compareFolded (other.content, option);
00898         }
00899 
00900         /***********************************************************************
00901         
00902                 The comparison can be done in UTF-16 code unit order or 
00903                 in code point order. They differ only when comparing 
00904                 supplementary code points (U+10000..U+10ffff) to BMP code 
00905                 points near the end of the BMP (i.e., U+e000..U+ffff). 
00906 
00907                 In code unit order, high BMP code points sort after 
00908                 supplementary code points because they are stored as
00909                 pairs of surrogates which are at U+d800..U+dfff.
00910 
00911         ***********************************************************************/
00912 
00913         final int compareFolded (wchar[] other, CaseOption option = CaseOption.Default)
00914         {
00915                 return compareFolded (get, other, option);
00916         }
00917 
00918         /***********************************************************************
00919         
00920                 Does this UText start with specified string?
00921 
00922         ***********************************************************************/
00923 
00924         final bool startsWith (UText other)
00925         {
00926                 return startsWith (other.get);
00927         }
00928 
00929         /***********************************************************************
00930         
00931                 Does this UText start with specified string?
00932 
00933         ***********************************************************************/
00934 
00935         final bool startsWith (wchar[] chars)
00936         {
00937                 if (len >= chars.length)
00938                     return compareFolded (content[0..chars.length], chars) == 0;
00939                 return false;
00940         }
00941 
00942         /***********************************************************************
00943         
00944                 Does this UText end with specified string?
00945 
00946         ***********************************************************************/
00947 
00948         final bool endsWith (UText other)
00949         {
00950                 return endsWith (other.get);
00951         }
00952 
00953         /***********************************************************************
00954         
00955                 Does this UText end with specified string?
00956 
00957         ***********************************************************************/
00958 
00959         final bool endsWith (wchar[] chars)
00960         {
00961                 if (len >= chars.length)
00962                     return compareFolded (content[len-chars.length..len], chars) == 0;
00963                 return false;
00964         }
00965 
00966         /***********************************************************************
00967         
00968                 Find the first occurrence of a BMP code point in a string.
00969                 A surrogate code point is found only if its match in the 
00970                 text is not part of a surrogate pair.
00971 
00972         ***********************************************************************/
00973 
00974         final uint indexOf (wchar c, uint start=0)
00975         {
00976                 pinIndex (start);
00977                 wchar* s = u_memchr (&content[start], c, len-start);
00978                 if (s)
00979                     return s - cast(wchar*) content;
00980                 return uint.max;
00981         }
00982 
00983         /***********************************************************************
00984         
00985                 Find the first occurrence of a substring in a string. 
00986 
00987                 The substring is found at code point boundaries. That means 
00988                 that if the substring begins with a trail surrogate or ends 
00989                 with a lead surrogate, then it is found only if these 
00990                 surrogates stand alone in the text. Otherwise, the substring 
00991                 edge units would be matched against halves of surrogate pairs.
00992 
00993         ***********************************************************************/
00994 
00995         final uint indexOf (UText other, uint start=0)
00996         {
00997                 return indexOf (other.get, start);
00998         }
00999 
01000         /***********************************************************************
01001         
01002                 Find the first occurrence of a substring in a string. 
01003 
01004                 The substring is found at code point boundaries. That means 
01005                 that if the substring begins with a trail surrogate or ends 
01006                 with a lead surrogate, then it is found only if these 
01007                 surrogates stand alone in the text. Otherwise, the substring 
01008                 edge units would be matched against halves of surrogate pairs.
01009 
01010         ***********************************************************************/
01011 
01012         final uint indexOf (wchar[] chars, uint start=0)
01013         {
01014                 pinIndex (start);
01015                 wchar* s = u_strFindFirst (&content[start], len-start, chars, chars.length);
01016                 if (s)
01017                     return s - cast(wchar*) content;
01018                 return uint.max;
01019         }
01020 
01021         /***********************************************************************
01022         
01023                 Find the last occurrence of a BMP code point in a string.
01024                 A surrogate code point is found only if its match in the 
01025                 text is not part of a surrogate pair.
01026 
01027         ***********************************************************************/
01028 
01029         final uint lastIndexOf (wchar c, uint start=uint.max)
01030         {
01031                 pinIndex (start);
01032                 wchar* s = u_memrchr (content, c, start);
01033                 if (s)
01034                     return s - cast(wchar*) content;
01035                 return uint.max;
01036         }
01037 
01038         /***********************************************************************
01039         
01040                 Find the last occurrence of a BMP code point in a string.
01041                 A surrogate code point is found only if its match in the 
01042                 text is not part of a surrogate pair.
01043 
01044         ***********************************************************************/
01045 
01046         final uint lastIndexOf (UText other, uint start=uint.max)
01047         {
01048                 return lastIndexOf (other.get, start);
01049         }
01050 
01051         /***********************************************************************
01052         
01053                 Find the last occurrence of a substring in a string. 
01054 
01055                 The substring is found at code point boundaries. That means 
01056                 that if the substring begins with a trail surrogate or ends 
01057                 with a lead surrogate, then it is found only if these 
01058                 surrogates stand alone in the text. Otherwise, the substring 
01059                 edge units would be matched against halves of surrogate pairs.
01060 
01061         ***********************************************************************/
01062 
01063         final uint lastIndexOf (wchar[] chars, uint start=uint.max)
01064         {
01065                 pinIndex (start);
01066                 wchar* s = u_strFindLast (content, start, chars, chars.length);
01067                 if (s)
01068                     return s - cast(wchar*) content;
01069                 return uint.max;
01070         }
01071 
01072         /***********************************************************************
01073 
01074                 Lowercase the characters into a seperate UString.
01075 
01076                 Casing is locale-dependent and context-sensitive. The 
01077                 result may be longer or shorter than the original. 
01078         
01079                 Note that the return value refers to the provided destination 
01080                 UString.
01081 
01082         ***********************************************************************/
01083 
01084         final UString toLower (UString dst)
01085         {
01086                return toLower (dst, ULocale.Default);
01087         }
01088 
01089         /***********************************************************************
01090 
01091                 Lowercase the characters into a seperate UString.
01092 
01093                 Casing is locale-dependent and context-sensitive. The 
01094                 result may be longer or shorter than the original.
01095         
01096                 Note that the return value refers to the provided destination 
01097                 UString.
01098 
01099         ***********************************************************************/
01100 
01101         final UString toLower (UString dst, inout ULocale locale)
01102         {
01103                 uint lower (wchar* dst, uint length, inout Error e)
01104                 {
01105                         return u_strToLower (dst, length, content, len, toString(locale.name), e);
01106                 }
01107 
01108                 dst.expand (len + 32);
01109                 return dst.format (&lower, "toLower() failed");
01110         }
01111 
01112         /***********************************************************************
01113 
01114                 Uppercase the characters into a seperate UString.
01115 
01116                 Casing is locale-dependent and context-sensitive. The 
01117                 result may be longer or shorter than the original.
01118 
01119                 Note that the return value refers to the provided destination 
01120                 UString.
01121 
01122         ***********************************************************************/
01123 
01124         final UString toUpper (UString dst)
01125         {
01126                return toUpper (dst, ULocale.Default);
01127         }
01128 
01129         /***********************************************************************
01130 
01131                 Uppercase the characters into a seperate UString.
01132 
01133                 Casing is locale-dependent and context-sensitive. The 
01134                 result may be longer or shorter than the original.
01135 
01136                 Note that the return value refers to the provided destination 
01137                 UString.
01138 
01139         ***********************************************************************/
01140 
01141         final UString toUpper (UString dst, inout ULocale locale)
01142         {
01143                 uint upper (wchar* dst, uint length, inout Error e)
01144                 {
01145                         return u_strToUpper (dst, length, content, len, toString(locale.name), e);
01146                 }
01147 
01148                 dst.expand (len + 32);
01149                 return dst.format (&upper, "toUpper() failed");
01150         }
01151 
01152         /***********************************************************************
01153         
01154                 Case-fold the characters into a seperate UString.
01155 
01156                 Case-folding is locale-independent and not context-sensitive,
01157                 but there is an option for whether to include or exclude 
01158                 mappings for dotted I and dotless i that are marked with 'I' 
01159                 in CaseFolding.txt. The result may be longer or shorter than 
01160                 the original.
01161 
01162                 Note that the return value refers to the provided destination 
01163                 UString.
01164 
01165         ***********************************************************************/
01166 
01167         final UString toFolded (UString dst, CaseOption option = CaseOption.Default)
01168         {
01169                 uint fold (wchar* dst, uint length, inout Error e)
01170                 {
01171                         return u_strFoldCase (dst, length, content, len, option, e);
01172                 }
01173 
01174                 dst.expand (len + 32);
01175                 return dst.format (&fold, "toFolded() failed");
01176         }
01177 
01178         /***********************************************************************
01179 
01180                 Converts a sequence of wchar (UTF-16) to UTF-8 bytes. If
01181                 the output array is not provided, an array of appropriate
01182                 size will be allocated and returned. Where the output is 
01183                 provided, it must be large enough to hold potentially four
01184                 bytes per character for surrogate-pairs or three bytes per
01185                 character for BMP only. Consider using UConverter where
01186                 streaming conversions are required.
01187 
01188                 Returns an array slice representing the valid UTF8 content.
01189 
01190         ***********************************************************************/
01191 
01192         final char[] toUtf8 (char[] dst = null)
01193         {
01194                 uint    x;
01195                 Error   e;
01196 
01197                 if (! cast(char*) dst)
01198                       dst = new char[len * 4];
01199                       
01200                 u_strToUTF8 (dst, dst.length, &x, content, len, e);
01201                 testError (e, "failed to convert to UTF8");
01202                 return dst [0..x];
01203         }
01204 
01205         /***********************************************************************
01206         
01207                 Remove leading and trailing whitespace from this UText.
01208                 Note that we slice the content to remove leading space.
01209 
01210         ***********************************************************************/
01211 
01212         UText trim ()
01213         {
01214                 wchar   c;
01215                 uint    i = len;
01216 
01217                 // cut off trailing white space
01218                 while (i && ((c = charAt(i-1)) == 0x20 || UChar.isWhiteSpace (c)))
01219                        --i;
01220                 len = i;
01221 
01222                 // now remove leading whitespace
01223                 for (i=0; i < len && ((c = charAt(i)) == 0x20 || UChar.isWhiteSpace (c)); ++i) {}
01224                 if (i)
01225                    {
01226                    len -= i;
01227                    content = content[i..length-i];
01228                    }
01229                   
01230                 return this;
01231         }
01232 
01233         /***********************************************************************
01234         
01235                 Unescape a string of characters and write the resulting
01236                 Unicode characters to the destination buffer.  The following 
01237                 escape sequences are recognized:
01238                 
01239                   uhhhh       4 hex digits; h in [0-9A-Fa-f]
01240                   Uhhhhhhhh   8 hex digits
01241                   xhh         1-2 hex digits
01242                   x{h...}     1-8 hex digits
01243                   ooo         1-3 octal digits; o in [0-7]
01244                   cX          control-X; X is masked with 0x1F
01245                  
01246                 as well as the standard ANSI C escapes:
01247                  
01248                   a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A,
01249                   v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B,
01250                   \\" =U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
01251                  
01252                 Anything else following a backslash is generically escaped.  
01253                 For example, "[a\\-z]" returns "[a-z]".
01254                  
01255                 If an escape sequence is ill-formed, this method returns an 
01256                 empty string.  An example of an ill-formed sequence is "\\u" 
01257                 followed by fewer than 4 hex digits.
01258                  
01259          ***********************************************************************/
01260 
01261         final UString unEscape () 
01262         {
01263                 UString result = new UString (len);
01264                 for (uint i=0; i < len;) 
01265                     {
01266                     dchar c = charAt(i++);
01267                     if (c == 0x005C) 
01268                        {
01269                        // bump index ...
01270                        c = u_unescapeAt (&_charAt, &i, len, cast(void*) this); 
01271 
01272                        // error?
01273                        if (c == 0xFFFFFFFF) 
01274                           {
01275                           result.truncate ();   // return empty string
01276                           break;                // invalid escape sequence
01277                           }
01278                        }
01279                     result.append (c);
01280                     }
01281                 return result;
01282         }
01283 
01284         /***********************************************************************
01285         
01286                 Is this code point a surrogate (U+d800..U+dfff)?
01287 
01288         ***********************************************************************/
01289 
01290         final static bool isSurrogate (wchar c)
01291         {
01292                 return (c & 0xfffff800) == 0xd800;
01293         }
01294 
01295         /***********************************************************************
01296         
01297                 Is this code unit a lead surrogate (U+d800..U+dbff)?
01298 
01299         ***********************************************************************/
01300 
01301         final static bool isLeading (wchar c)
01302         {
01303                 return (c & 0xfffffc00) == 0xd800;
01304         }
01305 
01306         /***********************************************************************
01307         
01308                 Is this code unit a trail surrogate (U+dc00..U+dfff)?
01309 
01310         ***********************************************************************/
01311 
01312         final static bool isTrailing (wchar c)
01313         {
01314                 return (c & 0xfffffc00) == 0xdc00;
01315         }
01316 
01317         /***********************************************************************
01318         
01319                 Adjust a random-access offset to a code point boundary 
01320                 at the start of a code point. If the offset points to 
01321                 the trail surrogate of a surrogate pair, then the offset 
01322                 is decremented. Otherwise, it is not modified.
01323 
01324         ***********************************************************************/
01325 
01326         final uint getCharStart (uint i)
01327         in {
01328                 if (i >= len)
01329                     exception ("index of out bounds"); 
01330            }
01331         body
01332         {
01333                 if (isTrailing (content[i]) && i && isLeading (content[i-1]))
01334                     --i;
01335                 return i;
01336         }
01337 
01338         /***********************************************************************
01339         
01340                 Adjust a random-access offset to a code point boundary 
01341                 after a code point. If the offset is behind the lead 
01342                 surrogate of a surrogate pair, then the offset is 
01343                 incremented. Otherwise, it is not modified.
01344 
01345         ***********************************************************************/
01346 
01347         final uint getCharLimit (uint i)
01348         in {
01349                 if (i >= len)
01350                     exception ("index of out bounds"); 
01351            }
01352         body
01353         {
01354                 if (i && isLeading(content[i-1]) && isTrailing (content[i]))
01355                     ++i;
01356                 return i;
01357         }
01358 
01359         /***********************************************************************
01360         
01361                 Callback for C unescapeAt() function
01362 
01363         ***********************************************************************/
01364 
01365         extern (C)
01366         {
01367                 typedef wchar function (uint offset, void* context) CharAt;
01368 
01369                 private static wchar _charAt (uint offset, void* context)
01370                 {
01371                         return (cast(UString) context).charAt (offset);
01372                 }
01373         }
01374 
01375         /***********************************************************************
01376         
01377                 Pin the given index to a valid position.
01378 
01379         ***********************************************************************/
01380 
01381         final private void pinIndex (inout uint x)
01382         {
01383                 if (x > len)
01384                     x = len;
01385         }
01386 
01387         /***********************************************************************
01388         
01389                 Pin the given index and length to a valid position.
01390 
01391         ***********************************************************************/
01392 
01393         final private void pinIndices (inout uint start, inout uint length)
01394         {
01395                 if (start > len) 
01396                     start = len;
01397 
01398                 if (length > (len - start))
01399                     length = len - start;
01400         }
01401 
01402         /***********************************************************************
01403         
01404                 Helper for comparison methods
01405 
01406         ***********************************************************************/
01407 
01408         final private int compareFolded (wchar[] s1, wchar[] s2, CaseOption option = CaseOption.Default)
01409         {
01410                 Error e;
01411 
01412                 int x = u_strCaseCompare (s1, s1.length, s2, s2.length, option, e);
01413                 testError (e, "compareFolded failed");
01414                 return x; 
01415         }
01416 
01417 
01418         /***********************************************************************
01419         
01420                 Bind the ICU functions from a shared library. This is
01421                 complicated by the issues regarding D and DLLs on the
01422                 Windows platform
01423 
01424         ***********************************************************************/
01425                 
01426         private static void* library;
01427 
01428         /***********************************************************************
01429 
01430         ***********************************************************************/
01431 
01432         private static extern (C) 
01433         {
01434                 wchar* function (wchar*, uint, wchar*, uint) u_strFindFirst;
01435                 wchar* function (wchar*, uint, wchar*, uint) u_strFindLast;
01436                 wchar* function (wchar*, wchar, uint) u_memchr;
01437                 wchar* function (wchar*, wchar, uint) u_memrchr;
01438                 int    function (wchar*, uint, wchar*, uint, bool) u_strCompare;
01439                 int    function (wchar*, uint, wchar*, uint, uint, inout Error) u_strCaseCompare;
01440                 dchar  function (CharAt, uint*, uint, void*) u_unescapeAt;
01441                 uint   function (wchar*, uint) u_countChar32;
01442                 uint   function (wchar*, uint, wchar*, uint, char*, inout Error) u_strToUpper;
01443                 uint   function (wchar*, uint, wchar*, uint, char*, inout Error) u_strToLower;
01444                 uint   function (wchar*, uint, wchar*, uint, uint, inout Error) u_strFoldCase;
01445                 wchar* function (wchar*, uint, uint*, char*, uint, inout Error) u_strFromUTF8;
01446                 char*  function (char*, uint, uint*, wchar*, uint, inout Error) u_strToUTF8;
01447         }
01448 
01449         /***********************************************************************
01450 
01451         ***********************************************************************/
01452 
01453         static  FunctionLoader.Bind[] targets = 
01454                 [
01455                 {cast(void**) &u_strFindFirst,      "u_strFindFirst"},
01456                 {cast(void**) &u_strFindLast,       "u_strFindLast"},
01457                 {cast(void**) &u_memchr,            "u_memchr"},
01458                 {cast(void**) &u_memrchr,           "u_memrchr"},
01459                 {cast(void**) &u_strCompare,        "u_strCompare"},
01460                 {cast(void**) &u_strCaseCompare,    "u_strCaseCompare"},
01461                 {cast(void**) &u_unescapeAt,        "u_unescapeAt"},
01462                 {cast(void**) &u_countChar32,       "u_countChar32"},
01463                 {cast(void**) &u_strToUpper,        "u_strToUpper"},
01464                 {cast(void**) &u_strToLower,        "u_strToLower"},
01465                 {cast(void**) &u_strFoldCase,       "u_strFoldCase"},
01466                 {cast(void**) &u_strFromUTF8,       "u_strFromUTF8"},
01467                 {cast(void**) &u_strToUTF8,         "u_strToUTF8"},
01468                 ];
01469 
01470         /***********************************************************************
01471 
01472         ***********************************************************************/
01473 
01474         static this ()
01475         {
01476                 library = FunctionLoader.bind (icuuc, targets);
01477                 //test ();
01478         }
01479 
01480         /***********************************************************************
01481 
01482         ***********************************************************************/
01483 
01484         static ~this ()
01485         {
01486                 FunctionLoader.unbind (library);
01487         }
01488 
01489         /***********************************************************************
01490 
01491         ***********************************************************************/
01492 
01493         private static void test()
01494         {
01495                 UString s = new UString (r"aaaqw \uabcd eaaa");
01496                 char[] x = "dssfsdff";
01497                 s ~ x ~ x;
01498                 wchar c = s[3];
01499                 s[3] = 'Q';
01500                 int y = s.indexOf ("qwe");
01501                 s.unEscape ();
01502                 s.toUpper (new UString);
01503                 s.padLeading(2).padTrailing(2).trim();
01504         }
01505 }

Generated on Tue Jan 25 21:18:25 2005 for Mango by doxygen 1.3.6