Main Page | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members | Related Pages

UBreakIterator.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file UBreakIterator.d
00004         
00005         Copyright (c) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026         4. Derivative works are permitted, but they must carry this notice
00027            in full and credit the original source.
00028 
00029 
00030                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00031 
00032 
00033         @version        Initial version, November 2004      
00034         @author         Kris
00035 
00036         Note that this package and documentation is built around the ICU 
00037         project (http://oss.software.ibm.com/icu/). Below is the license 
00038         statement as specified by that software:
00039 
00040 
00041                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00042 
00043 
00044         ICU License - ICU 1.8.1 and later
00045 
00046         COPYRIGHT AND PERMISSION NOTICE
00047 
00048         Copyright (c) 1995-2003 International Business Machines Corporation and 
00049         others.
00050 
00051         All rights reserved.
00052 
00053         Permission is hereby granted, free of charge, to any person obtaining a
00054         copy of this software and associated documentation files (the
00055         "Software"), to deal in the Software without restriction, including
00056         without limitation the rights to use, copy, modify, merge, publish,
00057         distribute, and/or sell copies of the Software, and to permit persons
00058         to whom the Software is furnished to do so, provided that the above
00059         copyright notice(s) and this permission notice appear in all copies of
00060         the Software and that both the above copyright notice(s) and this
00061         permission notice appear in supporting documentation.
00062 
00063         THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
00064         OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00065         MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
00066         OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
00067         HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
00068         INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
00069         FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
00070         NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
00071         WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
00072 
00073         Except as contained in this notice, the name of a copyright holder
00074         shall not be used in advertising or otherwise to promote the sale, use
00075         or other dealings in this Software without prior written authorization
00076         of the copyright holder.
00077 
00078         ----------------------------------------------------------------------
00079 
00080         All trademarks and registered trademarks mentioned herein are the 
00081         property of their respective owners.
00082 
00083 *******************************************************************************/
00084 
00085 module mango.icu.UBreakIterator;
00086 
00087 private import  mango.icu.ICU;
00088 
00089 public  import  mango.icu.ULocale,
00090                 mango.icu.UString;
00091 
00092 /*******************************************************************************
00093 
00094 *******************************************************************************/
00095 
00096 class UCharacterIterator : UBreakIterator
00097 {
00098         /***********************************************************************
00099         
00100         ***********************************************************************/
00101 
00102         this (inout ULocale locale, UText text = null)
00103         {
00104                 super (Type.Character, locale, text);
00105         }
00106 }
00107 
00108 
00109 /*******************************************************************************
00110 
00111 *******************************************************************************/
00112 
00113 class UWordIterator : UBreakIterator
00114 {
00115         public enum     Break 
00116                         {
00117                         None = 0, 
00118                         NoneLimit = 100, 
00119                         Number = 100, 
00120                         NumberLimit = 200,
00121                         Letter = 200, 
00122                         LetterLimit = 300, 
00123                         Kana = 300, 
00124                         KanaLimit = 400,
00125                         Ideo = 400, 
00126                         IdeoLimit = 500
00127                         }
00128 
00129         /***********************************************************************
00130         
00131         ***********************************************************************/
00132 
00133         this (inout ULocale locale, UText text = null)
00134         {
00135                 super (Type.Word, locale, text);
00136         }
00137 
00138         /***********************************************************************
00139         
00140                 Return the status from the break rule that determined 
00141                 the most recently returned break position.
00142 
00143         ***********************************************************************/
00144 
00145         void getStatus (inout Break b)
00146         {
00147                 b = cast(Break) super.getStatus();
00148         }
00149 }
00150 
00151 
00152 /*******************************************************************************
00153 
00154 *******************************************************************************/
00155 
00156 class ULineIterator : UBreakIterator
00157 {
00158         public enum     Break 
00159                         {
00160                         Soft = 0, 
00161                         SoftLimit = 100, 
00162                         Hard = 100, 
00163                         HardLimit = 200
00164                         }
00165 
00166         /***********************************************************************
00167         
00168         ***********************************************************************/
00169 
00170         this (inout ULocale locale, UText text = null)
00171         {
00172                 super (Type.Line, locale, text);
00173         }
00174 
00175         /***********************************************************************
00176         
00177                 Return the status from the break rule that determined 
00178                 the most recently returned break position.
00179 
00180         ***********************************************************************/
00181 
00182         void getStatus (inout Break b)
00183         {
00184                 b = cast(Break) super.getStatus();
00185         }
00186 }
00187 
00188 
00189 /*******************************************************************************
00190 
00191 *******************************************************************************/
00192 
00193 class USentenceIterator : UBreakIterator
00194 {
00195         public enum     Break 
00196                         {
00197                         Term = 0, 
00198                         TermLimit = 100, 
00199                         Sep = 100, 
00200                         Limit = 200
00201                         }
00202 
00203         /***********************************************************************
00204         
00205         ***********************************************************************/
00206 
00207         this (inout ULocale locale, UText text = null)
00208         {
00209                 super (Type.Sentence, locale, text);
00210         }
00211 
00212         /***********************************************************************
00213         
00214                 Return the status from the break rule that determined 
00215                 the most recently returned break position.
00216 
00217         ***********************************************************************/
00218 
00219         void getStatus (inout Break b)
00220         {
00221                 b = cast(Break) super.getStatus();
00222         }
00223 }
00224 
00225 
00226 /*******************************************************************************
00227 
00228 *******************************************************************************/
00229 
00230 class UTitleIterator : UBreakIterator
00231 {
00232         /***********************************************************************
00233         
00234         ***********************************************************************/
00235 
00236         this (inout ULocale locale, UText text = null)
00237         {
00238                 super (Type.Title, locale, text);
00239         }
00240 }
00241 
00242 
00243 /*******************************************************************************
00244 
00245 *******************************************************************************/
00246 
00247 class URuleIterator : UBreakIterator
00248 {
00249         /***********************************************************************
00250         
00251                 Open a new UBreakIterator for locating text boundaries 
00252                 using specified breaking rules
00253 
00254         ***********************************************************************/
00255 
00256         this (UText rules, UText text = null)
00257         {
00258                 Error e;
00259 
00260                 handle = ubrk_openRules (rules.get, rules.length, text.get, text.length, null, e);
00261                 testError (e, "failed to open rule iterator");
00262         }
00263 }
00264 
00265 
00266 /*******************************************************************************
00267 
00268         BreakIterator defines methods for finding the location of boundaries 
00269         in text. Pointer to a UBreakIterator maintain a current position and 
00270         scan over text returning the index of characters where boundaries occur.
00271 
00272         Line boundary analysis determines where a text string can be broken 
00273         when line-wrapping. The mechanism correctly handles punctuation and 
00274         hyphenated words.
00275 
00276         Sentence boundary analysis allows selection with correct interpretation 
00277         of periods within numbers and abbreviations, and trailing punctuation 
00278         marks such as quotation marks and parentheses.
00279 
00280         Word boundary analysis is used by search and replace functions, as well 
00281         as within text editing applications that allow the user to select words 
00282         with a double click. Word selection provides correct interpretation of 
00283         punctuation marks within and following words. Characters that are not 
00284         part of a word, such as symbols or punctuation marks, have word-breaks 
00285         on both sides.
00286 
00287         Character boundary analysis allows users to interact with characters 
00288         as they expect to, for example, when moving the cursor through a text 
00289         string. Character boundary analysis provides correct navigation of 
00290         through character strings, regardless of how the character is stored. 
00291         For example, an accented character might be stored as a base character 
00292         and a diacritical mark. What users consider to be a character can differ 
00293         between languages.
00294 
00295         Title boundary analysis locates all positions, typically starts of 
00296         words, that should be set to Title Case when title casing the text. 
00297 
00298         See <A HREF="http://oss.software.ibm.com/icu/apiref/ubrk_8h.html">
00299         this page</A> for full details.
00300 
00301 *******************************************************************************/
00302 
00303 private class UBreakIterator : ICU
00304 {       
00305         package Handle  handle;  
00306 
00307         // this is returned by next(), previous() etc ...
00308         const uint Done = uint.max;
00309 
00310         /***********************************************************************
00311         
00312                 internal types passed to C API
00313 
00314         ***********************************************************************/
00315 
00316         private  enum   Type    
00317                         {  
00318                         Character, 
00319                         Word,
00320                         Line,
00321                         Sentence,
00322                         Title
00323                         }
00324 
00325 
00326         /***********************************************************************
00327         
00328                 Internal use only!
00329 
00330         ***********************************************************************/
00331 
00332         private this ()
00333         {
00334         }
00335 
00336         /***********************************************************************
00337         
00338                 Open a new UBreakIterator for locating text boundaries for 
00339                 a specified locale. A UBreakIterator may be used for detecting 
00340                 character, line, word, and sentence breaks in text. 
00341 
00342         ***********************************************************************/
00343 
00344         this (Type type, inout ULocale locale, UText text)
00345         {
00346                 Error e;
00347 
00348                 handle = ubrk_open (type, toString(locale.name), text.get, text.length, e);
00349                 testError (e, "failed to create break iterator");
00350         }
00351 
00352         /***********************************************************************
00353         
00354                 Close a UBreakIterator
00355 
00356         ***********************************************************************/
00357 
00358         ~this ()
00359         {
00360                 ubrk_close (handle);
00361         }
00362 
00363         /***********************************************************************
00364         
00365                 Sets an existing iterator to point to a new piece of text
00366 
00367         ***********************************************************************/
00368 
00369         void setText (UText text)
00370         {
00371                 Error e;
00372                 ubrk_setText (handle, text.get, text.length, e);
00373                 testError (e, "failed to set iterator text");
00374         }
00375 
00376         /***********************************************************************
00377         
00378                 Determine the most recently-returned text boundary
00379 
00380         ***********************************************************************/
00381 
00382         uint current ()
00383         {
00384                 return ubrk_current (handle);
00385         }
00386 
00387         /***********************************************************************
00388         
00389                 Determine the text boundary following the current text 
00390                 boundary, or UBRK_DONE if all text boundaries have been 
00391                 returned. 
00392                 
00393                 If offset is specified, determines the text boundary 
00394                 following the current text boundary: The value returned 
00395                 is always greater than offset, or Done
00396 
00397         ***********************************************************************/
00398 
00399         uint next (uint offset = uint.max)
00400         {
00401                 if (offset == uint.max)
00402                     return ubrk_next (handle);
00403                 return ubrk_following (handle, offset);
00404         }
00405 
00406         /***********************************************************************
00407         
00408                 Determine the text boundary preceding the current text 
00409                 boundary, or Done if all text boundaries have been returned.
00410 
00411                 If offset is specified, determines the text boundary preceding 
00412                 the specified offset. The value returned is always smaller than 
00413                 offset, or Done.
00414 
00415         ***********************************************************************/
00416 
00417         uint previous (uint offset = uint.max)
00418         {
00419                 if (offset == uint.max)
00420                     return ubrk_previous (handle);
00421                 return ubrk_preceding (handle, offset);
00422         }
00423 
00424         /***********************************************************************
00425         
00426                 Determine the index of the first character in the text 
00427                 being scanned. This is not always the same as index 0 
00428                 of the text. 
00429 
00430         ***********************************************************************/
00431 
00432         uint first ()
00433         {
00434                 return ubrk_first (handle);
00435         }
00436 
00437         /***********************************************************************
00438         
00439                 Determine the index immediately beyond the last character 
00440                 in the text being scanned. This is not the same as the last 
00441                 character
00442 
00443         ***********************************************************************/
00444 
00445         uint last ()
00446         {
00447                 return ubrk_last (handle);
00448         }
00449 
00450         /***********************************************************************
00451         
00452                 Returns true if the specfied position is a boundary position.
00453                 As a side effect, leaves the iterator pointing to the first 
00454                 boundary position at or after "offset". 
00455 
00456         ***********************************************************************/
00457 
00458         bool isBoundary (uint offset)
00459         {
00460                 return ubrk_isBoundary (handle, offset) != 0;
00461         }
00462 
00463         /***********************************************************************
00464         
00465                 Return the status from the break rule that determined 
00466                 the most recently returned break position.
00467 
00468         ***********************************************************************/
00469 
00470         void getStatus (inout uint s)
00471         {
00472                 s = getStatus ();
00473         }
00474 
00475         /***********************************************************************
00476         
00477                 Return the status from the break rule that determined 
00478                 the most recently returned break position.
00479 
00480                 The values appear in the rule source within brackets, 
00481                 {123}, for example. For rules that do not specify a status, 
00482                 a default value of 0 is returned.
00483 
00484                 For word break iterators, the possible values are defined 
00485                 in enum UWordBreak
00486 
00487         ***********************************************************************/
00488 
00489         private uint getStatus ()
00490         {
00491                 return ubrk_getRuleStatus (handle);
00492         }
00493 
00494 
00495         /***********************************************************************
00496         
00497                 Bind the ICU functions from a shared library. This is
00498                 complicated by the issues regarding D and DLLs on the
00499                 Windows platform
00500 
00501         ***********************************************************************/
00502 
00503         private static void* library;
00504         
00505         /***********************************************************************
00506                 
00507         ***********************************************************************/
00508 
00509         private static extern (C) 
00510         {
00511                 Handle function (uint, char*, wchar*, uint, inout Error) ubrk_open;
00512                 Handle function (wchar*, uint, wchar*, uint, void*, inout Error) ubrk_openRules;
00513                 void   function (Handle) ubrk_close;
00514                 void   function (Handle, wchar*, uint, inout Error) ubrk_setText;
00515                 uint   function (Handle) ubrk_current;
00516                 uint   function (Handle) ubrk_next;
00517                 uint   function (Handle) ubrk_previous;
00518                 uint   function (Handle) ubrk_first;
00519                 uint   function (Handle) ubrk_last;
00520                 uint   function (Handle, uint) ubrk_preceding;
00521                 uint   function (Handle, uint) ubrk_following;
00522                 byte   function (Handle, uint) ubrk_isBoundary;
00523                 uint   function (Handle) ubrk_getRuleStatus;
00524         }
00525 
00526         /***********************************************************************
00527 
00528         ***********************************************************************/
00529 
00530         static  FunctionLoader.Bind[] targets = 
00531                 [
00532                 {cast(void**) &ubrk_open,               "ubrk_open"}, 
00533                 {cast(void**) &ubrk_close,              "ubrk_close"},
00534                 {cast(void**) &ubrk_openRules,          "ubrk_openRules"},
00535                 {cast(void**) &ubrk_setText,            "ubrk_setText"},
00536                 {cast(void**) &ubrk_current,            "ubrk_current"},
00537                 {cast(void**) &ubrk_next,               "ubrk_next"},
00538                 {cast(void**) &ubrk_previous,           "ubrk_previous"},
00539                 {cast(void**) &ubrk_first,              "ubrk_first"},
00540                 {cast(void**) &ubrk_last,               "ubrk_last"},
00541                 {cast(void**) &ubrk_preceding,          "ubrk_preceding"},
00542                 {cast(void**) &ubrk_following,          "ubrk_following"},
00543                 {cast(void**) &ubrk_isBoundary,         "ubrk_isBoundary"},
00544                 {cast(void**) &ubrk_getRuleStatus,      "ubrk_getRuleStatus"},
00545                 ];
00546 
00547          /**********************************************************************
00548 
00549          **********************************************************************/
00550 
00551          static this ()
00552          {
00553                 library = FunctionLoader.bind (icuuc, targets);
00554          }
00555 
00556          /**********************************************************************
00557 
00558          **********************************************************************/
00559                
00560          static ~this ()
00561          {
00562                FunctionLoader.unbind (library);
00563          }
00564 }

Generated on Tue Jan 25 21:18:23 2005 for Mango by doxygen 1.3.6