00001 /******************************************************************************* 00002 00003 @file UBreakIterator.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version, November 2004 00034 @author Kris 00035 00036 Note that this package and documentation is built around the ICU 00037 project (http://oss.software.ibm.com/icu/). Below is the license 00038 statement as specified by that software: 00039 00040 00041 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00042 00043 00044 ICU License - ICU 1.8.1 and later 00045 00046 COPYRIGHT AND PERMISSION NOTICE 00047 00048 Copyright (c) 1995-2003 International Business Machines Corporation and 00049 others. 00050 00051 All rights reserved. 00052 00053 Permission is hereby granted, free of charge, to any person obtaining a 00054 copy of this software and associated documentation files (the 00055 "Software"), to deal in the Software without restriction, including 00056 without limitation the rights to use, copy, modify, merge, publish, 00057 distribute, and/or sell copies of the Software, and to permit persons 00058 to whom the Software is furnished to do so, provided that the above 00059 copyright notice(s) and this permission notice appear in all copies of 00060 the Software and that both the above copyright notice(s) and this 00061 permission notice appear in supporting documentation. 00062 00063 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 00064 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00065 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 00066 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 00067 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL 00068 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING 00069 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, 00070 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION 00071 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 00072 00073 Except as contained in this notice, the name of a copyright holder 00074 shall not be used in advertising or otherwise to promote the sale, use 00075 or other dealings in this Software without prior written authorization 00076 of the copyright holder. 00077 00078 ---------------------------------------------------------------------- 00079 00080 All trademarks and registered trademarks mentioned herein are the 00081 property of their respective owners. 00082 00083 *******************************************************************************/ 00084 00085 module mango.icu.UBreakIterator; 00086 00087 private import mango.icu.ICU; 00088 00089 public import mango.icu.ULocale, 00090 mango.icu.UString; 00091 00092 /******************************************************************************* 00093 00094 *******************************************************************************/ 00095 00096 class UCharacterIterator : UBreakIterator 00097 { 00098 /*********************************************************************** 00099 00100 ***********************************************************************/ 00101 00102 this (inout ULocale locale, UText text = null) 00103 { 00104 super (Type.Character, locale, text); 00105 } 00106 } 00107 00108 00109 /******************************************************************************* 00110 00111 *******************************************************************************/ 00112 00113 class UWordIterator : UBreakIterator 00114 { 00115 public enum Break 00116 { 00117 None = 0, 00118 NoneLimit = 100, 00119 Number = 100, 00120 NumberLimit = 200, 00121 Letter = 200, 00122 LetterLimit = 300, 00123 Kana = 300, 00124 KanaLimit = 400, 00125 Ideo = 400, 00126 IdeoLimit = 500 00127 } 00128 00129 /*********************************************************************** 00130 00131 ***********************************************************************/ 00132 00133 this (inout ULocale locale, UText text = null) 00134 { 00135 super (Type.Word, locale, text); 00136 } 00137 00138 /*********************************************************************** 00139 00140 Return the status from the break rule that determined 00141 the most recently returned break position. 00142 00143 ***********************************************************************/ 00144 00145 void getStatus (inout Break b) 00146 { 00147 b = cast(Break) super.getStatus(); 00148 } 00149 } 00150 00151 00152 /******************************************************************************* 00153 00154 *******************************************************************************/ 00155 00156 class ULineIterator : UBreakIterator 00157 { 00158 public enum Break 00159 { 00160 Soft = 0, 00161 SoftLimit = 100, 00162 Hard = 100, 00163 HardLimit = 200 00164 } 00165 00166 /*********************************************************************** 00167 00168 ***********************************************************************/ 00169 00170 this (inout ULocale locale, UText text = null) 00171 { 00172 super (Type.Line, locale, text); 00173 } 00174 00175 /*********************************************************************** 00176 00177 Return the status from the break rule that determined 00178 the most recently returned break position. 00179 00180 ***********************************************************************/ 00181 00182 void getStatus (inout Break b) 00183 { 00184 b = cast(Break) super.getStatus(); 00185 } 00186 } 00187 00188 00189 /******************************************************************************* 00190 00191 *******************************************************************************/ 00192 00193 class USentenceIterator : UBreakIterator 00194 { 00195 public enum Break 00196 { 00197 Term = 0, 00198 TermLimit = 100, 00199 Sep = 100, 00200 Limit = 200 00201 } 00202 00203 /*********************************************************************** 00204 00205 ***********************************************************************/ 00206 00207 this (inout ULocale locale, UText text = null) 00208 { 00209 super (Type.Sentence, locale, text); 00210 } 00211 00212 /*********************************************************************** 00213 00214 Return the status from the break rule that determined 00215 the most recently returned break position. 00216 00217 ***********************************************************************/ 00218 00219 void getStatus (inout Break b) 00220 { 00221 b = cast(Break) super.getStatus(); 00222 } 00223 } 00224 00225 00226 /******************************************************************************* 00227 00228 *******************************************************************************/ 00229 00230 class UTitleIterator : UBreakIterator 00231 { 00232 /*********************************************************************** 00233 00234 ***********************************************************************/ 00235 00236 this (inout ULocale locale, UText text = null) 00237 { 00238 super (Type.Title, locale, text); 00239 } 00240 } 00241 00242 00243 /******************************************************************************* 00244 00245 *******************************************************************************/ 00246 00247 class URuleIterator : UBreakIterator 00248 { 00249 /*********************************************************************** 00250 00251 Open a new UBreakIterator for locating text boundaries 00252 using specified breaking rules 00253 00254 ***********************************************************************/ 00255 00256 this (UText rules, UText text = null) 00257 { 00258 Error e; 00259 00260 handle = ubrk_openRules (rules.get, rules.length, text.get, text.length, null, e); 00261 testError (e, "failed to open rule iterator"); 00262 } 00263 } 00264 00265 00266 /******************************************************************************* 00267 00268 BreakIterator defines methods for finding the location of boundaries 00269 in text. Pointer to a UBreakIterator maintain a current position and 00270 scan over text returning the index of characters where boundaries occur. 00271 00272 Line boundary analysis determines where a text string can be broken 00273 when line-wrapping. The mechanism correctly handles punctuation and 00274 hyphenated words. 00275 00276 Sentence boundary analysis allows selection with correct interpretation 00277 of periods within numbers and abbreviations, and trailing punctuation 00278 marks such as quotation marks and parentheses. 00279 00280 Word boundary analysis is used by search and replace functions, as well 00281 as within text editing applications that allow the user to select words 00282 with a double click. Word selection provides correct interpretation of 00283 punctuation marks within and following words. Characters that are not 00284 part of a word, such as symbols or punctuation marks, have word-breaks 00285 on both sides. 00286 00287 Character boundary analysis allows users to interact with characters 00288 as they expect to, for example, when moving the cursor through a text 00289 string. Character boundary analysis provides correct navigation of 00290 through character strings, regardless of how the character is stored. 00291 For example, an accented character might be stored as a base character 00292 and a diacritical mark. What users consider to be a character can differ 00293 between languages. 00294 00295 Title boundary analysis locates all positions, typically starts of 00296 words, that should be set to Title Case when title casing the text. 00297 00298 See <A HREF="http://oss.software.ibm.com/icu/apiref/ubrk_8h.html"> 00299 this page</A> for full details. 00300 00301 *******************************************************************************/ 00302 00303 private class UBreakIterator : ICU 00304 { 00305 package Handle handle; 00306 00307 // this is returned by next(), previous() etc ... 00308 const uint Done = uint.max; 00309 00310 /*********************************************************************** 00311 00312 internal types passed to C API 00313 00314 ***********************************************************************/ 00315 00316 private enum Type 00317 { 00318 Character, 00319 Word, 00320 Line, 00321 Sentence, 00322 Title 00323 } 00324 00325 00326 /*********************************************************************** 00327 00328 Internal use only! 00329 00330 ***********************************************************************/ 00331 00332 private this () 00333 { 00334 } 00335 00336 /*********************************************************************** 00337 00338 Open a new UBreakIterator for locating text boundaries for 00339 a specified locale. A UBreakIterator may be used for detecting 00340 character, line, word, and sentence breaks in text. 00341 00342 ***********************************************************************/ 00343 00344 this (Type type, inout ULocale locale, UText text) 00345 { 00346 Error e; 00347 00348 handle = ubrk_open (type, toString(locale.name), text.get, text.length, e); 00349 testError (e, "failed to create break iterator"); 00350 } 00351 00352 /*********************************************************************** 00353 00354 Close a UBreakIterator 00355 00356 ***********************************************************************/ 00357 00358 ~this () 00359 { 00360 ubrk_close (handle); 00361 } 00362 00363 /*********************************************************************** 00364 00365 Sets an existing iterator to point to a new piece of text 00366 00367 ***********************************************************************/ 00368 00369 void setText (UText text) 00370 { 00371 Error e; 00372 ubrk_setText (handle, text.get, text.length, e); 00373 testError (e, "failed to set iterator text"); 00374 } 00375 00376 /*********************************************************************** 00377 00378 Determine the most recently-returned text boundary 00379 00380 ***********************************************************************/ 00381 00382 uint current () 00383 { 00384 return ubrk_current (handle); 00385 } 00386 00387 /*********************************************************************** 00388 00389 Determine the text boundary following the current text 00390 boundary, or UBRK_DONE if all text boundaries have been 00391 returned. 00392 00393 If offset is specified, determines the text boundary 00394 following the current text boundary: The value returned 00395 is always greater than offset, or Done 00396 00397 ***********************************************************************/ 00398 00399 uint next (uint offset = uint.max) 00400 { 00401 if (offset == uint.max) 00402 return ubrk_next (handle); 00403 return ubrk_following (handle, offset); 00404 } 00405 00406 /*********************************************************************** 00407 00408 Determine the text boundary preceding the current text 00409 boundary, or Done if all text boundaries have been returned. 00410 00411 If offset is specified, determines the text boundary preceding 00412 the specified offset. The value returned is always smaller than 00413 offset, or Done. 00414 00415 ***********************************************************************/ 00416 00417 uint previous (uint offset = uint.max) 00418 { 00419 if (offset == uint.max) 00420 return ubrk_previous (handle); 00421 return ubrk_preceding (handle, offset); 00422 } 00423 00424 /*********************************************************************** 00425 00426 Determine the index of the first character in the text 00427 being scanned. This is not always the same as index 0 00428 of the text. 00429 00430 ***********************************************************************/ 00431 00432 uint first () 00433 { 00434 return ubrk_first (handle); 00435 } 00436 00437 /*********************************************************************** 00438 00439 Determine the index immediately beyond the last character 00440 in the text being scanned. This is not the same as the last 00441 character 00442 00443 ***********************************************************************/ 00444 00445 uint last () 00446 { 00447 return ubrk_last (handle); 00448 } 00449 00450 /*********************************************************************** 00451 00452 Returns true if the specfied position is a boundary position. 00453 As a side effect, leaves the iterator pointing to the first 00454 boundary position at or after "offset". 00455 00456 ***********************************************************************/ 00457 00458 bool isBoundary (uint offset) 00459 { 00460 return ubrk_isBoundary (handle, offset) != 0; 00461 } 00462 00463 /*********************************************************************** 00464 00465 Return the status from the break rule that determined 00466 the most recently returned break position. 00467 00468 ***********************************************************************/ 00469 00470 void getStatus (inout uint s) 00471 { 00472 s = getStatus (); 00473 } 00474 00475 /*********************************************************************** 00476 00477 Return the status from the break rule that determined 00478 the most recently returned break position. 00479 00480 The values appear in the rule source within brackets, 00481 {123}, for example. For rules that do not specify a status, 00482 a default value of 0 is returned. 00483 00484 For word break iterators, the possible values are defined 00485 in enum UWordBreak 00486 00487 ***********************************************************************/ 00488 00489 private uint getStatus () 00490 { 00491 return ubrk_getRuleStatus (handle); 00492 } 00493 00494 00495 /*********************************************************************** 00496 00497 Bind the ICU functions from a shared library. This is 00498 complicated by the issues regarding D and DLLs on the 00499 Windows platform 00500 00501 ***********************************************************************/ 00502 00503 private static void* library; 00504 00505 /*********************************************************************** 00506 00507 ***********************************************************************/ 00508 00509 private static extern (C) 00510 { 00511 Handle function (uint, char*, wchar*, uint, inout Error) ubrk_open; 00512 Handle function (wchar*, uint, wchar*, uint, void*, inout Error) ubrk_openRules; 00513 void function (Handle) ubrk_close; 00514 void function (Handle, wchar*, uint, inout Error) ubrk_setText; 00515 uint function (Handle) ubrk_current; 00516 uint function (Handle) ubrk_next; 00517 uint function (Handle) ubrk_previous; 00518 uint function (Handle) ubrk_first; 00519 uint function (Handle) ubrk_last; 00520 uint function (Handle, uint) ubrk_preceding; 00521 uint function (Handle, uint) ubrk_following; 00522 byte function (Handle, uint) ubrk_isBoundary; 00523 uint function (Handle) ubrk_getRuleStatus; 00524 } 00525 00526 /*********************************************************************** 00527 00528 ***********************************************************************/ 00529 00530 static FunctionLoader.Bind[] targets = 00531 [ 00532 {cast(void**) &ubrk_open, "ubrk_open"}, 00533 {cast(void**) &ubrk_close, "ubrk_close"}, 00534 {cast(void**) &ubrk_openRules, "ubrk_openRules"}, 00535 {cast(void**) &ubrk_setText, "ubrk_setText"}, 00536 {cast(void**) &ubrk_current, "ubrk_current"}, 00537 {cast(void**) &ubrk_next, "ubrk_next"}, 00538 {cast(void**) &ubrk_previous, "ubrk_previous"}, 00539 {cast(void**) &ubrk_first, "ubrk_first"}, 00540 {cast(void**) &ubrk_last, "ubrk_last"}, 00541 {cast(void**) &ubrk_preceding, "ubrk_preceding"}, 00542 {cast(void**) &ubrk_following, "ubrk_following"}, 00543 {cast(void**) &ubrk_isBoundary, "ubrk_isBoundary"}, 00544 {cast(void**) &ubrk_getRuleStatus, "ubrk_getRuleStatus"}, 00545 ]; 00546 00547 /********************************************************************** 00548 00549 **********************************************************************/ 00550 00551 static this () 00552 { 00553 library = FunctionLoader.bind (icuuc, targets); 00554 } 00555 00556 /********************************************************************** 00557 00558 **********************************************************************/ 00559 00560 static ~this () 00561 { 00562 FunctionLoader.unbind (library); 00563 } 00564 }