00001 /******************************************************************************* 00002 00003 @file USearch.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version, November 2004 00034 @author Kris 00035 00036 Note that this package and documentation is built around the ICU 00037 project (http://oss.software.ibm.com/icu/). Below is the license 00038 statement as specified by that software: 00039 00040 00041 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00042 00043 00044 ICU License - ICU 1.8.1 and later 00045 00046 COPYRIGHT AND PERMISSION NOTICE 00047 00048 Copyright (c) 1995-2003 International Business Machines Corporation and 00049 others. 00050 00051 All rights reserved. 00052 00053 Permission is hereby granted, free of charge, to any person obtaining a 00054 copy of this software and associated documentation files (the 00055 "Software"), to deal in the Software without restriction, including 00056 without limitation the rights to use, copy, modify, merge, publish, 00057 distribute, and/or sell copies of the Software, and to permit persons 00058 to whom the Software is furnished to do so, provided that the above 00059 copyright notice(s) and this permission notice appear in all copies of 00060 the Software and that both the above copyright notice(s) and this 00061 permission notice appear in supporting documentation. 00062 00063 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 00064 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00065 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 00066 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 00067 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL 00068 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING 00069 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, 00070 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION 00071 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 00072 00073 Except as contained in this notice, the name of a copyright holder 00074 shall not be used in advertising or otherwise to promote the sale, use 00075 or other dealings in this Software without prior written authorization 00076 of the copyright holder. 00077 00078 ---------------------------------------------------------------------- 00079 00080 All trademarks and registered trademarks mentioned herein are the 00081 property of their respective owners. 00082 00083 *******************************************************************************/ 00084 00085 module mango.icu.USearch; 00086 00087 private import mango.icu.ICU; 00088 00089 public import mango.icu.ULocale, 00090 mango.icu.UString, 00091 mango.icu.UCollator, 00092 mango.icu.UBreakIterator; 00093 00094 /******************************************************************************* 00095 00096 Apis for an engine that provides language-sensitive text 00097 searching based on the comparison rules defined in a UCollator 00098 data struct. This ensures that language eccentricity can be handled, 00099 e.g. for the German collator, characters ß and SS will be matched 00100 if case is chosen to be ignored. See the "ICU Collation Design 00101 Document" for more information. 00102 00103 The algorithm implemented is a modified form of the Boyer Moore's 00104 search. For more information see "Efficient Text Searching in Java", 00105 published in Java Report in February, 1999, for further information 00106 on the algorithm. 00107 00108 There are 2 match options for selection: Let S' be the sub-string 00109 of a text string S between the offsets start and end <start, end>. A 00110 pattern string P matches a text string S at the offsets <start, end> if 00111 00112 - option 1. Some canonical equivalent of P matches some canonical 00113 equivalent of S' 00114 00115 - option 2. P matches S' and if P starts or ends with a combining 00116 mark, there exists no non-ignorable combining mark before 00117 or after S' in S respectively. 00118 00119 Option 2 will be the default 00120 00121 This search has APIs similar to that of other text iteration 00122 mechanisms such as the break iterators in ubrk.h. Using these 00123 APIs, it is easy to scan through text looking for all occurances 00124 of a given pattern. This search iterator allows changing of 00125 direction by calling a reset followed by a next or previous. 00126 Though a direction change can occur without calling reset first, 00127 this operation comes with some speed penalty. Generally, match 00128 results in the forward direction will match the result matches 00129 in the backwards direction in the reverse order 00130 00131 USearch provides APIs to specify the starting position within 00132 the text string to be searched, e.g. setOffset(), previous(x) 00133 and next(x). Since the starting position will be set as it 00134 is specified, please take note that there are some dangerous 00135 positions which the search may render incorrect results: 00136 00137 - The midst of a substring that requires normalization. 00138 00139 - If the following match is to be found, the position should 00140 not be the second character which requires to be swapped 00141 with the preceding character. Vice versa, if the preceding 00142 match is to be found, position to search from should not be 00143 the first character which requires to be swapped with the 00144 next character. E.g certain Thai and Lao characters require 00145 swapping. 00146 00147 - If a following pattern match is to be found, any position 00148 within a contracting sequence except the first will fail. 00149 Vice versa if a preceding pattern match is to be found, 00150 a invalid starting point would be any character within a 00151 contracting sequence except the last. 00152 00153 A breakiterator can be used if only matches at logical breaks are 00154 desired. Using a breakiterator will only give you results that 00155 exactly matches the boundaries given by the breakiterator. For 00156 instance the pattern "e" will not be found in the string "\u00e9" 00157 if a character break iterator is used. 00158 00159 Options are provided to handle overlapping matches. E.g. In 00160 English, overlapping matches produces the result 0 and 2 for 00161 the pattern "abab" in the text "ababab", where else mutually 00162 exclusive matches only produce the result of 0. 00163 00164 Though collator attributes will be taken into consideration while 00165 performing matches, there are no APIs here for setting and getting 00166 the attributes. These attributes can be set by getting the collator 00167 from getCollator() and using the APIs in UCollator. Lastly to update 00168 String Search to the new collator attributes, reset() has to be called. 00169 00170 See http://oss.software.ibm.com/icu/apiref/usearch_8h.html for full 00171 details. 00172 00173 *******************************************************************************/ 00174 00175 class USearch : ICU 00176 { 00177 private Handle handle; 00178 private UBreakIterator iterator; 00179 00180 // DONE is returned by previous() and next() after all valid 00181 // matches have been returned, and by first() and last() if 00182 // there are no matches at all. 00183 const uint Done = uint.max; 00184 00185 //Possible types of searches 00186 public enum Attribute 00187 { 00188 Overlap, 00189 CanonicalMatch, 00190 Count 00191 } 00192 00193 public enum AttributeValue 00194 { 00195 Default = -1, 00196 Off, 00197 On, 00198 Count 00199 } 00200 00201 /*********************************************************************** 00202 00203 Creating a search iterator data struct using the argument 00204 locale language rule set 00205 00206 ***********************************************************************/ 00207 00208 this (UText pattern, UText text, inout ULocale locale, UBreakIterator iterator = null) 00209 { 00210 Error e; 00211 00212 this.iterator = iterator; 00213 handle = usearch_open (pattern.get, pattern.length, text.get, text.length, toString(locale.name), iterator, e); 00214 testError (e, "failed to open search"); 00215 } 00216 00217 /*********************************************************************** 00218 00219 Creating a search iterator data struct using the argument 00220 locale language rule set 00221 00222 ***********************************************************************/ 00223 00224 this (UText pattern, UText text, UCollator col, UBreakIterator iterator = null) 00225 { 00226 Error e; 00227 00228 this.iterator = iterator; 00229 handle = usearch_openFromCollator (pattern.get, pattern.length, text.get, text.length, col.handle, iterator, e); 00230 testError (e, "failed to open search from collator"); 00231 } 00232 00233 /*********************************************************************** 00234 00235 Close this USearch 00236 00237 ***********************************************************************/ 00238 00239 ~this () 00240 { 00241 usearch_close (handle); 00242 } 00243 00244 /*********************************************************************** 00245 00246 Sets the current position in the text string which the 00247 next search will start from. 00248 00249 ***********************************************************************/ 00250 00251 void setOffset (uint position) 00252 { 00253 Error e; 00254 00255 usearch_setOffset (handle, position, e); 00256 testError (e, "failed to set search offset"); 00257 } 00258 00259 /*********************************************************************** 00260 00261 Return the current index in the string text being searched 00262 00263 ***********************************************************************/ 00264 00265 uint getOffset () 00266 { 00267 return usearch_getOffset (handle); 00268 } 00269 00270 /*********************************************************************** 00271 00272 Returns the index to the match in the text string that was 00273 searched 00274 00275 ***********************************************************************/ 00276 00277 uint getMatchedStart () 00278 { 00279 return usearch_getMatchedStart (handle); 00280 } 00281 00282 /*********************************************************************** 00283 00284 Returns the length of text in the string which matches the 00285 search pattern 00286 00287 ***********************************************************************/ 00288 00289 uint getMatchedLength () 00290 { 00291 return usearch_getMatchedLength (handle); 00292 } 00293 00294 /*********************************************************************** 00295 00296 Returns the text that was matched by the most recent call to 00297 first(), next(), previous(), or last(). 00298 00299 ***********************************************************************/ 00300 00301 void getMatchedText (UString s) 00302 { 00303 uint fmt (wchar* dst, uint length, inout Error e) 00304 { 00305 return usearch_getMatchedText (handle, dst, length, e); 00306 } 00307 00308 s.format (&fmt, "failed to extract matched text"); 00309 } 00310 00311 /*********************************************************************** 00312 00313 Set the string text to be searched. 00314 00315 ***********************************************************************/ 00316 00317 void setText (UText t) 00318 { 00319 Error e; 00320 00321 usearch_setText (handle, t.get, t.length, e); 00322 testError (e, "failed to set search text"); 00323 } 00324 00325 /*********************************************************************** 00326 00327 Return the string text to be searched. Note that this 00328 returns a read-only reference to the search text. 00329 00330 ***********************************************************************/ 00331 00332 UText getText () 00333 { 00334 uint len; 00335 00336 wchar *x = usearch_getText (handle, &len); 00337 return new UText (x[0..len]); 00338 } 00339 00340 /*********************************************************************** 00341 00342 Sets the pattern used for matching 00343 00344 ***********************************************************************/ 00345 00346 void setPattern (UText t) 00347 { 00348 Error e; 00349 00350 usearch_setPattern (handle, t.get, t.length, e); 00351 testError (e, "failed to set search pattern"); 00352 } 00353 00354 /*********************************************************************** 00355 00356 Gets the search pattern. Note that this returns a 00357 read-only reference to the pattern. 00358 00359 ***********************************************************************/ 00360 00361 UText getPattern () 00362 { 00363 uint len; 00364 00365 wchar *x = usearch_getPattern (handle, &len); 00366 return new UText (x[0..len]); 00367 } 00368 00369 /*********************************************************************** 00370 00371 Set the BreakIterator that will be used to restrict the 00372 points at which matches are detected. 00373 00374 ***********************************************************************/ 00375 00376 void setIterator (UBreakIterator iterator) 00377 { 00378 Error e; 00379 00380 this.iterator = iterator; 00381 usearch_setBreakIterator (handle, iterator.handle, e); 00382 testError (e, "failed to set search iterator"); 00383 } 00384 00385 /*********************************************************************** 00386 00387 Get the BreakIterator that will be used to restrict the 00388 points at which matches are detected. 00389 00390 ***********************************************************************/ 00391 00392 UBreakIterator getIterator () 00393 { 00394 return iterator; 00395 } 00396 00397 /*********************************************************************** 00398 00399 Returns the first index at which the string text matches 00400 the search pattern 00401 00402 ***********************************************************************/ 00403 00404 uint first () 00405 { 00406 Error e; 00407 00408 uint x = usearch_first (handle, e); 00409 testError (e, "failed on first search"); 00410 return x; 00411 } 00412 00413 /*********************************************************************** 00414 00415 Returns the last index in the target text at which it 00416 matches the search pattern 00417 00418 ***********************************************************************/ 00419 00420 uint last () 00421 { 00422 Error e; 00423 00424 uint x = usearch_last (handle, e); 00425 testError (e, "failed on last search"); 00426 return x; 00427 } 00428 00429 /*********************************************************************** 00430 00431 Returns the index of the next point at which the string 00432 text matches the search pattern, starting from the current 00433 position. 00434 00435 If pos is specified, returns the first index greater than 00436 pos at which the string text matches the search pattern 00437 00438 ***********************************************************************/ 00439 00440 uint next (uint pos = uint.max) 00441 { 00442 Error e; 00443 uint x; 00444 00445 x = (pos == uint.max) ? usearch_next (handle, e) : 00446 usearch_following (handle, pos, e); 00447 00448 testError (e, "failed on next search"); 00449 return x; 00450 } 00451 00452 /*********************************************************************** 00453 00454 Returns the index of the previous point at which the 00455 string text matches the search pattern, starting at 00456 the current position. 00457 00458 If pos is specified, returns the first index less 00459 than pos at which the string text matches the search 00460 pattern. 00461 00462 ***********************************************************************/ 00463 00464 uint previous (uint pos = uint.max) 00465 { 00466 Error e; 00467 uint x; 00468 00469 x = (pos == uint.max) ? usearch_previous (handle, e) : 00470 usearch_preceding (handle, pos, e); 00471 00472 testError (e, "failed on next search"); 00473 return x; 00474 } 00475 00476 /*********************************************************************** 00477 00478 Search will begin at the start of the text string if a 00479 forward iteration is initiated before a backwards iteration. 00480 Otherwise if a backwards iteration is initiated before a 00481 forwards iteration, the search will begin at the end of the 00482 text string 00483 00484 ***********************************************************************/ 00485 00486 void reset () 00487 { 00488 usearch_reset (handle); 00489 } 00490 00491 /*********************************************************************** 00492 00493 Gets the collator used for the language rules. 00494 00495 ***********************************************************************/ 00496 00497 UCollator getCollator () 00498 { 00499 return new UCollator (usearch_getCollator (handle)); 00500 } 00501 00502 /*********************************************************************** 00503 00504 Sets the collator used for the language rules. This 00505 method causes internal data such as Boyer-Moore shift 00506 tables to be recalculated, but the iterator's position 00507 is unchanged 00508 00509 ***********************************************************************/ 00510 00511 void setCollator (UCollator col) 00512 { 00513 Error e; 00514 00515 usearch_setCollator (handle, col.handle, e); 00516 testError (e, "failed to set search collator"); 00517 } 00518 00519 00520 /*********************************************************************** 00521 00522 Bind the ICU functions from a shared library. This is 00523 complicated by the issues regarding D and DLLs on the 00524 Windows platform 00525 00526 ***********************************************************************/ 00527 00528 private static void* library; 00529 00530 /*********************************************************************** 00531 00532 ***********************************************************************/ 00533 00534 private static extern (C) 00535 { 00536 Handle function (wchar*, uint, wchar*, uint, char*, void*, inout Error) usearch_open; 00537 Handle function (wchar*, uint, wchar*, uint, Handle, void*, inout Error) usearch_openFromCollator; 00538 void function (Handle) usearch_close; 00539 void function (Handle, uint, inout Error) usearch_setOffset; 00540 uint function (Handle) usearch_getOffset; 00541 uint function (Handle) usearch_getMatchedStart; 00542 uint function (Handle) usearch_getMatchedLength; 00543 uint function (Handle, wchar*, uint, inout Error) usearch_getMatchedText; 00544 void function (Handle, wchar*, uint, inout Error) usearch_setText; 00545 wchar* function (Handle, uint*) usearch_getText; 00546 void function (Handle, wchar*, uint, inout Error) usearch_setPattern; 00547 wchar* function (Handle, uint*) usearch_getPattern; 00548 uint function (Handle, inout Error) usearch_first; 00549 uint function (Handle, inout Error) usearch_last; 00550 uint function (Handle, inout Error) usearch_next; 00551 uint function (Handle, inout Error) usearch_previous; 00552 uint function (Handle, uint, inout Error) usearch_following; 00553 uint function (Handle, uint, inout Error) usearch_preceding; 00554 void function (Handle) usearch_reset; 00555 void function (Handle, Handle, inout Error) usearch_setBreakIterator; 00556 Handle function (Handle) usearch_getCollator; 00557 void function (Handle, Handle, inout Error) usearch_setCollator; 00558 } 00559 00560 /*********************************************************************** 00561 00562 ***********************************************************************/ 00563 00564 static FunctionLoader.Bind[] targets = 00565 [ 00566 {cast(void**) &usearch_open, "usearch_open"}, 00567 {cast(void**) &usearch_openFromCollator, "usearch_openFromCollator"}, 00568 {cast(void**) &usearch_close, "usearch_close"}, 00569 {cast(void**) &usearch_setOffset, "usearch_setOffset"}, 00570 {cast(void**) &usearch_getOffset, "usearch_getOffset"}, 00571 {cast(void**) &usearch_getMatchedStart, "usearch_getMatchedStart"}, 00572 {cast(void**) &usearch_getMatchedLength, "usearch_getMatchedLength"}, 00573 {cast(void**) &usearch_getMatchedText, "usearch_getMatchedText"}, 00574 {cast(void**) &usearch_setText, "usearch_setText"}, 00575 {cast(void**) &usearch_getText, "usearch_getText"}, 00576 {cast(void**) &usearch_setPattern, "usearch_setPattern"}, 00577 {cast(void**) &usearch_getPattern, "usearch_getPattern"}, 00578 {cast(void**) &usearch_first, "usearch_first"}, 00579 {cast(void**) &usearch_last, "usearch_last"}, 00580 {cast(void**) &usearch_next, "usearch_next"}, 00581 {cast(void**) &usearch_previous, "usearch_previous"}, 00582 {cast(void**) &usearch_following, "usearch_following"}, 00583 {cast(void**) &usearch_preceding, "usearch_preceding"}, 00584 {cast(void**) &usearch_reset, "usearch_reset"}, 00585 {cast(void**) &usearch_setBreakIterator, "usearch_setBreakIterator"}, 00586 {cast(void**) &usearch_getCollator, "usearch_getCollator"}, 00587 {cast(void**) &usearch_setCollator, "usearch_setCollator"}, 00588 ]; 00589 00590 /*********************************************************************** 00591 00592 ***********************************************************************/ 00593 00594 static this () 00595 { 00596 library = FunctionLoader.bind (icuin, targets); 00597 } 00598 00599 /*********************************************************************** 00600 00601 ***********************************************************************/ 00602 00603 static ~this () 00604 { 00605 FunctionLoader.unbind (library); 00606 } 00607 }