Main Page | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members | Related Pages

USearch.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file USearch.d
00004         
00005         Copyright (c) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026         4. Derivative works are permitted, but they must carry this notice
00027            in full and credit the original source.
00028 
00029 
00030                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00031 
00032 
00033         @version        Initial version, November 2004      
00034         @author         Kris
00035 
00036         Note that this package and documentation is built around the ICU 
00037         project (http://oss.software.ibm.com/icu/). Below is the license 
00038         statement as specified by that software:
00039 
00040 
00041                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00042 
00043 
00044         ICU License - ICU 1.8.1 and later
00045 
00046         COPYRIGHT AND PERMISSION NOTICE
00047 
00048         Copyright (c) 1995-2003 International Business Machines Corporation and 
00049         others.
00050 
00051         All rights reserved.
00052 
00053         Permission is hereby granted, free of charge, to any person obtaining a
00054         copy of this software and associated documentation files (the
00055         "Software"), to deal in the Software without restriction, including
00056         without limitation the rights to use, copy, modify, merge, publish,
00057         distribute, and/or sell copies of the Software, and to permit persons
00058         to whom the Software is furnished to do so, provided that the above
00059         copyright notice(s) and this permission notice appear in all copies of
00060         the Software and that both the above copyright notice(s) and this
00061         permission notice appear in supporting documentation.
00062 
00063         THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
00064         OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00065         MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
00066         OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
00067         HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
00068         INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
00069         FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
00070         NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
00071         WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
00072 
00073         Except as contained in this notice, the name of a copyright holder
00074         shall not be used in advertising or otherwise to promote the sale, use
00075         or other dealings in this Software without prior written authorization
00076         of the copyright holder.
00077 
00078         ----------------------------------------------------------------------
00079 
00080         All trademarks and registered trademarks mentioned herein are the 
00081         property of their respective owners.
00082 
00083 *******************************************************************************/
00084 
00085 module mango.icu.USearch;
00086 
00087 private import  mango.icu.ICU;
00088 
00089 public  import  mango.icu.ULocale,
00090                 mango.icu.UString,
00091                 mango.icu.UCollator,
00092                 mango.icu.UBreakIterator;
00093 
00094 /*******************************************************************************
00095 
00096         Apis for an engine that provides language-sensitive text 
00097         searching based on the comparison rules defined in a UCollator 
00098         data struct. This ensures that language eccentricity can be handled, 
00099         e.g. for the German collator, characters ß and SS will be matched 
00100         if case is chosen to be ignored. See the "ICU Collation Design 
00101         Document" for more information.
00102 
00103         The algorithm implemented is a modified form of the Boyer Moore's 
00104         search. For more information see "Efficient Text Searching in Java", 
00105         published in Java Report in February, 1999, for further information 
00106         on the algorithm.
00107 
00108         There are 2 match options for selection: Let S' be the sub-string 
00109         of a text string S between the offsets start and end <start, end>. A 
00110         pattern string P matches a text string S at the offsets <start, end> if
00111  
00112         - option 1. Some canonical equivalent of P matches some canonical
00113                     equivalent of S'
00114 
00115         - option 2. P matches S' and if P starts or ends with a combining 
00116                     mark, there exists no non-ignorable combining mark before 
00117                     or after S’ n S respectively. 
00118  
00119         Option 2 will be the default·
00120 
00121         This search has APIs similar to that of other text iteration 
00122         mechanisms such as the break iterators in ubrk.h. Using these 
00123         APIs, it is easy to scan through text looking for all occurances 
00124         of a given pattern. This search iterator allows changing of 
00125         direction by calling a reset followed by a next or previous. 
00126         Though a direction change can occur without calling reset first, 
00127         this operation comes with some speed penalty. Generally, match 
00128         results in the forward direction will match the result matches 
00129         in the backwards direction in the reverse order
00130 
00131         USearch provides APIs to specify the starting position within 
00132         the text string to be searched, e.g. setOffset(), previous(x) 
00133         and next(x). Since the starting position will be set as it 
00134         is specified, please take note that there are some dangerous 
00135         positions which the search may render incorrect results:
00136 
00137         - The midst of a substring that requires normalization.
00138 
00139         - If the following match is to be found, the position should 
00140           not be the second character which requires to be swapped 
00141           with the preceding character. Vice versa, if the preceding 
00142           match is to be found, position to search from should not be 
00143           the first character which requires to be swapped with the 
00144           next character. E.g certain Thai and Lao characters require 
00145           swapping.
00146 
00147         - If a following pattern match is to be found, any position 
00148           within a contracting sequence except the first will fail. 
00149           Vice versa if a preceding pattern match is to be found, 
00150           a invalid starting point would be any character within a 
00151           contracting sequence except the last.
00152 
00153         A breakiterator can be used if only matches at logical breaks are 
00154         desired. Using a breakiterator will only give you results that 
00155         exactly matches the boundaries given by the breakiterator. For 
00156         instance the pattern "e" will not be found in the string "\u00e9" 
00157         if a character break iterator is used.
00158 
00159         Options are provided to handle overlapping matches. E.g. In 
00160         English, overlapping matches produces the result 0 and 2 for 
00161         the pattern "abab" in the text "ababab", where else mutually 
00162         exclusive matches only produce the result of 0.
00163 
00164         Though collator attributes will be taken into consideration while 
00165         performing matches, there are no APIs here for setting and getting 
00166         the attributes. These attributes can be set by getting the collator 
00167         from getCollator() and using the APIs in UCollator. Lastly to update 
00168         String Search to the new collator attributes, reset() has to be called.
00169 
00170         See http://oss.software.ibm.com/icu/apiref/usearch_8h.html for full 
00171         details.
00172 
00173 *******************************************************************************/
00174 
00175 class USearch : ICU
00176 {       
00177         private Handle          handle;
00178         private UBreakIterator  iterator;
00179 
00180         // DONE is returned by previous() and next() after all valid 
00181         // matches have been returned, and by first() and last() if 
00182         // there are no matches at all.
00183         const uint      Done = uint.max;
00184 
00185         //Possible types of searches
00186         public enum     Attribute 
00187                         {
00188                         Overlap, 
00189                         CanonicalMatch, 
00190                         Count
00191                         }
00192 
00193         public enum     AttributeValue 
00194                         {
00195                         Default = -1, 
00196                         Off, 
00197                         On, 
00198                         Count
00199                         }
00200 
00201         /***********************************************************************
00202 
00203                 Creating a search iterator data struct using the argument 
00204                 locale language rule set
00205 
00206         ***********************************************************************/
00207 
00208         this (UText pattern, UText text, inout ULocale locale, UBreakIterator iterator = null)
00209         {
00210                 Error e;
00211 
00212                 this.iterator = iterator;
00213                 handle = usearch_open (pattern.get, pattern.length, text.get, text.length, toString(locale.name), iterator, e);
00214                 testError (e, "failed to open search");
00215         }
00216 
00217         /***********************************************************************
00218 
00219                 Creating a search iterator data struct using the argument 
00220                 locale language rule set
00221 
00222         ***********************************************************************/
00223 
00224         this (UText pattern, UText text, UCollator col, UBreakIterator iterator = null)
00225         {
00226                 Error e;
00227 
00228                 this.iterator = iterator;
00229                 handle = usearch_openFromCollator (pattern.get, pattern.length, text.get, text.length, col.handle, iterator, e);
00230                 testError (e, "failed to open search from collator");
00231         }
00232 
00233         /***********************************************************************
00234         
00235                 Close this USearch
00236 
00237         ***********************************************************************/
00238 
00239         ~this ()
00240         {
00241                 usearch_close (handle);
00242         }
00243 
00244         /***********************************************************************
00245         
00246                 Sets the current position in the text string which the 
00247                 next search will start from.
00248                 
00249         ***********************************************************************/
00250 
00251         void setOffset (uint position)
00252         {       
00253                 Error e;
00254 
00255                 usearch_setOffset (handle, position, e);
00256                 testError (e, "failed to set search offset");
00257         }
00258 
00259         /***********************************************************************
00260         
00261                 Return the current index in the string text being searched
00262 
00263         ***********************************************************************/
00264 
00265         uint getOffset ()
00266         {       
00267                 return usearch_getOffset (handle);
00268         }
00269 
00270         /***********************************************************************
00271         
00272                 Returns the index to the match in the text string that was 
00273                 searched
00274 
00275         ***********************************************************************/
00276 
00277         uint getMatchedStart ()
00278         {       
00279                 return usearch_getMatchedStart (handle);
00280         }
00281 
00282         /***********************************************************************
00283         
00284                 Returns the length of text in the string which matches the 
00285                 search pattern
00286 
00287         ***********************************************************************/
00288 
00289         uint getMatchedLength ()
00290         {       
00291                 return usearch_getMatchedLength (handle);
00292         }
00293 
00294         /***********************************************************************
00295         
00296                 Returns the text that was matched by the most recent call to 
00297                 first(), next(), previous(), or last().
00298 
00299         ***********************************************************************/
00300 
00301         void getMatchedText (UString s)
00302         {       
00303                 uint fmt (wchar* dst, uint length, inout Error e)
00304                 {
00305                         return usearch_getMatchedText (handle, dst, length, e);
00306                 }
00307 
00308                 s.format (&fmt, "failed to extract matched text");
00309         }
00310 
00311         /***********************************************************************
00312         
00313                 Set the string text to be searched.
00314 
00315         ***********************************************************************/
00316 
00317         void setText (UText t)
00318         {       
00319                 Error e;
00320 
00321                 usearch_setText (handle, t.get, t.length, e);
00322                 testError (e, "failed to set search text");
00323         }
00324 
00325         /***********************************************************************
00326                 
00327                 Return the string text to be searched. Note that this 
00328                 returns a read-only reference to the search text.
00329 
00330         ***********************************************************************/
00331 
00332         UText getText ()
00333         {       
00334                 uint len;
00335 
00336                 wchar *x = usearch_getText (handle, &len);
00337                 return new UText (x[0..len]);
00338         }
00339 
00340         /***********************************************************************
00341         
00342                 Sets the pattern used for matching
00343 
00344         ***********************************************************************/
00345 
00346         void setPattern (UText t)
00347         {       
00348                 Error e;
00349 
00350                 usearch_setPattern (handle, t.get, t.length, e);
00351                 testError (e, "failed to set search pattern");
00352         }
00353 
00354         /***********************************************************************
00355                 
00356                 Gets the search pattern. Note that this returns a 
00357                 read-only reference to the pattern.
00358 
00359         ***********************************************************************/
00360 
00361         UText getPattern ()
00362         {       
00363                 uint len;
00364 
00365                 wchar *x = usearch_getPattern (handle, &len);
00366                 return new UText (x[0..len]);
00367         }
00368 
00369         /***********************************************************************
00370         
00371                 Set the BreakIterator that will be used to restrict the 
00372                 points at which matches are detected.
00373 
00374         ***********************************************************************/
00375 
00376         void setIterator (UBreakIterator iterator)
00377         {       
00378                 Error e;
00379 
00380                 this.iterator = iterator;
00381                 usearch_setBreakIterator (handle, iterator.handle, e);
00382                 testError (e, "failed to set search iterator");
00383         }
00384 
00385         /***********************************************************************
00386         
00387                 Get the BreakIterator that will be used to restrict the 
00388                 points at which matches are detected.
00389 
00390         ***********************************************************************/
00391 
00392         UBreakIterator getIterator ()
00393         {       
00394                 return iterator;
00395         }
00396 
00397         /***********************************************************************
00398                 
00399                 Returns the first index at which the string text matches 
00400                 the search pattern
00401                                         
00402         ***********************************************************************/
00403 
00404         uint first ()
00405         {     
00406                 Error e;
00407                 
00408                 uint x = usearch_first (handle, e);
00409                 testError (e, "failed on first search");  
00410                 return x;
00411         }
00412 
00413         /***********************************************************************
00414                 
00415                 Returns the last index in the target text at which it 
00416                 matches the search pattern
00417 
00418         ***********************************************************************/
00419 
00420         uint last ()
00421         {     
00422                 Error e;
00423                 
00424                 uint x = usearch_last (handle, e);
00425                 testError (e, "failed on last search");  
00426                 return x;
00427         }
00428 
00429         /***********************************************************************
00430                 
00431                 Returns the index of the next point at which the string 
00432                 text matches the search pattern, starting from the current
00433                 position.
00434 
00435                 If pos is specified, returns the first index greater than 
00436                 pos at which the string text matches the search pattern
00437 
00438         ***********************************************************************/
00439 
00440         uint next (uint pos = uint.max)
00441         {     
00442                 Error e;
00443                 uint  x;
00444 
00445                 x = (pos == uint.max) ? usearch_next (handle, e) : 
00446                                         usearch_following (handle, pos, e);
00447 
00448                 testError (e, "failed on next search");  
00449                 return x;
00450         }
00451 
00452         /***********************************************************************
00453 
00454                 Returns the index of the previous point at which the 
00455                 string text matches the search pattern, starting at 
00456                 the current position.                 
00457 
00458                 If pos is specified, returns the first index less 
00459                 than pos at which the string text matches the search 
00460                 pattern.
00461 
00462         ***********************************************************************/
00463 
00464         uint previous (uint pos = uint.max)
00465         {     
00466                 Error e;
00467                 uint  x;
00468 
00469                 x = (pos == uint.max) ? usearch_previous  (handle, e) : 
00470                                         usearch_preceding (handle, pos, e);
00471 
00472                 testError (e, "failed on next search");  
00473                 return x;
00474         }
00475 
00476         /***********************************************************************
00477         
00478                 Search will begin at the start of the text string if a 
00479                 forward iteration is initiated before a backwards iteration. 
00480                 Otherwise if a backwards iteration is initiated before a 
00481                 forwards iteration, the search will begin at the end of the 
00482                 text string
00483 
00484         ***********************************************************************/
00485 
00486         void reset ()
00487         {     
00488                 usearch_reset (handle);
00489         }
00490 
00491         /***********************************************************************
00492         
00493                 Gets the collator used for the language rules. 
00494 
00495         ***********************************************************************/
00496 
00497         UCollator getCollator ()
00498         {
00499                 return new UCollator (usearch_getCollator (handle));
00500         }
00501 
00502         /***********************************************************************
00503         
00504                 Sets the collator used for the language rules. This 
00505                 method causes internal data such as Boyer-Moore shift 
00506                 tables to be recalculated, but the iterator's position 
00507                 is unchanged
00508 
00509         ***********************************************************************/
00510 
00511         void setCollator (UCollator col)
00512         {
00513                 Error e;
00514 
00515                 usearch_setCollator (handle, col.handle, e);
00516                 testError (e, "failed to set search collator");  
00517         }
00518 
00519 
00520         /***********************************************************************
00521 
00522                 Bind the ICU functions from a shared library. This is
00523                 complicated by the issues regarding D and DLLs on the
00524                 Windows platform
00525         
00526         ***********************************************************************/
00527               
00528         private static void* library;
00529 
00530         /***********************************************************************
00531 
00532         ***********************************************************************/
00533 
00534         private static extern (C) 
00535         {
00536                 Handle  function (wchar*, uint, wchar*, uint, char*, void*, inout Error) usearch_open;
00537                 Handle  function (wchar*, uint, wchar*, uint, Handle, void*, inout Error) usearch_openFromCollator;
00538                 void    function (Handle) usearch_close;
00539                 void    function (Handle, uint, inout Error) usearch_setOffset;
00540                 uint    function (Handle) usearch_getOffset;
00541                 uint    function (Handle) usearch_getMatchedStart;
00542                 uint    function (Handle) usearch_getMatchedLength;
00543                 uint    function (Handle, wchar*, uint, inout Error) usearch_getMatchedText;
00544                 void    function (Handle, wchar*, uint, inout Error) usearch_setText;
00545                 wchar*  function (Handle, uint*) usearch_getText;
00546                 void    function (Handle, wchar*, uint, inout Error) usearch_setPattern;
00547                 wchar*  function (Handle, uint*) usearch_getPattern;
00548                 uint    function (Handle, inout Error) usearch_first;
00549                 uint    function (Handle, inout Error) usearch_last;
00550                 uint    function (Handle, inout Error) usearch_next;
00551                 uint    function (Handle, inout Error) usearch_previous;
00552                 uint    function (Handle, uint, inout Error) usearch_following;
00553                 uint    function (Handle, uint, inout Error) usearch_preceding;
00554                 void    function (Handle) usearch_reset;
00555                 void    function (Handle, Handle, inout Error) usearch_setBreakIterator;
00556                 Handle  function (Handle) usearch_getCollator;
00557                 void    function (Handle, Handle, inout Error) usearch_setCollator;
00558         }
00559 
00560         /***********************************************************************
00561 
00562         ***********************************************************************/
00563 
00564         static  FunctionLoader.Bind[] targets = 
00565                 [
00566                 {cast(void**) &usearch_open,             "usearch_open"}, 
00567                 {cast(void**) &usearch_openFromCollator, "usearch_openFromCollator"}, 
00568                 {cast(void**) &usearch_close,            "usearch_close"},
00569                 {cast(void**) &usearch_setOffset,        "usearch_setOffset"},
00570                 {cast(void**) &usearch_getOffset,        "usearch_getOffset"},
00571                 {cast(void**) &usearch_getMatchedStart,  "usearch_getMatchedStart"},
00572                 {cast(void**) &usearch_getMatchedLength, "usearch_getMatchedLength"},
00573                 {cast(void**) &usearch_getMatchedText,   "usearch_getMatchedText"},
00574                 {cast(void**) &usearch_setText,          "usearch_setText"},
00575                 {cast(void**) &usearch_getText,          "usearch_getText"},
00576                 {cast(void**) &usearch_setPattern,       "usearch_setPattern"},
00577                 {cast(void**) &usearch_getPattern,       "usearch_getPattern"},
00578                 {cast(void**) &usearch_first,            "usearch_first"},
00579                 {cast(void**) &usearch_last,             "usearch_last"},
00580                 {cast(void**) &usearch_next,             "usearch_next"},
00581                 {cast(void**) &usearch_previous,         "usearch_previous"},
00582                 {cast(void**) &usearch_following,        "usearch_following"},
00583                 {cast(void**) &usearch_preceding,        "usearch_preceding"},
00584                 {cast(void**) &usearch_reset,            "usearch_reset"},
00585                 {cast(void**) &usearch_setBreakIterator, "usearch_setBreakIterator"},
00586                 {cast(void**) &usearch_getCollator,      "usearch_getCollator"},
00587                 {cast(void**) &usearch_setCollator,      "usearch_setCollator"},
00588                 ];
00589 
00590         /***********************************************************************
00591 
00592         ***********************************************************************/
00593 
00594         static this ()
00595         {
00596                 library = FunctionLoader.bind (icuin, targets);
00597         }
00598 
00599         /***********************************************************************
00600 
00601         ***********************************************************************/
00602 
00603         static ~this ()
00604         {
00605                 FunctionLoader.unbind (library);
00606         }
00607 }

Generated on Tue Jan 25 21:18:25 2005 for Mango by doxygen 1.3.6