Main Page | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members | Related Pages

URegex.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file URegex.d
00004         
00005         Copyright (c) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026         4. Derivative works are permitted, but they must carry this notice
00027            in full and credit the original source.
00028 
00029 
00030                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00031 
00032 
00033         @version        Initial version, November 2004      
00034         @author         Kris
00035 
00036         Note that this package and documentation is built around the ICU 
00037         project (http://oss.software.ibm.com/icu/). Below is the license 
00038         statement as specified by that software:
00039 
00040 
00041                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00042 
00043 
00044         ICU License - ICU 1.8.1 and later
00045 
00046         COPYRIGHT AND PERMISSION NOTICE
00047 
00048         Copyright (c) 1995-2003 International Business Machines Corporation and 
00049         others.
00050 
00051         All rights reserved.
00052 
00053         Permission is hereby granted, free of charge, to any person obtaining a
00054         copy of this software and associated documentation files (the
00055         "Software"), to deal in the Software without restriction, including
00056         without limitation the rights to use, copy, modify, merge, publish,
00057         distribute, and/or sell copies of the Software, and to permit persons
00058         to whom the Software is furnished to do so, provided that the above
00059         copyright notice(s) and this permission notice appear in all copies of
00060         the Software and that both the above copyright notice(s) and this
00061         permission notice appear in supporting documentation.
00062 
00063         THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
00064         OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00065         MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
00066         OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
00067         HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
00068         INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
00069         FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
00070         NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
00071         WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
00072 
00073         Except as contained in this notice, the name of a copyright holder
00074         shall not be used in advertising or otherwise to promote the sale, use
00075         or other dealings in this Software without prior written authorization
00076         of the copyright holder.
00077 
00078         ----------------------------------------------------------------------
00079 
00080         All trademarks and registered trademarks mentioned herein are the 
00081         property of their respective owners.
00082 
00083 *******************************************************************************/
00084 
00085 module mango.icu.URegex;
00086 
00087 private import  mango.icu.ICU;
00088 
00089 public  import  mango.icu.ULocale,
00090                 mango.icu.UString,
00091                 mango.icu.UCollator,
00092                 mango.icu.UBreakIterator;
00093 
00094 
00095 /*******************************************************************************
00096 
00097         Set of slices to return for group matching. See URegex.groups()
00098 
00099 *******************************************************************************/
00100 
00101 class Groups : ICU
00102 {
00103         public  wchar[] g0,
00104                         g1,
00105                         g2,
00106                         g3,
00107                         g4,
00108                         g5,
00109                         g6,
00110                         g7,
00111                         g8,
00112                         g9;
00113 }
00114 
00115 /*******************************************************************************
00116 
00117         Apis for an engine that provides regular-expression searching of
00118         UTF16 strings.
00119 
00120         See http://icu.sourceforge.net/apiref/icu4c/uregex_8h.html for full
00121         details.
00122 
00123 *******************************************************************************/
00124 
00125 class URegex : Groups
00126 {       
00127         private Handle  handle;
00128         private UText   theText;
00129 
00130         // Regex modes 
00131         public enum     Flag 
00132                         {
00133                         None            = 0,
00134 
00135                         // Enable case insensitive matching
00136                         CaseInsensitive = 2, 
00137 
00138                         // Allow white space and comments within patterns
00139                         Comments        = 4,
00140 
00141                         // Control behavior of "$" and "^" If set, recognize 
00142                         // line terminators within string, otherwise, match
00143                         // only at start and end of input string.
00144                         MultiLine       = 8,
00145 
00146                         // If set, '.' matches line terminators, otherwise '.' 
00147                         // matching stops at line end
00148                         DotAll          = 32,
00149                         
00150                         // Forces normalization of pattern and strings
00151                         CanonEq         = 128,  
00152 
00153                         // If set, uses the Unicode TR 29 definition of word 
00154                         // boundaries. Warning: Unicode word boundaries are 
00155                         // quite different from traditional regular expression 
00156                         // word boundaries. See http://unicode.org/reports/tr29/#Word_Boundaries
00157                         UWord           = 256,
00158                         }
00159 
00160         /***********************************************************************
00161 
00162                 Compiles the regular expression in string form into an 
00163                 internal representation using the specified match mode 
00164                 flags. The resulting regular expression handle can then 
00165                 be used to perform various matching operations.
00166 
00167         ***********************************************************************/
00168 
00169         this (wchar[] pattern, Flag flags=Flag.None, ParseError* pe=null)
00170         {
00171                 Error e;
00172 
00173                 handle = uregex_open (pattern.ptr, pattern.length, flags, pe, e);
00174                 testError (e, "failed to open regex");
00175                 uregex_setText (handle, "", 0, e);
00176         }
00177 
00178         /***********************************************************************
00179 
00180                 Compiles the regular expression in string form into an 
00181                 internal representation using the specified match mode 
00182                 flags. The resulting regular expression handle can then 
00183                 be used to perform various matching operations.
00184 
00185         ***********************************************************************/
00186 
00187         this (UText pattern, Flag flags=Flag.None, ParseError* pe=null)
00188         {
00189                 this (pattern.get, flags, pe);
00190         }
00191 
00192         /***********************************************************************
00193 
00194                 Internal constructor; used for cloning
00195 
00196         ***********************************************************************/
00197 
00198         private this (Handle handle)
00199         {
00200                 Error e;
00201 
00202                 this.handle = handle;
00203                 uregex_setText (handle, "", 0, e);
00204         }
00205 
00206         /***********************************************************************
00207         
00208                 Close the regular expression, recovering all resources (memory) 
00209                 it was holding
00210 
00211         ***********************************************************************/
00212 
00213         ~this ()
00214         {
00215                 uregex_close (handle);
00216         }
00217 
00218         /***********************************************************************
00219         
00220                 Cloning a regular expression is faster than opening a second 
00221                 instance from the source form of the expression, and requires 
00222                 less memory.
00223 
00224                 Note that the current input string and the position of any 
00225                 matched text within it are not cloned; only the pattern itself 
00226                 and and the match mode flags are copied.
00227 
00228                 Cloning can be particularly useful to threaded applications 
00229                 that perform multiple match operations in parallel. Each 
00230                 concurrent RE operation requires its own instance of a 
00231                 URegularExpression.
00232 
00233         ***********************************************************************/
00234 
00235         URegex clone ()
00236         {       
00237                 Error e;
00238 
00239                 Handle h = uregex_clone (handle, e);
00240                 testError (e, "failed to clone regex");
00241                 return new URegex (h);
00242         }
00243 
00244         /***********************************************************************
00245 
00246                 Return a copy of the source form of the pattern for this 
00247                 regular expression
00248 
00249         ***********************************************************************/
00250 
00251         UString getPattern ()
00252         {       
00253                 Error e;
00254                 uint  len;
00255 
00256                 wchar* x = uregex_pattern (handle, len, e);
00257                 testError (e, "failed to extract regex pattern");
00258                 return new UString (x[0..len]);
00259         }
00260 
00261         /***********************************************************************
00262 
00263                 Get the match mode flags that were specified when compiling 
00264                 this regular expression        
00265 
00266         ***********************************************************************/
00267 
00268         Flag getFlags ()
00269         {       
00270                 Error e;
00271 
00272                 Flag f = cast(Flag) uregex_flags (handle, e);
00273                 testError (e, "failed to get regex flags");
00274                 return f;        
00275         }
00276 
00277         /***********************************************************************
00278         
00279                 Set the subject text string upon which the regular expression 
00280                 will look for matches.
00281 
00282                 This function may be called any number of times, allowing the 
00283                 regular expression pattern to be applied to different strings.
00284 
00285                 Regular expression matching operations work directly on the 
00286                 application's string data. No copy is made. The subject string 
00287                 data must not be altered after calling this function until after 
00288                 all regular expression operations involving this string data are 
00289                 completed.
00290 
00291                 Zero length strings are permitted. In this case, no subsequent 
00292                 match operation will dereference the text string pointer.
00293 
00294         ***********************************************************************/
00295 
00296         void setText (UText t)
00297         {       
00298                 Error e;
00299 
00300                 theText = t;
00301                 uregex_setText (handle, t.get, t.length, e);
00302                 testError (e, "failed to set regex text");
00303         }
00304 
00305         /***********************************************************************
00306                 
00307                 Get the subject text that is currently associated with this 
00308                 regular expression object. This simply returns whatever was
00309                 previously supplied via setText(). 
00310 
00311                 Note that this returns a read-only reference to the text.
00312 
00313         ***********************************************************************/
00314 
00315         UText getText ()
00316         {      
00317                 return theText;
00318         }
00319 
00320         /***********************************************************************
00321 
00322                 Return a set of slices representing the parenthesised groups.
00323                 This can be used in the following manner:               
00324 
00325                 @code
00326                 wchar msg;
00327 
00328                 if (regex.next())
00329                     with (regex.groups())
00330                           msg ~= g1 ~ ":" ~ g2
00331                 @endcode
00332 
00333                 Note that g0 represents the entire match, whereas g1 through
00334                 g9 represent the parenthesised expressions.
00335                 
00336         ***********************************************************************/
00337 
00338         Groups groups ()
00339         {  
00340                 wchar[]*        p = &g0;
00341                 uint            count = groupCount();
00342                 wchar[]         content = theText.get();
00343 
00344                 if (count > 9)
00345                     count = 9;
00346                 for (uint i=0; i <= count; ++p, ++i)
00347                      *p = content [start(i)..end(i)];
00348                 return this;
00349         }
00350 
00351         /***********************************************************************
00352 
00353                 Extract the string for the specified matching expression or 
00354                 subexpression. UString 's' is the destination for the match.
00355 
00356                 Group #0 is the complete string of matched text. Group #1 is 
00357                 the text matched by the first set of capturing parentheses.
00358         
00359         ***********************************************************************/
00360 
00361         void group (UString s, uint index)
00362         {       
00363                 uint fmt (wchar* dst, uint length, inout Error e)
00364                 {
00365                         return uregex_group (handle, index, dst, length, e);
00366                 }
00367 
00368                 s.format (&fmt, "failed to extract regex group text");
00369         }
00370 
00371         /***********************************************************************
00372         
00373                 Get the number of capturing groups in this regular 
00374                 expression's pattern
00375 
00376         ***********************************************************************/
00377 
00378         uint groupCount ()
00379         {       
00380                 Error e;
00381 
00382                 uint i = uregex_groupCount (handle, e);
00383                 testError (e, "failed to get regex group-count");
00384                 return i;        
00385         }
00386 
00387         /***********************************************************************
00388                 
00389                 Returns the index in the input string of the start of the 
00390                 text matched by the specified capture group during the 
00391                 previous match operation.
00392 
00393                 Return -1 if the capture group was not part of the last 
00394                 match. Group #0 refers to the complete range of matched 
00395                 text. Group #1 refers to the text matched by the first 
00396                 set of capturing parentheses
00397 
00398         ***********************************************************************/
00399 
00400         uint start (uint index = 0)
00401         {       
00402                 Error e;
00403 
00404                 uint i = uregex_start (handle, index, e);
00405                 testError (e, "failed to get regex start");
00406                 return i;        
00407         }
00408 
00409         /***********************************************************************
00410 
00411                 Returns the index in the input string of the position 
00412                 following the end of the text matched by the specified 
00413                 capture group.
00414 
00415                 Return -1 if the capture group was not part of the last 
00416                 match. Group #0 refers to the complete range of matched 
00417                 text. Group #1 refers to the text matched by the first 
00418                 set of capturing parentheses.
00419         
00420         ***********************************************************************/
00421 
00422         uint end (uint index = 0)
00423         {       
00424                 Error e;
00425 
00426                 uint i = uregex_end (handle, index, e);
00427                 testError (e, "failed to get regex end");
00428                 return i;        
00429         }
00430 
00431         /***********************************************************************
00432 
00433                 Reset any saved state from the previous match.
00434 
00435                 Has the effect of causing uregex_findNext to begin at the 
00436                 specified index, and causing uregex_start(), uregex_end() 
00437                 and uregex_group() to return an error indicating that there 
00438                 is no match information available.
00439         
00440         ***********************************************************************/
00441 
00442         void reset (uint startIndex)
00443         {       
00444                 Error e;
00445 
00446                 uregex_reset (handle, startIndex, e);
00447                 testError (e, "failed to set regex next-index");
00448         }
00449 
00450         /***********************************************************************
00451         
00452                 Attempts to match the input string, beginning at startIndex, 
00453                 against the pattern.
00454 
00455                 To succeed, the match must extend to the end of the input 
00456                 string
00457 
00458         ***********************************************************************/
00459 
00460         bool match (uint startIndex)
00461         {       
00462                 Error e;
00463 
00464                 bool b = uregex_matches (handle, startIndex, e);
00465                 testError (e, "failed while matching regex");
00466                 return b;
00467         }
00468 
00469         /***********************************************************************
00470 
00471                 Attempts to match the input string, starting from the 
00472                 specified index, against the pattern.
00473 
00474                 The match may be of any length, and is not required to 
00475                 extend to the end of the input string. Contrast with match()        
00476 
00477         ***********************************************************************/
00478 
00479         bool probe (uint startIndex)
00480         {       
00481                 Error e;
00482 
00483                 bool b = uregex_lookingAt (handle, startIndex, e);
00484                 testError (e, "failed while looking at regex");
00485                 return b;
00486         }
00487 
00488         /***********************************************************************
00489                 
00490                 Returns whether the text matches the search pattern, starting 
00491                 from the current position.
00492 
00493                 If startIndex is specified, the current position is moved to 
00494                 the specified location before the seach is initiated.
00495 
00496         ***********************************************************************/
00497 
00498         bool next (uint startIndex = uint.max)
00499         {     
00500                 Error e;
00501                 bool  b;
00502 
00503                 b = (startIndex == uint.max) ? uregex_findNext (handle, e) : 
00504                                                uregex_find     (handle, startIndex, e);
00505 
00506                 testError (e, "failed on next regex");  
00507                 return b;
00508         }
00509 
00510         /***********************************************************************
00511         
00512                 Replaces every substring of the input that matches the pattern 
00513                 with the given replacement string.
00514 
00515                 This is a convenience function that provides a complete 
00516                 find-and-replace-all operation.
00517 
00518                 This method scans the input string looking for matches of 
00519                 the pattern. Input that is not part of any match is copied 
00520                 unchanged to the destination buffer. Matched regions are 
00521                 replaced in the output buffer by the replacement string. 
00522                 The replacement string may contain references to capture 
00523                 groups; these take the form of $1, $2, etc.
00524 
00525                 The provided 'result' will contain the results, and should
00526                 be set with a length sufficient to house the entire result.
00527                 Upon completion, the 'result' is shortened appropriately 
00528                 and the total extent (length) of the operation is returned. 
00529                 Set the initital length of 'result' using the UString method
00530                 truncate().
00531 
00532                 The returned extent should be checked to ensure it is not
00533                 longer than the length of 'result'. If it is longer, then
00534                 the result has been truncated.
00535                 
00536         ***********************************************************************/
00537 
00538         uint replaceAll (UText replace, UString result)
00539         {
00540                 Error e;
00541 
00542                 uint len = uregex_replaceAll (handle, replace.get, replace.length, result.get, result.length, e);
00543                 testError (e, "failed during regex replace");  
00544                 result.truncate (len);
00545                 return len;
00546         }
00547 
00548         /***********************************************************************
00549         
00550                 Replaces the first substring of the input that matches the 
00551                 pattern with the given replacement string.
00552 
00553                 This is a convenience function that provides a complete 
00554                 find-and-replace operation.
00555 
00556                 This method scans the input string looking for a match of 
00557                 the pattern. All input that is not part of the match is 
00558                 copied unchanged to the destination buffer. The matched 
00559                 region is replaced in the output buffer by the replacement 
00560                 string. The replacement string may contain references to 
00561                 capture groups; these take the form of $1, $2, etc
00562 
00563                 The provided 'result' will contain the results, and should
00564                 be set with a length sufficient to house the entire result.
00565                 Upon completion, the 'result' is shortened appropriately 
00566                 and the total extent (length) of the operation is returned. 
00567                 Set the initital length of 'result' using the UString method
00568                 truncate().
00569 
00570                 The returned extent should be checked to ensure it is not
00571                 longer than the length of 'result'. If it is longer, then
00572                 the result has been truncated.
00573                 
00574         ***********************************************************************/
00575 
00576         uint replaceFirst (UText replace, UString result)
00577         {
00578                 Error e;
00579 
00580                 uint len = uregex_replaceFirst (handle, replace.get, replace.length, result.get, result.length, e);
00581                 testError (e, "failed during regex replace");  
00582                 result.truncate (len);
00583                 return len;
00584         }
00585 
00586         /***********************************************************************
00587         
00588                 Split the text up into slices (fields), where each slice 
00589                 represents the text situated between each pattern matched
00590                 within the text. The pattern is expected to represent one
00591                 or more slice delimiters.
00592 
00593         ***********************************************************************/
00594 
00595         uint split (wchar[][] fields)
00596         {     
00597                 Error           e;
00598                 wchar[]*        s;
00599                 uint            pos,
00600                                 count;
00601                 wchar[]         content = theText.get;
00602 
00603                 for (s = fields; count < fields.length;)
00604                      if (uregex_findNext (handle, e) && e == e.OK)
00605                         {
00606                         uint i = start();
00607                         *s = content[pos..i];
00608                         pos = end ();
00609 
00610                         // ignore leading delimiter
00611                         if (i)
00612                             ++s, ++count;
00613                         }
00614                      else
00615                         break;
00616                 testError (e, "failed during split");  
00617                 return count;
00618         }
00619 
00620 
00621         /***********************************************************************
00622 
00623                 Bind the ICU functions from a shared library. This is
00624                 complicated by the issues regarding D and DLLs on the
00625                 Windows platform
00626         
00627         ***********************************************************************/
00628               
00629         private static void* library;
00630 
00631         /***********************************************************************
00632 
00633         ***********************************************************************/
00634 
00635         private static extern (C) 
00636         {
00637                 Handle  function (wchar*, uint, uint, ParseError*, inout Error) uregex_open;
00638                 void    function (Handle) uregex_close;
00639                 Handle  function (Handle, inout Error) uregex_clone;
00640                 wchar*  function (Handle, inout uint, inout Error) uregex_pattern;
00641                 uint    function (Handle, inout Error) uregex_flags;
00642                 void    function (Handle, wchar*, uint, inout Error) uregex_setText;
00643                 wchar*  function (Handle, inout uint, inout Error) uregex_getText;
00644                 uint    function (Handle, uint, wchar*, uint, inout Error) uregex_group;
00645                 uint    function (Handle, inout Error) uregex_groupCount;
00646                 uint    function (Handle, uint, inout Error) uregex_start;
00647                 uint    function (Handle, uint, inout Error) uregex_end;
00648                 void    function (Handle, uint, inout Error) uregex_reset;
00649                 bool    function (Handle, uint, inout Error) uregex_matches;
00650                 bool    function (Handle, uint, inout Error) uregex_lookingAt;
00651                 bool    function (Handle, uint, inout Error) uregex_find;
00652                 bool    function (Handle, inout Error) uregex_findNext;
00653                 uint    function (Handle, wchar*, uint, wchar*, uint, inout Error) uregex_replaceAll;
00654                 uint    function (Handle, wchar*, uint, wchar*, uint, inout Error) uregex_replaceFirst;
00655         }
00656 
00657         /***********************************************************************
00658 
00659         ***********************************************************************/
00660 
00661         static  FunctionLoader.Bind[] targets = 
00662                 [
00663                 {cast(void**) &uregex_open,             "uregex_open"}, 
00664                 {cast(void**) &uregex_close,            "uregex_close"},
00665                 {cast(void**) &uregex_clone,            "uregex_clone"},
00666                 {cast(void**) &uregex_pattern,          "uregex_pattern"},
00667                 {cast(void**) &uregex_flags,            "uregex_flags"},
00668                 {cast(void**) &uregex_setText,          "uregex_setText"},
00669                 {cast(void**) &uregex_getText,          "uregex_getText"},
00670                 {cast(void**) &uregex_group,            "uregex_group"},
00671                 {cast(void**) &uregex_groupCount,       "uregex_groupCount"},
00672                 {cast(void**) &uregex_start,            "uregex_start"},
00673                 {cast(void**) &uregex_end,              "uregex_end"},
00674                 {cast(void**) &uregex_reset,            "uregex_reset"},
00675                 {cast(void**) &uregex_matches,          "uregex_matches"},
00676                 {cast(void**) &uregex_lookingAt,        "uregex_lookingAt"},
00677                 {cast(void**) &uregex_find,             "uregex_find"},
00678                 {cast(void**) &uregex_findNext,         "uregex_findNext"},
00679                 {cast(void**) &uregex_replaceAll,       "uregex_replaceAll"},
00680                 {cast(void**) &uregex_replaceFirst,     "uregex_replaceFirst"},
00681                 ];
00682 
00683         /***********************************************************************
00684 
00685         ***********************************************************************/
00686 
00687         static this ()
00688         {
00689                 library = FunctionLoader.bind (icuin, targets);
00690         }
00691 
00692         /***********************************************************************
00693 
00694         ***********************************************************************/
00695 
00696         static ~this ()
00697         {
00698                 FunctionLoader.unbind (library);
00699         }
00700 }

Generated on Sat Apr 9 20:11:30 2005 for Mango by doxygen 1.3.6