00001 /******************************************************************************* 00002 00003 @file URegex.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version, November 2004 00034 @author Kris 00035 00036 Note that this package and documentation is built around the ICU 00037 project (http://oss.software.ibm.com/icu/). Below is the license 00038 statement as specified by that software: 00039 00040 00041 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00042 00043 00044 ICU License - ICU 1.8.1 and later 00045 00046 COPYRIGHT AND PERMISSION NOTICE 00047 00048 Copyright (c) 1995-2003 International Business Machines Corporation and 00049 others. 00050 00051 All rights reserved. 00052 00053 Permission is hereby granted, free of charge, to any person obtaining a 00054 copy of this software and associated documentation files (the 00055 "Software"), to deal in the Software without restriction, including 00056 without limitation the rights to use, copy, modify, merge, publish, 00057 distribute, and/or sell copies of the Software, and to permit persons 00058 to whom the Software is furnished to do so, provided that the above 00059 copyright notice(s) and this permission notice appear in all copies of 00060 the Software and that both the above copyright notice(s) and this 00061 permission notice appear in supporting documentation. 00062 00063 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 00064 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00065 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 00066 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 00067 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL 00068 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING 00069 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, 00070 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION 00071 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 00072 00073 Except as contained in this notice, the name of a copyright holder 00074 shall not be used in advertising or otherwise to promote the sale, use 00075 or other dealings in this Software without prior written authorization 00076 of the copyright holder. 00077 00078 ---------------------------------------------------------------------- 00079 00080 All trademarks and registered trademarks mentioned herein are the 00081 property of their respective owners. 00082 00083 *******************************************************************************/ 00084 00085 module mango.icu.URegex; 00086 00087 private import mango.icu.ICU; 00088 00089 public import mango.icu.ULocale, 00090 mango.icu.UString, 00091 mango.icu.UCollator, 00092 mango.icu.UBreakIterator; 00093 00094 00095 /******************************************************************************* 00096 00097 Set of slices to return for group matching. See URegex.groups() 00098 00099 *******************************************************************************/ 00100 00101 class Groups : ICU 00102 { 00103 public wchar[] g0, 00104 g1, 00105 g2, 00106 g3, 00107 g4, 00108 g5, 00109 g6, 00110 g7, 00111 g8, 00112 g9; 00113 } 00114 00115 /******************************************************************************* 00116 00117 Apis for an engine that provides regular-expression searching of 00118 UTF16 strings. 00119 00120 See http://icu.sourceforge.net/apiref/icu4c/uregex_8h.html for full 00121 details. 00122 00123 *******************************************************************************/ 00124 00125 class URegex : Groups 00126 { 00127 private Handle handle; 00128 private UText theText; 00129 00130 // Regex modes 00131 public enum Flag 00132 { 00133 None = 0, 00134 00135 // Enable case insensitive matching 00136 CaseInsensitive = 2, 00137 00138 // Allow white space and comments within patterns 00139 Comments = 4, 00140 00141 // Control behavior of "$" and "^" If set, recognize 00142 // line terminators within string, otherwise, match 00143 // only at start and end of input string. 00144 MultiLine = 8, 00145 00146 // If set, '.' matches line terminators, otherwise '.' 00147 // matching stops at line end 00148 DotAll = 32, 00149 00150 // Forces normalization of pattern and strings 00151 CanonEq = 128, 00152 00153 // If set, uses the Unicode TR 29 definition of word 00154 // boundaries. Warning: Unicode word boundaries are 00155 // quite different from traditional regular expression 00156 // word boundaries. See http://unicode.org/reports/tr29/#Word_Boundaries 00157 UWord = 256, 00158 } 00159 00160 /*********************************************************************** 00161 00162 Compiles the regular expression in string form into an 00163 internal representation using the specified match mode 00164 flags. The resulting regular expression handle can then 00165 be used to perform various matching operations. 00166 00167 ***********************************************************************/ 00168 00169 this (wchar[] pattern, Flag flags=Flag.None, ParseError* pe=null) 00170 { 00171 Error e; 00172 00173 handle = uregex_open (pattern.ptr, pattern.length, flags, pe, e); 00174 testError (e, "failed to open regex"); 00175 uregex_setText (handle, "", 0, e); 00176 } 00177 00178 /*********************************************************************** 00179 00180 Compiles the regular expression in string form into an 00181 internal representation using the specified match mode 00182 flags. The resulting regular expression handle can then 00183 be used to perform various matching operations. 00184 00185 ***********************************************************************/ 00186 00187 this (UText pattern, Flag flags=Flag.None, ParseError* pe=null) 00188 { 00189 this (pattern.get, flags, pe); 00190 } 00191 00192 /*********************************************************************** 00193 00194 Internal constructor; used for cloning 00195 00196 ***********************************************************************/ 00197 00198 private this (Handle handle) 00199 { 00200 Error e; 00201 00202 this.handle = handle; 00203 uregex_setText (handle, "", 0, e); 00204 } 00205 00206 /*********************************************************************** 00207 00208 Close the regular expression, recovering all resources (memory) 00209 it was holding 00210 00211 ***********************************************************************/ 00212 00213 ~this () 00214 { 00215 uregex_close (handle); 00216 } 00217 00218 /*********************************************************************** 00219 00220 Cloning a regular expression is faster than opening a second 00221 instance from the source form of the expression, and requires 00222 less memory. 00223 00224 Note that the current input string and the position of any 00225 matched text within it are not cloned; only the pattern itself 00226 and and the match mode flags are copied. 00227 00228 Cloning can be particularly useful to threaded applications 00229 that perform multiple match operations in parallel. Each 00230 concurrent RE operation requires its own instance of a 00231 URegularExpression. 00232 00233 ***********************************************************************/ 00234 00235 URegex clone () 00236 { 00237 Error e; 00238 00239 Handle h = uregex_clone (handle, e); 00240 testError (e, "failed to clone regex"); 00241 return new URegex (h); 00242 } 00243 00244 /*********************************************************************** 00245 00246 Return a copy of the source form of the pattern for this 00247 regular expression 00248 00249 ***********************************************************************/ 00250 00251 UString getPattern () 00252 { 00253 Error e; 00254 uint len; 00255 00256 wchar* x = uregex_pattern (handle, len, e); 00257 testError (e, "failed to extract regex pattern"); 00258 return new UString (x[0..len]); 00259 } 00260 00261 /*********************************************************************** 00262 00263 Get the match mode flags that were specified when compiling 00264 this regular expression 00265 00266 ***********************************************************************/ 00267 00268 Flag getFlags () 00269 { 00270 Error e; 00271 00272 Flag f = cast(Flag) uregex_flags (handle, e); 00273 testError (e, "failed to get regex flags"); 00274 return f; 00275 } 00276 00277 /*********************************************************************** 00278 00279 Set the subject text string upon which the regular expression 00280 will look for matches. 00281 00282 This function may be called any number of times, allowing the 00283 regular expression pattern to be applied to different strings. 00284 00285 Regular expression matching operations work directly on the 00286 application's string data. No copy is made. The subject string 00287 data must not be altered after calling this function until after 00288 all regular expression operations involving this string data are 00289 completed. 00290 00291 Zero length strings are permitted. In this case, no subsequent 00292 match operation will dereference the text string pointer. 00293 00294 ***********************************************************************/ 00295 00296 void setText (UText t) 00297 { 00298 Error e; 00299 00300 theText = t; 00301 uregex_setText (handle, t.get, t.length, e); 00302 testError (e, "failed to set regex text"); 00303 } 00304 00305 /*********************************************************************** 00306 00307 Get the subject text that is currently associated with this 00308 regular expression object. This simply returns whatever was 00309 previously supplied via setText(). 00310 00311 Note that this returns a read-only reference to the text. 00312 00313 ***********************************************************************/ 00314 00315 UText getText () 00316 { 00317 return theText; 00318 } 00319 00320 /*********************************************************************** 00321 00322 Return a set of slices representing the parenthesised groups. 00323 This can be used in the following manner: 00324 00325 @code 00326 wchar msg; 00327 00328 if (regex.next()) 00329 with (regex.groups()) 00330 msg ~= g1 ~ ":" ~ g2 00331 @endcode 00332 00333 Note that g0 represents the entire match, whereas g1 through 00334 g9 represent the parenthesised expressions. 00335 00336 ***********************************************************************/ 00337 00338 Groups groups () 00339 { 00340 wchar[]* p = &g0; 00341 uint count = groupCount(); 00342 wchar[] content = theText.get(); 00343 00344 if (count > 9) 00345 count = 9; 00346 for (uint i=0; i <= count; ++p, ++i) 00347 *p = content [start(i)..end(i)]; 00348 return this; 00349 } 00350 00351 /*********************************************************************** 00352 00353 Extract the string for the specified matching expression or 00354 subexpression. UString 's' is the destination for the match. 00355 00356 Group #0 is the complete string of matched text. Group #1 is 00357 the text matched by the first set of capturing parentheses. 00358 00359 ***********************************************************************/ 00360 00361 void group (UString s, uint index) 00362 { 00363 uint fmt (wchar* dst, uint length, inout Error e) 00364 { 00365 return uregex_group (handle, index, dst, length, e); 00366 } 00367 00368 s.format (&fmt, "failed to extract regex group text"); 00369 } 00370 00371 /*********************************************************************** 00372 00373 Get the number of capturing groups in this regular 00374 expression's pattern 00375 00376 ***********************************************************************/ 00377 00378 uint groupCount () 00379 { 00380 Error e; 00381 00382 uint i = uregex_groupCount (handle, e); 00383 testError (e, "failed to get regex group-count"); 00384 return i; 00385 } 00386 00387 /*********************************************************************** 00388 00389 Returns the index in the input string of the start of the 00390 text matched by the specified capture group during the 00391 previous match operation. 00392 00393 Return -1 if the capture group was not part of the last 00394 match. Group #0 refers to the complete range of matched 00395 text. Group #1 refers to the text matched by the first 00396 set of capturing parentheses 00397 00398 ***********************************************************************/ 00399 00400 uint start (uint index = 0) 00401 { 00402 Error e; 00403 00404 uint i = uregex_start (handle, index, e); 00405 testError (e, "failed to get regex start"); 00406 return i; 00407 } 00408 00409 /*********************************************************************** 00410 00411 Returns the index in the input string of the position 00412 following the end of the text matched by the specified 00413 capture group. 00414 00415 Return -1 if the capture group was not part of the last 00416 match. Group #0 refers to the complete range of matched 00417 text. Group #1 refers to the text matched by the first 00418 set of capturing parentheses. 00419 00420 ***********************************************************************/ 00421 00422 uint end (uint index = 0) 00423 { 00424 Error e; 00425 00426 uint i = uregex_end (handle, index, e); 00427 testError (e, "failed to get regex end"); 00428 return i; 00429 } 00430 00431 /*********************************************************************** 00432 00433 Reset any saved state from the previous match. 00434 00435 Has the effect of causing uregex_findNext to begin at the 00436 specified index, and causing uregex_start(), uregex_end() 00437 and uregex_group() to return an error indicating that there 00438 is no match information available. 00439 00440 ***********************************************************************/ 00441 00442 void reset (uint startIndex) 00443 { 00444 Error e; 00445 00446 uregex_reset (handle, startIndex, e); 00447 testError (e, "failed to set regex next-index"); 00448 } 00449 00450 /*********************************************************************** 00451 00452 Attempts to match the input string, beginning at startIndex, 00453 against the pattern. 00454 00455 To succeed, the match must extend to the end of the input 00456 string 00457 00458 ***********************************************************************/ 00459 00460 bool match (uint startIndex) 00461 { 00462 Error e; 00463 00464 bool b = uregex_matches (handle, startIndex, e); 00465 testError (e, "failed while matching regex"); 00466 return b; 00467 } 00468 00469 /*********************************************************************** 00470 00471 Attempts to match the input string, starting from the 00472 specified index, against the pattern. 00473 00474 The match may be of any length, and is not required to 00475 extend to the end of the input string. Contrast with match() 00476 00477 ***********************************************************************/ 00478 00479 bool probe (uint startIndex) 00480 { 00481 Error e; 00482 00483 bool b = uregex_lookingAt (handle, startIndex, e); 00484 testError (e, "failed while looking at regex"); 00485 return b; 00486 } 00487 00488 /*********************************************************************** 00489 00490 Returns whether the text matches the search pattern, starting 00491 from the current position. 00492 00493 If startIndex is specified, the current position is moved to 00494 the specified location before the seach is initiated. 00495 00496 ***********************************************************************/ 00497 00498 bool next (uint startIndex = uint.max) 00499 { 00500 Error e; 00501 bool b; 00502 00503 b = (startIndex == uint.max) ? uregex_findNext (handle, e) : 00504 uregex_find (handle, startIndex, e); 00505 00506 testError (e, "failed on next regex"); 00507 return b; 00508 } 00509 00510 /*********************************************************************** 00511 00512 Replaces every substring of the input that matches the pattern 00513 with the given replacement string. 00514 00515 This is a convenience function that provides a complete 00516 find-and-replace-all operation. 00517 00518 This method scans the input string looking for matches of 00519 the pattern. Input that is not part of any match is copied 00520 unchanged to the destination buffer. Matched regions are 00521 replaced in the output buffer by the replacement string. 00522 The replacement string may contain references to capture 00523 groups; these take the form of $1, $2, etc. 00524 00525 The provided 'result' will contain the results, and should 00526 be set with a length sufficient to house the entire result. 00527 Upon completion, the 'result' is shortened appropriately 00528 and the total extent (length) of the operation is returned. 00529 Set the initital length of 'result' using the UString method 00530 truncate(). 00531 00532 The returned extent should be checked to ensure it is not 00533 longer than the length of 'result'. If it is longer, then 00534 the result has been truncated. 00535 00536 ***********************************************************************/ 00537 00538 uint replaceAll (UText replace, UString result) 00539 { 00540 Error e; 00541 00542 uint len = uregex_replaceAll (handle, replace.get, replace.length, result.get, result.length, e); 00543 testError (e, "failed during regex replace"); 00544 result.truncate (len); 00545 return len; 00546 } 00547 00548 /*********************************************************************** 00549 00550 Replaces the first substring of the input that matches the 00551 pattern with the given replacement string. 00552 00553 This is a convenience function that provides a complete 00554 find-and-replace operation. 00555 00556 This method scans the input string looking for a match of 00557 the pattern. All input that is not part of the match is 00558 copied unchanged to the destination buffer. The matched 00559 region is replaced in the output buffer by the replacement 00560 string. The replacement string may contain references to 00561 capture groups; these take the form of $1, $2, etc 00562 00563 The provided 'result' will contain the results, and should 00564 be set with a length sufficient to house the entire result. 00565 Upon completion, the 'result' is shortened appropriately 00566 and the total extent (length) of the operation is returned. 00567 Set the initital length of 'result' using the UString method 00568 truncate(). 00569 00570 The returned extent should be checked to ensure it is not 00571 longer than the length of 'result'. If it is longer, then 00572 the result has been truncated. 00573 00574 ***********************************************************************/ 00575 00576 uint replaceFirst (UText replace, UString result) 00577 { 00578 Error e; 00579 00580 uint len = uregex_replaceFirst (handle, replace.get, replace.length, result.get, result.length, e); 00581 testError (e, "failed during regex replace"); 00582 result.truncate (len); 00583 return len; 00584 } 00585 00586 /*********************************************************************** 00587 00588 Split the text up into slices (fields), where each slice 00589 represents the text situated between each pattern matched 00590 within the text. The pattern is expected to represent one 00591 or more slice delimiters. 00592 00593 ***********************************************************************/ 00594 00595 uint split (wchar[][] fields) 00596 { 00597 Error e; 00598 wchar[]* s; 00599 uint pos, 00600 count; 00601 wchar[] content = theText.get; 00602 00603 for (s = fields; count < fields.length;) 00604 if (uregex_findNext (handle, e) && e == e.OK) 00605 { 00606 uint i = start(); 00607 *s = content[pos..i]; 00608 pos = end (); 00609 00610 // ignore leading delimiter 00611 if (i) 00612 ++s, ++count; 00613 } 00614 else 00615 break; 00616 testError (e, "failed during split"); 00617 return count; 00618 } 00619 00620 00621 /*********************************************************************** 00622 00623 Bind the ICU functions from a shared library. This is 00624 complicated by the issues regarding D and DLLs on the 00625 Windows platform 00626 00627 ***********************************************************************/ 00628 00629 private static void* library; 00630 00631 /*********************************************************************** 00632 00633 ***********************************************************************/ 00634 00635 private static extern (C) 00636 { 00637 Handle function (wchar*, uint, uint, ParseError*, inout Error) uregex_open; 00638 void function (Handle) uregex_close; 00639 Handle function (Handle, inout Error) uregex_clone; 00640 wchar* function (Handle, inout uint, inout Error) uregex_pattern; 00641 uint function (Handle, inout Error) uregex_flags; 00642 void function (Handle, wchar*, uint, inout Error) uregex_setText; 00643 wchar* function (Handle, inout uint, inout Error) uregex_getText; 00644 uint function (Handle, uint, wchar*, uint, inout Error) uregex_group; 00645 uint function (Handle, inout Error) uregex_groupCount; 00646 uint function (Handle, uint, inout Error) uregex_start; 00647 uint function (Handle, uint, inout Error) uregex_end; 00648 void function (Handle, uint, inout Error) uregex_reset; 00649 bool function (Handle, uint, inout Error) uregex_matches; 00650 bool function (Handle, uint, inout Error) uregex_lookingAt; 00651 bool function (Handle, uint, inout Error) uregex_find; 00652 bool function (Handle, inout Error) uregex_findNext; 00653 uint function (Handle, wchar*, uint, wchar*, uint, inout Error) uregex_replaceAll; 00654 uint function (Handle, wchar*, uint, wchar*, uint, inout Error) uregex_replaceFirst; 00655 } 00656 00657 /*********************************************************************** 00658 00659 ***********************************************************************/ 00660 00661 static FunctionLoader.Bind[] targets = 00662 [ 00663 {cast(void**) &uregex_open, "uregex_open"}, 00664 {cast(void**) &uregex_close, "uregex_close"}, 00665 {cast(void**) &uregex_clone, "uregex_clone"}, 00666 {cast(void**) &uregex_pattern, "uregex_pattern"}, 00667 {cast(void**) &uregex_flags, "uregex_flags"}, 00668 {cast(void**) &uregex_setText, "uregex_setText"}, 00669 {cast(void**) &uregex_getText, "uregex_getText"}, 00670 {cast(void**) &uregex_group, "uregex_group"}, 00671 {cast(void**) &uregex_groupCount, "uregex_groupCount"}, 00672 {cast(void**) &uregex_start, "uregex_start"}, 00673 {cast(void**) &uregex_end, "uregex_end"}, 00674 {cast(void**) &uregex_reset, "uregex_reset"}, 00675 {cast(void**) &uregex_matches, "uregex_matches"}, 00676 {cast(void**) &uregex_lookingAt, "uregex_lookingAt"}, 00677 {cast(void**) &uregex_find, "uregex_find"}, 00678 {cast(void**) &uregex_findNext, "uregex_findNext"}, 00679 {cast(void**) &uregex_replaceAll, "uregex_replaceAll"}, 00680 {cast(void**) &uregex_replaceFirst, "uregex_replaceFirst"}, 00681 ]; 00682 00683 /*********************************************************************** 00684 00685 ***********************************************************************/ 00686 00687 static this () 00688 { 00689 library = FunctionLoader.bind (icuin, targets); 00690 } 00691 00692 /*********************************************************************** 00693 00694 ***********************************************************************/ 00695 00696 static ~this () 00697 { 00698 FunctionLoader.unbind (library); 00699 } 00700 }