00001 /******************************************************************************* 00002 00003 @file Token.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version, December 2005 00034 00035 @author Kris 00036 00037 00038 *******************************************************************************/ 00039 00040 module mango.text.Token; 00041 00042 private import mango.text.Text; 00043 00044 /******************************************************************************* 00045 00046 The base class for a set of tokenizers. 00047 00048 There are two types of tokenizers supported ~ exclusive and 00049 inclusive. The former are the more common kind, where a token 00050 is delimited by elements that are considered foreign. Examples 00051 include space, comma, and end-of-line delineation. Inclusive 00052 tokens are just the opposite: they look for patterns in the 00053 text that should be part of the token itself ~ everything else 00054 is considered foreign. Currently the only inclusive token type 00055 is exposed by RegexToken; everything else is of the exclusive 00056 variety. 00057 00058 The content provided to Tokenizers is supposed to be entirely 00059 read-only. All current tokenizers abide by this rule, but it's 00060 possible a user could mutate the content through a token slice. 00061 To enforce the desired read-only aspect, the code would have to 00062 introduce redundant copying or the compiler would have to support 00063 read-only arrays. 00064 00065 See LineToken, CharToken, RegexToken, QuotedToken, and SetToken. 00066 00067 *******************************************************************************/ 00068 00069 class TokenTemplate(T) 00070 { 00071 static if (!is (T == char) && !is (T == wchar) && !is (T == dchar)) 00072 pragma (msg, "Template type must be char, wchar, or dchar"); 00073 00074 00075 alias TokenTemplate Token; 00076 00077 alias bool delegate(Token token) Refill; 00078 00079 protected T* peek, // current position 00080 last, // prior position 00081 end; // end of content 00082 package T[] slice, // current token slice 00083 content; // the content to tokenize 00084 package bool hasTail, // sentinel for last call 00085 autoTrim; // trim tokens by default? 00086 package Refill refill; // where to get new content 00087 00088 /*********************************************************************** 00089 00090 Locate the next token. If this Token is configured for 00091 "refills", more content will be requested as needed. 00092 00093 Returns true if a token is found; false otherwise. 00094 00095 ***********************************************************************/ 00096 00097 abstract bool next(); 00098 00099 /*********************************************************************** 00100 00101 Construct a token upon the given content. Automatic 00102 refills are disabled 00103 00104 ***********************************************************************/ 00105 00106 this (T[] string) 00107 { 00108 prime (string); 00109 refill = &this.noRefill; 00110 } 00111 00112 /********************************************************************** 00113 00114 Iterate over the set of tokens. This provides read-only 00115 access to the tokens 00116 00117 **********************************************************************/ 00118 00119 int opApply (int delegate(inout T[]) dg) 00120 { 00121 int result = 0; 00122 00123 while (next) 00124 { 00125 T[] t = get (); 00126 result = dg (t); 00127 if (result) 00128 break; 00129 } 00130 return result; 00131 } 00132 00133 /*********************************************************************** 00134 00135 Enable automatic trimming of tokens? 00136 00137 ***********************************************************************/ 00138 00139 void setAutoTrim (bool enabled) 00140 { 00141 autoTrim = enabled; 00142 } 00143 00144 /*********************************************************************** 00145 00146 A Refill delegate should use this method to push the tail 00147 of the current text. Doing so will cause the tail to be 00148 prepended to the next 'found' token, after the content is 00149 primed to reference fresh incoming data. Thus, a Refill 00150 should do this: 00151 00152 // copy token.tail() into buffer 00153 00154 // append fresh content into buffer 00155 // ... 00156 00157 token.prime (buffer); 00158 return true; 00159 00160 ***********************************************************************/ 00161 00162 void setRefill (Refill refill) 00163 { 00164 this.refill = refill; 00165 } 00166 00167 /*********************************************************************** 00168 00169 Set the content, ready for next() to start 00170 00171 ***********************************************************************/ 00172 00173 T[] prime (T[] content) 00174 { 00175 this.content = content; 00176 peek = (last = content.ptr) - 1; 00177 end = last + content.length; 00178 return content; 00179 } 00180 00181 /*********************************************************************** 00182 00183 Return the current token as a slice of the content 00184 00185 ***********************************************************************/ 00186 00187 T[] get() 00188 { 00189 return slice; 00190 } 00191 00192 /*********************************************************************** 00193 00194 Return the current content-tail. This is typically used 00195 by "refill" handlers, when they stream more content in. 00196 00197 ***********************************************************************/ 00198 00199 T[] tail() 00200 { 00201 return last [0 .. end-last]; 00202 } 00203 00204 /*********************************************************************** 00205 00206 Return how much content has been consumed 00207 00208 ***********************************************************************/ 00209 00210 uint eaten() 00211 { 00212 return last - content.ptr; 00213 } 00214 00215 /*********************************************************************** 00216 00217 Return the index of the current token. This is different 00218 from eaten() in that the current token may not yet have 00219 been consumed. Thus index() will always be less than or 00220 equal to eaten() 00221 00222 ***********************************************************************/ 00223 00224 uint index() 00225 { 00226 return slice.ptr - content.ptr; 00227 } 00228 00229 /*********************************************************************** 00230 00231 Trim spaces from the left and right of the current token. 00232 Note that this is done in-place on the current slice. The 00233 content itself is not affected. 00234 00235 ***********************************************************************/ 00236 00237 Token trim () 00238 { 00239 slice = TextTemplate!(T).trim (slice); 00240 return this; 00241 } 00242 00243 /*********************************************************************** 00244 00245 Internal method for subclasses to call when they locate a 00246 token 00247 00248 ***********************************************************************/ 00249 00250 protected bool found (int offset = 0) 00251 { 00252 slice = last [0 .. (peek - offset) - last]; 00253 last = peek + 1; 00254 if (autoTrim) 00255 trim (); 00256 return true; 00257 } 00258 00259 /*********************************************************************** 00260 00261 Internal method for subclasses to call when they run out 00262 of content to scan. This invokes the "refill" facilities 00263 to provide additional content. 00264 00265 ***********************************************************************/ 00266 00267 protected bool getMore () 00268 { 00269 hasTail = false; 00270 if (last < end || content.ptr is null) 00271 { 00272 // more content available? 00273 if (refill (this)) 00274 return true; 00275 00276 // set the last slice for this content 00277 hasTail = true; 00278 slice = tail; 00279 if (autoTrim) 00280 trim(); 00281 last = end; 00282 } 00283 return false; 00284 } 00285 00286 /*********************************************************************** 00287 00288 Default "refill" handler, which indicates there's no more 00289 content to be had 00290 00291 ***********************************************************************/ 00292 00293 private bool noRefill (Token token) 00294 { 00295 return false; 00296 } 00297 } 00298 00299 //alias TokenTemplate!(char) Token;