00001 /******************************************************************************* 00002 00003 @file Tokenizer.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version, March 2004 00034 @author Kris 00035 00036 00037 *******************************************************************************/ 00038 00039 module mango.io.Tokenizer; 00040 00041 private import std.ctype; 00042 00043 private import mango.io.Token, 00044 mango.io.Exception; 00045 00046 private import mango.io.model.IReader, 00047 mango.io.model.IBuffer, 00048 mango.io.model.IConduit; 00049 00050 /******************************************************************************* 00051 00052 Extract tokens from an IBuffer. This is the base-class for all 00053 Tokenizers, but can also be used outside of the ITokenizer model. 00054 00055 *******************************************************************************/ 00056 00057 class Scanner 00058 { 00059 /*********************************************************************** 00060 00061 Scan the given IBuffer for another token, and place the 00062 results in the provided token. Note that this should be 00063 completely thread-safe so one can instantiate singleton 00064 tokenizers without issue. 00065 00066 Each Token is expected to be stripped of the delimiter. 00067 An end-of-file condition causes trailing content to be 00068 placed into the token. Requests made beyond Eof result 00069 in empty tokens (length == zero). 00070 00071 Returns true if a token was isolated, false otherwise. 00072 00073 ***********************************************************************/ 00074 00075 bool next (IBuffer buffer, int delegate (char[]) scan) 00076 { 00077 while (buffer.read (cast(int delegate(void[])) scan) == IConduit.Eof) 00078 { 00079 IConduit conduit = buffer.getConduit(); 00080 if (conduit is null) 00081 { 00082 buffer.skip (buffer.readable()); 00083 return false; 00084 } 00085 else 00086 { 00087 // no more space in the buffer? 00088 if (! buffer.writable()) 00089 { 00090 // did we start at the beginning? 00091 if (buffer.getPosition ()) 00092 // nope - move partial token to start of buffer 00093 buffer.compress (); 00094 else 00095 throw new TokenException ("Token is too large to fit within buffer"); 00096 } 00097 00098 // read another chunk of data 00099 if (conduit.read (buffer) == IConduit.Eof) 00100 { 00101 buffer.skip (buffer.readable()); 00102 return false; 00103 } 00104 } 00105 } 00106 return true; 00107 } 00108 00109 /*********************************************************************** 00110 00111 Clean up after we fail to find a token. Trailing content 00112 is placed into the token, and the scanner is told to try 00113 and load some more content (where available). 00114 00115 ***********************************************************************/ 00116 00117 int notFound (Token token, char[] content) 00118 { 00119 token.set (content); 00120 return IConduit.Eof; 00121 } 00122 } 00123 00124 00125 /******************************************************************************* 00126 00127 Interface to define how Tokenizers should expose their functionality. 00128 00129 *******************************************************************************/ 00130 00131 interface ITokenizer 00132 { 00133 /*********************************************************************** 00134 00135 ***********************************************************************/ 00136 00137 bool next (IBuffer buffer, Token token); 00138 } 00139 00140 00141 /******************************************************************************* 00142 00143 A simple delimiting tokenizer. Use this to tokenize simple streams 00144 such as comma-seperated text. 00145 00146 *******************************************************************************/ 00147 00148 class SimpleTokenizer : Scanner, ITokenizer 00149 { 00150 private char delimiter; 00151 00152 /*********************************************************************** 00153 00154 Construct a SimpleTokenizer with the given delimiter char. 00155 More sophisticated delimiters can be constructed by using 00156 a RegexTokenizer instead. 00157 00158 ***********************************************************************/ 00159 00160 this (char delimiter) 00161 { 00162 this.delimiter = delimiter; 00163 } 00164 00165 00166 /*********************************************************************** 00167 00168 Locate the next token from the provided buffer, and map a 00169 buffer reference into token. Returns true if a token was 00170 located, false otherwise. 00171 00172 Note that the buffer content is not duplicated. Instead, a 00173 slice of the buffer is referenced by the token. You can use 00174 Token.clone() or Token.toString().dup() to copy content per 00175 your application needs. 00176 00177 Note also that there may still be one token left in a buffer 00178 that was not terminated correctly (as in eof conditions). In 00179 such cases, tokens are mapped onto remaining content and the 00180 buffer will have no more readable content. 00181 00182 ***********************************************************************/ 00183 00184 bool next (IBuffer buffer, Token token) 00185 { 00186 int scan (char[] content) 00187 { 00188 foreach (int i, char c; content) 00189 if (c == delimiter) 00190 { 00191 token.set (content[0..i]); 00192 return i+1; 00193 } 00194 00195 return notFound (token, content); 00196 } 00197 00198 return super.next (buffer, &scan); 00199 } 00200 } 00201 00202 00203 /******************************************************************************* 00204 00205 A tokenizer that isolates content enclosed by whitespace. 00206 00207 *******************************************************************************/ 00208 00209 class SpaceTokenizer : Scanner, ITokenizer 00210 { 00211 /*********************************************************************** 00212 00213 Locate the next token from the provided buffer, and map a 00214 buffer reference into token. Returns true if a token was 00215 located, false otherwise. 00216 00217 Note that the buffer content is not duplicated. Instead, a 00218 slice of the buffer is referenced by the token. You can use 00219 Token.clone() or Token.toString().dup() to copy content per 00220 your application needs. 00221 00222 Note also that there may still be one token left in a buffer 00223 that was not terminated correctly (as in eof conditions). In 00224 such cases, tokens are mapped onto remaining content and the 00225 buffer will have no more readable content. 00226 00227 ***********************************************************************/ 00228 00229 bool next (IBuffer buffer, Token token) 00230 { 00231 int scan (char[] content) 00232 { 00233 foreach (int i, char c; content) 00234 if (isspace (c)) 00235 { 00236 token.set (content[0..i]); 00237 return i+1; 00238 } 00239 00240 return notFound (token, content); 00241 } 00242 00243 return super.next (buffer, &scan); 00244 } 00245 } 00246 00247 00248 /******************************************************************************* 00249 00250 A tokenizer for handling both whitespace and punctuation delimiters. 00251 00252 *******************************************************************************/ 00253 00254 class PunctTokenizer : Scanner, ITokenizer 00255 { 00256 /*********************************************************************** 00257 00258 Locate the next token from the provided buffer, and map a 00259 buffer reference into token. Returns true if a token was 00260 located, false otherwise. 00261 00262 Note that the buffer content is not duplicated. Instead, a 00263 slice of the buffer is referenced by the token. You can use 00264 Token.clone() or Token.toString().dup() to copy content per 00265 your application needs. 00266 00267 Note also that there may still be one token left in a buffer 00268 that was not terminated correctly (as in eof conditions). In 00269 such cases, tokens are mapped onto remaining content and the 00270 buffer will have no more readable content. 00271 00272 ***********************************************************************/ 00273 00274 bool next (IBuffer buffer, Token token) 00275 { 00276 int scan (char[] content) 00277 { 00278 foreach (int i, char c; content) 00279 if (isspace(c) || ispunct(c)) 00280 { 00281 token.set (content[0..i]); 00282 return i+1; 00283 } 00284 00285 return notFound (token, content); 00286 } 00287 00288 return super.next (buffer, &scan); 00289 } 00290 } 00291 00292 00293 /******************************************************************************* 00294 00295 Tokenize an entire line delimited by a single '\\n' character, or 00296 by a "\r\n" pair. 00297 00298 *******************************************************************************/ 00299 00300 class LineTokenizer : Scanner, ITokenizer 00301 { 00302 /*********************************************************************** 00303 00304 Locate the next token from the provided buffer, and map a 00305 buffer reference into token. Returns true if a token was 00306 located, false otherwise. 00307 00308 Note that the buffer content is not duplicated. Instead, a 00309 slice of the buffer is referenced by the token. You can use 00310 Token.clone() or Token.toString().dup() to copy content per 00311 your application needs. 00312 00313 Note also that there may still be one token left in a buffer 00314 that was not terminated correctly (as in eof conditions). In 00315 such cases, tokens are mapped onto remaining content and the 00316 buffer will have no more readable content. 00317 00318 ***********************************************************************/ 00319 00320 bool next (IBuffer buffer, Token token) 00321 { 00322 int scan (char[] content) 00323 { 00324 foreach (int i, char c; content) 00325 if (c == '\n') 00326 { 00327 int slice = i; 00328 if (i && content[i-1] == '\r') 00329 --slice; 00330 token.set (content[0..slice]); 00331 return i+1; 00332 } 00333 00334 return notFound (token, content); 00335 } 00336 00337 return super.next (buffer, &scan); 00338 } 00339 } 00340 00341 00342 /******************************************************************************* 00343 00344 Eat everything until we reach a newline. Use this with a Reader, 00345 where you wish to discard everything else in the current line. 00346 00347 *******************************************************************************/ 00348 00349 class LineScanner : Scanner, IReadable 00350 { 00351 /*********************************************************************** 00352 00353 IReadable interface to support Reader.get() & Reader.opShl() 00354 00355 ***********************************************************************/ 00356 00357 void read (IReader r) 00358 { 00359 next (r.getBuffer()); 00360 } 00361 00362 /*********************************************************************** 00363 00364 Eat all content until we see a '\n' character. The content 00365 is simply discarded. 00366 00367 ***********************************************************************/ 00368 00369 bool next (IBuffer buffer) 00370 { 00371 int scan (char[] content) 00372 { 00373 foreach (int i, char c; content) 00374 if (c == '\n') 00375 return i+1; 00376 return IConduit.Eof; 00377 } 00378 00379 return super.next (buffer, &scan); 00380 } 00381 } 00382 00383 00384 /******************************************************************************* 00385 00386 Wrap a tokenizer around the std.RegExp class. This is useful for 00387 situations where you can't load the entire source into memory at 00388 one time. In other words, this adapts RegExp into an incremental 00389 scanner. 00390 00391 Note that the associated buffer must be large enough to contain 00392 an entire RegExp match. For example, if you have a regex pattern 00393 that matches an entire file then the buffer must be at least the 00394 size of the file. In such cases, one might be advised to find an 00395 more effective solution. 00396 00397 *******************************************************************************/ 00398 00399 class RegexTokenizer : Scanner, ITokenizer 00400 { 00401 import std.regexp; 00402 00403 private RegExp exp; 00404 00405 /*********************************************************************** 00406 00407 Construct a RegexTokenizer with the provided RegExp. 00408 00409 ***********************************************************************/ 00410 00411 this (RegExp exp) 00412 { 00413 this.exp = exp; 00414 } 00415 00416 /*********************************************************************** 00417 00418 Locate the next token from the provided buffer, and map a 00419 buffer reference into token. Returns true if a token was 00420 located, false otherwise. 00421 00422 Note that the buffer content is not duplicated. Instead, a 00423 slice of the buffer is referenced by the token. You can use 00424 Token.clone() or Token.toString().dup() to copy content per 00425 your application needs. 00426 00427 Note also that there may still be one token left in a buffer 00428 that was not terminated correctly (as in eof conditions). In 00429 such cases, tokens are mapped onto remaining content and the 00430 buffer will have no more readable content. 00431 00432 ***********************************************************************/ 00433 00434 bool next (IBuffer buffer, Token token) 00435 { 00436 int scan (char[] content) 00437 { 00438 //printf ("'%.*s' : %d\n", content, content.length); 00439 00440 // did we find a match? 00441 if (exp.test (content)) 00442 { 00443 int start = exp.pmatch[0].rm_so; 00444 int end = exp.pmatch[0].rm_eo; 00445 00446 // yep: stuff it into the token and go home 00447 token.set (content[start..end]); 00448 return end; 00449 } 00450 00451 // this is a bit tricky since RegExp doesn't tell 00452 // us when it has a partial match. To compensate, 00453 // we force the buffer to load as much as it can 00454 // after a failure within a *partial* buffer. 00455 if (buffer.getPosition()) 00456 buffer.compress(); 00457 else 00458 // skip past everything that didn't match. The 00459 // entire buffer may still be a partial match, 00460 // but then it should be made bigger to begin 00461 // with. 00462 buffer.skip (content.length); 00463 00464 // say we found nothing 00465 return notFound (token, content); 00466 } 00467 00468 // return the next token using this tokenizer 00469 return super.next (buffer, &scan); 00470 } 00471 } 00472 00473 00474 /******************************************************************************* 00475 00476 It's convenient to have some simple tokenizers available without 00477 constructing them, so we provide a few to get going with. 00478 00479 Note that these Tokenizers do not maintain any state of their own. 00480 Thus they are all thread-safe. 00481 00482 *******************************************************************************/ 00483 00484 struct Tokenizers 00485 { 00486 static LineScanner eol; 00487 static LineTokenizer line; 00488 static SpaceTokenizer space; 00489 static PunctTokenizer punct; 00490 static SimpleTokenizer comma; 00491 00492 /*********************************************************************** 00493 00494 Make a few common tokenizers available as singletons 00495 00496 ***********************************************************************/ 00497 00498 static this () 00499 { 00500 eol = new LineScanner(); 00501 line = new LineTokenizer(); 00502 space = new SpaceTokenizer(); 00503 punct = new PunctTokenizer(); 00504 comma = new SimpleTokenizer(','); 00505 } 00506 } 00507 00508 00509