00001 /******************************************************************************* 00002 00003 @file Tokenizer.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version, March 2004 00034 @author Kris 00035 00036 00037 *******************************************************************************/ 00038 00039 module mango.io.Tokenizer; 00040 00041 version (Ares) 00042 private import std.c.ctype; 00043 else 00044 private import std.ctype; 00045 00046 private import mango.io.Token, 00047 mango.io.Exception; 00048 00049 private import mango.io.model.IReader, 00050 mango.io.model.IBuffer, 00051 mango.io.model.IConduit; 00052 00053 /******************************************************************************* 00054 00055 Extract tokens from an IBuffer. This is the base-class for all 00056 Tokenizers, but can also be used outside of the ITokenizer model. 00057 00058 *******************************************************************************/ 00059 00060 class ScannerTemplate(T) 00061 { 00062 /*********************************************************************** 00063 00064 Scan the given IBuffer for another token, and place the 00065 results in the provided token. Note that this should be 00066 completely thread-safe so one can instantiate singleton 00067 tokenizers without issue. 00068 00069 Each Token is expected to be stripped of the delimiter. 00070 An end-of-file condition causes trailing content to be 00071 placed into the token. Requests made beyond Eof result 00072 in empty tokens (length == zero). 00073 00074 Returns true if a token was isolated, false otherwise. 00075 00076 ***********************************************************************/ 00077 00078 bool next (IBuffer buffer, uint delegate (T[]) scan) 00079 { 00080 while (buffer.read (cast(uint delegate(void[])) scan) == IConduit.Eof) 00081 { 00082 IConduit conduit = buffer.getConduit(); 00083 if (conduit is null) 00084 { 00085 buffer.skip (buffer.readable()); 00086 return false; 00087 } 00088 else 00089 { 00090 // no more space in the buffer? 00091 if (! buffer.writable()) 00092 { 00093 // did we start at the beginning? 00094 if (buffer.getPosition ()) 00095 // nope - move partial token to start of buffer 00096 buffer.compress (); 00097 else 00098 throw new TokenException ("Token is too large to fit within buffer"); 00099 } 00100 00101 // read another chunk of data 00102 if (buffer.fill (conduit) == IConduit.Eof) 00103 { 00104 buffer.skip (buffer.readable()); 00105 return false; 00106 } 00107 } 00108 } 00109 return true; 00110 } 00111 00112 /*********************************************************************** 00113 00114 Clean up after we fail to find a token. Trailing content 00115 is placed into the token, and the scanner is told to try 00116 and load some more content (where available). 00117 00118 ***********************************************************************/ 00119 00120 uint notFound (TokenTemplate!(T) token, T[] content) 00121 { 00122 token.set (content); 00123 return IConduit.Eof; 00124 } 00125 } 00126 00127 alias ScannerTemplate!(char) Scanner; 00128 00129 00130 /******************************************************************************* 00131 00132 Interface to define how Tokenizers should expose their functionality. 00133 00134 *******************************************************************************/ 00135 00136 interface ITokenizerTemplate(T) 00137 { 00138 /*********************************************************************** 00139 00140 ***********************************************************************/ 00141 00142 bool next (IBuffer buffer, TokenTemplate token); 00143 } 00144 00145 alias ITokenizerTemplate!(T) ITokenizer; 00146 00147 00148 00149 /******************************************************************************* 00150 00151 A simple delimiting tokenizer. Use this to tokenize simple streams 00152 such as comma-seperated text. 00153 00154 *******************************************************************************/ 00155 00156 class SimpleTokenizerTemplate(T) : ScannerTemplate!(T), ITokenizer!(T) 00157 { 00158 private T delimiter; 00159 00160 /*********************************************************************** 00161 00162 Construct a SimpleTokenizer with the given delimiter char. 00163 More sophisticated delimiters can be constructed by using 00164 a RegexTokenizer instead. 00165 00166 ***********************************************************************/ 00167 00168 this (T delimiter) 00169 { 00170 this.delimiter = delimiter; 00171 } 00172 00173 00174 /*********************************************************************** 00175 00176 Locate the next token from the provided buffer, and map a 00177 buffer reference into token. Returns true if a token was 00178 located, false otherwise. 00179 00180 Note that the buffer content is not duplicated. Instead, a 00181 slice of the buffer is referenced by the token. You can use 00182 Token.clone() or Token.toString().dup() to copy content per 00183 your application needs. 00184 00185 Note also that there may still be one token left in a buffer 00186 that was not terminated correctly (as in eof conditions). In 00187 such cases, tokens are mapped onto remaining content and the 00188 buffer will have no more readable content. 00189 00190 ***********************************************************************/ 00191 00192 bool next (IBuffer buffer, TokenTemplate token) 00193 { 00194 uint scan (T[] content) 00195 { 00196 foreach (int i, T c; content) 00197 if (c == delimiter) 00198 { 00199 token.set (content[0..i]); 00200 return i+1; 00201 } 00202 00203 return notFound (token, content); 00204 } 00205 00206 return super.next (buffer, &scan); 00207 } 00208 } 00209 00210 alias SimpleTokenizerTemplate!(char) SimpleTokenizer; 00211 00212 00213 00214 /******************************************************************************* 00215 00216 A tokenizer that isolates content enclosed by whitespace. 00217 00218 *******************************************************************************/ 00219 00220 class SpaceTokenizerTemplate(T) : ScannerTemplate!(T), ITokenizer!(T) 00221 { 00222 /*********************************************************************** 00223 00224 Locate the next token from the provided buffer, and map a 00225 buffer reference into token. Returns true if a token was 00226 located, false otherwise. 00227 00228 Note that the buffer content is not duplicated. Instead, a 00229 slice of the buffer is referenced by the token. You can use 00230 Token.clone() or Token.toString().dup() to copy content per 00231 your application needs. 00232 00233 Note also that there may still be one token left in a buffer 00234 that was not terminated correctly (as in eof conditions). In 00235 such cases, tokens are mapped onto remaining content and the 00236 buffer will have no more readable content. 00237 00238 ***********************************************************************/ 00239 00240 bool next (IBuffer buffer, TokenTemplate token) 00241 { 00242 uint scan (T[] content) 00243 { 00244 foreach (int i, T c; content) 00245 if (isspace (c)) 00246 { 00247 token.set (content[0..i]); 00248 return i+1; 00249 } 00250 00251 return notFound (token, content); 00252 } 00253 00254 return super.next (buffer, &scan); 00255 } 00256 } 00257 00258 alias SpaceTokenizerTemplate!(char) SpaceTokenizer; 00259 00260 00261 /******************************************************************************* 00262 00263 A tokenizer for handling both whitespace and punctuation delimiters. 00264 00265 *******************************************************************************/ 00266 00267 class PunctTokenizerTemplate(T) : ScannerTemplate!(T), ITokenizer!(T) 00268 { 00269 /*********************************************************************** 00270 00271 Locate the next token from the provided buffer, and map a 00272 buffer reference into token. Returns true if a token was 00273 located, false otherwise. 00274 00275 Note that the buffer content is not duplicated. Instead, a 00276 slice of the buffer is referenced by the token. You can use 00277 Token.clone() or Token.toString().dup() to copy content per 00278 your application needs. 00279 00280 Note also that there may still be one token left in a buffer 00281 that was not terminated correctly (as in eof conditions). In 00282 such cases, tokens are mapped onto remaining content and the 00283 buffer will have no more readable content. 00284 00285 ***********************************************************************/ 00286 00287 bool next (IBuffer buffer, TokenTemplate token) 00288 { 00289 uint scan (T[] content) 00290 { 00291 foreach (int i, T c; content) 00292 if (isspace(c) || ispunct(c)) 00293 { 00294 token.set (content[0..i]); 00295 return i+1; 00296 } 00297 00298 return notFound (token, content); 00299 } 00300 00301 return super.next (buffer, &scan); 00302 } 00303 } 00304 00305 alias PunctTokenizerTemplate!(char) PunctTokenizer; 00306 00307 00308 00309 /******************************************************************************* 00310 00311 Tokenize an entire line delimited by a single '\\n' character, or 00312 by a "\r\n" pair. 00313 00314 *******************************************************************************/ 00315 00316 class LineTokenizerTemplate(T) : ScannerTemplate!(T), ITokenizer!(T) 00317 { 00318 /*********************************************************************** 00319 00320 Locate the next token from the provided buffer, and map a 00321 buffer reference into token. Returns true if a token was 00322 located, false otherwise. 00323 00324 Note that the buffer content is not duplicated. Instead, a 00325 slice of the buffer is referenced by the token. You can use 00326 Token.clone() or Token.toString().dup() to copy content per 00327 your application needs. 00328 00329 Note also that there may still be one token left in a buffer 00330 that was not terminated correctly (as in eof conditions). In 00331 such cases, tokens are mapped onto remaining content and the 00332 buffer will have no more readable content. 00333 00334 ***********************************************************************/ 00335 00336 bool next (IBuffer buffer, TokenTemplate token) 00337 { 00338 uint scan (T[] content) 00339 { 00340 foreach (int i, T c; content) 00341 if (c == '\n') 00342 { 00343 int slice = i; 00344 if (i && content[i-1] == '\r') 00345 --slice; 00346 token.set (content[0..slice]); 00347 return i+1; 00348 } 00349 00350 return notFound (token, content); 00351 } 00352 00353 return super.next (buffer, &scan); 00354 } 00355 } 00356 00357 alias LineTokenizerTemplate!(char) LineTokenizer; 00358 00359 00360 /******************************************************************************* 00361 00362 Eat everything until we reach a newline. Use this with a Reader, 00363 where you wish to discard everything else in the current line. 00364 00365 *******************************************************************************/ 00366 00367 class LineScannerTemplate(T) : ScannerTemplate!(T), IReadable!(T) 00368 { 00369 /*********************************************************************** 00370 00371 IReadable interface to support Reader.get() 00372 00373 ***********************************************************************/ 00374 00375 void read (IReader r) 00376 { 00377 next (r.getBuffer()); 00378 } 00379 00380 /*********************************************************************** 00381 00382 Eat all content until we see a '\n' character. The content 00383 is simply discarded. 00384 00385 ***********************************************************************/ 00386 00387 bool next (IBuffer buffer) 00388 { 00389 uint scan (T[] content) 00390 { 00391 foreach (int i, T c; content) 00392 if (c == '\n') 00393 return i+1; 00394 return IConduit.Eof; 00395 } 00396 00397 return super.next (buffer, &scan); 00398 } 00399 } 00400 00401 alias LineScannerTemplate!(char) LineScanner; 00402 00403 00404 version (Ares) {} 00405 else 00406 { 00407 /******************************************************************************* 00408 00409 Wrap a tokenizer around the std.RegExp class. This is useful for 00410 situations where you can't load the entire source into memory at 00411 one time. In other words, this adapts RegExp into an incremental 00412 scanner. 00413 00414 Note that the associated buffer must be large enough to contain 00415 an entire RegExp match. For example, if you have a regex pattern 00416 that matches an entire file then the buffer must be at least the 00417 size of the file. In such cases, one might be advised to find an 00418 more effective solution. 00419 00420 *******************************************************************************/ 00421 00422 class RegexTokenizerTemplate(T) : ScannerTemplate!(T), ITokenizer!(T) 00423 { 00424 import std.regexp; 00425 00426 private RegExp!(T) exp; 00427 00428 /*********************************************************************** 00429 00430 Construct a RegexTokenizer with the provided RegExp. 00431 00432 ***********************************************************************/ 00433 00434 this (RegExp!(T) exp) 00435 { 00436 this.exp = exp; 00437 } 00438 00439 /*********************************************************************** 00440 00441 Locate the next token from the provided buffer, and map a 00442 buffer reference into token. Returns true if a token was 00443 located, false otherwise. 00444 00445 Note that the buffer content is not duplicated. Instead, a 00446 slice of the buffer is referenced by the token. You can use 00447 Token.clone() or Token.toString().dup() to copy content per 00448 your application needs. 00449 00450 Note also that there may still be one token left in a buffer 00451 that was not terminated correctly (as in eof conditions). In 00452 such cases, tokens are mapped onto remaining content and the 00453 buffer will have no more readable content. 00454 00455 ***********************************************************************/ 00456 00457 bool next (IBuffer buffer, TokenTemplate token) 00458 { 00459 uint scan (T[] content) 00460 { 00461 //printf ("'%.*s' : %d\n", content, content.length); 00462 00463 // did we find a match? 00464 if (exp.test (content)) 00465 { 00466 int start = exp.pmatch[0].rm_so; 00467 int end = exp.pmatch[0].rm_eo; 00468 00469 // yep: stuff it into the token and go home 00470 token.set (content[start..end]); 00471 return end; 00472 } 00473 00474 // this is a bit tricky since RegExp doesn't tell 00475 // us when it has a partial match. To compensate, 00476 // we force the buffer to load as much as it can 00477 // after a failure within a *partial* buffer. 00478 if (buffer.getPosition()) 00479 buffer.compress(); 00480 else 00481 // skip past everything that didn't match. The 00482 // entire buffer may still be a partial match, 00483 // but then it should be made bigger to begin 00484 // with. 00485 buffer.skip (content.length); 00486 00487 // say we found nothing 00488 return notFound (token, content); 00489 } 00490 00491 // return the next token using this tokenizer 00492 return super.next (buffer, &scan); 00493 } 00494 } 00495 } 00496 00497 /******************************************************************************* 00498 00499 It's convenient to have some simple tokenizers available without 00500 constructing them, so we provide a few to get going with. 00501 00502 Note that these Tokenizers do not maintain any state of their own. 00503 Thus they are all thread-safe. 00504 00505 *******************************************************************************/ 00506 00507 struct Tokenizers 00508 { 00509 static LineScanner eol; 00510 static LineTokenizer line; 00511 static SpaceTokenizer space; 00512 static PunctTokenizer punct; 00513 static SimpleTokenizer comma; 00514 00515 /*********************************************************************** 00516 00517 Make a few common tokenizers available as singletons 00518 00519 ***********************************************************************/ 00520 00521 static this () 00522 { 00523 eol = new LineScanner(); 00524 line = new LineTokenizer(); 00525 space = new SpaceTokenizer(); 00526 punct = new PunctTokenizer(); 00527 comma = new SimpleTokenizer(','); 00528 } 00529 } 00530 00531 00532