00001 /******************************************************************************* 00002 00003 @file Tokenizer.d 00004 00005 Copyright (C) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 00027 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00028 00029 00030 @version Initial version, March 2004 00031 @author Kris 00032 00033 00034 *******************************************************************************/ 00035 00036 module mango.io.Tokenizer; 00037 00038 private import std.ctype; 00039 00040 private import mango.io.Token, 00041 mango.io.Exception; 00042 00043 private import mango.io.model.IReader, 00044 mango.io.model.IBuffer, 00045 mango.io.model.IConduit; 00046 00047 /******************************************************************************* 00048 00049 Extract tokens from an IBuffer. This is the base-class for all 00050 Tokenizers, but can also be used outside of the ITokenizer model. 00051 00052 *******************************************************************************/ 00053 00054 class Scanner 00055 { 00056 /*********************************************************************** 00057 00058 Scan the given IBuffer for another token, and place the 00059 results in the provided token. Note that this should be 00060 completely thread-safe so one can instantiate singleton 00061 tokenizers without issue. 00062 00063 Each Token is expected to be stripped of the delimiter. 00064 An end-of-file condition causes trailing content to be 00065 placed into the token. Requests made beyond Eof result 00066 in empty tokens (length == zero). 00067 00068 Returns true if a token was isolated, false otherwise. 00069 00070 ***********************************************************************/ 00071 00072 bool next (IBuffer buffer, int delegate (char[]) scan) 00073 { 00074 while (buffer.read (cast(int delegate(void[])) scan) == IConduit.Eof) 00075 { 00076 IConduit conduit = buffer.getConduit(); 00077 if (conduit is null) 00078 { 00079 buffer.skip (buffer.readable()); 00080 return false; 00081 } 00082 else 00083 { 00084 // no more space in the buffer? 00085 if (! buffer.writable()) 00086 { 00087 // did we start at the beginning? 00088 if (buffer.getPosition ()) 00089 // nope - move partial token to start of buffer 00090 buffer.compress (); 00091 else 00092 throw new TokenException ("Token is too large to fit within buffer"); 00093 } 00094 00095 // read another chunk of data 00096 if (conduit.read (buffer) == IConduit.Eof) 00097 { 00098 buffer.skip (buffer.readable()); 00099 return false; 00100 } 00101 } 00102 } 00103 return true; 00104 } 00105 00106 /*********************************************************************** 00107 00108 Clean up after we fail to find a token. Trailing content 00109 is placed into the token, and the scanner is told to try 00110 and load some more content (where available). 00111 00112 ***********************************************************************/ 00113 00114 int notFound (Token token, char[] content) 00115 { 00116 token.set (content); 00117 return IConduit.Eof; 00118 } 00119 } 00120 00121 00122 /******************************************************************************* 00123 00124 Interface to define how Tokenizers should expose their functionality. 00125 00126 *******************************************************************************/ 00127 00128 interface ITokenizer 00129 { 00130 /*********************************************************************** 00131 00132 ***********************************************************************/ 00133 00134 bool next (IBuffer buffer, Token token); 00135 } 00136 00137 00138 /******************************************************************************* 00139 00140 A simple delimiting tokenizer. Use this to tokenize simple streams 00141 such as comma-seperated text. 00142 00143 *******************************************************************************/ 00144 00145 class SimpleTokenizer : Scanner, ITokenizer 00146 { 00147 private char delimiter; 00148 00149 /*********************************************************************** 00150 00151 Construct a SimpleTokenizer with the given delimiter char. 00152 More sophisticated delimiters can be constructed by using 00153 a RegexTokenizer instead. 00154 00155 ***********************************************************************/ 00156 00157 this (char delimiter) 00158 { 00159 this.delimiter = delimiter; 00160 } 00161 00162 00163 /*********************************************************************** 00164 00165 Locate the next token from the provided buffer, and map a 00166 buffer reference into token. Returns true if a token was 00167 located, false otherwise. 00168 00169 Note that the buffer content is not duplicated. Instead, a 00170 slice of the buffer is referenced by the token. You can use 00171 Token.clone() or Token.toString().dup() to copy content per 00172 your application needs. 00173 00174 Note also that there may still be one token left in a buffer 00175 that was not terminated correctly (as in eof conditions). In 00176 such cases, tokens are mapped onto remaining content and the 00177 buffer will have no more readable content. 00178 00179 ***********************************************************************/ 00180 00181 bool next (IBuffer buffer, Token token) 00182 { 00183 int scan (char[] content) 00184 { 00185 foreach (int i, char c; content) 00186 if (c == delimiter) 00187 { 00188 token.set (content[0..i]); 00189 return i+1; 00190 } 00191 00192 return notFound (token, content); 00193 } 00194 00195 return super.next (buffer, &scan); 00196 } 00197 } 00198 00199 00200 /******************************************************************************* 00201 00202 A tokenizer that isolates content enclosed by whitespace. 00203 00204 *******************************************************************************/ 00205 00206 class SpaceTokenizer : Scanner, ITokenizer 00207 { 00208 /*********************************************************************** 00209 00210 Locate the next token from the provided buffer, and map a 00211 buffer reference into token. Returns true if a token was 00212 located, false otherwise. 00213 00214 Note that the buffer content is not duplicated. Instead, a 00215 slice of the buffer is referenced by the token. You can use 00216 Token.clone() or Token.toString().dup() to copy content per 00217 your application needs. 00218 00219 Note also that there may still be one token left in a buffer 00220 that was not terminated correctly (as in eof conditions). In 00221 such cases, tokens are mapped onto remaining content and the 00222 buffer will have no more readable content. 00223 00224 ***********************************************************************/ 00225 00226 bool next (IBuffer buffer, Token token) 00227 { 00228 int scan (char[] content) 00229 { 00230 foreach (int i, char c; content) 00231 if (isspace (c)) 00232 { 00233 token.set (content[0..i]); 00234 return i+1; 00235 } 00236 00237 return notFound (token, content); 00238 } 00239 00240 return super.next (buffer, &scan); 00241 } 00242 } 00243 00244 00245 /******************************************************************************* 00246 00247 A tokenizer for handling both whitespace and punctuation delimiters. 00248 00249 *******************************************************************************/ 00250 00251 class PunctTokenizer : Scanner, ITokenizer 00252 { 00253 /*********************************************************************** 00254 00255 Locate the next token from the provided buffer, and map a 00256 buffer reference into token. Returns true if a token was 00257 located, false otherwise. 00258 00259 Note that the buffer content is not duplicated. Instead, a 00260 slice of the buffer is referenced by the token. You can use 00261 Token.clone() or Token.toString().dup() to copy content per 00262 your application needs. 00263 00264 Note also that there may still be one token left in a buffer 00265 that was not terminated correctly (as in eof conditions). In 00266 such cases, tokens are mapped onto remaining content and the 00267 buffer will have no more readable content. 00268 00269 ***********************************************************************/ 00270 00271 bool next (IBuffer buffer, Token token) 00272 { 00273 int scan (char[] content) 00274 { 00275 foreach (int i, char c; content) 00276 if (isspace(c) || ispunct(c)) 00277 { 00278 token.set (content[0..i]); 00279 return i+1; 00280 } 00281 00282 return notFound (token, content); 00283 } 00284 00285 return super.next (buffer, &scan); 00286 } 00287 } 00288 00289 00290 /******************************************************************************* 00291 00292 Tokenize an entire line delimited by a single '\\n' character, or 00293 by a "\r\n" pair. 00294 00295 *******************************************************************************/ 00296 00297 class LineTokenizer : Scanner, ITokenizer 00298 { 00299 /*********************************************************************** 00300 00301 Locate the next token from the provided buffer, and map a 00302 buffer reference into token. Returns true if a token was 00303 located, false otherwise. 00304 00305 Note that the buffer content is not duplicated. Instead, a 00306 slice of the buffer is referenced by the token. You can use 00307 Token.clone() or Token.toString().dup() to copy content per 00308 your application needs. 00309 00310 Note also that there may still be one token left in a buffer 00311 that was not terminated correctly (as in eof conditions). In 00312 such cases, tokens are mapped onto remaining content and the 00313 buffer will have no more readable content. 00314 00315 ***********************************************************************/ 00316 00317 bool next (IBuffer buffer, Token token) 00318 { 00319 int scan (char[] content) 00320 { 00321 foreach (int i, char c; content) 00322 if (c == '\n') 00323 { 00324 int slice = i; 00325 if (i && content[i-1] == '\r') 00326 --slice; 00327 token.set (content[0..slice]); 00328 return i+1; 00329 } 00330 00331 return notFound (token, content); 00332 } 00333 00334 return super.next (buffer, &scan); 00335 } 00336 } 00337 00338 00339 /******************************************************************************* 00340 00341 Eat everything until we reach a newline. Use this with a Reader, 00342 where you wish to discard everything else in the current line. 00343 00344 *******************************************************************************/ 00345 00346 class LineScanner : Scanner, IReadable 00347 { 00348 /*********************************************************************** 00349 00350 IReadable interface to support Reader.get() & Reader.opShl() 00351 00352 ***********************************************************************/ 00353 00354 void read (IReader r) 00355 { 00356 next (r.getBuffer()); 00357 } 00358 00359 /*********************************************************************** 00360 00361 Eat all content until we see a '\n' character. The content 00362 is simply discarded. 00363 00364 ***********************************************************************/ 00365 00366 bool next (IBuffer buffer) 00367 { 00368 int scan (char[] content) 00369 { 00370 foreach (int i, char c; content) 00371 if (c == '\n') 00372 return i+1; 00373 return IConduit.Eof; 00374 } 00375 00376 return super.next (buffer, &scan); 00377 } 00378 } 00379 00380 00381 /******************************************************************************* 00382 00383 Wrap a tokenizer around the std.RegExp class. This is useful for 00384 situations where you can't load the entire source into memory at 00385 one time. In other words, this adapts RegExp into an incremental 00386 scanner. 00387 00388 Note that the associated buffer must be large enough to contain 00389 an entire RegExp match. For example, if you have a regex pattern 00390 that matches an entire file then the buffer must be at least the 00391 size of the file. In such cases, one might be advised to find an 00392 more effective solution. 00393 00394 *******************************************************************************/ 00395 00396 class RegexTokenizer : Scanner, ITokenizer 00397 { 00398 import std.regexp; 00399 00400 private RegExp exp; 00401 00402 /*********************************************************************** 00403 00404 Construct a RegexTokenizer with the provided RegExp. 00405 00406 ***********************************************************************/ 00407 00408 this (RegExp exp) 00409 { 00410 this.exp = exp; 00411 } 00412 00413 /*********************************************************************** 00414 00415 Locate the next token from the provided buffer, and map a 00416 buffer reference into token. Returns true if a token was 00417 located, false otherwise. 00418 00419 Note that the buffer content is not duplicated. Instead, a 00420 slice of the buffer is referenced by the token. You can use 00421 Token.clone() or Token.toString().dup() to copy content per 00422 your application needs. 00423 00424 Note also that there may still be one token left in a buffer 00425 that was not terminated correctly (as in eof conditions). In 00426 such cases, tokens are mapped onto remaining content and the 00427 buffer will have no more readable content. 00428 00429 ***********************************************************************/ 00430 00431 bool next (IBuffer buffer, Token token) 00432 { 00433 int scan (char[] content) 00434 { 00435 //printf ("'%.*s' : %d\n", content, content.length); 00436 00437 // did we find a match? 00438 if (exp.test (content)) 00439 { 00440 int start = exp.pmatch[0].rm_so; 00441 int end = exp.pmatch[0].rm_eo; 00442 00443 // yep: stuff it into the token and go home 00444 token.set (content[start..end]); 00445 return end; 00446 } 00447 00448 // this is a bit tricky since RegExp doesn't tell 00449 // us when it has a partial match. To compensate, 00450 // we force the buffer to load as much as it can 00451 // after a failure within a *partial* buffer. 00452 if (buffer.getPosition()) 00453 buffer.compress(); 00454 else 00455 // skip past everything that didn't match. The 00456 // entire buffer may still be a partial match, 00457 // but then it should be made bigger to begin 00458 // with. 00459 buffer.skip (content.length); 00460 00461 // say we found nothing 00462 return notFound (token, content); 00463 } 00464 00465 // return the next token using this tokenizer 00466 return super.next (buffer, &scan); 00467 } 00468 } 00469 00470 00471 /******************************************************************************* 00472 00473 It's convenient to have some simple tokenizers available without 00474 constructing them, so we provide a few to get going with. 00475 00476 Note that these Tokenizers do not maintain any state of their own. 00477 Thus they are all thread-safe. 00478 00479 *******************************************************************************/ 00480 00481 struct Tokenizers 00482 { 00483 static LineScanner eol; 00484 static LineTokenizer line; 00485 static SpaceTokenizer space; 00486 static PunctTokenizer punct; 00487 static SimpleTokenizer comma; 00488 00489 /*********************************************************************** 00490 00491 Make a few common tokenizers available as singletons 00492 00493 ***********************************************************************/ 00494 00495 static this () 00496 { 00497 eol = new LineScanner(); 00498 line = new LineTokenizer(); 00499 space = new SpaceTokenizer(); 00500 punct = new PunctTokenizer(); 00501 comma = new SimpleTokenizer(','); 00502 } 00503 } 00504 00505 00506