00001 /******************************************************************************* 00002 00003 @file Tokenizer.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version, March 2004 00034 @author Kris 00035 00036 00037 *******************************************************************************/ 00038 00039 module mango.io.Tokenizer; 00040 00041 version (Ares) 00042 private import std.c.ctype; 00043 else 00044 private import std.ctype; 00045 00046 private import mango.io.Token, 00047 mango.io.Exception; 00048 00049 private import mango.io.model.IReader, 00050 mango.io.model.IBuffer, 00051 mango.io.model.IConduit; 00052 00053 /******************************************************************************* 00054 00055 Extract tokens from an IBuffer. This is the base-class for all 00056 Tokenizers, but can also be used outside of the ITokenizer model. 00057 00058 *******************************************************************************/ 00059 00060 class Scanner 00061 { 00062 /*********************************************************************** 00063 00064 Scan the given IBuffer for another token, and place the 00065 results in the provided token. Note that this should be 00066 completely thread-safe so one can instantiate singleton 00067 tokenizers without issue. 00068 00069 Each Token is expected to be stripped of the delimiter. 00070 An end-of-file condition causes trailing content to be 00071 placed into the token. Requests made beyond Eof result 00072 in empty tokens (length == zero). 00073 00074 Returns true if a token was isolated, false otherwise. 00075 00076 ***********************************************************************/ 00077 00078 bool next (IBuffer buffer, uint delegate (char[]) scan) 00079 { 00080 while (buffer.read (cast(uint delegate(void[])) scan) == IConduit.Eof) 00081 { 00082 IConduit conduit = buffer.getConduit(); 00083 if (conduit is null) 00084 { 00085 buffer.skip (buffer.readable()); 00086 return false; 00087 } 00088 else 00089 { 00090 // no more space in the buffer? 00091 if (! buffer.writable()) 00092 { 00093 // did we start at the beginning? 00094 if (buffer.getPosition ()) 00095 // nope - move partial token to start of buffer 00096 buffer.compress (); 00097 else 00098 throw new TokenException ("Token is too large to fit within buffer"); 00099 } 00100 00101 // read another chunk of data 00102 if (buffer.fill (conduit) == IConduit.Eof) 00103 { 00104 buffer.skip (buffer.readable()); 00105 return false; 00106 } 00107 } 00108 } 00109 return true; 00110 } 00111 00112 /*********************************************************************** 00113 00114 Clean up after we fail to find a token. Trailing content 00115 is placed into the token, and the scanner is told to try 00116 and load some more content (where available). 00117 00118 ***********************************************************************/ 00119 00120 uint notFound (Token token, char[] content) 00121 { 00122 token.set (content); 00123 return IConduit.Eof; 00124 } 00125 } 00126 00127 00128 /******************************************************************************* 00129 00130 Interface to define how Tokenizers should expose their functionality. 00131 00132 *******************************************************************************/ 00133 00134 interface ITokenizer 00135 { 00136 /*********************************************************************** 00137 00138 ***********************************************************************/ 00139 00140 bool next (IBuffer buffer, Token token); 00141 } 00142 00143 00144 /******************************************************************************* 00145 00146 A simple delimiting tokenizer. Use this to tokenize simple streams 00147 such as comma-seperated text. 00148 00149 *******************************************************************************/ 00150 00151 class SimpleTokenizer : Scanner, ITokenizer 00152 { 00153 private char delimiter; 00154 00155 /*********************************************************************** 00156 00157 Construct a SimpleTokenizer with the given delimiter char. 00158 More sophisticated delimiters can be constructed by using 00159 a RegexTokenizer instead. 00160 00161 ***********************************************************************/ 00162 00163 this (char delimiter) 00164 { 00165 this.delimiter = delimiter; 00166 } 00167 00168 00169 /*********************************************************************** 00170 00171 Locate the next token from the provided buffer, and map a 00172 buffer reference into token. Returns true if a token was 00173 located, false otherwise. 00174 00175 Note that the buffer content is not duplicated. Instead, a 00176 slice of the buffer is referenced by the token. You can use 00177 Token.clone() or Token.toString().dup() to copy content per 00178 your application needs. 00179 00180 Note also that there may still be one token left in a buffer 00181 that was not terminated correctly (as in eof conditions). In 00182 such cases, tokens are mapped onto remaining content and the 00183 buffer will have no more readable content. 00184 00185 ***********************************************************************/ 00186 00187 bool next (IBuffer buffer, Token token) 00188 { 00189 uint scan (char[] content) 00190 { 00191 foreach (int i, char c; content) 00192 if (c == delimiter) 00193 { 00194 token.set (content[0..i]); 00195 return i+1; 00196 } 00197 00198 return notFound (token, content); 00199 } 00200 00201 return super.next (buffer, &scan); 00202 } 00203 } 00204 00205 00206 /******************************************************************************* 00207 00208 A tokenizer that isolates content enclosed by whitespace. 00209 00210 *******************************************************************************/ 00211 00212 class SpaceTokenizer : Scanner, ITokenizer 00213 { 00214 /*********************************************************************** 00215 00216 Locate the next token from the provided buffer, and map a 00217 buffer reference into token. Returns true if a token was 00218 located, false otherwise. 00219 00220 Note that the buffer content is not duplicated. Instead, a 00221 slice of the buffer is referenced by the token. You can use 00222 Token.clone() or Token.toString().dup() to copy content per 00223 your application needs. 00224 00225 Note also that there may still be one token left in a buffer 00226 that was not terminated correctly (as in eof conditions). In 00227 such cases, tokens are mapped onto remaining content and the 00228 buffer will have no more readable content. 00229 00230 ***********************************************************************/ 00231 00232 bool next (IBuffer buffer, Token token) 00233 { 00234 uint scan (char[] content) 00235 { 00236 foreach (int i, char c; content) 00237 if (isspace (c)) 00238 { 00239 token.set (content[0..i]); 00240 return i+1; 00241 } 00242 00243 return notFound (token, content); 00244 } 00245 00246 return super.next (buffer, &scan); 00247 } 00248 } 00249 00250 00251 /******************************************************************************* 00252 00253 A tokenizer for handling both whitespace and punctuation delimiters. 00254 00255 *******************************************************************************/ 00256 00257 class PunctTokenizer : Scanner, ITokenizer 00258 { 00259 /*********************************************************************** 00260 00261 Locate the next token from the provided buffer, and map a 00262 buffer reference into token. Returns true if a token was 00263 located, false otherwise. 00264 00265 Note that the buffer content is not duplicated. Instead, a 00266 slice of the buffer is referenced by the token. You can use 00267 Token.clone() or Token.toString().dup() to copy content per 00268 your application needs. 00269 00270 Note also that there may still be one token left in a buffer 00271 that was not terminated correctly (as in eof conditions). In 00272 such cases, tokens are mapped onto remaining content and the 00273 buffer will have no more readable content. 00274 00275 ***********************************************************************/ 00276 00277 bool next (IBuffer buffer, Token token) 00278 { 00279 uint scan (char[] content) 00280 { 00281 foreach (int i, char c; content) 00282 if (isspace(c) || ispunct(c)) 00283 { 00284 token.set (content[0..i]); 00285 return i+1; 00286 } 00287 00288 return notFound (token, content); 00289 } 00290 00291 return super.next (buffer, &scan); 00292 } 00293 } 00294 00295 00296 /******************************************************************************* 00297 00298 Tokenize an entire line delimited by a single '\\n' character, or 00299 by a "\r\n" pair. 00300 00301 *******************************************************************************/ 00302 00303 class LineTokenizer : Scanner, ITokenizer 00304 { 00305 /*********************************************************************** 00306 00307 Locate the next token from the provided buffer, and map a 00308 buffer reference into token. Returns true if a token was 00309 located, false otherwise. 00310 00311 Note that the buffer content is not duplicated. Instead, a 00312 slice of the buffer is referenced by the token. You can use 00313 Token.clone() or Token.toString().dup() to copy content per 00314 your application needs. 00315 00316 Note also that there may still be one token left in a buffer 00317 that was not terminated correctly (as in eof conditions). In 00318 such cases, tokens are mapped onto remaining content and the 00319 buffer will have no more readable content. 00320 00321 ***********************************************************************/ 00322 00323 bool next (IBuffer buffer, Token token) 00324 { 00325 uint scan (char[] content) 00326 { 00327 foreach (int i, char c; content) 00328 if (c == '\n') 00329 { 00330 int slice = i; 00331 if (i && content[i-1] == '\r') 00332 --slice; 00333 token.set (content[0..slice]); 00334 return i+1; 00335 } 00336 00337 return notFound (token, content); 00338 } 00339 00340 return super.next (buffer, &scan); 00341 } 00342 } 00343 00344 00345 /******************************************************************************* 00346 00347 Eat everything until we reach a newline. Use this with a Reader, 00348 where you wish to discard everything else in the current line. 00349 00350 *******************************************************************************/ 00351 00352 class LineScanner : Scanner, IReadable 00353 { 00354 /*********************************************************************** 00355 00356 IReadable interface to support Reader.get() 00357 00358 ***********************************************************************/ 00359 00360 void read (IReader r) 00361 { 00362 next (r.getBuffer()); 00363 } 00364 00365 /*********************************************************************** 00366 00367 Eat all content until we see a '\n' character. The content 00368 is simply discarded. 00369 00370 ***********************************************************************/ 00371 00372 bool next (IBuffer buffer) 00373 { 00374 uint scan (char[] content) 00375 { 00376 foreach (int i, char c; content) 00377 if (c == '\n') 00378 return i+1; 00379 return IConduit.Eof; 00380 } 00381 00382 return super.next (buffer, &scan); 00383 } 00384 } 00385 00386 version (Ares) {} 00387 else 00388 { 00389 /******************************************************************************* 00390 00391 Wrap a tokenizer around the std.RegExp class. This is useful for 00392 situations where you can't load the entire source into memory at 00393 one time. In other words, this adapts RegExp into an incremental 00394 scanner. 00395 00396 Note that the associated buffer must be large enough to contain 00397 an entire RegExp match. For example, if you have a regex pattern 00398 that matches an entire file then the buffer must be at least the 00399 size of the file. In such cases, one might be advised to find an 00400 more effective solution. 00401 00402 *******************************************************************************/ 00403 00404 class RegexTokenizer : Scanner, ITokenizer 00405 { 00406 import std.regexp; 00407 00408 private RegExp exp; 00409 00410 /*********************************************************************** 00411 00412 Construct a RegexTokenizer with the provided RegExp. 00413 00414 ***********************************************************************/ 00415 00416 this (RegExp exp) 00417 { 00418 this.exp = exp; 00419 } 00420 00421 /*********************************************************************** 00422 00423 Locate the next token from the provided buffer, and map a 00424 buffer reference into token. Returns true if a token was 00425 located, false otherwise. 00426 00427 Note that the buffer content is not duplicated. Instead, a 00428 slice of the buffer is referenced by the token. You can use 00429 Token.clone() or Token.toString().dup() to copy content per 00430 your application needs. 00431 00432 Note also that there may still be one token left in a buffer 00433 that was not terminated correctly (as in eof conditions). In 00434 such cases, tokens are mapped onto remaining content and the 00435 buffer will have no more readable content. 00436 00437 ***********************************************************************/ 00438 00439 bool next (IBuffer buffer, Token token) 00440 { 00441 uint scan (char[] content) 00442 { 00443 //printf ("'%.*s' : %d\n", content, content.length); 00444 00445 // did we find a match? 00446 if (exp.test (content)) 00447 { 00448 int start = exp.pmatch[0].rm_so; 00449 int end = exp.pmatch[0].rm_eo; 00450 00451 // yep: stuff it into the token and go home 00452 token.set (content[start..end]); 00453 return end; 00454 } 00455 00456 // this is a bit tricky since RegExp doesn't tell 00457 // us when it has a partial match. To compensate, 00458 // we force the buffer to load as much as it can 00459 // after a failure within a *partial* buffer. 00460 if (buffer.getPosition()) 00461 buffer.compress(); 00462 else 00463 // skip past everything that didn't match. The 00464 // entire buffer may still be a partial match, 00465 // but then it should be made bigger to begin 00466 // with. 00467 buffer.skip (content.length); 00468 00469 // say we found nothing 00470 return notFound (token, content); 00471 } 00472 00473 // return the next token using this tokenizer 00474 return super.next (buffer, &scan); 00475 } 00476 } 00477 } 00478 00479 /******************************************************************************* 00480 00481 It's convenient to have some simple tokenizers available without 00482 constructing them, so we provide a few to get going with. 00483 00484 Note that these Tokenizers do not maintain any state of their own. 00485 Thus they are all thread-safe. 00486 00487 *******************************************************************************/ 00488 00489 struct Tokenizers 00490 { 00491 static LineScanner eol; 00492 static LineTokenizer line; 00493 static SpaceTokenizer space; 00494 static PunctTokenizer punct; 00495 static SimpleTokenizer comma; 00496 00497 /*********************************************************************** 00498 00499 Make a few common tokenizers available as singletons 00500 00501 ***********************************************************************/ 00502 00503 static this () 00504 { 00505 eol = new LineScanner(); 00506 line = new LineTokenizer(); 00507 space = new SpaceTokenizer(); 00508 punct = new PunctTokenizer(); 00509 comma = new SimpleTokenizer(','); 00510 } 00511 } 00512 00513 00514