Main Page | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members | Related Pages

Tokenizer.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file Tokenizer.d     
00004 
00005         Copyright (c) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026         4. Derivative works are permitted, but they must carry this notice
00027            in full and credit the original source.
00028 
00029 
00030                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00031 
00032 
00033         @version        Initial version, March 2004
00034         @author         Kris
00035 
00036 
00037 *******************************************************************************/
00038 
00039 module mango.io.Tokenizer;
00040 
00041 private import  std.ctype;
00042 
00043 private import  mango.io.Token,
00044                 mango.io.Exception;
00045 
00046 private import  mango.io.model.IReader,
00047                 mango.io.model.IBuffer,
00048                 mango.io.model.IConduit;
00049 
00050 /*******************************************************************************
00051 
00052         Extract tokens from an IBuffer. This is the base-class for all
00053         Tokenizers, but can also be used outside of the ITokenizer model.
00054 
00055 *******************************************************************************/
00056 
00057 class Scanner
00058 { 
00059         /***********************************************************************
00060         
00061                 Scan the given IBuffer for another token, and place the
00062                 results in the provided token. Note that this should be
00063                 completely thread-safe so one can instantiate singleton
00064                 tokenizers without issue.
00065 
00066                 Each Token is expected to be stripped of the delimiter.
00067                 An end-of-file condition causes trailing content to be 
00068                 placed into the token. Requests made beyond Eof result
00069                 in empty tokens (length == zero).
00070 
00071                 Returns true if a token was isolated, false otherwise.
00072 
00073         ***********************************************************************/
00074 
00075         bool next (IBuffer buffer, int delegate (char[]) scan)
00076         {
00077                 while (buffer.read (cast(int delegate(void[])) scan) == IConduit.Eof)
00078                       {
00079                       IConduit conduit = buffer.getConduit();
00080                       if (conduit is null)
00081                          {
00082                          buffer.skip (buffer.readable());
00083                          return false;
00084                          }
00085                       else
00086                          {
00087                          // no more space in the buffer?
00088                          if (! buffer.writable())
00089                             {
00090                             // did we start at the beginning?
00091                             if (buffer.getPosition ())
00092                                 // nope - move partial token to start of buffer
00093                                 buffer.compress ();
00094                             else
00095                                throw new TokenException ("Token is too large to fit within buffer");
00096                             }
00097 
00098                          // read another chunk of data
00099                          if (conduit.read (buffer) == IConduit.Eof)
00100                             {
00101                             buffer.skip (buffer.readable());
00102                             return false;
00103                             }
00104                          }
00105                       }
00106                 return true;
00107         }
00108 
00109         /***********************************************************************
00110         
00111                 Clean up after we fail to find a token. Trailing content
00112                 is placed into the token, and the scanner is told to try
00113                 and load some more content (where available).
00114                 
00115         ***********************************************************************/
00116 
00117         int notFound (Token token, char[] content)
00118         {
00119                 token.set (content);
00120                 return IConduit.Eof;
00121         }
00122 }
00123 
00124 
00125 /*******************************************************************************
00126 
00127         Interface to define how Tokenizers should expose their functionality.
00128 
00129 *******************************************************************************/
00130 
00131 interface ITokenizer
00132 { 
00133         /***********************************************************************
00134         
00135         ***********************************************************************/
00136 
00137         bool next (IBuffer buffer, Token token);
00138 }
00139 
00140 
00141 /*******************************************************************************
00142 
00143         A simple delimiting tokenizer. Use this to tokenize simple streams
00144         such as comma-seperated text.
00145 
00146 *******************************************************************************/
00147 
00148 class SimpleTokenizer : Scanner, ITokenizer
00149 {
00150         private char delimiter;
00151 
00152         /***********************************************************************
00153         
00154                 Construct a SimpleTokenizer with the given delimiter char.
00155                 More sophisticated delimiters can be constructed by using
00156                 a RegexTokenizer instead. 
00157 
00158         ***********************************************************************/
00159 
00160         this (char delimiter)
00161         {
00162                 this.delimiter = delimiter;
00163         }
00164      
00165 
00166         /***********************************************************************
00167         
00168                 Locate the next token from the provided buffer, and map a
00169                 buffer reference into token. Returns true if a token was 
00170                 located, false otherwise. 
00171 
00172                 Note that the buffer content is not duplicated. Instead, a
00173                 slice of the buffer is referenced by the token. You can use
00174                 Token.clone() or Token.toString().dup() to copy content per
00175                 your application needs.
00176 
00177                 Note also that there may still be one token left in a buffer 
00178                 that was not terminated correctly (as in eof conditions). In 
00179                 such cases, tokens are mapped onto remaining content and the 
00180                 buffer will have no more readable content.
00181 
00182         ***********************************************************************/
00183 
00184         bool next (IBuffer buffer, Token token)
00185         {
00186                 int scan (char[] content)
00187                 {
00188                         foreach (int i, char c; content)
00189                                  if (c == delimiter)
00190                                     {
00191                                     token.set (content[0..i]);
00192                                     return i+1;
00193                                     }
00194 
00195                         return notFound (token, content);
00196                 }
00197 
00198                 return super.next (buffer, &scan);
00199         }
00200 }
00201 
00202 
00203 /*******************************************************************************
00204 
00205         A tokenizer that isolates content enclosed by whitespace.
00206 
00207 *******************************************************************************/
00208 
00209 class SpaceTokenizer : Scanner, ITokenizer
00210 {
00211         /***********************************************************************
00212         
00213                 Locate the next token from the provided buffer, and map a
00214                 buffer reference into token. Returns true if a token was 
00215                 located, false otherwise. 
00216 
00217                 Note that the buffer content is not duplicated. Instead, a
00218                 slice of the buffer is referenced by the token. You can use
00219                 Token.clone() or Token.toString().dup() to copy content per
00220                 your application needs.
00221 
00222                 Note also that there may still be one token left in a buffer 
00223                 that was not terminated correctly (as in eof conditions). In 
00224                 such cases, tokens are mapped onto remaining content and the 
00225                 buffer will have no more readable content.
00226 
00227         ***********************************************************************/
00228 
00229         bool next (IBuffer buffer, Token token)
00230         {
00231                 int scan (char[] content)
00232                 {
00233                         foreach (int i, char c; content)
00234                                  if (isspace (c))
00235                                     {
00236                                     token.set (content[0..i]);
00237                                     return i+1;
00238                                     }
00239 
00240                         return notFound (token, content);
00241                 }
00242 
00243                 return super.next (buffer, &scan);
00244         }
00245 }
00246 
00247 
00248 /*******************************************************************************
00249 
00250         A tokenizer for handling both whitespace and punctuation delimiters.
00251 
00252 *******************************************************************************/
00253 
00254 class PunctTokenizer : Scanner, ITokenizer
00255 {
00256         /***********************************************************************
00257         
00258                 Locate the next token from the provided buffer, and map a
00259                 buffer reference into token. Returns true if a token was 
00260                 located, false otherwise. 
00261 
00262                 Note that the buffer content is not duplicated. Instead, a
00263                 slice of the buffer is referenced by the token. You can use
00264                 Token.clone() or Token.toString().dup() to copy content per
00265                 your application needs.
00266 
00267                 Note also that there may still be one token left in a buffer 
00268                 that was not terminated correctly (as in eof conditions). In 
00269                 such cases, tokens are mapped onto remaining content and the 
00270                 buffer will have no more readable content.
00271 
00272         ***********************************************************************/
00273 
00274         bool next (IBuffer buffer, Token token)
00275         {
00276                 int scan (char[] content)
00277                 {
00278                         foreach (int i, char c; content)
00279                                  if (isspace(c) || ispunct(c))
00280                                     {
00281                                     token.set (content[0..i]);
00282                                     return i+1;
00283                                     }
00284 
00285                         return notFound (token, content);
00286                 }
00287 
00288                 return super.next (buffer, &scan);
00289         }
00290 }
00291 
00292 
00293 /*******************************************************************************
00294 
00295         Tokenize an entire line delimited by a single '\\n' character, or
00296         by a "\r\n" pair.
00297 
00298 *******************************************************************************/
00299 
00300 class LineTokenizer : Scanner, ITokenizer
00301 {
00302         /***********************************************************************
00303         
00304                 Locate the next token from the provided buffer, and map a
00305                 buffer reference into token. Returns true if a token was 
00306                 located, false otherwise. 
00307 
00308                 Note that the buffer content is not duplicated. Instead, a
00309                 slice of the buffer is referenced by the token. You can use
00310                 Token.clone() or Token.toString().dup() to copy content per
00311                 your application needs.
00312 
00313                 Note also that there may still be one token left in a buffer 
00314                 that was not terminated correctly (as in eof conditions). In 
00315                 such cases, tokens are mapped onto remaining content and the 
00316                 buffer will have no more readable content.
00317 
00318         ***********************************************************************/
00319 
00320         bool next (IBuffer buffer, Token token)
00321         {
00322                 int scan (char[] content)
00323                 {      
00324                         foreach (int i, char c; content)
00325                                  if (c == '\n')
00326                                     {
00327                                     int slice = i;
00328                                     if (i && content[i-1] == '\r')
00329                                         --slice;
00330                                     token.set (content[0..slice]);
00331                                     return i+1;
00332                                     }
00333 
00334                         return notFound (token, content);
00335                 }
00336 
00337                 return super.next (buffer, &scan);
00338         }
00339 }
00340    
00341      
00342 /*******************************************************************************
00343 
00344         Eat everything until we reach a newline. Use this with a Reader, 
00345         where you wish to discard everything else in the current line. 
00346 
00347 *******************************************************************************/
00348 
00349 class LineScanner : Scanner, IReadable
00350 {       
00351         /***********************************************************************
00352         
00353                 IReadable interface to support Reader.get()
00354 
00355         ***********************************************************************/
00356 
00357         void read (IReader r)
00358         {
00359                 next (r.getBuffer());
00360         }
00361                 
00362         /***********************************************************************
00363         
00364                 Eat all content until we see a '\n' character. The content
00365                 is simply discarded.
00366 
00367         ***********************************************************************/
00368 
00369         bool next (IBuffer buffer)
00370         {
00371                 int scan (char[] content)
00372                 {      
00373                         foreach (int i, char c; content)
00374                                  if (c == '\n')
00375                                      return i+1;
00376                         return IConduit.Eof;
00377                 }
00378 
00379                 return super.next (buffer, &scan);
00380         }
00381 }
00382 
00383 
00384 /*******************************************************************************
00385 
00386         Wrap a tokenizer around the std.RegExp class. This is useful for
00387         situations where you can't load the entire source into memory at
00388         one time. In other words, this adapts RegExp into an incremental
00389         scanner.
00390 
00391         Note that the associated buffer must be large enough to contain
00392         an entire RegExp match. For example, if you have a regex pattern
00393         that matches an entire file then the buffer must be at least the
00394         size of the file. In such cases, one might be advised to find an 
00395         more effective solution.
00396 
00397 *******************************************************************************/
00398 
00399 class RegexTokenizer : Scanner, ITokenizer
00400 {
00401         import std.regexp;
00402     
00403         private RegExp exp;
00404 
00405         /***********************************************************************
00406         
00407                 Construct a RegexTokenizer with the provided RegExp.
00408 
00409         ***********************************************************************/
00410 
00411         this (RegExp exp)
00412         {
00413                 this.exp = exp;
00414         }
00415 
00416         /***********************************************************************
00417         
00418                 Locate the next token from the provided buffer, and map a
00419                 buffer reference into token. Returns true if a token was 
00420                 located, false otherwise. 
00421 
00422                 Note that the buffer content is not duplicated. Instead, a
00423                 slice of the buffer is referenced by the token. You can use
00424                 Token.clone() or Token.toString().dup() to copy content per
00425                 your application needs.
00426 
00427                 Note also that there may still be one token left in a buffer 
00428                 that was not terminated correctly (as in eof conditions). In 
00429                 such cases, tokens are mapped onto remaining content and the 
00430                 buffer will have no more readable content.
00431 
00432         ***********************************************************************/
00433 
00434         bool next (IBuffer buffer, Token token)
00435         {
00436                 int scan (char[] content)
00437                 {      
00438                         //printf ("'%.*s' : %d\n", content, content.length);
00439 
00440                         // did we find a match?
00441                         if (exp.test (content))
00442                            {
00443                            int start = exp.pmatch[0].rm_so;
00444                            int end   = exp.pmatch[0].rm_eo;
00445 
00446                            // yep: stuff it into the token and go home
00447                            token.set (content[start..end]);
00448                            return end;
00449                            }
00450                         
00451                         // this is a bit tricky since RegExp doesn't tell
00452                         // us when it has a partial match. To compensate,
00453                         // we force the buffer to load as much as it can
00454                         // after a failure within a *partial* buffer.
00455                         if (buffer.getPosition())
00456                             buffer.compress();
00457                         else
00458                            // skip past everything that didn't match. The
00459                            // entire buffer may still be a partial match,
00460                            // but then it should be made bigger to begin
00461                            // with.
00462                            buffer.skip (content.length);
00463 
00464                         // say we found nothing
00465                         return notFound (token, content);
00466                 }
00467 
00468                 // return the next token using this tokenizer
00469                 return super.next (buffer, &scan);
00470         }
00471 }
00472    
00473      
00474 /*******************************************************************************
00475 
00476         It's convenient to have some simple tokenizers available without 
00477         constructing them, so we provide a few to get going with.
00478 
00479         Note that these Tokenizers do not maintain any state of their own. 
00480         Thus they are all thread-safe.
00481 
00482 *******************************************************************************/
00483 
00484 struct Tokenizers
00485 {       
00486         static LineScanner      eol;
00487         static LineTokenizer    line;
00488         static SpaceTokenizer   space;
00489         static PunctTokenizer   punct;
00490         static SimpleTokenizer  comma;
00491          
00492         /***********************************************************************
00493 
00494                 Make a few common tokenizers available as singletons      
00495 
00496         ***********************************************************************/
00497 
00498         static this ()
00499         {
00500                 eol = new LineScanner();
00501                 line = new LineTokenizer();           
00502                 space = new SpaceTokenizer();
00503                 punct = new PunctTokenizer();
00504                 comma = new SimpleTokenizer(',');
00505         }
00506 }
00507  
00508 
00509 

Generated on Sat Apr 9 20:11:28 2005 for Mango by doxygen 1.3.6