Main Page | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members | Related Pages

Tokenizer.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file Tokenizer.d     
00004 
00005         Copyright (C) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026 
00027                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00028 
00029 
00030         @version        Initial version, March 2004
00031         @author         Kris
00032 
00033 
00034 *******************************************************************************/
00035 
00036 module mango.io.Tokenizer;
00037 
00038 private import  std.ctype;
00039 
00040 private import  mango.io.Token,
00041                 mango.io.Exception;
00042 
00043 private import  mango.io.model.IReader,
00044                 mango.io.model.IBuffer,
00045                 mango.io.model.IConduit;
00046 
00047 /*******************************************************************************
00048 
00049         Extract tokens from an IBuffer. This is the base-class for all
00050         Tokenizers, but can also be used outside of the ITokenizer model.
00051 
00052 *******************************************************************************/
00053 
00054 class Scanner
00055 { 
00056         /***********************************************************************
00057         
00058                 Scan the given IBuffer for another token, and place the
00059                 results in the provided token. Note that this should be
00060                 completely thread-safe so one can instantiate singleton
00061                 tokenizers without issue.
00062 
00063                 Each Token is expected to be stripped of the delimiter.
00064                 An end-of-file condition causes trailing content to be 
00065                 placed into the token. Requests made beyond Eof result
00066                 in empty tokens (length == zero).
00067 
00068                 Returns true if a token was isolated, false otherwise.
00069 
00070         ***********************************************************************/
00071 
00072         bool next (IBuffer buffer, int delegate (char[]) scan)
00073         {
00074                 while (buffer.read (cast(int delegate(void[])) scan) == IConduit.Eof)
00075                       {
00076                       IConduit conduit = buffer.getConduit();
00077                       if (conduit is null)
00078                          {
00079                          buffer.skip (buffer.readable());
00080                          return false;
00081                          }
00082                       else
00083                          {
00084                          // no more space in the buffer?
00085                          if (! buffer.writable())
00086                             {
00087                             // did we start at the beginning?
00088                             if (buffer.getPosition ())
00089                                 // nope - move partial token to start of buffer
00090                                 buffer.compress ();
00091                             else
00092                                throw new TokenException ("Token is too large to fit within buffer");
00093                             }
00094 
00095                          // read another chunk of data
00096                          if (conduit.read (buffer) == IConduit.Eof)
00097                             {
00098                             buffer.skip (buffer.readable());
00099                             return false;
00100                             }
00101                          }
00102                       }
00103                 return true;
00104         }
00105 
00106         /***********************************************************************
00107         
00108                 Clean up after we fail to find a token. Trailing content
00109                 is placed into the token, and the scanner is told to try
00110                 and load some more content (where available).
00111                 
00112         ***********************************************************************/
00113 
00114         int notFound (Token token, char[] content)
00115         {
00116                 token.set (content);
00117                 return IConduit.Eof;
00118         }
00119 }
00120 
00121 
00122 /*******************************************************************************
00123 
00124         Interface to define how Tokenizers should expose their functionality.
00125 
00126 *******************************************************************************/
00127 
00128 interface ITokenizer
00129 { 
00130         /***********************************************************************
00131         
00132         ***********************************************************************/
00133 
00134         bool next (IBuffer buffer, Token token);
00135 }
00136 
00137 
00138 /*******************************************************************************
00139 
00140         A simple delimiting tokenizer. Use this to tokenize simple streams
00141         such as comma-seperated text.
00142 
00143 *******************************************************************************/
00144 
00145 class SimpleTokenizer : Scanner, ITokenizer
00146 {
00147         private char delimiter;
00148 
00149         /***********************************************************************
00150         
00151                 Construct a SimpleTokenizer with the given delimiter char.
00152                 More sophisticated delimiters can be constructed by using
00153                 a RegexTokenizer instead. 
00154 
00155         ***********************************************************************/
00156 
00157         this (char delimiter)
00158         {
00159                 this.delimiter = delimiter;
00160         }
00161      
00162 
00163         /***********************************************************************
00164         
00165                 Locate the next token from the provided buffer, and map a
00166                 buffer reference into token. Returns true if a token was 
00167                 located, false otherwise. 
00168 
00169                 Note that the buffer content is not duplicated. Instead, a
00170                 slice of the buffer is referenced by the token. You can use
00171                 Token.clone() or Token.toString().dup() to copy content per
00172                 your application needs.
00173 
00174                 Note also that there may still be one token left in a buffer 
00175                 that was not terminated correctly (as in eof conditions). In 
00176                 such cases, tokens are mapped onto remaining content and the 
00177                 buffer will have no more readable content.
00178 
00179         ***********************************************************************/
00180 
00181         bool next (IBuffer buffer, Token token)
00182         {
00183                 int scan (char[] content)
00184                 {
00185                         foreach (int i, char c; content)
00186                                  if (c == delimiter)
00187                                     {
00188                                     token.set (content[0..i]);
00189                                     return i+1;
00190                                     }
00191 
00192                         return notFound (token, content);
00193                 }
00194 
00195                 return super.next (buffer, &scan);
00196         }
00197 }
00198 
00199 
00200 /*******************************************************************************
00201 
00202         A tokenizer that isolates content enclosed by whitespace.
00203 
00204 *******************************************************************************/
00205 
00206 class SpaceTokenizer : Scanner, ITokenizer
00207 {
00208         /***********************************************************************
00209         
00210                 Locate the next token from the provided buffer, and map a
00211                 buffer reference into token. Returns true if a token was 
00212                 located, false otherwise. 
00213 
00214                 Note that the buffer content is not duplicated. Instead, a
00215                 slice of the buffer is referenced by the token. You can use
00216                 Token.clone() or Token.toString().dup() to copy content per
00217                 your application needs.
00218 
00219                 Note also that there may still be one token left in a buffer 
00220                 that was not terminated correctly (as in eof conditions). In 
00221                 such cases, tokens are mapped onto remaining content and the 
00222                 buffer will have no more readable content.
00223 
00224         ***********************************************************************/
00225 
00226         bool next (IBuffer buffer, Token token)
00227         {
00228                 int scan (char[] content)
00229                 {
00230                         foreach (int i, char c; content)
00231                                  if (isspace (c))
00232                                     {
00233                                     token.set (content[0..i]);
00234                                     return i+1;
00235                                     }
00236 
00237                         return notFound (token, content);
00238                 }
00239 
00240                 return super.next (buffer, &scan);
00241         }
00242 }
00243 
00244 
00245 /*******************************************************************************
00246 
00247         A tokenizer for handling both whitespace and punctuation delimiters.
00248 
00249 *******************************************************************************/
00250 
00251 class PunctTokenizer : Scanner, ITokenizer
00252 {
00253         /***********************************************************************
00254         
00255                 Locate the next token from the provided buffer, and map a
00256                 buffer reference into token. Returns true if a token was 
00257                 located, false otherwise. 
00258 
00259                 Note that the buffer content is not duplicated. Instead, a
00260                 slice of the buffer is referenced by the token. You can use
00261                 Token.clone() or Token.toString().dup() to copy content per
00262                 your application needs.
00263 
00264                 Note also that there may still be one token left in a buffer 
00265                 that was not terminated correctly (as in eof conditions). In 
00266                 such cases, tokens are mapped onto remaining content and the 
00267                 buffer will have no more readable content.
00268 
00269         ***********************************************************************/
00270 
00271         bool next (IBuffer buffer, Token token)
00272         {
00273                 int scan (char[] content)
00274                 {
00275                         foreach (int i, char c; content)
00276                                  if (isspace(c) || ispunct(c))
00277                                     {
00278                                     token.set (content[0..i]);
00279                                     return i+1;
00280                                     }
00281 
00282                         return notFound (token, content);
00283                 }
00284 
00285                 return super.next (buffer, &scan);
00286         }
00287 }
00288 
00289 
00290 /*******************************************************************************
00291 
00292         Tokenize an entire line delimited by a single '\\n' character, or
00293         by a "\r\n" pair.
00294 
00295 *******************************************************************************/
00296 
00297 class LineTokenizer : Scanner, ITokenizer
00298 {
00299         /***********************************************************************
00300         
00301                 Locate the next token from the provided buffer, and map a
00302                 buffer reference into token. Returns true if a token was 
00303                 located, false otherwise. 
00304 
00305                 Note that the buffer content is not duplicated. Instead, a
00306                 slice of the buffer is referenced by the token. You can use
00307                 Token.clone() or Token.toString().dup() to copy content per
00308                 your application needs.
00309 
00310                 Note also that there may still be one token left in a buffer 
00311                 that was not terminated correctly (as in eof conditions). In 
00312                 such cases, tokens are mapped onto remaining content and the 
00313                 buffer will have no more readable content.
00314 
00315         ***********************************************************************/
00316 
00317         bool next (IBuffer buffer, Token token)
00318         {
00319                 int scan (char[] content)
00320                 {      
00321                         foreach (int i, char c; content)
00322                                  if (c == '\n')
00323                                     {
00324                                     int slice = i;
00325                                     if (i && content[i-1] == '\r')
00326                                         --slice;
00327                                     token.set (content[0..slice]);
00328                                     return i+1;
00329                                     }
00330 
00331                         return notFound (token, content);
00332                 }
00333 
00334                 return super.next (buffer, &scan);
00335         }
00336 }
00337    
00338      
00339 /*******************************************************************************
00340 
00341         Eat everything until we reach a newline. Use this with a Reader, 
00342         where you wish to discard everything else in the current line. 
00343 
00344 *******************************************************************************/
00345 
00346 class LineScanner : Scanner, IReadable
00347 {       
00348         /***********************************************************************
00349         
00350                 IReadable interface to support Reader.get() & Reader.opShl()
00351 
00352         ***********************************************************************/
00353 
00354         void read (IReader r)
00355         {
00356                 next (r.getBuffer());
00357         }
00358                 
00359         /***********************************************************************
00360         
00361                 Eat all content until we see a '\n' character. The content
00362                 is simply discarded.
00363 
00364         ***********************************************************************/
00365 
00366         bool next (IBuffer buffer)
00367         {
00368                 int scan (char[] content)
00369                 {      
00370                         foreach (int i, char c; content)
00371                                  if (c == '\n')
00372                                      return i+1;
00373                         return IConduit.Eof;
00374                 }
00375 
00376                 return super.next (buffer, &scan);
00377         }
00378 }
00379 
00380 
00381 /*******************************************************************************
00382 
00383         Wrap a tokenizer around the std.RegExp class. This is useful for
00384         situations where you can't load the entire source into memory at
00385         one time. In other words, this adapts RegExp into an incremental
00386         scanner.
00387 
00388         Note that the associated buffer must be large enough to contain
00389         an entire RegExp match. For example, if you have a regex pattern
00390         that matches an entire file then the buffer must be at least the
00391         size of the file. In such cases, one might be advised to find an 
00392         more effective solution.
00393 
00394 *******************************************************************************/
00395 
00396 class RegexTokenizer : Scanner, ITokenizer
00397 {
00398         import std.regexp;
00399     
00400         private RegExp exp;
00401 
00402         /***********************************************************************
00403         
00404                 Construct a RegexTokenizer with the provided RegExp.
00405 
00406         ***********************************************************************/
00407 
00408         this (RegExp exp)
00409         {
00410                 this.exp = exp;
00411         }
00412 
00413         /***********************************************************************
00414         
00415                 Locate the next token from the provided buffer, and map a
00416                 buffer reference into token. Returns true if a token was 
00417                 located, false otherwise. 
00418 
00419                 Note that the buffer content is not duplicated. Instead, a
00420                 slice of the buffer is referenced by the token. You can use
00421                 Token.clone() or Token.toString().dup() to copy content per
00422                 your application needs.
00423 
00424                 Note also that there may still be one token left in a buffer 
00425                 that was not terminated correctly (as in eof conditions). In 
00426                 such cases, tokens are mapped onto remaining content and the 
00427                 buffer will have no more readable content.
00428 
00429         ***********************************************************************/
00430 
00431         bool next (IBuffer buffer, Token token)
00432         {
00433                 int scan (char[] content)
00434                 {      
00435                         //printf ("'%.*s' : %d\n", content, content.length);
00436 
00437                         // did we find a match?
00438                         if (exp.test (content))
00439                            {
00440                            int start = exp.pmatch[0].rm_so;
00441                            int end   = exp.pmatch[0].rm_eo;
00442 
00443                            // yep: stuff it into the token and go home
00444                            token.set (content[start..end]);
00445                            return end;
00446                            }
00447                         
00448                         // this is a bit tricky since RegExp doesn't tell
00449                         // us when it has a partial match. To compensate,
00450                         // we force the buffer to load as much as it can
00451                         // after a failure within a *partial* buffer.
00452                         if (buffer.getPosition())
00453                             buffer.compress();
00454                         else
00455                            // skip past everything that didn't match. The
00456                            // entire buffer may still be a partial match,
00457                            // but then it should be made bigger to begin
00458                            // with.
00459                            buffer.skip (content.length);
00460 
00461                         // say we found nothing
00462                         return notFound (token, content);
00463                 }
00464 
00465                 // return the next token using this tokenizer
00466                 return super.next (buffer, &scan);
00467         }
00468 }
00469    
00470      
00471 /*******************************************************************************
00472 
00473         It's convenient to have some simple tokenizers available without 
00474         constructing them, so we provide a few to get going with.
00475 
00476         Note that these Tokenizers do not maintain any state of their own. 
00477         Thus they are all thread-safe.
00478 
00479 *******************************************************************************/
00480 
00481 struct Tokenizers
00482 {       
00483         static LineScanner      eol;
00484         static LineTokenizer    line;
00485         static SpaceTokenizer   space;
00486         static PunctTokenizer   punct;
00487         static SimpleTokenizer  comma;
00488          
00489         /***********************************************************************
00490 
00491                 Make a few common tokenizers available as singletons      
00492 
00493         ***********************************************************************/
00494 
00495         static this ()
00496         {
00497                 eol = new LineScanner();
00498                 line = new LineTokenizer();           
00499                 space = new SpaceTokenizer();
00500                 punct = new PunctTokenizer();
00501                 comma = new SimpleTokenizer(',');
00502         }
00503 }
00504  
00505 
00506 

Generated on Sun Nov 7 19:06:53 2004 for Mango by doxygen 1.3.6