Main Page | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Class Members | File Members | Related Pages

Tokenizer.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file Tokenizer.d     
00004 
00005         Copyright (c) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026         4. Derivative works are permitted, but they must carry this notice
00027            in full and credit the original source.
00028 
00029 
00030                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00031 
00032 
00033         @version        Initial version, March 2004
00034         @author         Kris
00035 
00036 
00037 *******************************************************************************/
00038 
00039 module mango.io.Tokenizer;
00040 
00041 version (Ares)
00042          private import  std.c.ctype;
00043       else
00044          private import  std.ctype;
00045 
00046 private import  mango.io.Token,
00047                 mango.io.Exception;
00048 
00049 private import  mango.io.model.IReader,
00050                 mango.io.model.IBuffer,
00051                 mango.io.model.IConduit;
00052 
00053 /*******************************************************************************
00054 
00055         Extract tokens from an IBuffer. This is the base-class for all
00056         Tokenizers, but can also be used outside of the ITokenizer model.
00057 
00058 *******************************************************************************/
00059 
00060 class Scanner
00061 { 
00062         /***********************************************************************
00063         
00064                 Scan the given IBuffer for another token, and place the
00065                 results in the provided token. Note that this should be
00066                 completely thread-safe so one can instantiate singleton
00067                 tokenizers without issue.
00068 
00069                 Each Token is expected to be stripped of the delimiter.
00070                 An end-of-file condition causes trailing content to be 
00071                 placed into the token. Requests made beyond Eof result
00072                 in empty tokens (length == zero).
00073 
00074                 Returns true if a token was isolated, false otherwise.
00075 
00076         ***********************************************************************/
00077 
00078         bool next (IBuffer buffer, uint delegate (char[]) scan)
00079         {
00080                 while (buffer.read (cast(uint delegate(void[])) scan) == IConduit.Eof)
00081                       {
00082                       IConduit conduit = buffer.getConduit();
00083                       if (conduit is null)
00084                          {
00085                          buffer.skip (buffer.readable());
00086                          return false;
00087                          }
00088                       else
00089                          {
00090                          // no more space in the buffer?
00091                          if (! buffer.writable())
00092                             {
00093                             // did we start at the beginning?
00094                             if (buffer.getPosition ())
00095                                 // nope - move partial token to start of buffer
00096                                 buffer.compress ();
00097                             else
00098                                throw new TokenException ("Token is too large to fit within buffer");
00099                             }
00100 
00101                          // read another chunk of data
00102                          if (buffer.fill (conduit) == IConduit.Eof)
00103                             {
00104                             buffer.skip (buffer.readable());
00105                             return false;
00106                             }
00107                          }
00108                       }
00109                 return true;
00110         }
00111 
00112         /***********************************************************************
00113         
00114                 Clean up after we fail to find a token. Trailing content
00115                 is placed into the token, and the scanner is told to try
00116                 and load some more content (where available).
00117                 
00118         ***********************************************************************/
00119 
00120         uint notFound (Token token, char[] content)
00121         {
00122                 token.set (content);
00123                 return IConduit.Eof;
00124         }
00125 }
00126 
00127 
00128 /*******************************************************************************
00129 
00130         Interface to define how Tokenizers should expose their functionality.
00131 
00132 *******************************************************************************/
00133 
00134 interface ITokenizer
00135 { 
00136         /***********************************************************************
00137         
00138         ***********************************************************************/
00139 
00140         bool next (IBuffer buffer, Token token);
00141 }
00142 
00143 
00144 /*******************************************************************************
00145 
00146         A simple delimiting tokenizer. Use this to tokenize simple streams
00147         such as comma-seperated text.
00148 
00149 *******************************************************************************/
00150 
00151 class SimpleTokenizer : Scanner, ITokenizer
00152 {
00153         private char delimiter;
00154 
00155         /***********************************************************************
00156         
00157                 Construct a SimpleTokenizer with the given delimiter char.
00158                 More sophisticated delimiters can be constructed by using
00159                 a RegexTokenizer instead. 
00160 
00161         ***********************************************************************/
00162 
00163         this (char delimiter)
00164         {
00165                 this.delimiter = delimiter;
00166         }
00167      
00168 
00169         /***********************************************************************
00170         
00171                 Locate the next token from the provided buffer, and map a
00172                 buffer reference into token. Returns true if a token was 
00173                 located, false otherwise. 
00174 
00175                 Note that the buffer content is not duplicated. Instead, a
00176                 slice of the buffer is referenced by the token. You can use
00177                 Token.clone() or Token.toString().dup() to copy content per
00178                 your application needs.
00179 
00180                 Note also that there may still be one token left in a buffer 
00181                 that was not terminated correctly (as in eof conditions). In 
00182                 such cases, tokens are mapped onto remaining content and the 
00183                 buffer will have no more readable content.
00184 
00185         ***********************************************************************/
00186 
00187         bool next (IBuffer buffer, Token token)
00188         {
00189                 uint scan (char[] content)
00190                 {
00191                         foreach (int i, char c; content)
00192                                  if (c == delimiter)
00193                                     {
00194                                     token.set (content[0..i]);
00195                                     return i+1;
00196                                     }
00197 
00198                         return notFound (token, content);
00199                 }
00200 
00201                 return super.next (buffer, &scan);
00202         }
00203 }
00204 
00205 
00206 /*******************************************************************************
00207 
00208         A tokenizer that isolates content enclosed by whitespace.
00209 
00210 *******************************************************************************/
00211 
00212 class SpaceTokenizer : Scanner, ITokenizer
00213 {
00214         /***********************************************************************
00215         
00216                 Locate the next token from the provided buffer, and map a
00217                 buffer reference into token. Returns true if a token was 
00218                 located, false otherwise. 
00219 
00220                 Note that the buffer content is not duplicated. Instead, a
00221                 slice of the buffer is referenced by the token. You can use
00222                 Token.clone() or Token.toString().dup() to copy content per
00223                 your application needs.
00224 
00225                 Note also that there may still be one token left in a buffer 
00226                 that was not terminated correctly (as in eof conditions). In 
00227                 such cases, tokens are mapped onto remaining content and the 
00228                 buffer will have no more readable content.
00229 
00230         ***********************************************************************/
00231 
00232         bool next (IBuffer buffer, Token token)
00233         {
00234                 uint scan (char[] content)
00235                 {
00236                         foreach (int i, char c; content)
00237                                  if (isspace (c))
00238                                     {
00239                                     token.set (content[0..i]);
00240                                     return i+1;
00241                                     }
00242 
00243                         return notFound (token, content);
00244                 }
00245 
00246                 return super.next (buffer, &scan);
00247         }
00248 }
00249 
00250 
00251 /*******************************************************************************
00252 
00253         A tokenizer for handling both whitespace and punctuation delimiters.
00254 
00255 *******************************************************************************/
00256 
00257 class PunctTokenizer : Scanner, ITokenizer
00258 {
00259         /***********************************************************************
00260         
00261                 Locate the next token from the provided buffer, and map a
00262                 buffer reference into token. Returns true if a token was 
00263                 located, false otherwise. 
00264 
00265                 Note that the buffer content is not duplicated. Instead, a
00266                 slice of the buffer is referenced by the token. You can use
00267                 Token.clone() or Token.toString().dup() to copy content per
00268                 your application needs.
00269 
00270                 Note also that there may still be one token left in a buffer 
00271                 that was not terminated correctly (as in eof conditions). In 
00272                 such cases, tokens are mapped onto remaining content and the 
00273                 buffer will have no more readable content.
00274 
00275         ***********************************************************************/
00276 
00277         bool next (IBuffer buffer, Token token)
00278         {
00279                 uint scan (char[] content)
00280                 {
00281                         foreach (int i, char c; content)
00282                                  if (isspace(c) || ispunct(c))
00283                                     {
00284                                     token.set (content[0..i]);
00285                                     return i+1;
00286                                     }
00287 
00288                         return notFound (token, content);
00289                 }
00290 
00291                 return super.next (buffer, &scan);
00292         }
00293 }
00294 
00295 
00296 /*******************************************************************************
00297 
00298         Tokenize an entire line delimited by a single '\\n' character, or
00299         by a "\r\n" pair.
00300 
00301 *******************************************************************************/
00302 
00303 class LineTokenizer : Scanner, ITokenizer
00304 {
00305         /***********************************************************************
00306         
00307                 Locate the next token from the provided buffer, and map a
00308                 buffer reference into token. Returns true if a token was 
00309                 located, false otherwise. 
00310 
00311                 Note that the buffer content is not duplicated. Instead, a
00312                 slice of the buffer is referenced by the token. You can use
00313                 Token.clone() or Token.toString().dup() to copy content per
00314                 your application needs.
00315 
00316                 Note also that there may still be one token left in a buffer 
00317                 that was not terminated correctly (as in eof conditions). In 
00318                 such cases, tokens are mapped onto remaining content and the 
00319                 buffer will have no more readable content.
00320 
00321         ***********************************************************************/
00322 
00323         bool next (IBuffer buffer, Token token)
00324         {
00325                 uint scan (char[] content)
00326                 {      
00327                         foreach (int i, char c; content)
00328                                  if (c == '\n')
00329                                     {
00330                                     int slice = i;
00331                                     if (i && content[i-1] == '\r')
00332                                         --slice;
00333                                     token.set (content[0..slice]);
00334                                     return i+1;
00335                                     }
00336 
00337                         return notFound (token, content);
00338                 }
00339 
00340                 return super.next (buffer, &scan);
00341         }
00342 }
00343    
00344      
00345 /*******************************************************************************
00346 
00347         Eat everything until we reach a newline. Use this with a Reader, 
00348         where you wish to discard everything else in the current line. 
00349 
00350 *******************************************************************************/
00351 
00352 class LineScanner : Scanner, IReadable
00353 {       
00354         /***********************************************************************
00355         
00356                 IReadable interface to support Reader.get()
00357 
00358         ***********************************************************************/
00359 
00360         void read (IReader r)
00361         {
00362                 next (r.getBuffer());
00363         }
00364                 
00365         /***********************************************************************
00366         
00367                 Eat all content until we see a '\n' character. The content
00368                 is simply discarded.
00369 
00370         ***********************************************************************/
00371 
00372         bool next (IBuffer buffer)
00373         {
00374                 uint scan (char[] content)
00375                 {      
00376                         foreach (int i, char c; content)
00377                                  if (c == '\n')
00378                                      return i+1;
00379                         return IConduit.Eof;
00380                 }
00381 
00382                 return super.next (buffer, &scan);
00383         }
00384 }
00385 
00386 version (Ares) {}
00387 else
00388 {
00389 /*******************************************************************************
00390 
00391         Wrap a tokenizer around the std.RegExp class. This is useful for
00392         situations where you can't load the entire source into memory at
00393         one time. In other words, this adapts RegExp into an incremental
00394         scanner.
00395 
00396         Note that the associated buffer must be large enough to contain
00397         an entire RegExp match. For example, if you have a regex pattern
00398         that matches an entire file then the buffer must be at least the
00399         size of the file. In such cases, one might be advised to find an 
00400         more effective solution.
00401 
00402 *******************************************************************************/
00403 
00404 class RegexTokenizer : Scanner, ITokenizer
00405 {
00406         import std.regexp;
00407     
00408         private RegExp exp;
00409 
00410         /***********************************************************************
00411         
00412                 Construct a RegexTokenizer with the provided RegExp.
00413 
00414         ***********************************************************************/
00415 
00416         this (RegExp exp)
00417         {
00418                 this.exp = exp;
00419         }
00420 
00421         /***********************************************************************
00422         
00423                 Locate the next token from the provided buffer, and map a
00424                 buffer reference into token. Returns true if a token was 
00425                 located, false otherwise. 
00426 
00427                 Note that the buffer content is not duplicated. Instead, a
00428                 slice of the buffer is referenced by the token. You can use
00429                 Token.clone() or Token.toString().dup() to copy content per
00430                 your application needs.
00431 
00432                 Note also that there may still be one token left in a buffer 
00433                 that was not terminated correctly (as in eof conditions). In 
00434                 such cases, tokens are mapped onto remaining content and the 
00435                 buffer will have no more readable content.
00436 
00437         ***********************************************************************/
00438 
00439         bool next (IBuffer buffer, Token token)
00440         {
00441                 uint scan (char[] content)
00442                 {      
00443                         //printf ("'%.*s' : %d\n", content, content.length);
00444 
00445                         // did we find a match?
00446                         if (exp.test (content))
00447                            {
00448                            int start = exp.pmatch[0].rm_so;
00449                            int end   = exp.pmatch[0].rm_eo;
00450 
00451                            // yep: stuff it into the token and go home
00452                            token.set (content[start..end]);
00453                            return end;
00454                            }
00455                         
00456                         // this is a bit tricky since RegExp doesn't tell
00457                         // us when it has a partial match. To compensate,
00458                         // we force the buffer to load as much as it can
00459                         // after a failure within a *partial* buffer.
00460                         if (buffer.getPosition())
00461                             buffer.compress();
00462                         else
00463                            // skip past everything that didn't match. The
00464                            // entire buffer may still be a partial match,
00465                            // but then it should be made bigger to begin
00466                            // with.
00467                            buffer.skip (content.length);
00468 
00469                         // say we found nothing
00470                         return notFound (token, content);
00471                 }
00472 
00473                 // return the next token using this tokenizer
00474                 return super.next (buffer, &scan);
00475         }
00476 }
00477 }   
00478      
00479 /*******************************************************************************
00480 
00481         It's convenient to have some simple tokenizers available without 
00482         constructing them, so we provide a few to get going with.
00483 
00484         Note that these Tokenizers do not maintain any state of their own. 
00485         Thus they are all thread-safe.
00486 
00487 *******************************************************************************/
00488 
00489 struct Tokenizers
00490 {       
00491         static LineScanner      eol;
00492         static LineTokenizer    line;
00493         static SpaceTokenizer   space;
00494         static PunctTokenizer   punct;
00495         static SimpleTokenizer  comma;
00496          
00497         /***********************************************************************
00498 
00499                 Make a few common tokenizers available as singletons      
00500 
00501         ***********************************************************************/
00502 
00503         static this ()
00504         {
00505                 eol = new LineScanner();
00506                 line = new LineTokenizer();           
00507                 space = new SpaceTokenizer();
00508                 punct = new PunctTokenizer();
00509                 comma = new SimpleTokenizer(',');
00510         }
00511 }
00512  
00513 
00514 

Generated on Sat Dec 24 17:28:34 2005 for Mango by  doxygen 1.4.0