Main Page | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Class Members | File Members | Related Pages

Copy of Tokenizer.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file Tokenizer.d     
00004 
00005         Copyright (c) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026         4. Derivative works are permitted, but they must carry this notice
00027            in full and credit the original source.
00028 
00029 
00030                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00031 
00032 
00033         @version        Initial version, March 2004
00034         @author         Kris
00035 
00036 
00037 *******************************************************************************/
00038 
00039 module mango.io.Tokenizer;
00040 
00041 version (Ares)
00042          private import  std.c.ctype;
00043       else
00044          private import  std.ctype;
00045 
00046 private import  mango.io.Token,
00047                 mango.io.Exception;
00048 
00049 private import  mango.io.model.IReader,
00050                 mango.io.model.IBuffer,
00051                 mango.io.model.IConduit;
00052 
00053 /*******************************************************************************
00054 
00055         Extract tokens from an IBuffer. This is the base-class for all
00056         Tokenizers, but can also be used outside of the ITokenizer model.
00057 
00058 *******************************************************************************/
00059 
00060 class ScannerTemplate(T)
00061 { 
00062         /***********************************************************************
00063         
00064                 Scan the given IBuffer for another token, and place the
00065                 results in the provided token. Note that this should be
00066                 completely thread-safe so one can instantiate singleton
00067                 tokenizers without issue.
00068 
00069                 Each Token is expected to be stripped of the delimiter.
00070                 An end-of-file condition causes trailing content to be 
00071                 placed into the token. Requests made beyond Eof result
00072                 in empty tokens (length == zero).
00073 
00074                 Returns true if a token was isolated, false otherwise.
00075 
00076         ***********************************************************************/
00077 
00078         bool next (IBuffer buffer, uint delegate (T[]) scan)
00079         {
00080                 while (buffer.read (cast(uint delegate(void[])) scan) == IConduit.Eof)
00081                       {
00082                       IConduit conduit = buffer.getConduit();
00083                       if (conduit is null)
00084                          {
00085                          buffer.skip (buffer.readable());
00086                          return false;
00087                          }
00088                       else
00089                          {
00090                          // no more space in the buffer?
00091                          if (! buffer.writable())
00092                             {
00093                             // did we start at the beginning?
00094                             if (buffer.getPosition ())
00095                                 // nope - move partial token to start of buffer
00096                                 buffer.compress ();
00097                             else
00098                                throw new TokenException ("Token is too large to fit within buffer");
00099                             }
00100 
00101                          // read another chunk of data
00102                          if (buffer.fill (conduit) == IConduit.Eof)
00103                             {
00104                             buffer.skip (buffer.readable());
00105                             return false;
00106                             }
00107                          }
00108                       }
00109                 return true;
00110         }
00111 
00112         /***********************************************************************
00113         
00114                 Clean up after we fail to find a token. Trailing content
00115                 is placed into the token, and the scanner is told to try
00116                 and load some more content (where available).
00117                 
00118         ***********************************************************************/
00119 
00120         uint notFound (TokenTemplate!(T) token, T[] content)
00121         {
00122                 token.set (content);
00123                 return IConduit.Eof;
00124         }
00125 }
00126 
00127 alias ScannerTemplate!(char) Scanner;
00128 
00129 
00130 /*******************************************************************************
00131 
00132         Interface to define how Tokenizers should expose their functionality.
00133 
00134 *******************************************************************************/
00135 
00136 interface ITokenizerTemplate(T)
00137 { 
00138         /***********************************************************************
00139         
00140         ***********************************************************************/
00141 
00142         bool next (IBuffer buffer, TokenTemplate token);
00143 }
00144 
00145 alias ITokenizerTemplate!(T) ITokenizer;
00146 
00147 
00148 
00149 /*******************************************************************************
00150 
00151         A simple delimiting tokenizer. Use this to tokenize simple streams
00152         such as comma-seperated text.
00153 
00154 *******************************************************************************/
00155 
00156 class SimpleTokenizerTemplate(T) : ScannerTemplate!(T), ITokenizer!(T)
00157 {
00158         private T delimiter;
00159 
00160         /***********************************************************************
00161         
00162                 Construct a SimpleTokenizer with the given delimiter char.
00163                 More sophisticated delimiters can be constructed by using
00164                 a RegexTokenizer instead. 
00165 
00166         ***********************************************************************/
00167 
00168         this (T delimiter)
00169         {
00170                 this.delimiter = delimiter;
00171         }
00172      
00173 
00174         /***********************************************************************
00175         
00176                 Locate the next token from the provided buffer, and map a
00177                 buffer reference into token. Returns true if a token was 
00178                 located, false otherwise. 
00179 
00180                 Note that the buffer content is not duplicated. Instead, a
00181                 slice of the buffer is referenced by the token. You can use
00182                 Token.clone() or Token.toString().dup() to copy content per
00183                 your application needs.
00184 
00185                 Note also that there may still be one token left in a buffer 
00186                 that was not terminated correctly (as in eof conditions). In 
00187                 such cases, tokens are mapped onto remaining content and the 
00188                 buffer will have no more readable content.
00189 
00190         ***********************************************************************/
00191 
00192         bool next (IBuffer buffer, TokenTemplate token)
00193         {
00194                 uint scan (T[] content)
00195                 {
00196                         foreach (int i, T c; content)
00197                                  if (c == delimiter)
00198                                     {
00199                                     token.set (content[0..i]);
00200                                     return i+1;
00201                                     }
00202 
00203                         return notFound (token, content);
00204                 }
00205 
00206                 return super.next (buffer, &scan);
00207         }
00208 }
00209 
00210 alias SimpleTokenizerTemplate!(char) SimpleTokenizer;
00211 
00212 
00213 
00214 /*******************************************************************************
00215 
00216         A tokenizer that isolates content enclosed by whitespace.
00217 
00218 *******************************************************************************/
00219 
00220 class SpaceTokenizerTemplate(T) : ScannerTemplate!(T), ITokenizer!(T)
00221 {
00222         /***********************************************************************
00223         
00224                 Locate the next token from the provided buffer, and map a
00225                 buffer reference into token. Returns true if a token was 
00226                 located, false otherwise. 
00227 
00228                 Note that the buffer content is not duplicated. Instead, a
00229                 slice of the buffer is referenced by the token. You can use
00230                 Token.clone() or Token.toString().dup() to copy content per
00231                 your application needs.
00232 
00233                 Note also that there may still be one token left in a buffer 
00234                 that was not terminated correctly (as in eof conditions). In 
00235                 such cases, tokens are mapped onto remaining content and the 
00236                 buffer will have no more readable content.
00237 
00238         ***********************************************************************/
00239 
00240         bool next (IBuffer buffer, TokenTemplate token)
00241         {
00242                 uint scan (T[] content)
00243                 {
00244                         foreach (int i, T c; content)
00245                                  if (isspace (c))
00246                                     {
00247                                     token.set (content[0..i]);
00248                                     return i+1;
00249                                     }
00250 
00251                         return notFound (token, content);
00252                 }
00253 
00254                 return super.next (buffer, &scan);
00255         }
00256 }
00257 
00258 alias SpaceTokenizerTemplate!(char) SpaceTokenizer;
00259 
00260 
00261 /*******************************************************************************
00262 
00263         A tokenizer for handling both whitespace and punctuation delimiters.
00264 
00265 *******************************************************************************/
00266 
00267 class PunctTokenizerTemplate(T) : ScannerTemplate!(T), ITokenizer!(T)
00268 {
00269         /***********************************************************************
00270         
00271                 Locate the next token from the provided buffer, and map a
00272                 buffer reference into token. Returns true if a token was 
00273                 located, false otherwise. 
00274 
00275                 Note that the buffer content is not duplicated. Instead, a
00276                 slice of the buffer is referenced by the token. You can use
00277                 Token.clone() or Token.toString().dup() to copy content per
00278                 your application needs.
00279 
00280                 Note also that there may still be one token left in a buffer 
00281                 that was not terminated correctly (as in eof conditions). In 
00282                 such cases, tokens are mapped onto remaining content and the 
00283                 buffer will have no more readable content.
00284 
00285         ***********************************************************************/
00286 
00287         bool next (IBuffer buffer, TokenTemplate token)
00288         {
00289                 uint scan (T[] content)
00290                 {
00291                         foreach (int i, T c; content)
00292                                  if (isspace(c) || ispunct(c))
00293                                     {
00294                                     token.set (content[0..i]);
00295                                     return i+1;
00296                                     }
00297 
00298                         return notFound (token, content);
00299                 }
00300 
00301                 return super.next (buffer, &scan);
00302         }
00303 }
00304 
00305 alias PunctTokenizerTemplate!(char) PunctTokenizer;
00306 
00307 
00308 
00309 /*******************************************************************************
00310 
00311         Tokenize an entire line delimited by a single '\\n' character, or
00312         by a "\r\n" pair.
00313 
00314 *******************************************************************************/
00315 
00316 class LineTokenizerTemplate(T) : ScannerTemplate!(T), ITokenizer!(T)
00317 {
00318         /***********************************************************************
00319         
00320                 Locate the next token from the provided buffer, and map a
00321                 buffer reference into token. Returns true if a token was 
00322                 located, false otherwise. 
00323 
00324                 Note that the buffer content is not duplicated. Instead, a
00325                 slice of the buffer is referenced by the token. You can use
00326                 Token.clone() or Token.toString().dup() to copy content per
00327                 your application needs.
00328 
00329                 Note also that there may still be one token left in a buffer 
00330                 that was not terminated correctly (as in eof conditions). In 
00331                 such cases, tokens are mapped onto remaining content and the 
00332                 buffer will have no more readable content.
00333 
00334         ***********************************************************************/
00335 
00336         bool next (IBuffer buffer, TokenTemplate token)
00337         {
00338                 uint scan (T[] content)
00339                 {      
00340                         foreach (int i, T c; content)
00341                                  if (c == '\n')
00342                                     {
00343                                     int slice = i;
00344                                     if (i && content[i-1] == '\r')
00345                                         --slice;
00346                                     token.set (content[0..slice]);
00347                                     return i+1;
00348                                     }
00349 
00350                         return notFound (token, content);
00351                 }
00352 
00353                 return super.next (buffer, &scan);
00354         }
00355 }
00356    
00357 alias LineTokenizerTemplate!(char) LineTokenizer;
00358 
00359      
00360 /*******************************************************************************
00361 
00362         Eat everything until we reach a newline. Use this with a Reader, 
00363         where you wish to discard everything else in the current line. 
00364 
00365 *******************************************************************************/
00366 
00367 class LineScannerTemplate(T) : ScannerTemplate!(T), IReadable!(T)
00368 {       
00369         /***********************************************************************
00370         
00371                 IReadable interface to support Reader.get()
00372 
00373         ***********************************************************************/
00374 
00375         void read (IReader r)
00376         {
00377                 next (r.getBuffer());
00378         }
00379                 
00380         /***********************************************************************
00381         
00382                 Eat all content until we see a '\n' character. The content
00383                 is simply discarded.
00384 
00385         ***********************************************************************/
00386 
00387         bool next (IBuffer buffer)
00388         {
00389                 uint scan (T[] content)
00390                 {      
00391                         foreach (int i, T c; content)
00392                                  if (c == '\n')
00393                                      return i+1;
00394                         return IConduit.Eof;
00395                 }
00396 
00397                 return super.next (buffer, &scan);
00398         }
00399 }
00400 
00401 alias LineScannerTemplate!(char) LineScanner;
00402 
00403 
00404 version (Ares) {}
00405 else
00406 {
00407 /*******************************************************************************
00408 
00409         Wrap a tokenizer around the std.RegExp class. This is useful for
00410         situations where you can't load the entire source into memory at
00411         one time. In other words, this adapts RegExp into an incremental
00412         scanner.
00413 
00414         Note that the associated buffer must be large enough to contain
00415         an entire RegExp match. For example, if you have a regex pattern
00416         that matches an entire file then the buffer must be at least the
00417         size of the file. In such cases, one might be advised to find an 
00418         more effective solution.
00419 
00420 *******************************************************************************/
00421 
00422 class RegexTokenizerTemplate(T) : ScannerTemplate!(T), ITokenizer!(T)
00423 {
00424         import std.regexp;
00425     
00426         private RegExp!(T) exp;
00427 
00428         /***********************************************************************
00429         
00430                 Construct a RegexTokenizer with the provided RegExp.
00431 
00432         ***********************************************************************/
00433 
00434         this (RegExp!(T) exp)
00435         {
00436                 this.exp = exp;
00437         }
00438 
00439         /***********************************************************************
00440         
00441                 Locate the next token from the provided buffer, and map a
00442                 buffer reference into token. Returns true if a token was 
00443                 located, false otherwise. 
00444 
00445                 Note that the buffer content is not duplicated. Instead, a
00446                 slice of the buffer is referenced by the token. You can use
00447                 Token.clone() or Token.toString().dup() to copy content per
00448                 your application needs.
00449 
00450                 Note also that there may still be one token left in a buffer 
00451                 that was not terminated correctly (as in eof conditions). In 
00452                 such cases, tokens are mapped onto remaining content and the 
00453                 buffer will have no more readable content.
00454 
00455         ***********************************************************************/
00456 
00457         bool next (IBuffer buffer, TokenTemplate token)
00458         {
00459                 uint scan (T[] content)
00460                 {      
00461                         //printf ("'%.*s' : %d\n", content, content.length);
00462 
00463                         // did we find a match?
00464                         if (exp.test (content))
00465                            {
00466                            int start = exp.pmatch[0].rm_so;
00467                            int end   = exp.pmatch[0].rm_eo;
00468 
00469                            // yep: stuff it into the token and go home
00470                            token.set (content[start..end]);
00471                            return end;
00472                            }
00473                         
00474                         // this is a bit tricky since RegExp doesn't tell
00475                         // us when it has a partial match. To compensate,
00476                         // we force the buffer to load as much as it can
00477                         // after a failure within a *partial* buffer.
00478                         if (buffer.getPosition())
00479                             buffer.compress();
00480                         else
00481                            // skip past everything that didn't match. The
00482                            // entire buffer may still be a partial match,
00483                            // but then it should be made bigger to begin
00484                            // with.
00485                            buffer.skip (content.length);
00486 
00487                         // say we found nothing
00488                         return notFound (token, content);
00489                 }
00490 
00491                 // return the next token using this tokenizer
00492                 return super.next (buffer, &scan);
00493         }
00494 }
00495 }   
00496      
00497 /*******************************************************************************
00498 
00499         It's convenient to have some simple tokenizers available without 
00500         constructing them, so we provide a few to get going with.
00501 
00502         Note that these Tokenizers do not maintain any state of their own. 
00503         Thus they are all thread-safe.
00504 
00505 *******************************************************************************/
00506 
00507 struct Tokenizers
00508 {       
00509         static LineScanner      eol;
00510         static LineTokenizer    line;
00511         static SpaceTokenizer   space;
00512         static PunctTokenizer   punct;
00513         static SimpleTokenizer  comma;
00514          
00515         /***********************************************************************
00516 
00517                 Make a few common tokenizers available as singletons      
00518 
00519         ***********************************************************************/
00520 
00521         static this ()
00522         {
00523                 eol = new LineScanner();
00524                 line = new LineTokenizer();           
00525                 space = new SpaceTokenizer();
00526                 punct = new PunctTokenizer();
00527                 comma = new SimpleTokenizer(',');
00528         }
00529 }
00530  
00531 
00532 

Generated on Sat Dec 24 17:28:32 2005 for Mango by  doxygen 1.4.0