Main Page | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members | Related Pages

UNormalize.d

Go to the documentation of this file.
00001 /*******************************************************************************
00002 
00003         @file UNormalize.d
00004         
00005         Copyright (c) 2004 Kris Bell
00006         
00007         This software is provided 'as-is', without any express or implied
00008         warranty. In no event will the authors be held liable for damages
00009         of any kind arising from the use of this software.
00010         
00011         Permission is hereby granted to anyone to use this software for any 
00012         purpose, including commercial applications, and to alter it and/or 
00013         redistribute it freely, subject to the following restrictions:
00014         
00015         1. The origin of this software must not be misrepresented; you must 
00016            not claim that you wrote the original software. If you use this 
00017            software in a product, an acknowledgment within documentation of 
00018            said product would be appreciated but is not required.
00019 
00020         2. Altered source versions must be plainly marked as such, and must 
00021            not be misrepresented as being the original software.
00022 
00023         3. This notice may not be removed or altered from any distribution
00024            of the source.
00025 
00026         4. Derivative works are permitted, but they must carry this notice
00027            in full and credit the original source.
00028 
00029 
00030                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00031 
00032 
00033         @version        Initial version, October 2004      
00034         @author         Kris
00035 
00036         Note that this package and documentation is built around the ICU 
00037         project (http://oss.software.ibm.com/icu/). Below is the license 
00038         statement as specified by that software:
00039 
00040 
00041                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
00042 
00043 
00044         ICU License - ICU 1.8.1 and later
00045 
00046         COPYRIGHT AND PERMISSION NOTICE
00047 
00048         Copyright (c) 1995-2003 International Business Machines Corporation and 
00049         others.
00050 
00051         All rights reserved.
00052 
00053         Permission is hereby granted, free of charge, to any person obtaining a
00054         copy of this software and associated documentation files (the
00055         "Software"), to deal in the Software without restriction, including
00056         without limitation the rights to use, copy, modify, merge, publish,
00057         distribute, and/or sell copies of the Software, and to permit persons
00058         to whom the Software is furnished to do so, provided that the above
00059         copyright notice(s) and this permission notice appear in all copies of
00060         the Software and that both the above copyright notice(s) and this
00061         permission notice appear in supporting documentation.
00062 
00063         THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
00064         OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00065         MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
00066         OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
00067         HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
00068         INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
00069         FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
00070         NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
00071         WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
00072 
00073         Except as contained in this notice, the name of a copyright holder
00074         shall not be used in advertising or otherwise to promote the sale, use
00075         or other dealings in this Software without prior written authorization
00076         of the copyright holder.
00077 
00078         ----------------------------------------------------------------------
00079 
00080         All trademarks and registered trademarks mentioned herein are the 
00081         property of their respective owners.
00082 
00083 *******************************************************************************/
00084 
00085 module mango.icu.UNormalize;
00086 
00087 private import  mango.icu.ICU,
00088                 mango.icu.UString,
00089                 mango.icu.ULocale;
00090 
00091 /*******************************************************************************
00092 
00093         transforms Unicode text into an equivalent composed or 
00094         decomposed form, allowing for easier sorting and searching 
00095         of text. UNormalize supports the standard normalization forms 
00096         described in http://www.unicode.org/unicode/reports/tr15/
00097 
00098         Characters with accents or other adornments can be encoded 
00099         in several different ways in Unicode. For example, take the 
00100         character A-acute. In Unicode, this can be encoded as a single 
00101         character (the "composed" form):
00102         
00103                 00C1 LATIN CAPITAL LETTER A WITH ACUTE
00104 
00105         or as two separate characters (the "decomposed" form):
00106 
00107                 0041 LATIN CAPITAL LETTER A 0301 COMBINING ACUTE ACCENT
00108 
00109         To a user of your program, however, both of these sequences 
00110         should be treated as the same "user-level" character "A with 
00111         acute accent". When you are searching or comparing text, you 
00112         must ensure that these two sequences are treated equivalently. 
00113         In addition, you must handle characters with more than one 
00114         accent. Sometimes the order of a character's combining accents 
00115         is significant, while in other cases accent sequences in different 
00116         orders are really equivalent.
00117 
00118         Similarly, the string "ffi" can be encoded as three separate 
00119         letters:
00120 
00121                 0066 LATIN SMALL LETTER F 0066 LATIN SMALL LETTER F 
00122                 0069 LATIN SMALL LETTER I
00123 
00124         or as the single character
00125 
00126                 FB03 LATIN SMALL LIGATURE FFI
00127 
00128         The ffi ligature is not a distinct semantic character, and strictly 
00129         speaking it shouldn't be in Unicode at all, but it was included for 
00130         compatibility with existing character sets that already provided it. 
00131         The Unicode standard identifies such characters by giving them 
00132         "compatibility" decompositions into the corresponding semantic 
00133         characters. When sorting and searching, you will often want to use 
00134         these mappings.
00135 
00136         unorm_normalize helps solve these problems by transforming text into 
00137         the canonical composed and decomposed forms as shown in the first 
00138         example above. In addition, you can have it perform compatibility 
00139         decompositions so that you can treat compatibility characters the 
00140         same as their equivalents. Finally, UNormalize rearranges 
00141         accents into the proper canonical order, so that you do not have 
00142         to worry about accent rearrangement on your own.
00143 
00144         Form FCD, "Fast C or D", is also designed for collation. It allows 
00145         to work on strings that are not necessarily normalized with an 
00146         algorithm (like in collation) that works under "canonical closure", 
00147         i.e., it treats precomposed characters and their decomposed 
00148         equivalents the same.
00149 
00150         It is not a normalization form because it does not provide for 
00151         uniqueness of representation. Multiple strings may be canonically 
00152         equivalent (their NFDs are identical) and may all conform to FCD 
00153         without being identical themselves.
00154 
00155         The form is defined such that the "raw decomposition", the 
00156         recursive canonical decomposition of each character, results 
00157         in a string that is canonically ordered. This means that 
00158         precomposed characters are allowed for as long as their 
00159         decompositions do not need canonical reordering.
00160 
00161         Its advantage for a process like collation is that all NFD 
00162         and most NFC texts - and many unnormalized texts - already 
00163         conform to FCD and do not need to be normalized (NFD) for 
00164         such a process. The FCD quick check will return UNORM_YES 
00165         for most strings in practice.
00166 
00167         For more details on FCD see the collation design document: 
00168         http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/collation/ICU_collation_design.htm
00169 
00170         ICU collation performs either NFD or FCD normalization 
00171         automatically if normalization is turned on for the collator 
00172         object. Beyond collation and string search, normalized strings 
00173         may be useful for string equivalence comparisons, transliteration/
00174         transcription, unique representations, etc.
00175 
00176         The W3C generally recommends to exchange texts in NFC. Note also 
00177         that most legacy character encodings use only precomposed forms 
00178         and often do not encode any combining marks by themselves. For 
00179         conversion to such character encodings the Unicode text needs to 
00180         be normalized to NFC. For more usage examples, see the Unicode 
00181         Standard Annex.         
00182 
00183         See <A HREF="http://oss.software.ibm.com/icu/apiref/unorm_8h.html">
00184         this page</A> for full details.
00185 
00186 
00187 *******************************************************************************/
00188 
00189 class UNormalize : ICU
00190 {
00191         enum    Mode 
00192                 {
00193                 None    = 1, 
00194                 NFD     = 2, 
00195                 NFKD    = 3, 
00196                 NFC     = 4,
00197                 Default = NFC, 
00198                 NFKC    = 5, 
00199                 FCD     = 6, 
00200                 Count
00201                 }
00202 
00203         enum    Check 
00204                 { 
00205                 No, 
00206                 Yes, 
00207                 Maybe  
00208                 }
00209 
00210         enum    Options
00211                 { 
00212                 None      = 0x00,
00213                 Unicode32 = 0x20 
00214                 }
00215 
00216         /***********************************************************************
00217 
00218                 Normalize a string. The string will be normalized according 
00219                 the specified normalization mode and options        
00220 
00221         ***********************************************************************/
00222 
00223         static void normalize (UText src, UString dst, Mode mode, Options o = Options.None)
00224         {
00225                 uint fmt (wchar* dst, uint len, inout Error e)
00226                 {
00227                         return unorm_normalize (src.get, src.len, mode, o, dst, len, e);
00228                 }
00229 
00230                 dst.format (&fmt, "failed to normalize");
00231         }
00232 
00233         /***********************************************************************
00234 
00235                 Performing quick check on a string, to quickly determine 
00236                 if the string is in a particular normalization format.
00237 
00238                 Three types of result can be returned: Yes, No or Maybe. 
00239                 Result Yes indicates that the argument string is in the 
00240                 desired normalized format, No determines that argument 
00241                 string is not in the desired normalized format. A Maybe 
00242                 result indicates that a more thorough check is required, 
00243                 the user may have to put the string in its normalized 
00244                 form and compare the results.        
00245 
00246         ***********************************************************************/
00247 
00248         static Check check (UText t, Mode mode, Options o = Options.None)
00249         {      
00250                 Error e; 
00251 
00252                 Check c = cast(Check) unorm_quickCheckWithOptions (t.get, t.len, mode, o, e);
00253                 testError (e, "failed to perform normalization check");
00254                 return c;
00255         }
00256 
00257         /***********************************************************************
00258 
00259                 Test if a string is in a given normalization form. 
00260 
00261                 Unlike check(), this function returns a definitive result, 
00262                 never a "maybe". For NFD, NFKD, and FCD, both functions 
00263                 work exactly the same. For NFC and NFKC where quickCheck 
00264                 may return "maybe", this function will perform further 
00265                 tests to arrive at a TRUE/FALSE result.        
00266 
00267         ***********************************************************************/
00268 
00269         static bool isNormalized (UText t, Mode mode, Options o = Options.None)
00270         {      
00271                 Error e; 
00272 
00273                 byte b = unorm_isNormalizedWithOptions (t.get, t.len, mode, o, e);
00274                 testError (e, "failed to perform normalization test");
00275                 return b != 0;
00276         }
00277 
00278         /***********************************************************************
00279 
00280                 Concatenate normalized strings, making sure that the result 
00281                 is normalized as well. If both the left and the right strings 
00282                 are in the normalization form according to "mode/options", 
00283                 then the result will be
00284 
00285                         dest=normalize(left+right, mode, options)
00286 
00287                 With the input strings already being normalized, this function 
00288                 will use unorm_next() and unorm_previous() to find the adjacent 
00289                 end pieces of the input strings. Only the concatenation of these 
00290                 end pieces will be normalized and then concatenated with the 
00291                 remaining parts of the input strings.
00292 
00293                 It is allowed to have dst==left to avoid copying the entire 
00294                 left string.        
00295 
00296         ***********************************************************************/
00297 
00298         static void concatenate (UText left, UText right, UString dst, Mode mode, Options o = Options.None)
00299         {      
00300                 uint fmt (wchar* p, uint len, inout Error e)
00301                 {
00302                         return unorm_concatenate (left.get, left.len, right.get, right.len, p, len, mode, o, e);
00303                 }
00304 
00305                 dst.format (&fmt, "failed to concatenate");
00306         }
00307 
00308         /***********************************************************************
00309         
00310                 Compare two strings for canonical equivalence. Further 
00311                 options include case-insensitive comparison and code 
00312                 point order (as opposed to code unit order).
00313 
00314                 Canonical equivalence between two strings is defined as 
00315                 their normalized forms (NFD or NFC) being identical. 
00316                 This function compares strings incrementally instead of
00317                 normalizing (and optionally case-folding) both strings 
00318                 entirely, improving performance significantly.
00319 
00320                 Bulk normalization is only necessary if the strings do 
00321                 not fulfill the FCD conditions. Only in this case, and 
00322                 only if the strings are relatively long, is memory 
00323                 allocated temporarily. For FCD strings and short non-FCD 
00324                 strings there is no memory allocation.
00325 
00326         ***********************************************************************/
00327 
00328         static int compare (UText left, UText right, Options o = Options.None)
00329         {      
00330                 Error e; 
00331 
00332                 int i = unorm_compare (left.get, left.len, right.get, right.len, o, e);
00333                 testError (e, "failed to compare");
00334                 return i;
00335         }
00336 
00337 
00338         /***********************************************************************
00339         
00340                 Bind the ICU functions from a shared library. This is
00341                 complicated by the issues regarding D and DLLs on the
00342                 Windows platform
00343 
00344         ***********************************************************************/
00345 
00346         private static void* library;
00347 
00348         /***********************************************************************
00349 
00350         ***********************************************************************/
00351 
00352         private static extern (C) 
00353         {
00354                 uint  function (wchar*, uint, uint, uint, wchar*, uint, inout Error) unorm_normalize;
00355                 uint  function (wchar*, uint, uint, uint, inout Error) unorm_quickCheckWithOptions;
00356                 byte  function (wchar*, uint, uint, uint, inout Error) unorm_isNormalizedWithOptions;
00357                 uint  function (wchar*, uint, wchar*, uint, wchar*, uint, uint, uint, inout Error) unorm_concatenate;
00358                 uint  function (wchar*, uint, wchar*, uint, uint, inout Error) unorm_compare;
00359         }
00360 
00361         /***********************************************************************
00362 
00363         ***********************************************************************/
00364 
00365         static  FunctionLoader.Bind[] targets = 
00366                 [
00367                 {cast(void**) &unorm_normalize,                 "unorm_normalize"},
00368                 {cast(void**) &unorm_quickCheckWithOptions,     "unorm_quickCheckWithOptions"},
00369                 {cast(void**) &unorm_isNormalizedWithOptions,   "unorm_isNormalizedWithOptions"},
00370                 {cast(void**) &unorm_concatenate,               "unorm_concatenate"},
00371                 {cast(void**) &unorm_compare,                   "unorm_compare"},
00372                 ];
00373 
00374         /***********************************************************************
00375 
00376         ***********************************************************************/
00377 
00378         static this ()
00379         {
00380                 library = FunctionLoader.bind (icuuc, targets);
00381         }
00382 
00383         /***********************************************************************
00384 
00385         ***********************************************************************/
00386 
00387         static ~this ()
00388         {
00389                 FunctionLoader.unbind (library);
00390         }
00391 }

Generated on Sat Apr 9 20:11:29 2005 for Mango by doxygen 1.3.6