00001 /******************************************************************************* 00002 00003 @file UNormalize.d 00004 00005 Copyright (c) 2004 Kris Bell 00006 00007 This software is provided 'as-is', without any express or implied 00008 warranty. In no event will the authors be held liable for damages 00009 of any kind arising from the use of this software. 00010 00011 Permission is hereby granted to anyone to use this software for any 00012 purpose, including commercial applications, and to alter it and/or 00013 redistribute it freely, subject to the following restrictions: 00014 00015 1. The origin of this software must not be misrepresented; you must 00016 not claim that you wrote the original software. If you use this 00017 software in a product, an acknowledgment within documentation of 00018 said product would be appreciated but is not required. 00019 00020 2. Altered source versions must be plainly marked as such, and must 00021 not be misrepresented as being the original software. 00022 00023 3. This notice may not be removed or altered from any distribution 00024 of the source. 00025 00026 4. Derivative works are permitted, but they must carry this notice 00027 in full and credit the original source. 00028 00029 00030 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00031 00032 00033 @version Initial version, October 2004 00034 @author Kris 00035 00036 Note that this package and documentation is built around the ICU 00037 project (http://oss.software.ibm.com/icu/). Below is the license 00038 statement as specified by that software: 00039 00040 00041 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 00042 00043 00044 ICU License - ICU 1.8.1 and later 00045 00046 COPYRIGHT AND PERMISSION NOTICE 00047 00048 Copyright (c) 1995-2003 International Business Machines Corporation and 00049 others. 00050 00051 All rights reserved. 00052 00053 Permission is hereby granted, free of charge, to any person obtaining a 00054 copy of this software and associated documentation files (the 00055 "Software"), to deal in the Software without restriction, including 00056 without limitation the rights to use, copy, modify, merge, publish, 00057 distribute, and/or sell copies of the Software, and to permit persons 00058 to whom the Software is furnished to do so, provided that the above 00059 copyright notice(s) and this permission notice appear in all copies of 00060 the Software and that both the above copyright notice(s) and this 00061 permission notice appear in supporting documentation. 00062 00063 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 00064 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00065 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 00066 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 00067 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL 00068 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING 00069 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, 00070 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION 00071 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 00072 00073 Except as contained in this notice, the name of a copyright holder 00074 shall not be used in advertising or otherwise to promote the sale, use 00075 or other dealings in this Software without prior written authorization 00076 of the copyright holder. 00077 00078 ---------------------------------------------------------------------- 00079 00080 All trademarks and registered trademarks mentioned herein are the 00081 property of their respective owners. 00082 00083 *******************************************************************************/ 00084 00085 module mango.icu.UNormalize; 00086 00087 private import mango.icu.ICU, 00088 mango.icu.UString, 00089 mango.icu.ULocale; 00090 00091 /******************************************************************************* 00092 00093 transforms Unicode text into an equivalent composed or 00094 decomposed form, allowing for easier sorting and searching 00095 of text. UNormalize supports the standard normalization forms 00096 described in http://www.unicode.org/unicode/reports/tr15/ 00097 00098 Characters with accents or other adornments can be encoded 00099 in several different ways in Unicode. For example, take the 00100 character A-acute. In Unicode, this can be encoded as a single 00101 character (the "composed" form): 00102 00103 00C1 LATIN CAPITAL LETTER A WITH ACUTE 00104 00105 or as two separate characters (the "decomposed" form): 00106 00107 0041 LATIN CAPITAL LETTER A 0301 COMBINING ACUTE ACCENT 00108 00109 To a user of your program, however, both of these sequences 00110 should be treated as the same "user-level" character "A with 00111 acute accent". When you are searching or comparing text, you 00112 must ensure that these two sequences are treated equivalently. 00113 In addition, you must handle characters with more than one 00114 accent. Sometimes the order of a character's combining accents 00115 is significant, while in other cases accent sequences in different 00116 orders are really equivalent. 00117 00118 Similarly, the string "ffi" can be encoded as three separate 00119 letters: 00120 00121 0066 LATIN SMALL LETTER F 0066 LATIN SMALL LETTER F 00122 0069 LATIN SMALL LETTER I 00123 00124 or as the single character 00125 00126 FB03 LATIN SMALL LIGATURE FFI 00127 00128 The ffi ligature is not a distinct semantic character, and strictly 00129 speaking it shouldn't be in Unicode at all, but it was included for 00130 compatibility with existing character sets that already provided it. 00131 The Unicode standard identifies such characters by giving them 00132 "compatibility" decompositions into the corresponding semantic 00133 characters. When sorting and searching, you will often want to use 00134 these mappings. 00135 00136 unorm_normalize helps solve these problems by transforming text into 00137 the canonical composed and decomposed forms as shown in the first 00138 example above. In addition, you can have it perform compatibility 00139 decompositions so that you can treat compatibility characters the 00140 same as their equivalents. Finally, UNormalize rearranges 00141 accents into the proper canonical order, so that you do not have 00142 to worry about accent rearrangement on your own. 00143 00144 Form FCD, "Fast C or D", is also designed for collation. It allows 00145 to work on strings that are not necessarily normalized with an 00146 algorithm (like in collation) that works under "canonical closure", 00147 i.e., it treats precomposed characters and their decomposed 00148 equivalents the same. 00149 00150 It is not a normalization form because it does not provide for 00151 uniqueness of representation. Multiple strings may be canonically 00152 equivalent (their NFDs are identical) and may all conform to FCD 00153 without being identical themselves. 00154 00155 The form is defined such that the "raw decomposition", the 00156 recursive canonical decomposition of each character, results 00157 in a string that is canonically ordered. This means that 00158 precomposed characters are allowed for as long as their 00159 decompositions do not need canonical reordering. 00160 00161 Its advantage for a process like collation is that all NFD 00162 and most NFC texts - and many unnormalized texts - already 00163 conform to FCD and do not need to be normalized (NFD) for 00164 such a process. The FCD quick check will return UNORM_YES 00165 for most strings in practice. 00166 00167 For more details on FCD see the collation design document: 00168 http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/collation/ICU_collation_design.htm 00169 00170 ICU collation performs either NFD or FCD normalization 00171 automatically if normalization is turned on for the collator 00172 object. Beyond collation and string search, normalized strings 00173 may be useful for string equivalence comparisons, transliteration/ 00174 transcription, unique representations, etc. 00175 00176 The W3C generally recommends to exchange texts in NFC. Note also 00177 that most legacy character encodings use only precomposed forms 00178 and often do not encode any combining marks by themselves. For 00179 conversion to such character encodings the Unicode text needs to 00180 be normalized to NFC. For more usage examples, see the Unicode 00181 Standard Annex. 00182 00183 See <A HREF="http://oss.software.ibm.com/icu/apiref/unorm_8h.html"> 00184 this page</A> for full details. 00185 00186 00187 *******************************************************************************/ 00188 00189 class UNormalize : ICU 00190 { 00191 enum Mode 00192 { 00193 None = 1, 00194 NFD = 2, 00195 NFKD = 3, 00196 NFC = 4, 00197 Default = NFC, 00198 NFKC = 5, 00199 FCD = 6, 00200 Count 00201 } 00202 00203 enum Check 00204 { 00205 No, 00206 Yes, 00207 Maybe 00208 } 00209 00210 enum Options 00211 { 00212 None = 0x00, 00213 Unicode32 = 0x20 00214 } 00215 00216 /*********************************************************************** 00217 00218 Normalize a string. The string will be normalized according 00219 the specified normalization mode and options 00220 00221 ***********************************************************************/ 00222 00223 static void normalize (UText src, UString dst, Mode mode, Options o = Options.None) 00224 { 00225 uint fmt (wchar* dst, uint len, inout Error e) 00226 { 00227 return unorm_normalize (src.get, src.len, mode, o, dst, len, e); 00228 } 00229 00230 dst.format (&fmt, "failed to normalize"); 00231 } 00232 00233 /*********************************************************************** 00234 00235 Performing quick check on a string, to quickly determine 00236 if the string is in a particular normalization format. 00237 00238 Three types of result can be returned: Yes, No or Maybe. 00239 Result Yes indicates that the argument string is in the 00240 desired normalized format, No determines that argument 00241 string is not in the desired normalized format. A Maybe 00242 result indicates that a more thorough check is required, 00243 the user may have to put the string in its normalized 00244 form and compare the results. 00245 00246 ***********************************************************************/ 00247 00248 static Check check (UText t, Mode mode, Options o = Options.None) 00249 { 00250 Error e; 00251 00252 Check c = cast(Check) unorm_quickCheckWithOptions (t.get, t.len, mode, o, e); 00253 testError (e, "failed to perform normalization check"); 00254 return c; 00255 } 00256 00257 /*********************************************************************** 00258 00259 Test if a string is in a given normalization form. 00260 00261 Unlike check(), this function returns a definitive result, 00262 never a "maybe". For NFD, NFKD, and FCD, both functions 00263 work exactly the same. For NFC and NFKC where quickCheck 00264 may return "maybe", this function will perform further 00265 tests to arrive at a TRUE/FALSE result. 00266 00267 ***********************************************************************/ 00268 00269 static bool isNormalized (UText t, Mode mode, Options o = Options.None) 00270 { 00271 Error e; 00272 00273 byte b = unorm_isNormalizedWithOptions (t.get, t.len, mode, o, e); 00274 testError (e, "failed to perform normalization test"); 00275 return b != 0; 00276 } 00277 00278 /*********************************************************************** 00279 00280 Concatenate normalized strings, making sure that the result 00281 is normalized as well. If both the left and the right strings 00282 are in the normalization form according to "mode/options", 00283 then the result will be 00284 00285 dest=normalize(left+right, mode, options) 00286 00287 With the input strings already being normalized, this function 00288 will use unorm_next() and unorm_previous() to find the adjacent 00289 end pieces of the input strings. Only the concatenation of these 00290 end pieces will be normalized and then concatenated with the 00291 remaining parts of the input strings. 00292 00293 It is allowed to have dst==left to avoid copying the entire 00294 left string. 00295 00296 ***********************************************************************/ 00297 00298 static void concatenate (UText left, UText right, UString dst, Mode mode, Options o = Options.None) 00299 { 00300 uint fmt (wchar* p, uint len, inout Error e) 00301 { 00302 return unorm_concatenate (left.get, left.len, right.get, right.len, p, len, mode, o, e); 00303 } 00304 00305 dst.format (&fmt, "failed to concatenate"); 00306 } 00307 00308 /*********************************************************************** 00309 00310 Compare two strings for canonical equivalence. Further 00311 options include case-insensitive comparison and code 00312 point order (as opposed to code unit order). 00313 00314 Canonical equivalence between two strings is defined as 00315 their normalized forms (NFD or NFC) being identical. 00316 This function compares strings incrementally instead of 00317 normalizing (and optionally case-folding) both strings 00318 entirely, improving performance significantly. 00319 00320 Bulk normalization is only necessary if the strings do 00321 not fulfill the FCD conditions. Only in this case, and 00322 only if the strings are relatively long, is memory 00323 allocated temporarily. For FCD strings and short non-FCD 00324 strings there is no memory allocation. 00325 00326 ***********************************************************************/ 00327 00328 static int compare (UText left, UText right, Options o = Options.None) 00329 { 00330 Error e; 00331 00332 int i = unorm_compare (left.get, left.len, right.get, right.len, o, e); 00333 testError (e, "failed to compare"); 00334 return i; 00335 } 00336 00337 00338 /*********************************************************************** 00339 00340 Bind the ICU functions from a shared library. This is 00341 complicated by the issues regarding D and DLLs on the 00342 Windows platform 00343 00344 ***********************************************************************/ 00345 00346 private static void* library; 00347 00348 /*********************************************************************** 00349 00350 ***********************************************************************/ 00351 00352 private static extern (C) 00353 { 00354 uint function (wchar*, uint, uint, uint, wchar*, uint, inout Error) unorm_normalize; 00355 uint function (wchar*, uint, uint, uint, inout Error) unorm_quickCheckWithOptions; 00356 byte function (wchar*, uint, uint, uint, inout Error) unorm_isNormalizedWithOptions; 00357 uint function (wchar*, uint, wchar*, uint, wchar*, uint, uint, uint, inout Error) unorm_concatenate; 00358 uint function (wchar*, uint, wchar*, uint, uint, inout Error) unorm_compare; 00359 } 00360 00361 /*********************************************************************** 00362 00363 ***********************************************************************/ 00364 00365 static FunctionLoader.Bind[] targets = 00366 [ 00367 {cast(void**) &unorm_normalize, "unorm_normalize"}, 00368 {cast(void**) &unorm_quickCheckWithOptions, "unorm_quickCheckWithOptions"}, 00369 {cast(void**) &unorm_isNormalizedWithOptions, "unorm_isNormalizedWithOptions"}, 00370 {cast(void**) &unorm_concatenate, "unorm_concatenate"}, 00371 {cast(void**) &unorm_compare, "unorm_compare"}, 00372 ]; 00373 00374 /*********************************************************************** 00375 00376 ***********************************************************************/ 00377 00378 static this () 00379 { 00380 library = FunctionLoader.bind (icuuc, targets); 00381 } 00382 00383 /*********************************************************************** 00384 00385 ***********************************************************************/ 00386 00387 static ~this () 00388 { 00389 FunctionLoader.unbind (library); 00390 } 00391 }