1 /*
2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package java.lang;
27
28 import java.text.BreakIterator;
29 import java.util.HashSet;
30 import java.util.Hashtable;
31 import java.util.Iterator;
32 import java.util.Locale;
33 import sun.text.Normalizer;
34
35
36 /**
37 * This is a utility class for <code>String.toLowerCase()</code> and
38 * <code>String.toUpperCase()</code>, that handles special casing with
39 * conditions. In other words, it handles the mappings with conditions
40 * that are defined in
41 * <a href="http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt">Special
42 * Casing Properties</a> file.
43 * <p>
44 * Note that the unconditional case mappings (including 1:M mappings)
45 * are handled in <code>Character.toLower/UpperCase()</code>.
46 */
47 final class ConditionalSpecialCasing {
48
49 // context conditions.
50 static final int FINAL_CASED = 1;
51 static final int AFTER_SOFT_DOTTED = 2;
52 static final int MORE_ABOVE = 3;
53 static final int AFTER_I = 4;
54 static final int NOT_BEFORE_DOT = 5;
55
56 // combining class definitions
57 static final int COMBINING_CLASS_ABOVE = 230;
58
59 // Special case mapping entries
60 static Entry[] entry = {
61 //# ================================================================================
62 //# Conditional mappings
63 //# ================================================================================
64 new Entry(0x03A3, new char[]{0x03C2}, new char[]{0x03A3}, null, FINAL_CASED), // # GREEK CAPITAL LETTER SIGMA
65 new Entry(0x0130, new char[]{0x0069, 0x0307}, new char[]{0x0130}, null, 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
66
67 //# ================================================================================
68 //# Locale-sensitive mappings
69 //# ================================================================================
70 //# Lithuanian
71 new Entry(0x0307, new char[]{0x0307}, new char[]{}, "lt", AFTER_SOFT_DOTTED), // # COMBINING DOT ABOVE
72 new Entry(0x0049, new char[]{0x0069, 0x0307}, new char[]{0x0049}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I
73 new Entry(0x004A, new char[]{0x006A, 0x0307}, new char[]{0x004A}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER J
74 new Entry(0x012E, new char[]{0x012F, 0x0307}, new char[]{0x012E}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I WITH OGONEK
75 new Entry(0x00CC, new char[]{0x0069, 0x0307, 0x0300}, new char[]{0x00CC}, "lt", 0), // # LATIN CAPITAL LETTER I WITH GRAVE
76 new Entry(0x00CD, new char[]{0x0069, 0x0307, 0x0301}, new char[]{0x00CD}, "lt", 0), // # LATIN CAPITAL LETTER I WITH ACUTE
77 new Entry(0x0128, new char[]{0x0069, 0x0307, 0x0303}, new char[]{0x0128}, "lt", 0), // # LATIN CAPITAL LETTER I WITH TILDE
78
79 //# ================================================================================
80 //# Turkish and Azeri
81 new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
82 new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
83 new Entry(0x0307, new char[]{}, new char[]{0x0307}, "tr", AFTER_I), // # COMBINING DOT ABOVE
84 new Entry(0x0307, new char[]{}, new char[]{0x0307}, "az", AFTER_I), // # COMBINING DOT ABOVE
85 new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "tr", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
86 new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "az", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
87 new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN SMALL LETTER I
88 new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "az", 0) // # LATIN SMALL LETTER I
89 };
90
91 // A hash table that contains the above entries
92 static Hashtable<Integer, HashSet<Entry>> entryTable = new Hashtable<>();
93 static {
94 // create hashtable from the entry
95 for (Entry cur : entry) {
96 Integer cp = cur.getCodePoint();
97 HashSet<Entry> set = entryTable.get(cp);
98 if (set == null) {
99 set = new HashSet<>();
100 entryTable.put(cp, set);
101 }
102 set.add(cur);
103 }
104 }
105
106 static int toLowerCaseEx(String src, int index, Locale locale) {
107 char[] result = lookUpTable(src, index, locale, true);
108
109 if (result != null) {
110 if (result.length == 1) {
111 return result[0];
112 } else {
113 return Character.ERROR;
114 }
115 } else {
116 // default to Character class' one
117 return Character.toLowerCase(src.codePointAt(index));
118 }
119 }
120
121 static int toUpperCaseEx(String src, int index, Locale locale) {
122 char[] result = lookUpTable(src, index, locale, false);
123
124 if (result != null) {
125 if (result.length == 1) {
126 return result[0];
127 } else {
128 return Character.ERROR;
129 }
130 } else {
131 // default to Character class' one
132 return Character.toUpperCaseEx(src.codePointAt(index));
133 }
134 }
135
136 static char[] toLowerCaseCharArray(String src, int index, Locale locale) {
137 return lookUpTable(src, index, locale, true);
138 }
139
140 static char[] toUpperCaseCharArray(String src, int index, Locale locale) {
141 char[] result = lookUpTable(src, index, locale, false);
142 if (result != null) {
143 return result;
144 } else {
145 return Character.toUpperCaseCharArray(src.codePointAt(index));
146 }
147 }
148
149 private static char[] lookUpTable(String src, int index, Locale locale, boolean bLowerCasing) {
150 HashSet<Entry> set = entryTable.get(src.codePointAt(index));
151 char[] ret = null;
152
153 if (set != null) {
154 Iterator<Entry> iter = set.iterator();
155 String currentLang = locale.getLanguage();
156 while (iter.hasNext()) {
157 Entry entry = iter.next();
158 String conditionLang = entry.getLanguage();
159 if (((conditionLang == null) || (conditionLang.equals(currentLang))) &&
160 isConditionMet(src, index, locale, entry.getCondition())) {
161 ret = bLowerCasing ? entry.getLowerCase() : entry.getUpperCase();
162 if (conditionLang != null) {
163 break;
164 }
165 }
166 }
167 }
168
169 return ret;
170 }
171
172 private static boolean isConditionMet(String src, int index, Locale locale, int condition) {
173 switch (condition) {
174 case FINAL_CASED:
175 return isFinalCased(src, index, locale);
176
177 case AFTER_SOFT_DOTTED:
178 return isAfterSoftDotted(src, index);
179
180 case MORE_ABOVE:
181 return isMoreAbove(src, index);
182
183 case AFTER_I:
184 return isAfterI(src, index);
185
186 case NOT_BEFORE_DOT:
187 return !isBeforeDot(src, index);
188
189 default:
190 return true;
191 }
192 }
193
194 /**
195 * Implements the "Final_Cased" condition
196 *
197 * Specification: Within the closest word boundaries containing C, there is a cased
198 * letter before C, and there is no cased letter after C.
199 *
200 * Regular Expression:
201 * Before C: [{cased==true}][{wordBoundary!=true}]*
202 * After C: !([{wordBoundary!=true}]*[{cased}])
203 */
204 private static boolean isFinalCased(String src, int index, Locale locale) {
205 BreakIterator wordBoundary = BreakIterator.getWordInstance(locale);
206 wordBoundary.setText(src);
207 int ch;
208
209 // Look for a preceding 'cased' letter
210 for (int i = index; (i >= 0) && !wordBoundary.isBoundary(i);
211 i -= Character.charCount(ch)) {
212
213 ch = src.codePointBefore(i);
214 if (isCased(ch)) {
215
216 int len = src.length();
217 // Check that there is no 'cased' letter after the index
218 for (i = index + Character.charCount(src.codePointAt(index));
219 (i < len) && !wordBoundary.isBoundary(i);
220 i += Character.charCount(ch)) {
221
222 ch = src.codePointAt(i);
223 if (isCased(ch)) {
224 return false;
225 }
226 }
227
228 return true;
229 }
230 }
231
232 return false;
233 }
234
235 /**
236 * Implements the "After_I" condition
237 *
238 * Specification: The last preceding base character was an uppercase I,
239 * and there is no intervening combining character class 230 (ABOVE).
240 *
241 * Regular Expression:
242 * Before C: [I]([{cc!=230}&{cc!=0}])*
243 */
244 private static boolean isAfterI(String src, int index) {
245 int ch;
246 int cc;
247
248 // Look for the last preceding base character
249 for (int i = index; i > 0; i -= Character.charCount(ch)) {
250
251 ch = src.codePointBefore(i);
252
253 if (ch == 'I') {
254 return true;
255 } else {
256 cc = Normalizer.getCombiningClass(ch);
257 if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
258 return false;
259 }
260 }
261 }
262
263 return false;
264 }
265
266 /**
267 * Implements the "After_Soft_Dotted" condition
268 *
269 * Specification: The last preceding character with combining class
270 * of zero before C was Soft_Dotted, and there is no intervening
271 * combining character class 230 (ABOVE).
272 *
273 * Regular Expression:
274 * Before C: [{Soft_Dotted==true}]([{cc!=230}&{cc!=0}])*
275 */
276 private static boolean isAfterSoftDotted(String src, int index) {
277 int ch;
278 int cc;
279
280 // Look for the last preceding character
281 for (int i = index; i > 0; i -= Character.charCount(ch)) {
282
283 ch = src.codePointBefore(i);
284
285 if (isSoftDotted(ch)) {
286 return true;
287 } else {
288 cc = Normalizer.getCombiningClass(ch);
289 if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
290 return false;
291 }
292 }
293 }
294
295 return false;
296 }
297
298 /**
299 * Implements the "More_Above" condition
300 *
301 * Specification: C is followed by one or more characters of combining
302 * class 230 (ABOVE) in the combining character sequence.
303 *
304 * Regular Expression:
305 * After C: [{cc!=0}]*[{cc==230}]
306 */
307 private static boolean isMoreAbove(String src, int index) {
308 int ch;
309 int cc;
310 int len = src.length();
311
312 // Look for a following ABOVE combining class character
313 for (int i = index + Character.charCount(src.codePointAt(index));
314 i < len; i += Character.charCount(ch)) {
315
316 ch = src.codePointAt(i);
317 cc = Normalizer.getCombiningClass(ch);
318
319 if (cc == COMBINING_CLASS_ABOVE) {
320 return true;
321 } else if (cc == 0) {
322 return false;
323 }
324 }
325
326 return false;
327 }
328
329 /**
330 * Implements the "Before_Dot" condition
331 *
332 * Specification: C is followed by <code>U+0307 COMBINING DOT ABOVE</code>.
333 * Any sequence of characters with a combining class that is
334 * neither 0 nor 230 may intervene between the current character
335 * and the combining dot above.
336 *
337 * Regular Expression:
338 * After C: ([{cc!=230}&{cc!=0}])*[\u0307]
339 */
340 private static boolean isBeforeDot(String src, int index) {
341 int ch;
342 int cc;
343 int len = src.length();
344
345 // Look for a following COMBINING DOT ABOVE
346 for (int i = index + Character.charCount(src.codePointAt(index));
347 i < len; i += Character.charCount(ch)) {
348
349 ch = src.codePointAt(i);
350
351 if (ch == '\u0307') {
352 return true;
353 } else {
354 cc = Normalizer.getCombiningClass(ch);
355 if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
356 return false;
357 }
358 }
359 }
360
361 return false;
362 }
363
364 /**
365 * Examines whether a character is 'cased'.
366 *
367 * A character C is defined to be 'cased' if and only if at least one of
368 * following are true for C: uppercase==true, or lowercase==true, or
369 * general_category==titlecase_letter.
370 *
371 * The uppercase and lowercase property values are specified in the data
372 * file DerivedCoreProperties.txt in the Unicode Character Database.
373 */
374 private static boolean isCased(int ch) {
375 int type = Character.getType(ch);
376 if (type == Character.LOWERCASE_LETTER ||
377 type == Character.UPPERCASE_LETTER ||
378 type == Character.TITLECASE_LETTER) {
379 return true;
380 } else {
381 // Check for Other_Lowercase and Other_Uppercase
382 //
383 if ((ch >= 0x02B0) && (ch <= 0x02B8)) {
384 // MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y
385 return true;
386 } else if ((ch >= 0x02C0) && (ch <= 0x02C1)) {
387 // MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP
388 return true;
389 } else if ((ch >= 0x02E0) && (ch <= 0x02E4)) {
390 // MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
391 return true;
392 } else if (ch == 0x0345) {
393 // COMBINING GREEK YPOGEGRAMMENI
394 return true;
395 } else if (ch == 0x037A) {
396 // GREEK YPOGEGRAMMENI
397 return true;
398 } else if ((ch >= 0x1D2C) && (ch <= 0x1D61)) {
399 // MODIFIER LETTER CAPITAL A..MODIFIER LETTER SMALL CHI
400 return true;
401 } else if ((ch >= 0x2160) && (ch <= 0x217F)) {
402 // ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND
403 // SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND
404 return true;
405 } else if ((ch >= 0x24B6) && (ch <= 0x24E9)) {
406 // CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z
407 // CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
408 return true;
409 } else {
410 return false;
411 }
412 }
413 }
414
415 private static boolean isSoftDotted(int ch) {
416 switch (ch) {
417 case 0x0069: // Soft_Dotted # L& LATIN SMALL LETTER I
418 case 0x006A: // Soft_Dotted # L& LATIN SMALL LETTER J
419 case 0x012F: // Soft_Dotted # L& LATIN SMALL LETTER I WITH OGONEK
420 case 0x0268: // Soft_Dotted # L& LATIN SMALL LETTER I WITH STROKE
421 case 0x0456: // Soft_Dotted # L& CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
422 case 0x0458: // Soft_Dotted # L& CYRILLIC SMALL LETTER JE
423 case 0x1D62: // Soft_Dotted # L& LATIN SUBSCRIPT SMALL LETTER I
424 case 0x1E2D: // Soft_Dotted # L& LATIN SMALL LETTER I WITH TILDE BELOW
425 case 0x1ECB: // Soft_Dotted # L& LATIN SMALL LETTER I WITH DOT BELOW
426 case 0x2071: // Soft_Dotted # L& SUPERSCRIPT LATIN SMALL LETTER I
427 return true;
428 default:
429 return false;
430 }
431 }
432
433 /**
434 * An internal class that represents an entry in the Special Casing Properties.
435 */
436 static class Entry {
437 int ch;
438 char [] lower;
439 char [] upper;
440 String lang;
441 int condition;
442
443 Entry(int ch, char[] lower, char[] upper, String lang, int condition) {
444 this.ch = ch;
445 this.lower = lower;
446 this.upper = upper;
447 this.lang = lang;
448 this.condition = condition;
449 }
450
451 int getCodePoint() {
452 return ch;
453 }
454
455 char[] getLowerCase() {
456 return lower;
457 }
458
459 char[] getUpperCase() {
460 return upper;
461 }
462
463 String getLanguage() {
464 return lang;
465 }
466
467 int getCondition() {
468 return condition;
469 }
470 }
471 }
472