1
25
26 package java.util.regex;
27
28 import java.util.HashMap;
29 import java.util.Locale;
30 import java.util.regex.Pattern.CharPredicate;
31 import java.util.regex.Pattern.BmpCharPredicate;
32
33 class CharPredicates {
34
35 static final CharPredicate ALPHABETIC() {
36 return Character::isAlphabetic;
37 }
38
39
40 static final CharPredicate DIGIT() {
41 return Character::isDigit;
42 }
43
44 static final CharPredicate LETTER() {
45 return Character::isLetter;
46 }
47
48 static final CharPredicate IDEOGRAPHIC() {
49 return Character::isIdeographic;
50 }
51
52 static final CharPredicate LOWERCASE() {
53 return Character::isLowerCase;
54 }
55
56 static final CharPredicate UPPERCASE() {
57 return Character::isUpperCase;
58 }
59
60 static final CharPredicate TITLECASE() {
61 return Character::isTitleCase;
62 }
63
64
65 static final CharPredicate WHITE_SPACE() {
66 return ch ->
67 ((((1 << Character.SPACE_SEPARATOR) |
68 (1 << Character.LINE_SEPARATOR) |
69 (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
70 != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
71 }
72
73
74 static final CharPredicate CONTROL() {
75 return ch -> Character.getType(ch) == Character.CONTROL;
76 }
77
78
79 static final CharPredicate PUNCTUATION() {
80 return ch ->
81 ((((1 << Character.CONNECTOR_PUNCTUATION) |
82 (1 << Character.DASH_PUNCTUATION) |
83 (1 << Character.START_PUNCTUATION) |
84 (1 << Character.END_PUNCTUATION) |
85 (1 << Character.OTHER_PUNCTUATION) |
86 (1 << Character.INITIAL_QUOTE_PUNCTUATION) |
87 (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
88 != 0;
89 }
90
91
92
93 static final CharPredicate HEX_DIGIT() {
94 return DIGIT().union(ch -> (ch >= 0x0030 && ch <= 0x0039) ||
95 (ch >= 0x0041 && ch <= 0x0046) ||
96 (ch >= 0x0061 && ch <= 0x0066) ||
97 (ch >= 0xFF10 && ch <= 0xFF19) ||
98 (ch >= 0xFF21 && ch <= 0xFF26) ||
99 (ch >= 0xFF41 && ch <= 0xFF46));
100 }
101
102 static final CharPredicate ASSIGNED() {
103 return ch -> Character.getType(ch) != Character.UNASSIGNED;
104 }
105
106
107 static final CharPredicate NONCHARACTER_CODE_POINT() {
108 return ch -> (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
109 }
110
111
112
113 static final CharPredicate ALNUM() {
114 return ALPHABETIC().union(DIGIT());
115 }
116
117
118
119
120
121 static final CharPredicate BLANK() {
122 return ch ->
123 Character.getType(ch) == Character.SPACE_SEPARATOR ||
124 ch == 0x9;
125 }
126
127
128
129
130
131
132 static final CharPredicate GRAPH() {
133 return ch ->
134 ((((1 << Character.SPACE_SEPARATOR) |
135 (1 << Character.LINE_SEPARATOR) |
136 (1 << Character.PARAGRAPH_SEPARATOR) |
137 (1 << Character.CONTROL) |
138 (1 << Character.SURROGATE) |
139 (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
140 == 0;
141 }
142
143
144
145
146 static final CharPredicate PRINT() {
147 return GRAPH().union(BLANK()).and(CONTROL().negate());
148 }
149
150
151 static final CharPredicate JOIN_CONTROL() {
152 return ch -> ch == 0x200C || ch == 0x200D;
153 }
154
155
156
157
158
159
160 static final CharPredicate WORD() {
161 return ALPHABETIC().union(ch -> ((((1 << Character.NON_SPACING_MARK) |
162 (1 << Character.ENCLOSING_MARK) |
163 (1 << Character.COMBINING_SPACING_MARK) |
164 (1 << Character.DECIMAL_DIGIT_NUMBER) |
165 (1 << Character.CONNECTOR_PUNCTUATION))
166 >> Character.getType(ch)) & 1) != 0,
167 JOIN_CONTROL());
168 }
169
170
171
172 private static CharPredicate getPosixPredicate(String name) {
173 switch (name) {
174 case "ALPHA": return ALPHABETIC();
175 case "LOWER": return LOWERCASE();
176 case "UPPER": return UPPERCASE();
177 case "SPACE": return WHITE_SPACE();
178 case "PUNCT": return PUNCTUATION();
179 case "XDIGIT": return HEX_DIGIT();
180 case "ALNUM": return ALNUM();
181 case "CNTRL": return CONTROL();
182 case "DIGIT": return DIGIT();
183 case "BLANK": return BLANK();
184 case "GRAPH": return GRAPH();
185 case "PRINT": return PRINT();
186 default: return null;
187 }
188 }
189
190 private static CharPredicate getUnicodePredicate(String name) {
191 switch (name) {
192 case "ALPHABETIC": return ALPHABETIC();
193 case "ASSIGNED": return ASSIGNED();
194 case "CONTROL": return CONTROL();
195 case "HEXDIGIT": return HEX_DIGIT();
196 case "IDEOGRAPHIC": return IDEOGRAPHIC();
197 case "JOINCONTROL": return JOIN_CONTROL();
198 case "LETTER": return LETTER();
199 case "LOWERCASE": return LOWERCASE();
200 case "NONCHARACTERCODEPOINT": return NONCHARACTER_CODE_POINT();
201 case "TITLECASE": return TITLECASE();
202 case "PUNCTUATION": return PUNCTUATION();
203 case "UPPERCASE": return UPPERCASE();
204 case "WHITESPACE": return WHITE_SPACE();
205 case "WORD": return WORD();
206 case "WHITE_SPACE": return WHITE_SPACE();
207 case "HEX_DIGIT": return HEX_DIGIT();
208 case "NONCHARACTER_CODE_POINT": return NONCHARACTER_CODE_POINT();
209 case "JOIN_CONTROL": return JOIN_CONTROL();
210 default: return null;
211 }
212 }
213
214 public static CharPredicate forUnicodeProperty(String propName) {
215 propName = propName.toUpperCase(Locale.ROOT);
216 CharPredicate p = getUnicodePredicate(propName);
217 if (p != null)
218 return p;
219 return getPosixPredicate(propName);
220 }
221
222 public static CharPredicate forPOSIXName(String propName) {
223 return getPosixPredicate(propName.toUpperCase(Locale.ENGLISH));
224 }
225
226
227
228
232 static CharPredicate forUnicodeScript(String name) {
233 final Character.UnicodeScript script;
234 try {
235 script = Character.UnicodeScript.forName(name);
236 return ch -> script == Character.UnicodeScript.of(ch);
237 } catch (IllegalArgumentException iae) {}
238 return null;
239 }
240
241
244 static CharPredicate forUnicodeBlock(String name) {
245 final Character.UnicodeBlock block;
246 try {
247 block = Character.UnicodeBlock.forName(name);
248 return ch -> block == Character.UnicodeBlock.of(ch);
249 } catch (IllegalArgumentException iae) {}
250 return null;
251 }
252
253
254
255
256
257 static CharPredicate forProperty(String name) {
258
259
260 switch (name) {
261 case "Cn": return category(1<<Character.UNASSIGNED);
262 case "Lu": return category(1<<Character.UPPERCASE_LETTER);
263 case "Ll": return category(1<<Character.LOWERCASE_LETTER);
264 case "Lt": return category(1<<Character.TITLECASE_LETTER);
265 case "Lm": return category(1<<Character.MODIFIER_LETTER);
266 case "Lo": return category(1<<Character.OTHER_LETTER);
267 case "Mn": return category(1<<Character.NON_SPACING_MARK);
268 case "Me": return category(1<<Character.ENCLOSING_MARK);
269 case "Mc": return category(1<<Character.COMBINING_SPACING_MARK);
270 case "Nd": return category(1<<Character.DECIMAL_DIGIT_NUMBER);
271 case "Nl": return category(1<<Character.LETTER_NUMBER);
272 case "No": return category(1<<Character.OTHER_NUMBER);
273 case "Zs": return category(1<<Character.SPACE_SEPARATOR);
274 case "Zl": return category(1<<Character.LINE_SEPARATOR);
275 case "Zp": return category(1<<Character.PARAGRAPH_SEPARATOR);
276 case "Cc": return category(1<<Character.CONTROL);
277 case "Cf": return category(1<<Character.FORMAT);
278 case "Co": return category(1<<Character.PRIVATE_USE);
279 case "Cs": return category(1<<Character.SURROGATE);
280 case "Pd": return category(1<<Character.DASH_PUNCTUATION);
281 case "Ps": return category(1<<Character.START_PUNCTUATION);
282 case "Pe": return category(1<<Character.END_PUNCTUATION);
283 case "Pc": return category(1<<Character.CONNECTOR_PUNCTUATION);
284 case "Po": return category(1<<Character.OTHER_PUNCTUATION);
285 case "Sm": return category(1<<Character.MATH_SYMBOL);
286 case "Sc": return category(1<<Character.CURRENCY_SYMBOL);
287 case "Sk": return category(1<<Character.MODIFIER_SYMBOL);
288 case "So": return category(1<<Character.OTHER_SYMBOL);
289 case "Pi": return category(1<<Character.INITIAL_QUOTE_PUNCTUATION);
290 case "Pf": return category(1<<Character.FINAL_QUOTE_PUNCTUATION);
291 case "L": return category(((1<<Character.UPPERCASE_LETTER) |
292 (1<<Character.LOWERCASE_LETTER) |
293 (1<<Character.TITLECASE_LETTER) |
294 (1<<Character.MODIFIER_LETTER) |
295 (1<<Character.OTHER_LETTER)));
296 case "M": return category(((1<<Character.NON_SPACING_MARK) |
297 (1<<Character.ENCLOSING_MARK) |
298 (1<<Character.COMBINING_SPACING_MARK)));
299 case "N": return category(((1<<Character.DECIMAL_DIGIT_NUMBER) |
300 (1<<Character.LETTER_NUMBER) |
301 (1<<Character.OTHER_NUMBER)));
302 case "Z": return category(((1<<Character.SPACE_SEPARATOR) |
303 (1<<Character.LINE_SEPARATOR) |
304 (1<<Character.PARAGRAPH_SEPARATOR)));
305 case "C": return category(((1<<Character.CONTROL) |
306 (1<<Character.FORMAT) |
307 (1<<Character.PRIVATE_USE) |
308 (1<<Character.SURROGATE) |
309 (1<<Character.UNASSIGNED)));
310 case "P": return category(((1<<Character.DASH_PUNCTUATION) |
311 (1<<Character.START_PUNCTUATION) |
312 (1<<Character.END_PUNCTUATION) |
313 (1<<Character.CONNECTOR_PUNCTUATION) |
314 (1<<Character.OTHER_PUNCTUATION) |
315 (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
316 (1<<Character.FINAL_QUOTE_PUNCTUATION)));
317 case "S": return category(((1<<Character.MATH_SYMBOL) |
318 (1<<Character.CURRENCY_SYMBOL) |
319 (1<<Character.MODIFIER_SYMBOL) |
320 (1<<Character.OTHER_SYMBOL)));
321 case "LC": return category(((1<<Character.UPPERCASE_LETTER) |
322 (1<<Character.LOWERCASE_LETTER) |
323 (1<<Character.TITLECASE_LETTER)));
324 case "LD": return category(((1<<Character.UPPERCASE_LETTER) |
325 (1<<Character.LOWERCASE_LETTER) |
326 (1<<Character.TITLECASE_LETTER) |
327 (1<<Character.MODIFIER_LETTER) |
328 (1<<Character.OTHER_LETTER) |
329 (1<<Character.DECIMAL_DIGIT_NUMBER)));
330 case "L1": return range(0x00, 0xFF);
331 case "all": return Pattern.ALL();
332
333
334 case "ASCII": return range(0x00, 0x7F);
335 case "Alnum": return ctype(ASCII.ALNUM);
336 case "Alpha": return ctype(ASCII.ALPHA);
337 case "Blank": return ctype(ASCII.BLANK);
338 case "Cntrl": return ctype(ASCII.CNTRL);
339 case "Digit": return range('0', '9');
340 case "Graph": return ctype(ASCII.GRAPH);
341 case "Lower": return range('a', 'z');
342 case "Print": return range(0x20, 0x7E);
343 case "Punct": return ctype(ASCII.PUNCT);
344 case "Space": return ctype(ASCII.SPACE);
345 case "Upper": return range('A', 'Z');
346 case "XDigit": return ctype(ASCII.XDIGIT);
347
348
349 case "javaLowerCase": return java.lang.Character::isLowerCase;
350 case "javaUpperCase": return Character::isUpperCase;
351 case "javaAlphabetic": return java.lang.Character::isAlphabetic;
352 case "javaIdeographic": return java.lang.Character::isIdeographic;
353 case "javaTitleCase": return java.lang.Character::isTitleCase;
354 case "javaDigit": return java.lang.Character::isDigit;
355 case "javaDefined": return java.lang.Character::isDefined;
356 case "javaLetter": return java.lang.Character::isLetter;
357 case "javaLetterOrDigit": return java.lang.Character::isLetterOrDigit;
358 case "javaJavaIdentifierStart": return java.lang.Character::isJavaIdentifierStart;
359 case "javaJavaIdentifierPart": return java.lang.Character::isJavaIdentifierPart;
360 case "javaUnicodeIdentifierStart": return java.lang.Character::isUnicodeIdentifierStart;
361 case "javaUnicodeIdentifierPart": return java.lang.Character::isUnicodeIdentifierPart;
362 case "javaIdentifierIgnorable": return java.lang.Character::isIdentifierIgnorable;
363 case "javaSpaceChar": return java.lang.Character::isSpaceChar;
364 case "javaWhitespace": return java.lang.Character::isWhitespace;
365 case "javaISOControl": return java.lang.Character::isISOControl;
366 case "javaMirrored": return java.lang.Character::isMirrored;
367 default: return null;
368 }
369 }
370
371 private static CharPredicate category(final int typeMask) {
372 return ch -> (typeMask & (1 << Character.getType(ch))) != 0;
373 }
374
375 private static CharPredicate range(final int lower, final int upper) {
376 return (BmpCharPredicate)ch -> lower <= ch && ch <= upper;
377 }
378
379 private static CharPredicate ctype(final int ctype) {
380 return (BmpCharPredicate)ch -> ch < 128 && ASCII.isType(ch, ctype);
381 }
382
383
384
385
388 static final BmpCharPredicate ASCII_DIGIT() {
389 return ch -> ch < 128 && ASCII.isDigit(ch);
390 }
391 static final BmpCharPredicate ASCII_WORD() {
392 return ch -> ch < 128 && ASCII.isWord(ch);
393 }
394 static final BmpCharPredicate ASCII_SPACE() {
395 return ch -> ch < 128 && ASCII.isSpace(ch);
396 }
397
398 }
399