1 /*
2  * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */

25
26 package java.util.regex;
27
28 import java.util.HashMap;
29 import java.util.Locale;
30 import java.util.regex.Pattern.CharPredicate;
31 import java.util.regex.Pattern.BmpCharPredicate;
32
33 class CharPredicates {
34
35     static final CharPredicate ALPHABETIC() {
36         return Character::isAlphabetic;
37     }
38
39     // \p{gc=Decimal_Number}
40     static final CharPredicate DIGIT() {
41         return Character::isDigit;
42     }
43
44     static final CharPredicate LETTER() {
45         return Character::isLetter;
46     }
47
48     static final CharPredicate IDEOGRAPHIC() {
49         return Character::isIdeographic;
50     }
51
52     static final CharPredicate LOWERCASE() {
53         return Character::isLowerCase;
54     }
55
56     static final CharPredicate UPPERCASE() {
57         return Character::isUpperCase;
58     }
59
60     static final CharPredicate TITLECASE() {
61         return Character::isTitleCase;
62     }
63
64     // \p{Whitespace}
65     static final CharPredicate WHITE_SPACE() {
66         return ch ->
67             ((((1 << Character.SPACE_SEPARATOR) |
68                (1 << Character.LINE_SEPARATOR) |
69                (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
70             != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
71     }
72
73     // \p{gc=Control}
74     static final CharPredicate CONTROL() {
75         return ch -> Character.getType(ch) == Character.CONTROL;
76     }
77
78     // \p{gc=Punctuation}
79     static final CharPredicate PUNCTUATION() {
80         return ch ->
81             ((((1 << Character.CONNECTOR_PUNCTUATION) |
82                (1 << Character.DASH_PUNCTUATION) |
83                (1 << Character.START_PUNCTUATION) |
84                (1 << Character.END_PUNCTUATION) |
85                (1 << Character.OTHER_PUNCTUATION) |
86                (1 << Character.INITIAL_QUOTE_PUNCTUATION) |
87                (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
88             != 0;
89     }
90
91     // \p{gc=Decimal_Number}
92     // \p{Hex_Digit}    -> PropList.txt: Hex_Digit
93     static final CharPredicate HEX_DIGIT() {
94         return DIGIT().union(ch -> (ch >= 0x0030 && ch <= 0x0039) ||
95                 (ch >= 0x0041 && ch <= 0x0046) ||
96                 (ch >= 0x0061 && ch <= 0x0066) ||
97                 (ch >= 0xFF10 && ch <= 0xFF19) ||
98                 (ch >= 0xFF21 && ch <= 0xFF26) ||
99                 (ch >= 0xFF41 && ch <= 0xFF46));
100     }
101
102     static final CharPredicate ASSIGNED() {
103         return ch -> Character.getType(ch) != Character.UNASSIGNED;
104     }
105
106     // PropList.txt:Noncharacter_Code_Point
107     static final CharPredicate NONCHARACTER_CODE_POINT() {
108         return ch -> (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
109     }
110
111     // \p{alpha}
112     // \p{digit}
113     static final CharPredicate ALNUM() {
114         return ALPHABETIC().union(DIGIT());
115     }
116
117     // \p{Whitespace} --
118     // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL}  -> 0xa, 0xb, 0xc, 0xd, 0x85
119     //  \p{gc=Line_Separator}
120     //  \p{gc=Paragraph_Separator}]
121     static final CharPredicate BLANK() {
122         return ch ->
123             Character.getType(ch) == Character.SPACE_SEPARATOR ||
124             ch == 0x9; // \N{HT}
125     }
126
127     // [^
128     //  \p{space}
129     //  \p{gc=Control}
130     //  \p{gc=Surrogate}
131     //  \p{gc=Unassigned}]
132     static final CharPredicate GRAPH() {
133         return ch ->
134             ((((1 << Character.SPACE_SEPARATOR) |
135                (1 << Character.LINE_SEPARATOR) |
136                (1 << Character.PARAGRAPH_SEPARATOR) |
137                (1 << Character.CONTROL) |
138                (1 << Character.SURROGATE) |
139                (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
140             == 0;
141     }
142
143     // \p{graph}
144     // \p{blank}
145     // -- \p{cntrl}
146     static final CharPredicate PRINT() {
147         return GRAPH().union(BLANK()).and(CONTROL().negate());
148     }
149
150     //  200C..200D    PropList.txt:Join_Control
151     static final CharPredicate JOIN_CONTROL() {
152         return ch -> ch == 0x200C || ch == 0x200D;
153     }
154
155     //  \p{alpha}
156     //  \p{gc=Mark}
157     //  \p{digit}
158     //  \p{gc=Connector_Punctuation}
159     //  \p{Join_Control}    200C..200D
160     static final CharPredicate WORD() {
161         return ALPHABETIC().union(ch -> ((((1 << Character.NON_SPACING_MARK) |
162                                   (1 << Character.ENCLOSING_MARK) |
163                                   (1 << Character.COMBINING_SPACING_MARK) |
164                                   (1 << Character.DECIMAL_DIGIT_NUMBER) |
165                                   (1 << Character.CONNECTOR_PUNCTUATION))
166                                  >> Character.getType(ch)) & 1) != 0,
167                          JOIN_CONTROL());
168     }
169
170     /////////////////////////////////////////////////////////////////////////////
171
172     private static CharPredicate getPosixPredicate(String name) {
173         switch (name) {
174             case "ALPHA"return ALPHABETIC();
175             case "LOWER"return LOWERCASE();
176             case "UPPER"return UPPERCASE();
177             case "SPACE"return WHITE_SPACE();
178             case "PUNCT"return PUNCTUATION();
179             case "XDIGIT"return HEX_DIGIT();
180             case "ALNUM"return ALNUM();
181             case "CNTRL"return CONTROL();
182             case "DIGIT"return DIGIT();
183             case "BLANK"return BLANK();
184             case "GRAPH"return GRAPH();
185             case "PRINT"return PRINT();
186             defaultreturn null;
187         }
188     }
189
190     private static CharPredicate getUnicodePredicate(String name) {
191         switch (name) {
192             case "ALPHABETIC"return ALPHABETIC();
193             case "ASSIGNED"return ASSIGNED();
194             case "CONTROL"return CONTROL();
195             case "HEXDIGIT"return HEX_DIGIT();
196             case "IDEOGRAPHIC"return IDEOGRAPHIC();
197             case "JOINCONTROL"return JOIN_CONTROL();
198             case "LETTER"return LETTER();
199             case "LOWERCASE"return LOWERCASE();
200             case "NONCHARACTERCODEPOINT"return NONCHARACTER_CODE_POINT();
201             case "TITLECASE"return TITLECASE();
202             case "PUNCTUATION"return PUNCTUATION();
203             case "UPPERCASE"return UPPERCASE();
204             case "WHITESPACE"return WHITE_SPACE();
205             case "WORD"return WORD();
206             case "WHITE_SPACE"return WHITE_SPACE();
207             case "HEX_DIGIT"return HEX_DIGIT();
208             case "NONCHARACTER_CODE_POINT"return NONCHARACTER_CODE_POINT();
209             case "JOIN_CONTROL"return JOIN_CONTROL();
210             defaultreturn null;
211         }
212     }
213
214     public static CharPredicate forUnicodeProperty(String propName) {
215         propName = propName.toUpperCase(Locale.ROOT);
216         CharPredicate p = getUnicodePredicate(propName);
217         if (p != null)
218             return p;
219         return getPosixPredicate(propName);
220     }
221
222     public static CharPredicate forPOSIXName(String propName) {
223         return getPosixPredicate(propName.toUpperCase(Locale.ENGLISH));
224     }
225
226     /////////////////////////////////////////////////////////////////////////////
227
228     /**
229      * Returns a predicate matching all characters belong to a named
230      * UnicodeScript.
231      */

232     static CharPredicate forUnicodeScript(String name) {
233         final Character.UnicodeScript script;
234         try {
235             script = Character.UnicodeScript.forName(name);
236             return ch -> script == Character.UnicodeScript.of(ch);
237         } catch (IllegalArgumentException iae) {}
238         return null;
239     }
240
241     /**
242      * Returns a predicate matching all characters in a UnicodeBlock.
243      */

244     static CharPredicate forUnicodeBlock(String name) {
245         final Character.UnicodeBlock block;
246         try {
247             block = Character.UnicodeBlock.forName(name);
248             return ch -> block == Character.UnicodeBlock.of(ch);
249         } catch (IllegalArgumentException iae) {}
250          return null;
251     }
252
253     /////////////////////////////////////////////////////////////////////////////
254
255     // unicode categories, aliases, properties, java methods ...
256
257     static CharPredicate forProperty(String name) {
258         // Unicode character property aliases, defined in
259         // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
260         switch (name) {
261             case "Cn"return category(1<<Character.UNASSIGNED);
262             case "Lu"return category(1<<Character.UPPERCASE_LETTER);
263             case "Ll"return category(1<<Character.LOWERCASE_LETTER);
264             case "Lt"return category(1<<Character.TITLECASE_LETTER);
265             case "Lm"return category(1<<Character.MODIFIER_LETTER);
266             case "Lo"return category(1<<Character.OTHER_LETTER);
267             case "Mn"return category(1<<Character.NON_SPACING_MARK);
268             case "Me"return category(1<<Character.ENCLOSING_MARK);
269             case "Mc"return category(1<<Character.COMBINING_SPACING_MARK);
270             case "Nd"return category(1<<Character.DECIMAL_DIGIT_NUMBER);
271             case "Nl"return category(1<<Character.LETTER_NUMBER);
272             case "No"return category(1<<Character.OTHER_NUMBER);
273             case "Zs"return category(1<<Character.SPACE_SEPARATOR);
274             case "Zl"return category(1<<Character.LINE_SEPARATOR);
275             case "Zp"return category(1<<Character.PARAGRAPH_SEPARATOR);
276             case "Cc"return category(1<<Character.CONTROL);
277             case "Cf"return category(1<<Character.FORMAT);
278             case "Co"return category(1<<Character.PRIVATE_USE);
279             case "Cs"return category(1<<Character.SURROGATE);
280             case "Pd"return category(1<<Character.DASH_PUNCTUATION);
281             case "Ps"return category(1<<Character.START_PUNCTUATION);
282             case "Pe"return category(1<<Character.END_PUNCTUATION);
283             case "Pc"return category(1<<Character.CONNECTOR_PUNCTUATION);
284             case "Po"return category(1<<Character.OTHER_PUNCTUATION);
285             case "Sm"return category(1<<Character.MATH_SYMBOL);
286             case "Sc"return category(1<<Character.CURRENCY_SYMBOL);
287             case "Sk"return category(1<<Character.MODIFIER_SYMBOL);
288             case "So"return category(1<<Character.OTHER_SYMBOL);
289             case "Pi"return category(1<<Character.INITIAL_QUOTE_PUNCTUATION);
290             case "Pf"return category(1<<Character.FINAL_QUOTE_PUNCTUATION);
291             case "L"return category(((1<<Character.UPPERCASE_LETTER) |
292                               (1<<Character.LOWERCASE_LETTER) |
293                               (1<<Character.TITLECASE_LETTER) |
294                               (1<<Character.MODIFIER_LETTER)  |
295                               (1<<Character.OTHER_LETTER)));
296             case "M"return category(((1<<Character.NON_SPACING_MARK) |
297                               (1<<Character.ENCLOSING_MARK)   |
298                               (1<<Character.COMBINING_SPACING_MARK)));
299             case "N"return category(((1<<Character.DECIMAL_DIGIT_NUMBER) |
300                               (1<<Character.LETTER_NUMBER)        |
301                               (1<<Character.OTHER_NUMBER)));
302             case "Z"return category(((1<<Character.SPACE_SEPARATOR) |
303                               (1<<Character.LINE_SEPARATOR)  |
304                               (1<<Character.PARAGRAPH_SEPARATOR)));
305             case "C"return category(((1<<Character.CONTROL)     |
306                               (1<<Character.FORMAT)      |
307                               (1<<Character.PRIVATE_USE) |
308                               (1<<Character.SURROGATE)   |
309                               (1<<Character.UNASSIGNED))); // Other
310             case "P"return category(((1<<Character.DASH_PUNCTUATION)      |
311                               (1<<Character.START_PUNCTUATION)     |
312                               (1<<Character.END_PUNCTUATION)       |
313                               (1<<Character.CONNECTOR_PUNCTUATION) |
314                               (1<<Character.OTHER_PUNCTUATION)     |
315                               (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
316                               (1<<Character.FINAL_QUOTE_PUNCTUATION)));
317             case "S"return category(((1<<Character.MATH_SYMBOL)     |
318                               (1<<Character.CURRENCY_SYMBOL) |
319                               (1<<Character.MODIFIER_SYMBOL) |
320                               (1<<Character.OTHER_SYMBOL)));
321             case "LC"return category(((1<<Character.UPPERCASE_LETTER) |
322                                (1<<Character.LOWERCASE_LETTER) |
323                                (1<<Character.TITLECASE_LETTER)));
324             case "LD"return category(((1<<Character.UPPERCASE_LETTER) |
325                                (1<<Character.LOWERCASE_LETTER) |
326                                (1<<Character.TITLECASE_LETTER) |
327                                (1<<Character.MODIFIER_LETTER)  |
328                                (1<<Character.OTHER_LETTER)     |
329                                (1<<Character.DECIMAL_DIGIT_NUMBER)));
330             case "L1"return range(0x00, 0xFF); // Latin-1
331             case "all"return Pattern.ALL();
332             // Posix regular expression character classes, defined in
333             // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
334             case "ASCII"return range(0x00, 0x7F);   // ASCII
335             case "Alnum"return ctype(ASCII.ALNUM);  // Alphanumeric characters
336             case "Alpha"return ctype(ASCII.ALPHA);  // Alphabetic characters
337             case "Blank"return ctype(ASCII.BLANK);  // Space and tab characters
338             case "Cntrl"return ctype(ASCII.CNTRL);  // Control characters
339             case "Digit"return range('0', '9');     // Numeric characters
340             case "Graph"return ctype(ASCII.GRAPH);  // printable and visible
341             case "Lower"return range('a', 'z');     // Lower-case alphabetic
342             case "Print"return range(0x20, 0x7E);   // Printable characters
343             case "Punct"return ctype(ASCII.PUNCT);  // Punctuation characters
344             case "Space"return ctype(ASCII.SPACE);  // Space characters
345             case "Upper"return range('A', 'Z');     // Upper-case alphabetic
346             case "XDigit"return ctype(ASCII.XDIGIT); // hexadecimal digits
347
348             // Java character properties, defined by methods in Character.java
349             case "javaLowerCase"return java.lang.Character::isLowerCase;
350             case "javaUpperCase"return  Character::isUpperCase;
351             case "javaAlphabetic"return java.lang.Character::isAlphabetic;
352             case "javaIdeographic"return java.lang.Character::isIdeographic;
353             case "javaTitleCase"return java.lang.Character::isTitleCase;
354             case "javaDigit"return java.lang.Character::isDigit;
355             case "javaDefined"return java.lang.Character::isDefined;
356             case "javaLetter"return java.lang.Character::isLetter;
357             case "javaLetterOrDigit"return java.lang.Character::isLetterOrDigit;
358             case "javaJavaIdentifierStart"return java.lang.Character::isJavaIdentifierStart;
359             case "javaJavaIdentifierPart"return java.lang.Character::isJavaIdentifierPart;
360             case "javaUnicodeIdentifierStart"return java.lang.Character::isUnicodeIdentifierStart;
361             case "javaUnicodeIdentifierPart"return java.lang.Character::isUnicodeIdentifierPart;
362             case "javaIdentifierIgnorable"return java.lang.Character::isIdentifierIgnorable;
363             case "javaSpaceChar"return java.lang.Character::isSpaceChar;
364             case "javaWhitespace"return java.lang.Character::isWhitespace;
365             case "javaISOControl"return java.lang.Character::isISOControl;
366             case "javaMirrored"return java.lang.Character::isMirrored;
367             defaultreturn null;
368         }
369     }
370
371     private static CharPredicate category(final int typeMask) {
372         return ch -> (typeMask & (1 << Character.getType(ch))) != 0;
373     }
374
375     private static CharPredicate range(final int lower, final int upper) {
376         return (BmpCharPredicate)ch -> lower <= ch && ch <= upper;
377     }
378
379     private static CharPredicate ctype(final int ctype) {
380         return (BmpCharPredicate)ch -> ch < 128 && ASCII.isType(ch, ctype);
381     }
382
383     /////////////////////////////////////////////////////////////////////////////
384
385     /**
386      * Posix ASCII variants, not in the lookup map
387      */

388     static final BmpCharPredicate ASCII_DIGIT() {
389         return ch -> ch < 128 && ASCII.isDigit(ch);
390     }
391     static final BmpCharPredicate ASCII_WORD() {
392         return ch -> ch < 128 && ASCII.isWord(ch);
393     }
394     static final BmpCharPredicate ASCII_SPACE() {
395         return ch -> ch < 128 && ASCII.isSpace(ch);
396     }
397
398 }
399