Monitoring JavaMelody sur /demo

1 /*

2  * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved.

3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.

4  *

5  * This code is free software; you can redistribute it and/or modify it

6  * under the terms of the GNU General Public License version 2 only, as

7  * published by the Free Software Foundation.  Oracle designates this

8  * particular file as subject to the "Classpath" exception as provided

9  * by Oracle in the LICENSE file that accompanied this code.

10  *

11  * This code is distributed in the hope that it will be useful, but WITHOUT

12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License

14  * version 2 for more details (a copy is included in the LICENSE file that

15  * accompanied this code).

16  *

17  * You should have received a copy of the GNU General Public License version

18  * 2 along with this work; if not, write to the Free Software Foundation,

19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.

20  *

21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA

22  * or visit www.oracle.com if you need additional information or have any

23  * questions.

24  */

25 

26 package java.util.regex;

27 

28 import java.text.Normalizer;

29 import java.text.Normalizer.Form;

30 import java.util.Locale;

31 import java.util.Iterator;

32 import java.util.Map;

33 import java.util.ArrayList;

34 import java.util.HashMap;

35 import java.util.LinkedHashSet;

36 import java.util.List;

37 import java.util.Set;

38 import java.util.Arrays;

39 import java.util.NoSuchElementException;

40 import java.util.Spliterator;

41 import java.util.Spliterators;

42 import java.util.function.Predicate;

43 import java.util.stream.Stream;

44 import java.util.stream.StreamSupport;

45 

46 

47 /**

48  * A compiled representation of a regular expression.

49  *

50  * <p> A regular expression, specified as a string, must first be compiled into

51  * an instance of this class.  The resulting pattern can then be used to create

52  * a {@link Matcher} object that can match arbitrary {@linkplain

53  * java.lang.CharSequence character sequences} against the regular

54  * expression.  All of the state involved in performing a match resides in the

55  * matcher, so many matchers can share the same pattern.

56  *

57  * <p> A typical invocation sequence is thus

58  *

59  * <blockquote><pre>

60  * Pattern p = Pattern.{@link #compile compile}("a*b");

61  * Matcher m = p.{@link #matcher matcher}("aaaaab");

62  * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote>

63  *

64  * <p> A {@link #matches matches} method is defined by this class as a

65  * convenience for when a regular expression is used just once.  This method

66  * compiles an expression and matches an input sequence against it in a single

67  * invocation.  The statement

68  *

69  * <blockquote><pre>

70  * boolean b = Pattern.matches("a*b", "aaaaab");</pre></blockquote>

71  *

72  * is equivalent to the three statements above, though for repeated matches it

73  * is less efficient since it does not allow the compiled pattern to be reused.

74  *

75  * <p> Instances of this class are immutable and are safe for use by multiple

76  * concurrent threads.  Instances of the {@link Matcher} class are not safe for

77  * such use.

78  *

79  *

80  * <h3><a id="sum">Summary of regular-expression constructs</a></h3>

81  *

82  * <table class="borderless">

83  * <caption style="display:none">Regular expression constructs, and what they match</caption>

84  * <thead style="text-align:left">

85  * <tr>

86  * <th id="construct">Construct</th>

87  * <th id="matches">Matches</th>

88  * </tr>

89  * </thead>

90  * <tbody style="text-align:left">

91  *

92  * <tr><th colspan="2" style="padding-top:20px" id="characters">Characters</th></tr>

93  *

94  * <tr><th style="vertical-align:top; font-weight: normal" id="x"><i>x</i></th>

95  *     <td headers="matches characters x">The character <i>x</i></td></tr>

96  * <tr><th style="vertical-align:top; font-weight: normal" id="backslash">{@code \\}</th>

97  *     <td headers="matches characters backslash">The backslash character</td></tr>

98  * <tr><th style="vertical-align:top; font-weight: normal" id="octal_n">{@code \0}<i>n</i></th>

99  *     <td headers="matches characters octal_n">The character with octal value {@code 0}<i>n</i>

100  *         (0&nbsp;{@code <=}&nbsp;<i>n</i>&nbsp;{@code <=}&nbsp;7)</td></tr>

101  * <tr><th style="vertical-align:top; font-weight: normal" id="octal_nn">{@code \0}<i>nn</i></th>

102  *     <td headers="matches characters octal_nn">The character with octal value {@code 0}<i>nn</i>

103  *         (0&nbsp;{@code <=}&nbsp;<i>n</i>&nbsp;{@code <=}&nbsp;7)</td></tr>

104  * <tr><th style="vertical-align:top; font-weight: normal" id="octal_nnn">{@code \0}<i>mnn</i></th>

105  *     <td headers="matches characters octal_nnn">The character with octal value {@code 0}<i>mnn</i>

106  *         (0&nbsp;{@code <=}&nbsp;<i>m</i>&nbsp;{@code <=}&nbsp;3,

107  *         0&nbsp;{@code <=}&nbsp;<i>n</i>&nbsp;{@code <=}&nbsp;7)</td></tr>

108  * <tr><th style="vertical-align:top; font-weight: normal" id="hex_hh">{@code \x}<i>hh</i></th>

109  *     <td headers="matches characters hex_hh">The character with hexadecimal value {@code 0x}<i>hh</i></td></tr>

110  * <tr><th style="vertical-align:top; font-weight: normal" id="hex_hhhh"><code>&#92;u</code><i>hhhh</i></th>

111  *     <td headers="matches characters hex_hhhh">The character with hexadecimal&nbsp;value&nbsp;{@code 0x}<i>hhhh</i></td></tr>

112  * <tr><th style="vertical-align:top; font-weight: normal" id="hex_h_h"><code>&#92;x</code><i>{h...h}</i></th>

113  *     <td headers="matches characters hex_h_h">The character with hexadecimal value {@code 0x}<i>h...h</i>

114  *         ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}

115  *         &nbsp;&lt;=&nbsp;{@code 0x}<i>h...h</i>&nbsp;&lt;=&nbsp;

116  *          {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>

117  * <tr><th style="vertical-align:top; font-weight: normal" id="unicode_name"><code>&#92;N{</code><i>name</i><code>}</code></th>

118  *     <td headers="matches characters unicode_name">The character with Unicode character name <i>'name'</i></td></tr>

119  * <tr><th style="vertical-align:top; font-weight:normal" id="tab">{@code \t}</th>

120  *     <td headers="matches characters tab">The tab character (<code>'&#92;u0009'</code>)</td></tr>

121  * <tr><th style="vertical-align:top; font-weight:normal" id="newline">{@code \n}</th>

122  *     <td headers="matches characters newline">The newline (line feed) character (<code>'&#92;u000A'</code>)</td></tr>

123  * <tr><th style="vertical-align:top; font-weight:normal" id="return">{@code \r}</th>

124  *     <td headers="matches characters return">The carriage-return character (<code>'&#92;u000D'</code>)</td></tr>

125  * <tr><th style="vertical-align:top; font-weight:normal" id="form_feed">{@code \f}</th>

126  *     <td headers="matches characters form_feed">The form-feed character (<code>'&#92;u000C'</code>)</td></tr>

127  * <tr><th style="vertical-align:top; font-weight:normal" id="bell">{@code \a}</th>

128  *     <td headers="matches characters bell">The alert (bell) character (<code>'&#92;u0007'</code>)</td></tr>

129  * <tr><th style="vertical-align:top; font-weight:normal" id="escape">{@code \e}</th>

130  *     <td headers="matches characters escape">The escape character (<code>'&#92;u001B'</code>)</td></tr>

131  * <tr><th style="vertical-align:top; font-weight:normal" id="ctrl_x">{@code \c}<i>x</i></th>

132  *     <td headers="matches characters ctrl_x">The control character corresponding to <i>x</i></td></tr>

133  *

134  *  <tr><th colspan="2" style="padding-top:20px" id="classes">Character classes</th></tr>

135  *

136  * <tr><th style="vertical-align:top; font-weight:normal" id="simple">{@code [abc]}</th>

137  *     <td headers="matches classes simple">{@code a}, {@code b}, or {@code c} (simple class)</td></tr>

138  * <tr><th style="vertical-align:top; font-weight:normal" id="negation">{@code [^abc]}</th>

139  *     <td headers="matches classes negation">Any character except {@code a}, {@code b}, or {@code c} (negation)</td></tr>

140  * <tr><th style="vertical-align:top; font-weight:normal" id="range">{@code [a-zA-Z]}</th>

141  *     <td headers="matches classes range">{@code a} through {@code z}

142  *         or {@code A} through {@code Z}, inclusive (range)</td></tr>

143  * <tr><th style="vertical-align:top; font-weight:normal" id="union">{@code [a-d[m-p]]}</th>

144  *     <td headers="matches classes union">{@code a} through {@code d},

145  *      or {@code m} through {@code p}: {@code [a-dm-p]} (union)</td></tr>

146  * <tr><th style="vertical-align:top; font-weight:normal" id="intersection">{@code [a-z&&[def]]}</th>

147  *     <td headers="matches classes intersection">{@code d}, {@code e}, or {@code f} (intersection)</tr>

148  * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction1">{@code [a-z&&[^bc]]}</th>

149  *     <td headers="matches classes subtraction1">{@code a} through {@code z},

150  *         except for {@code b} and {@code c}: {@code [ad-z]} (subtraction)</td></tr>

151  * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction2">{@code [a-z&&[^m-p]]}</th>

152  *     <td headers="matches classes subtraction2">{@code a} through {@code z},

153  *          and not {@code m} through {@code p}: {@code [a-lq-z]}(subtraction)</td></tr>

154  *

155  * <tr><th colspan="2" style="padding-top:20px" id="predef">Predefined character classes</th></tr>

156  *

157  * <tr><th style="vertical-align:top; font-weight:normal" id="any">{@code .}</th>

158  *     <td headers="matches predef any">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>

159  * <tr><th style="vertical-align:top; font-weight:normal" id="digit">{@code \d}</th>

160  *     <td headers="matches predef digit">A digit: {@code [0-9]}</td></tr>

161  * <tr><th style="vertical-align:top; font-weight:normal" id="non_digit">{@code \D}</th>

162  *     <td headers="matches predef non_digit">A non-digit: {@code [^0-9]}</td></tr>

163  * <tr><th style="vertical-align:top; font-weight:normal" id="horiz_white">{@code \h}</th>

164  *     <td headers="matches predef horiz_white">A horizontal whitespace character:

165  *     <code>[ \t\xA0&#92;u1680&#92;u180e&#92;u2000-&#92;u200a&#92;u202f&#92;u205f&#92;u3000]</code></td></tr>

166  * <tr><th style="vertical-align:top; font-weight:normal" id="non_horiz_white">{@code \H}</th>

167  *     <td headers="matches predef non_horiz_white">A non-horizontal whitespace character: {@code [^\h]}</td></tr>

168  * <tr><th style="vertical-align:top; font-weight:normal" id="white">{@code \s}</th>

169  *     <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr>

170  * <tr><th style="vertical-align:top; font-weight:normal" id="non_white">{@code \S}</th>

171  *     <td headers="matches predef non_white">A non-whitespace character: {@code [^\s]}</td></tr>

172  * <tr><th style="vertical-align:top; font-weight:normal" id="vert_white">{@code \v}</th>

173  *     <td headers="matches predef vert_white">A vertical whitespace character: <code>[\n\x0B\f\r\x85&#92;u2028&#92;u2029]</code>

174  *     </td></tr>

175  * <tr><th style="vertical-align:top; font-weight:normal" id="non_vert_white">{@code \V}</th>

176  *     <td headers="matches predef non_vert_white">A non-vertical whitespace character: {@code [^\v]}</td></tr>

177  * <tr><th style="vertical-align:top; font-weight:normal" id="word">{@code \w}</th>

178  *     <td headers="matches predef word">A word character: {@code [a-zA-Z_0-9]}</td></tr>

179  * <tr><th style="vertical-align:top; font-weight:normal" id="non_word">{@code \W}</th>

180  *     <td headers="matches predef non_word">A non-word character: {@code [^\w]}</td></tr>

181  *

182  * <tr><th colspan="2" style="padding-top:20px" id="posix"><b>POSIX character classes (US-ASCII only)</b></th></tr>

183  *

184  * <tr><th style="vertical-align:top; font-weight:normal" id="Lower">{@code \p{Lower}}</th>

185  *     <td headers="matches posix Lower">A lower-case alphabetic character: {@code [a-z]}</td></tr>

186  * <tr><th style="vertical-align:top; font-weight:normal" id="Upper">{@code \p{Upper}}</th>

187  *     <td headers="matches posix Upper">An upper-case alphabetic character:{@code [A-Z]}</td></tr>

188  * <tr><th style="vertical-align:top; font-weight:normal" id="ASCII">{@code \p{ASCII}}</th>

189  *     <td headers="matches posix ASCII">All ASCII:{@code [\x00-\x7F]}</td></tr>

190  * <tr><th style="vertical-align:top; font-weight:normal" id="Alpha">{@code \p{Alpha}}</th>

191  *     <td headers="matches posix Alpha">An alphabetic character:{@code [\p{Lower}\p{Upper}]}</td></tr>

192  * <tr><th style="vertical-align:top; font-weight:normal" id="Digit">{@code \p{Digit}}</th>

193  *     <td headers="matches posix Digit">A decimal digit: {@code [0-9]}</td></tr>

194  * <tr><th style="vertical-align:top; font-weight:normal" id="Alnum">{@code \p{Alnum}}</th>

195  *     <td headers="matches posix Alnum">An alphanumeric character:{@code [\p{Alpha}\p{Digit}]}</td></tr>

196  * <tr><th style="vertical-align:top; font-weight:normal" id="Punct">{@code \p{Punct}}</th>

197  *     <td headers="matches posix Punct">Punctuation: One of {@code !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~}</td></tr>

198  *     <!-- {@code [\!"#\$%&'\(\)\*\+,\-\./:;\<=\>\?@\[\\\]\^_`\{\|\}~]}

199  *          {@code [\X21-\X2F\X31-\X40\X5B-\X60\X7B-\X7E]} -->

200  * <tr><th style="vertical-align:top; font-weight:normal" id="Graph">{@code \p{Graph}}</th>

201  *     <td headers="matches posix Graph">A visible character: {@code [\p{Alnum}\p{Punct}]}</td></tr>

202  * <tr><th style="vertical-align:top; font-weight:normal" id="Print">{@code \p{Print}}</th>

203  *     <td headers="matches posix Print">A printable character: {@code [\p{Graph}\x20]}</td></tr>

204  * <tr><th style="vertical-align:top; font-weight:normal" id="Blank">{@code \p{Blank}}</th>

205  *     <td headers="matches posix Blank">A space or a tab: {@code [ \t]}</td></tr>

206  * <tr><th style="vertical-align:top; font-weight:normal" id="Cntrl">{@code \p{Cntrl}}</th>

207  *     <td headers="matches posix Cntrl">A control character: {@code [\x00-\x1F\x7F]}</td></tr>

208  * <tr><th style="vertical-align:top; font-weight:normal" id="XDigit">{@code \p{XDigit}}</th>

209  *     <td headers="matches posix XDigit">A hexadecimal digit: {@code [0-9a-fA-F]}</td></tr>

210  * <tr><th style="vertical-align:top; font-weight:normal" id="Space">{@code \p{Space}}</th>

211  *     <td headers="matches posix Space">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr>

212  *

213  * <tr><th colspan="2" style="padding-top:20px" id="java">java.lang.Character classes (simple <a href="#jcc">java character type</a>)</th></tr>

214  *

215  * <tr><th style="vertical-align:top; font-weight:normal" id="javaLowerCase">{@code \p{javaLowerCase}}</th>

216  *     <td headers="matches java javaLowerCase">Equivalent to java.lang.Character.isLowerCase()</td></tr>

217  * <tr><th style="vertical-align:top; font-weight:normal" id="javaUpperCase">{@code \p{javaUpperCase}}</th>

218  *     <td headers="matches java javaUpperCase">Equivalent to java.lang.Character.isUpperCase()</td></tr>

219  * <tr><th style="vertical-align:top; font-weight:normal" id="javaWhitespace">{@code \p{javaWhitespace}}</th>

220  *     <td headers="matches java javaWhitespace">Equivalent to java.lang.Character.isWhitespace()</td></tr>

221  * <tr><th style="vertical-align:top; font-weight:normal" id="javaMirrored">{@code \p{javaMirrored}}</th>

222  *     <td headers="matches java javaMirrored">Equivalent to java.lang.Character.isMirrored()</td></tr>

223  *

224  * <tr><th colspan="2" style="padding-top:20px"  id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr>

225  *

226  * <tr><th style="vertical-align:top; font-weight:normal" id="IsLatin">{@code \p{IsLatin}}</th>

227  *     <td headers="matches unicode IsLatin">A Latin&nbsp;script character (<a href="#usc">script</a>)</td></tr>

228  * <tr><th style="vertical-align:top; font-weight:normal" id="InGreek">{@code \p{InGreek}}</th>

229  *     <td headers="matches unicode InGreek">A character in the Greek&nbsp;block (<a href="#ubc">block</a>)</td></tr>

230  * <tr><th style="vertical-align:top; font-weight:normal" id="Lu">{@code \p{Lu}}</th>

231  *     <td headers="matches unicode Lu">An uppercase letter (<a href="#ucc">category</a>)</td></tr>

232  * <tr><th style="vertical-align:top; font-weight:normal" id="IsAlphabetic">{@code \p{IsAlphabetic}}</th>

233  *     <td headers="matches unicode IsAlphabetic">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>

234  * <tr><th style="vertical-align:top; font-weight:normal" id="Sc">{@code \p{Sc}}</th>

235  *     <td headers="matches unicode Sc">A currency symbol</td></tr>

236  * <tr><th style="vertical-align:top; font-weight:normal" id="not_InGreek">{@code \P{InGreek}}</th>

237  *     <td headers="matches unicode not_InGreek">Any character except one in the Greek block (negation)</td></tr>

238  * <tr><th style="vertical-align:top; font-weight:normal" id="not_uppercase">{@code [\p{L}&&[^\p{Lu}]]}</th>

239  *     <td headers="matches unicode not_uppercase">Any letter except an uppercase letter (subtraction)</td></tr>

240  *

241  * <tr><th colspan="2" style="padding-top:20px" id="bounds">Boundary matchers</th></tr>

242  *

243  * <tr><th style="vertical-align:top; font-weight:normal" id="begin_line">{@code ^}</th>

244  *     <td headers="matches bounds begin_line">The beginning of a line</td></tr>

245  * <tr><th style="vertical-align:top; font-weight:normal" id="end_line">{@code $}</th>

246  *     <td headers="matches bounds end_line">The end of a line</td></tr>

247  * <tr><th style="vertical-align:top; font-weight:normal" id="word_boundary">{@code \b}</th>

248  *     <td headers="matches bounds word_boundary">A word boundary</td></tr>

249  * <tr><th style="vertical-align:top; font-weight:normal" id="grapheme_cluster_boundary">{@code \b{g}}</th>

250  *     <td headers="matches bounds grapheme_cluster_boundary">A Unicode extended grapheme cluster boundary</td></tr>

251  * <tr><th style="vertical-align:top; font-weight:normal" id="non_word_boundary">{@code \B}</th>

252  *     <td headers="matches bounds non_word_boundary">A non-word boundary</td></tr>

253  * <tr><th style="vertical-align:top; font-weight:normal" id="begin_input">{@code \A}</th>

254  *     <td headers="matches bounds begin_input">The beginning of the input</td></tr>

255  * <tr><th style="vertical-align:top; font-weight:normal" id="end_prev_match">{@code \G}</th>

256  *     <td headers="matches bounds end_prev_match">The end of the previous match</td></tr>

257  * <tr><th style="vertical-align:top; font-weight:normal" id="end_input_except_term">{@code \Z}</th>

258  *     <td headers="matches bounds end_input_except_term">The end of the input but for the final

259  *         <a href="#lt">terminator</a>, if&nbsp;any</td></tr>

260  * <tr><th style="vertical-align:top; font-weight:normal" id="end_input">{@code \z}</th>

261  *     <td headers="matches bounds end_input">The end of the input</td></tr>

262  *

263  * <tr><th colspan="2" style="padding-top:20px" id="linebreak">Linebreak matcher</th></tr>

264  *

265  * <tr><th style="vertical-align:top; font-weight:normal" id="any_unicode_linebreak">{@code \R}</th>

266  *     <td headers="matches linebreak any_unicode_linebreak">Any Unicode linebreak sequence, is equivalent to

267  *     <code>&#92;u000D&#92;u000A|[&#92;u000A&#92;u000B&#92;u000C&#92;u000D&#92;u0085&#92;u2028&#92;u2029]

268  *     </code></td></tr>

269  *

270  * <tr><th colspan="2" style="padding-top:20px" id="grapheme">Unicode Extended Grapheme matcher</th></tr>

271  *

272  * <tr><th style="vertical-align:top; font-weight:normal" id="grapheme_any">{@code \X}</th>

273  *     <td headers="matches grapheme grapheme_any">Any Unicode extended grapheme cluster</td></tr>

274  *

275  * <tr><th colspan="2" style="padding-top:20px" id="greedy">Greedy quantifiers</th></tr>

276  *

277  * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_once_or_not"><i>X</i>{@code ?}</th>

278  *     <td headers="matches greedy greedy_once_or_not"><i>X</i>, once or not at all</td></tr>

279  * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_zero_or_more"><i>X</i>{@code *}</th>

280  *     <td headers="matches greedy greedy_zero_or_more"><i>X</i>, zero or more times</td></tr>

281  * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_one_or_more"><i>X</i>{@code +}</th>

282  *     <td headers="matches greedy greedy_one_or_more"><i>X</i>, one or more times</td></tr>

283  * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_exactly"><i>X</i><code>{</code><i>n</i><code>}</code></th>

284  *     <td headers="matches greedy greedy_exactly"><i>X</i>, exactly <i>n</i> times</td></tr>

285  * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_at_least"><i>X</i><code>{</code><i>n</i>{@code ,}}</th>

286  *     <td headers="matches greedy greedy_at_least"><i>X</i>, at least <i>n</i> times</td></tr>

287  * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_at_least_up_to"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}</code></th>

288  *     <td headers="matches greedy greedy_at_least_up_to"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>

289  *

290  * <tr><th colspan="2" style="padding-top:20px" id="reluc">Reluctant quantifiers</th></tr>

291  *

292  * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_once_or_not"><i>X</i>{@code ??}</th>

293  *     <td headers="matches reluc reluc_once_or_not"><i>X</i>, once or not at all</td></tr>

294  * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_zero_or_more"><i>X</i>{@code *?}</th>

295  *     <td headers="matches reluc reluc_zero_or_more"><i>X</i>, zero or more times</td></tr>

296  * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_one_or_more"><i>X</i>{@code +?}</th>

297  *     <td headers="matches reluc reluc_one_or_more"><i>X</i>, one or more times</td></tr>

298  * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_exactly"><i>X</i><code>{</code><i>n</i><code>}?</code></th>

299  *     <td headers="matches reluc reluc_exactly"><i>X</i>, exactly <i>n</i> times</td></tr>

300  * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_at_least"><i>X</i><code>{</code><i>n</i><code>,}?</code></th>

301  *     <td headers="matches reluc reluc_at_least"><i>X</i>, at least <i>n</i> times</td></tr>

302  * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_at_least_up_to"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}?</code></th>

303  *     <td headers="matches reluc reluc_at_least_up_to"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>

304  *

305  * <tr><th colspan="2" style="padding-top:20px" id="poss">Possessive quantifiers</th></tr>

306  *

307  * <tr><th style="vertical-align:top; font-weight:normal" id="poss_once_or_not"><i>X</i>{@code ?+}</th>

308  *     <td headers="matches poss poss_once_or_not"><i>X</i>, once or not at all</td></tr>

309  * <tr><th style="vertical-align:top; font-weight:normal" id="poss_zero_or_more"><i>X</i>{@code *+}</th>

310  *     <td headers="matches poss poss_zero_or_more"><i>X</i>, zero or more times</td></tr>

311  * <tr><th style="vertical-align:top; font-weight:normal" id="poss_one_or_more"><i>X</i>{@code ++}</th>

312  *     <td headers="matches poss poss_one_or_more"><i>X</i>, one or more times</td></tr>

313  * <tr><th style="vertical-align:top; font-weight:normal" id="poss_exactly"><i>X</i><code>{</code><i>n</i><code>}+</code></th>

314  *     <td headers="matches poss poss_exactly"><i>X</i>, exactly <i>n</i> times</td></tr>

315  * <tr><th style="vertical-align:top; font-weight:normal" id="poss_at_least"><i>X</i><code>{</code><i>n</i><code>,}+</code></th>

316  *     <td headers="matches poss poss_at_least"><i>X</i>, at least <i>n</i> times</td></tr>

317  * <tr><th style="vertical-align:top; font-weight:normal" id="poss_at_least_up_to"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}+</code></th>

318  *     <td headers="matches poss poss_at_least_up_to"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>

319  *

320  * <tr><th colspan="2" style="padding-top:20px" id="logical">Logical operators</th></tr>

321  *

322  * <tr><th style="vertical-align:top; font-weight:normal" id="concat"><i>XY</i></th>

323  *     <td headers="matches logical concat"><i>X</i> followed by <i>Y</i></td></tr>

324  * <tr><th style="vertical-align:top; font-weight:normal" id="alternate"><i>X</i>{@code |}<i>Y</i></th>

325  *     <td headers="matches logical alternate">Either <i>X</i> or <i>Y</i></td></tr>

326  * <tr><th style="vertical-align:top; font-weight:normal" id="group">{@code (}<i>X</i>{@code )}</th>

327  *     <td headers="matches logical group">X, as a <a href="#cg">capturing group</a></td></tr>

328  *

329  * <tr><th colspan="2" style="padding-top:20px" id="backref">Back references</th></tr>

330  *

331  * <tr><th style="vertical-align:top; font-weight:normal" id="back_nth">{@code \}<i>n</i></th>

332  *     <td headers="matches backref back_nth">Whatever the <i>n</i><sup>th</sup>

333  *     <a href="#cg">capturing group</a> matched</td></tr>

334  * <tr><th style="vertical-align:top; font-weight:normal" id="back_named">{@code \}<i>k</i>&lt;<i>name</i>&gt;</th>

335  *     <td headers="matches backref back_named">Whatever the

336  *     <a href="#groupname">named-capturing group</a> "name" matched</td></tr>

337  *

338  * <tr><th colspan="2" style="padding-top:20px" id="quote">Quotation</th></tr>

339  *

340  * <tr><th style="vertical-align:top; font-weight:normal" id="quote_follow">{@code \}</th>

341  *     <td headers="matches quote quote_follow">Nothing, but quotes the following character</td></tr>

342  * <tr><th style="vertical-align:top; font-weight:normal" id="quote_begin">{@code \Q}</th>

343  *     <td headers="matches quote quote_begin">Nothing, but quotes all characters until {@code \E}</td></tr>

344  * <tr><th style="vertical-align:top; font-weight:normal" id="quote_end">{@code \E}</th>

345  *     <td headers="matches quote quote_end">Nothing, but ends quoting started by {@code \Q}</td></tr>

346  *     <!-- Metachars: !$()*+.<>?[\]^{|} -->

347  *

348  * <tr><th colspan="2" style="padding-top:20px" id="special">Special constructs (named-capturing and non-capturing)</th></tr>

349  *

350  * <tr><th style="vertical-align:top; font-weight:normal" id="named_group"><code>(?&lt;<a href="#groupname">name</a>&gt;</code><i>X</i>{@code )}</th>

351  *     <td headers="matches special named_group"><i>X</i>, as a named-capturing group</td></tr>

352  * <tr><th style="vertical-align:top; font-weight:normal" id="non_capture_group">{@code (?:}<i>X</i>{@code )}</th>

353  *     <td headers="matches special non_capture_group"><i>X</i>, as a non-capturing group</td></tr>

354  * <tr><th style="vertical-align:top; font-weight:normal" id="flags"><code>(?idmsuxU-idmsuxU)&nbsp;</code></th>

355  *     <td headers="matches special flags">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>

356  * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>

357  * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a>

358  * on - off</td></tr>

359  * <tr><th style="vertical-align:top; font-weight:normal" id="non_capture_group_flags"><code>(?idmsux-idmsux:</code><i>X</i>{@code )}&nbsp;&nbsp;</th>

360  *     <td headers="matches special non_capture_group_flags"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the

361  *         given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>

362  * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >

363  * <a href="#COMMENTS">x</a> on - off</td></tr>

364  * <tr><th style="vertical-align:top; font-weight:normal" id="pos_lookahead">{@code (?=}<i>X</i>{@code )}</th>

365  *     <td headers="matches special pos_lookahead"><i>X</i>, via zero-width positive lookahead</td></tr>

366  * <tr><th style="vertical-align:top; font-weight:normal" id="neg_lookahead">{@code (?!}<i>X</i>{@code )}</th>

367  *     <td headers="matches special neg_lookahead"><i>X</i>, via zero-width negative lookahead</td></tr>

368  * <tr><th style="vertical-align:top; font-weight:normal" id="pos_lookbehind">{@code (?<=}<i>X</i>{@code )}</th>

369  *     <td headers="matches special pos_lookbehind"><i>X</i>, via zero-width positive lookbehind</td></tr>

370  * <tr><th style="vertical-align:top; font-weight:normal" id="neg_lookbehind">{@code (?<!}<i>X</i>{@code )}</th>

371  *     <td headers="matches special neg_lookbehind"><i>X</i>, via zero-width negative lookbehind</td></tr>

372  * <tr><th style="vertical-align:top; font-weight:normal" id="indep_non_capture_group">{@code (?>}<i>X</i>{@code )}</th>

373  *     <td headers="matches special indep_non_capture_group"><i>X</i>, as an independent, non-capturing group</td></tr>

374  *

375  * </tbody>

376  * </table>

377  *

378  * <hr>

379  *

380  *

381  * <h3><a id="bs">Backslashes, escapes, and quoting</a></h3>

382  *

383  * <p> The backslash character ({@code '\'}) serves to introduce escaped

384  * constructs, as defined in the table above, as well as to quote characters

385  * that otherwise would be interpreted as unescaped constructs.  Thus the

386  * expression {@code \\} matches a single backslash and <code>\{</code> matches a

387  * left brace.

388  *

389  * <p> It is an error to use a backslash prior to any alphabetic character that

390  * does not denote an escaped construct; these are reserved for future

391  * extensions to the regular-expression language.  A backslash may be used

392  * prior to a non-alphabetic character regardless of whether that character is

393  * part of an unescaped construct.

394  *

395  * <p> Backslashes within string literals in Java source code are interpreted

396  * as required by

397  * <cite>The Java&trade; Language Specification</cite>

398  * as either Unicode escapes (section 3.3) or other character escapes (section 3.10.6)

399  * It is therefore necessary to double backslashes in string

400  * literals that represent regular expressions to protect them from

401  * interpretation by the Java bytecode compiler.  The string literal

402  * <code>"&#92;b"</code>, for example, matches a single backspace character when

403  * interpreted as a regular expression, while {@code "\\b"} matches a

404  * word boundary.  The string literal {@code "\(hello\)"} is illegal

405  * and leads to a compile-time error; in order to match the string

406  * {@code (hello)} the string literal {@code "\\(hello\\)"}

407  * must be used.

408  *

409  * <h3><a id="cc">Character Classes</a></h3>

410  *

411  *    <p> Character classes may appear within other character classes, and

412  *    may be composed by the union operator (implicit) and the intersection

413  *    operator ({@code &&}).

414  *    The union operator denotes a class that contains every character that is

415  *    in at least one of its operand classes.  The intersection operator

416  *    denotes a class that contains every character that is in both of its

417  *    operand classes.

418  *

419  *    <p> The precedence of character-class operators is as follows, from

420  *    highest to lowest:

421  *

422  *    <table class="striped" style="margin-left: 2em;">

423  *      <caption style="display:none">Precedence of character class operators.</caption>

424  *      <thead>

425  *      <tr><th scope="col">Precedence<th scope="col">Name<th scope="col">Example

426  *      </thead>

427  *      <tbody>

428  *      <tr><th scope="row">1</th>

429  *        <td>Literal escape&nbsp;&nbsp;&nbsp;&nbsp;</td>

430  *        <td>{@code \x}</td></tr>

431  *     <tr><th scope="row">2</th>

432  *        <td>Grouping</td>

433  *        <td>{@code [...]}</td></tr>

434  *     <tr><th scope="row">3</th>

435  *        <td>Range</td>

436  *        <td>{@code a-z}</td></tr>

437  *      <tr><th scope="row">4</th>

438  *        <td>Union</td>

439  *        <td>{@code [a-e][i-u]}</td></tr>

440  *      <tr><th scope="row">5</th>

441  *        <td>Intersection</td>

442  *        <td>{@code [a-z&&[aeiou]]}</td></tr>

443  *      </tbody>

444  *    </table>

445  *

446  *    <p> Note that a different set of metacharacters are in effect inside

447  *    a character class than outside a character class. For instance, the

448  *    regular expression {@code .} loses its special meaning inside a

449  *    character class, while the expression {@code -} becomes a range

450  *    forming metacharacter.

451  *

452  * <h3><a id="lt">Line terminators</a></h3>

453  *

454  * <p> A <i>line terminator</i> is a one- or two-character sequence that marks

455  * the end of a line of the input character sequence.  The following are

456  * recognized as line terminators:

457  *

458  * <ul>

459  *

460  *   <li> A newline (line feed) character ({@code '\n'}),

461  *

462  *   <li> A carriage-return character followed immediately by a newline

463  *   character ({@code "\r\n"}),

464  *

465  *   <li> A standalone carriage-return character ({@code '\r'}),

466  *

467  *   <li> A next-line character (<code>'&#92;u0085'</code>),

468  *

469  *   <li> A line-separator character (<code>'&#92;u2028'</code>), or

470  *

471  *   <li> A paragraph-separator character (<code>'&#92;u2029'</code>).

472  *

473  * </ul>

474  * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators

475  * recognized are newline characters.

476  *

477  * <p> The regular expression {@code .} matches any character except a line

478  * terminator unless the {@link #DOTALL} flag is specified.

479  *

480  * <p> By default, the regular expressions {@code ^} and {@code $} ignore

481  * line terminators and only match at the beginning and the end, respectively,

482  * of the entire input sequence. If {@link #MULTILINE} mode is activated then

483  * {@code ^} matches at the beginning of input and after any line terminator

484  * except at the end of input. When in {@link #MULTILINE} mode {@code $}

485  * matches just before a line terminator or the end of the input sequence.

486  *

487  * <h3><a id="cg">Groups and capturing</a></h3>

488  *

489  * <h4><a id="gnumber">Group number</a></h4>

490  * <p> Capturing groups are numbered by counting their opening parentheses from

491  * left to right.  In the expression {@code ((A)(B(C)))}, for example, there

492  * are four such groups: </p>

493  *

494  * <ol style="margin-left:2em;">

495  *   <li> {@code ((A)(B(C)))}

496  *   <li> {@code (A)}

497  *   <li> {@code (B(C))}

498  *   <li> {@code (C)}

499  * </ol>

500  *

501  * <p> Group zero always stands for the entire expression.

502  *

503  * <p> Capturing groups are so named because, during a match, each subsequence

504  * of the input sequence that matches such a group is saved.  The captured

505  * subsequence may be used later in the expression, via a back reference, and

506  * may also be retrieved from the matcher once the match operation is complete.

507  *

508  * <h4><a id="groupname">Group name</a></h4>

509  * <p>A capturing group can also be assigned a "name", a {@code named-capturing group},

510  * and then be back-referenced later by the "name". Group names are composed of

511  * the following characters. The first character must be a {@code letter}.

512  *

513  * <ul>

514  *   <li> The uppercase letters {@code 'A'} through {@code 'Z'}

515  *        (<code>'&#92;u0041'</code>&nbsp;through&nbsp;<code>'&#92;u005a'</code>),

516  *   <li> The lowercase letters {@code 'a'} through {@code 'z'}

517  *        (<code>'&#92;u0061'</code>&nbsp;through&nbsp;<code>'&#92;u007a'</code>),

518  *   <li> The digits {@code '0'} through {@code '9'}

519  *        (<code>'&#92;u0030'</code>&nbsp;through&nbsp;<code>'&#92;u0039'</code>),

520  * </ul>

521  *

522  * <p> A {@code named-capturing group} is still numbered as described in

523  * <a href="#gnumber">Group number</a>.

524  *

525  * <p> The captured input associated with a group is always the subsequence

526  * that the group most recently matched.  If a group is evaluated a second time

527  * because of quantification then its previously-captured value, if any, will

528  * be retained if the second evaluation fails.  Matching the string

529  * {@code "aba"} against the expression {@code (a(b)?)+}, for example, leaves

530  * group two set to {@code "b"}.  All captured input is discarded at the

531  * beginning of each match.

532  *

533  * <p> Groups beginning with {@code (?} are either pure, <i>non-capturing</i> groups

534  * that do not capture text and do not count towards the group total, or

535  * <i>named-capturing</i> group.

536  *

537  * <h3> Unicode support </h3>

538  *

539  * <p> This class is in conformance with Level 1 of <a

540  * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical

541  * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1

542  * Canonical Equivalents.

543  * <p>

544  * <b>Unicode escape sequences</b> such as <code>&#92;u2014</code> in Java source code

545  * are processed as described in section 3.3 of

546  * <cite>The Java&trade; Language Specification</cite>.

547  * Such escape sequences are also implemented directly by the regular-expression

548  * parser so that Unicode escapes can be used in expressions that are read from

549  * files or from the keyboard.  Thus the strings <code>"&#92;u2014"</code> and

550  * {@code "\\u2014"}, while not equal, compile into the same pattern, which

551  * matches the character with hexadecimal value {@code 0x2014}.

552  * <p>

553  * A Unicode character can also be represented by using its <b>Hex notation</b>

554  * (hexadecimal code point value) directly as described in construct

555  * <code>&#92;x{...}</code>, for example a supplementary character U+2011F can be

556  * specified as <code>&#92;x{2011F}</code>, instead of two consecutive Unicode escape

557  * sequences of the surrogate pair <code>&#92;uD840</code><code>&#92;uDD1F</code>.

558  * <p>

559  * <b>Unicode character names</b> are supported by the named character construct

560  * <code>\N{</code>...<code>}</code>, for example, <code>\N{WHITE SMILING FACE}</code>

561  * specifies character <code>&#92;u263A</code>. The character names supported

562  * by this class are the valid Unicode character names matched by

563  * {@link java.lang.Character#codePointOf(String) Character.codePointOf(name)}.

564  * <p>

565  * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">

566  * <b>Unicode extended grapheme clusters</b></a> are supported by the grapheme

567  * cluster matcher {@code \X} and the corresponding boundary matcher {@code \b{g}}.

568  * <p>

569  * Unicode scripts, blocks, categories and binary properties are written with

570  * the {@code \p} and {@code \P} constructs as in Perl.

571  * <code>\p{</code><i>prop</i><code>}</code> matches if

572  * the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code>

573  * does not match if the input has that property.

574  * <p>

575  * Scripts, blocks, categories and binary properties can be used both inside

576  * and outside of a character class.

577  *

578  * <p>

579  * <b><a id="usc">Scripts</a></b> are specified either with the prefix {@code Is}, as in

580  * {@code IsHiragana}, or by using  the {@code script} keyword (or its short

581  * form {@code sc}) as in {@code script=Hiragana} or {@code sc=Hiragana}.

582  * <p>

583  * The script names supported by {@code Pattern} are the valid script names

584  * accepted and defined by

585  * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.

586  *

587  * <p>

588  * <b><a id="ubc">Blocks</a></b> are specified with the prefix {@code In}, as in

589  * {@code InMongolian}, or by using the keyword {@code block} (or its short

590  * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.

591  * <p>

592  * The block names supported by {@code Pattern} are the valid block names

593  * accepted and defined by

594  * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.

595  * <p>

596  *

597  * <b><a id="ucc">Categories</a></b> may be specified with the optional prefix {@code Is}:

598  * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode

599  * letters. Same as scripts and blocks, categories can also be specified

600  * by using the keyword {@code general_category} (or its short form

601  * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.

602  * <p>

603  * The supported categories are those of

604  * <a href="http://www.unicode.org/unicode/standard/standard.html">

605  * <i>The Unicode Standard</i></a> in the version specified by the

606  * {@link java.lang.Character Character} class. The category names are those

607  * defined in the Standard, both normative and informative.

608  * <p>

609  *

610  * <b><a id="ubpc">Binary properties</a></b> are specified with the prefix {@code Is}, as in

611  * {@code IsAlphabetic}. The supported binary properties by {@code Pattern}

612  * are

613  * <ul>

614  *   <li> Alphabetic

615  *   <li> Ideographic

616  *   <li> Letter

617  *   <li> Lowercase

618  *   <li> Uppercase

619  *   <li> Titlecase

620  *   <li> Punctuation

621  *   <Li> Control

622  *   <li> White_Space

623  *   <li> Digit

624  *   <li> Hex_Digit

625  *   <li> Join_Control

626  *   <li> Noncharacter_Code_Point

627  *   <li> Assigned

628  * </ul>

629  * <p>

630  * The following <b>Predefined Character classes</b> and <b>POSIX character classes</b>

631  * are in conformance with the recommendation of <i>Annex C: Compatibility Properties</i>

632  * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression

633  * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified.

634  *

635  * <table class="striped">

636  * <caption style="display:none">predefined and posix character classes in Unicode mode</caption>

637  * <thead>

638  * <tr>

639  * <th scope="col" id="predef_classes">Classes</th>

640  * <th scope="col" id="predef_matches">Matches</th>

641  * </tr>

642  * </thead>

643  * <tbody>

644  * <tr><th scope="row">{@code \p{Lower}}</th>

645  *     <td>A lowercase character:{@code \p{IsLowercase}}</td></tr>

646  * <tr><th scope="row">{@code \p{Upper}}</th>

647  *     <td>An uppercase character:{@code \p{IsUppercase}}</td></tr>

648  * <tr><th scope="row">{@code \p{ASCII}}</th>

649  *     <td>All ASCII:{@code [\x00-\x7F]}</td></tr>

650  * <tr><th scope="row">{@code \p{Alpha}}</th>

651  *     <td>An alphabetic character:{@code \p{IsAlphabetic}}</td></tr>

652  * <tr><th scope="row">{@code \p{Digit}}</th>

653  *     <td>A decimal digit character:{@code \p{IsDigit}}</td></tr>

654  * <tr><th scope="row">{@code \p{Alnum}}</th>

655  *     <td>An alphanumeric character:{@code [\p{IsAlphabetic}\p{IsDigit}]}</td></tr>

656  * <tr><th scope="row">{@code \p{Punct}}</th>

657  *     <td>A punctuation character:{@code \p{IsPunctuation}}</td></tr>

658  * <tr><th scope="row">{@code \p{Graph}}</th>

659  *     <td>A visible character: {@code [^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]}</td></tr>

660  * <tr><th scope="row">{@code \p{Print}}</th>

661  *     <td>A printable character: {@code [\p{Graph}\p{Blank}&&[^\p{Cntrl}]]}</td></tr>

662  * <tr><th scope="row">{@code \p{Blank}}</th>

663  *     <td>A space or a tab: {@code [\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]}</td></tr>

664  * <tr><th scope="row">{@code \p{Cntrl}}</th>

665  *     <td>A control character: {@code \p{gc=Cc}}</td></tr>

666  * <tr><th scope="row">{@code \p{XDigit}}</th>

667  *     <td>A hexadecimal digit: {@code [\p{gc=Nd}\p{IsHex_Digit}]}</td></tr>

668  * <tr><th scope="row">{@code \p{Space}}</th>

669  *     <td>A whitespace character:{@code \p{IsWhite_Space}}</td></tr>

670  * <tr><th scope="row">{@code \d}</th>

671  *     <td>A digit: {@code \p{IsDigit}}</td></tr>

672  * <tr><th scope="row">{@code \D}</th>

673  *     <td>A non-digit: {@code [^\d]}</td></tr>

674  * <tr><th scope="row">{@code \s}</th>

675  *     <td>A whitespace character: {@code \p{IsWhite_Space}}</td></tr>

676  * <tr><th scope="row">{@code \S}</th>

677  *     <td>A non-whitespace character: {@code [^\s]}</td></tr>

678  * <tr><th scope="row">{@code \w}</th>

679  *     <td>A word character: {@code [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}\p{IsJoin_Control}]}</td></tr>

680  * <tr><th scope="row">{@code \W}</th>

681  *     <td>A non-word character: {@code [^\w]}</td></tr>

682  * </tbody>

683  * </table>

684  * <p>

685  * <a id="jcc">

686  * Categories that behave like the java.lang.Character

687  * boolean is<i>methodname</i> methods (except for the deprecated ones) are

688  * available through the same <code>\p{</code><i>prop</i><code>}</code> syntax where

689  * the specified property has the name <code>java<i>methodname</i></code></a>.

690  *

691  * <h3> Comparison to Perl 5 </h3>

692  *

693  * <p>The {@code Pattern} engine performs traditional NFA-based matching

694  * with ordered alternation as occurs in Perl 5.

695  *

696  * <p> Perl constructs not supported by this class: </p>

697  *

698  * <ul>

699  *    <li><p> The backreference constructs, <code>\g{</code><i>n</i><code>}</code> for

700  *    the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and

701  *    <code>\g{</code><i>name</i><code>}</code> for

702  *    <a href="#groupname">named-capturing group</a>.

703  *    </p></li>

704  *

705  *    <li><p> The conditional constructs

706  *    {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code )} and

707  *    {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code |}<i>Y</i>{@code )},

708  *    </p></li>

709  *

710  *    <li><p> The embedded code constructs <code>(?{</code><i>code</i><code>})</code>

711  *    and <code>(??{</code><i>code</i><code>})</code>,</p></li>

712  *

713  *    <li><p> The embedded comment syntax {@code (?#comment)}, and </p></li>

714  *

715  *    <li><p> The preprocessing operations {@code \l} <code>&#92;u</code>,

716  *    {@code \L}, and {@code \U}.  </p></li>

717  *

718  * </ul>

719  *

720  * <p> Constructs supported by this class but not by Perl: </p>

721  *

722  * <ul>

723  *

724  *    <li><p> Character-class union and intersection as described

725  *    <a href="#cc">above</a>.</p></li>

726  *

727  * </ul>

728  *

729  * <p> Notable differences from Perl: </p>

730  *

731  * <ul>

732  *

733  *    <li><p> In Perl, {@code \1} through {@code \9} are always interpreted

734  *    as back references; a backslash-escaped number greater than {@code 9} is

735  *    treated as a back reference if at least that many subexpressions exist,

736  *    otherwise it is interpreted, if possible, as an octal escape.  In this

737  *    class octal escapes must always begin with a zero. In this class,

738  *    {@code \1} through {@code \9} are always interpreted as back

739  *    references, and a larger number is accepted as a back reference if at

740  *    least that many subexpressions exist at that point in the regular

741  *    expression, otherwise the parser will drop digits until the number is

742  *    smaller or equal to the existing number of groups or it is one digit.

743  *    </p></li>

744  *

745  *    <li><p> Perl uses the {@code g} flag to request a match that resumes

746  *    where the last match left off.  This functionality is provided implicitly

747  *    by the {@link Matcher} class: Repeated invocations of the {@link

748  *    Matcher#find find} method will resume where the last match left off,

749  *    unless the matcher is reset.  </p></li>

750  *

751  *    <li><p> In Perl, embedded flags at the top level of an expression affect

752  *    the whole expression.  In this class, embedded flags always take effect

753  *    at the point at which they appear, whether they are at the top level or

754  *    within a group; in the latter case, flags are restored at the end of the

755  *    group just as in Perl.  </p></li>

756  *

757  * </ul>

758  *

759  *

760  * <p> For a more precise description of the behavior of regular expression

761  * constructs, please see <a href="http://www.oreilly.com/catalog/regex3/">

762  * <i>Mastering Regular Expressions, 3nd Edition</i>, Jeffrey E. F. Friedl,

763  * O'Reilly and Associates, 2006.</a>

764  * </p>

765  *

766  * @see java.lang.String#split(String, int)

767  * @see java.lang.String#split(String)

768  *

769  * @author      Mike McCloskey

770  * @author      Mark Reinhold

771  * @author      JSR-51 Expert Group

772  * @since       1.4

773  * @spec        JSR-51

774  */

775 

776 public final class Pattern

777     implements java.io.Serializable

778 {

779 

780     /**

781      * Regular expression modifier values.  Instead of being passed as

782      * arguments, they can also be passed as inline modifiers.

783      * For example, the following statements have the same effect.

784      * <pre>

785      * Pattern p1 = Pattern.compile("abc", Pattern.CASE_INSENSITIVE|Pattern.MULTILINE);

786      * Pattern p2 = Pattern.compile("(?im)abc", 0);

787      * </pre>

788      */

789 

790     /**

791      * Enables Unix lines mode.

792      *

793      * <p> In this mode, only the {@code '\n'} line terminator is recognized

794      * in the behavior of {@code .}, {@code ^}, and {@code $}.

795      *

796      * <p> Unix lines mode can also be enabled via the embedded flag

797      * expression&nbsp;{@code (?d)}.

798      */

799     public static final int UNIX_LINES = 0x01;

800 

801     /**

802      * Enables case-insensitive matching.

803      *

804      * <p> By default, case-insensitive matching assumes that only characters

805      * in the US-ASCII charset are being matched.  Unicode-aware

806      * case-insensitive matching can be enabled by specifying the {@link

807      * #UNICODE_CASE} flag in conjunction with this flag.

808      *

809      * <p> Case-insensitive matching can also be enabled via the embedded flag

810      * expression&nbsp;{@code (?i)}.

811      *

812      * <p> Specifying this flag may impose a slight performance penalty.  </p>

813      */

814     public static final int CASE_INSENSITIVE = 0x02;

815 

816     /**

817      * Permits whitespace and comments in pattern.

818      *

819      * <p> In this mode, whitespace is ignored, and embedded comments starting

820      * with {@code #} are ignored until the end of a line.

821      *

822      * <p> Comments mode can also be enabled via the embedded flag

823      * expression&nbsp;{@code (?x)}.

824      */

825     public static final int COMMENTS = 0x04;

826 

827     /**

828      * Enables multiline mode.

829      *

830      * <p> In multiline mode the expressions {@code ^} and {@code $} match

831      * just after or just before, respectively, a line terminator or the end of

832      * the input sequence.  By default these expressions only match at the

833      * beginning and the end of the entire input sequence.

834      *

835      * <p> Multiline mode can also be enabled via the embedded flag

836      * expression&nbsp;{@code (?m)}.  </p>

837      */

838     public static final int MULTILINE = 0x08;

839 

840     /**

841      * Enables literal parsing of the pattern.

842      *

843      * <p> When this flag is specified then the input string that specifies

844      * the pattern is treated as a sequence of literal characters.

845      * Metacharacters or escape sequences in the input sequence will be

846      * given no special meaning.

847      *

848      * <p>The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on

849      * matching when used in conjunction with this flag. The other flags

850      * become superfluous.

851      *

852      * <p> There is no embedded flag character for enabling literal parsing.

853      * @since 1.5

854      */

855     public static final int LITERAL = 0x10;

856 

857     /**

858      * Enables dotall mode.

859      *

860      * <p> In dotall mode, the expression {@code .} matches any character,

861      * including a line terminator.  By default this expression does not match

862      * line terminators.

863      *

864      * <p> Dotall mode can also be enabled via the embedded flag

865      * expression&nbsp;{@code (?s)}.  (The {@code s} is a mnemonic for

866      * "single-line" mode, which is what this is called in Perl.)  </p>

867      */

868     public static final int DOTALL = 0x20;

869 

870     /**

871      * Enables Unicode-aware case folding.

872      *

873      * <p> When this flag is specified then case-insensitive matching, when

874      * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner

875      * consistent with the Unicode Standard.  By default, case-insensitive

876      * matching assumes that only characters in the US-ASCII charset are being

877      * matched.

878      *

879      * <p> Unicode-aware case folding can also be enabled via the embedded flag

880      * expression&nbsp;{@code (?u)}.

881      *

882      * <p> Specifying this flag may impose a performance penalty.  </p>

883      */

884     public static final int UNICODE_CASE = 0x40;

885 

886     /**

887      * Enables canonical equivalence.

888      *

889      * <p> When this flag is specified then two characters will be considered

890      * to match if, and only if, their full canonical decompositions match.

891      * The expression <code>"a&#92;u030A"</code>, for example, will match the

892      * string <code>"&#92;u00E5"</code> when this flag is specified.  By default,

893      * matching does not take canonical equivalence into account.

894      *

895      * <p> There is no embedded flag character for enabling canonical

896      * equivalence.

897      *

898      * <p> Specifying this flag may impose a performance penalty.  </p>

899      */

900     public static final int CANON_EQ = 0x80;

901 

902     /**

903      * Enables the Unicode version of <i>Predefined character classes</i> and

904      * <i>POSIX character classes</i>.

905      *

906      * <p> When this flag is specified then the (US-ASCII only)

907      * <i>Predefined character classes</i> and <i>POSIX character classes</i>

908      * are in conformance with

909      * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical

910      * Standard #18: Unicode Regular Expression</i></a>

911      * <i>Annex C: Compatibility Properties</i>.

912      * <p>

913      * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded

914      * flag expression&nbsp;{@code (?U)}.

915      * <p>

916      * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case

917      * folding.

918      * <p>

919      * Specifying this flag may impose a performance penalty.  </p>

920      * @since 1.7

921      */

922     public static final int UNICODE_CHARACTER_CLASS = 0x100;

923 

924     /**

925      * Contains all possible flags for compile(regex, flags).

926      */

927     private static final int ALL_FLAGS = CASE_INSENSITIVE | MULTILINE |

928             DOTALL | UNICODE_CASE | CANON_EQ | UNIX_LINES | LITERAL |

929             UNICODE_CHARACTER_CLASS | COMMENTS;

930 

931     /* Pattern has only two serialized components: The pattern string

932      * and the flags, which are all that is needed to recompile the pattern

933      * when it is deserialized.

934      */

935 

936     /** use serialVersionUID from Merlin b59 for interoperability */

937     private static final long serialVersionUID = 5073258162644648461L;

938 

939     /**

940      * The original regular-expression pattern string.

941      *

942      * @serial

943      */

944     private String pattern;

945 

946     /**

947      * The original pattern flags.

948      *

949      * @serial

950      */

951     private int flags;

952 

953     /**

954      * The temporary pattern flags used during compiling. The flags might be turn

955      * on and off by embedded flag.

956      */

957     private transient int flags0;

958 

959     /**

960      * Boolean indicating this Pattern is compiled; this is necessary in order

961      * to lazily compile deserialized Patterns.

962      */

963     private transient volatile boolean compiled;

964 

965     /**

966      * The normalized pattern string.

967      */

968     private transient String normalizedPattern;

969 

970     /**

971      * The starting point of state machine for the find operation.  This allows

972      * a match to start anywhere in the input.

973      */

974     transient Node root;

975 

976     /**

977      * The root of object tree for a match operation.  The pattern is matched

978      * at the beginning.  This may include a find that uses BnM or a First

979      * node.

980      */

981     transient Node matchRoot;

982 

983     /**

984      * Temporary storage used by parsing pattern slice.

985      */

986     transient int[] buffer;

987 

988     /**

989      * A temporary storage used for predicate for double return.

990      */

991     transient CharPredicate predicate;

992 

993     /**

994      * Map the "name" of the "named capturing group" to its group id

995      * node.

996      */

997     transient volatile Map<String, Integer> namedGroups;

998 

999     /**

1000      * Temporary storage used while parsing group references.

1001      */

1002     transient GroupHead[] groupNodes;

1003 

1004     /**

1005      * Temporary storage used to store the top level closure nodes.

1006      */

1007     transient List<Node> topClosureNodes;

1008 

1009     /**

1010      * The number of top greedy closure nodes in this Pattern. Used by

1011      * matchers to allocate storage needed for a IntHashSet to keep the

1012      * beginning pos {@code i} of all failed match.

1013      */

1014     transient int localTCNCount;

1015 

1016     /*

1017      * Turn off the stop-exponential-backtracking optimization if there

1018      * is a group ref in the pattern.

1019      */

1020     transient boolean hasGroupRef;

1021 

1022     /**

1023      * Temporary null terminated code point array used by pattern compiling.

1024      */

1025     private transient int[] temp;

1026 

1027     /**

1028      * The number of capturing groups in this Pattern. Used by matchers to

1029      * allocate storage needed to perform a match.

1030      */

1031     transient int capturingGroupCount;

1032 

1033     /**

1034      * The local variable count used by parsing tree. Used by matchers to

1035      * allocate storage needed to perform a match.

1036      */

1037     transient int localCount;

1038 

1039     /**

1040      * Index into the pattern string that keeps track of how much has been

1041      * parsed.

1042      */

1043     private transient int cursor;

1044 

1045     /**

1046      * Holds the length of the pattern string.

1047      */

1048     private transient int patternLength;

1049 

1050     /**

1051      * If the Start node might possibly match supplementary characters.

1052      * It is set to true during compiling if

1053      * (1) There is supplementary char in pattern, or

1054      * (2) There is complement node of a "family" CharProperty

1055      */

1056     private transient boolean hasSupplementary;

1057 

1058     /**

1059      * Compiles the given regular expression into a pattern.

1060      *

1061      * @param  regex

1062      *         The expression to be compiled

1063      * @return the given regular expression compiled into a pattern

1064      * @throws  PatternSyntaxException

1065      *          If the expression's syntax is invalid

1066      */

1067     public static Pattern compile(String regex) {

1068         return new Pattern(regex, 0);

1069     }

1070 

1071     /**

1072      * Compiles the given regular expression into a pattern with the given

1073      * flags.

1074      *

1075      * @param  regex

1076      *         The expression to be compiled

1077      *

1078      * @param  flags

1079      *         Match flags, a bit mask that may include

1080      *         {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},

1081      *         {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},

1082      *         {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}

1083      *         and {@link #COMMENTS}

1084      *

1085      * @return the given regular expression compiled into a pattern with the given flags

1086      * @throws  IllegalArgumentException

1087      *          If bit values other than those corresponding to the defined

1088      *          match flags are set in {@code flags}

1089      *

1090      * @throws  PatternSyntaxException

1091      *          If the expression's syntax is invalid

1092      */

1093     public static Pattern compile(String regex, int flags) {

1094         return new Pattern(regex, flags);

1095     }

1096 

1097     /**

1098      * Returns the regular expression from which this pattern was compiled.

1099      *

1100      * @return  The source of this pattern

1101      */

1102     public String pattern() {

1103         return pattern;

1104     }

1105 

1106     /**

1107      * <p>Returns the string representation of this pattern. This

1108      * is the regular expression from which this pattern was

1109      * compiled.</p>

1110      *

1111      * @return  The string representation of this pattern

1112      * @since 1.5

1113      */

1114     public String toString() {

1115         return pattern;

1116     }

1117 

1118     /**

1119      * Creates a matcher that will match the given input against this pattern.

1120      *

1121      * @param  input

1122      *         The character sequence to be matched

1123      *

1124      * @return  A new matcher for this pattern

1125      */

1126     public Matcher matcher(CharSequence input) {

1127         if (!compiled) {

1128             synchronized(this) {

1129                 if (!compiled)

1130                     compile();

1131             }

1132         }

1133         Matcher m = new Matcher(this, input);

1134         return m;

1135     }

1136 

1137     /**

1138      * Returns this pattern's match flags.

1139      *

1140      * @return  The match flags specified when this pattern was compiled

1141      */

1142     public int flags() {

1143         return flags0;

1144     }

1145 

1146     /**

1147      * Compiles the given regular expression and attempts to match the given

1148      * input against it.

1149      *

1150      * <p> An invocation of this convenience method of the form

1151      *

1152      * <blockquote><pre>

1153      * Pattern.matches(regex, input);</pre></blockquote>

1154      *

1155      * behaves in exactly the same way as the expression

1156      *

1157      * <blockquote><pre>

1158      * Pattern.compile(regex).matcher(input).matches()</pre></blockquote>

1159      *

1160      * <p> If a pattern is to be used multiple times, compiling it once and reusing

1161      * it will be more efficient than invoking this method each time.  </p>

1162      *

1163      * @param  regex

1164      *         The expression to be compiled

1165      *

1166      * @param  input

1167      *         The character sequence to be matched

1168      * @return whether or not the regular expression matches on the input

1169      * @throws  PatternSyntaxException

1170      *          If the expression's syntax is invalid

1171      */

1172     public static boolean matches(String regex, CharSequence input) {

1173         Pattern p = Pattern.compile(regex);

1174         Matcher m = p.matcher(input);

1175         return m.matches();

1176     }

1177 

1178     /**

1179      * Splits the given input sequence around matches of this pattern.

1180      *

1181      * <p> The array returned by this method contains each substring of the

1182      * input sequence that is terminated by another subsequence that matches

1183      * this pattern or is terminated by the end of the input sequence.  The

1184      * substrings in the array are in the order in which they occur in the

1185      * input. If this pattern does not match any subsequence of the input then

1186      * the resulting array has just one element, namely the input sequence in

1187      * string form.

1188      *

1189      * <p> When there is a positive-width match at the beginning of the input

1190      * sequence then an empty leading substring is included at the beginning

1191      * of the resulting array. A zero-width match at the beginning however

1192      * never produces such empty leading substring.

1193      *

1194      * <p> The {@code limit} parameter controls the number of times the

1195      * pattern is applied and therefore affects the length of the resulting

1196      * array.

1197      * <ul>

1198      *    <li><p>

1199      *    If the <i>limit</i> is positive then the pattern will be applied

1200      *    at most <i>limit</i>&nbsp;-&nbsp;1 times, the array's length will be

1201      *    no greater than <i>limit</i>, and the array's last entry will contain

1202      *    all input beyond the last matched delimiter.</p></li>

1203      *

1204      *    <li><p>

1205      *    If the <i>limit</i> is zero then the pattern will be applied as

1206      *    many times as possible, the array can have any length, and trailing

1207      *    empty strings will be discarded.</p></li>

1208      *

1209      *    <li><p>

1210      *    If the <i>limit</i> is negative then the pattern will be applied

1211      *    as many times as possible and the array can have any length.</p></li>

1212      * </ul>

1213      *

1214      * <p> The input {@code "boo:and:foo"}, for example, yields the following

1215      * results with these parameters:

1216      *

1217      * <table class="plain" style="margin-left:2em;">

1218      * <caption style="display:none">Split example showing regex, limit, and result</caption>

1219      * <thead>

1220      * <tr>

1221      *     <th scope="col">Regex</th>

1222      *     <th scope="col">Limit</th>

1223      *     <th scope="col">Result</th>

1224      * </tr>

1225      * </thead>

1226      * <tbody>

1227      * <tr><th scope="row" rowspan="3" style="font-weight:normal">:</th>

1228      *     <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">2</th>

1229      *     <td>{@code { "boo", "and:foo" }}</td></tr>

1230      * <tr><!-- : -->

1231      *     <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th>

1232      *     <td>{@code { "boo", "and", "foo" }}</td></tr>

1233      * <tr><!-- : -->

1234      *     <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-2</th>

1235      *     <td>{@code { "boo", "and", "foo" }}</td></tr>

1236      * <tr><th scope="row" rowspan="3" style="font-weight:normal">o</th>

1237      *     <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th>

1238      *     <td>{@code { "b", "", ":and:f", "", "" }}</td></tr>

1239      * <tr><!-- o -->

1240      *     <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-2</th>

1241      *     <td>{@code { "b", "", ":and:f", "", "" }}</td></tr>

1242      * <tr><!-- o -->

1243      *     <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">0</th>

1244      *     <td>{@code { "b", "", ":and:f" }}</td></tr>

1245      * </tbody>

1246      * </table>

1247      *

1248      * @param  input

1249      *         The character sequence to be split

1250      *

1251      * @param  limit

1252      *         The result threshold, as described above

1253      *

1254      * @return  The array of strings computed by splitting the input

1255      *          around matches of this pattern

1256      */

1257     public String[] split(CharSequence input, int limit) {

1258         int index = 0;

1259         boolean matchLimited = limit > 0;

1260         ArrayList<String> matchList = new ArrayList<>();

1261         Matcher m = matcher(input);

1262 

1263         // Add segments before each match found

1264         while(m.find()) {

1265             if (!matchLimited || matchList.size() < limit - 1) {

1266                 if (index == 0 && index == m.start() && m.start() == m.end()) {

1267                     // no empty leading substring included for zero-width match

1268                     // at the beginning of the input char sequence.

1269                     continue;

1270                 }

1271                 String match = input.subSequence(index, m.start()).toString();

1272                 matchList.add(match);

1273                 index = m.end();

1274             } else if (matchList.size() == limit - 1) { // last one

1275                 String match = input.subSequence(index,

1276                                                  input.length()).toString();

1277                 matchList.add(match);

1278                 index = m.end();

1279             }

1280         }

1281 

1282         // If no match was found, return this

1283         if (index == 0)

1284             return new String[] {input.toString()};

1285 

1286         // Add remaining segment

1287         if (!matchLimited || matchList.size() < limit)

1288             matchList.add(input.subSequence(index, input.length()).toString());

1289 

1290         // Construct result

1291         int resultSize = matchList.size();

1292         if (limit == 0)

1293             while (resultSize > 0 && matchList.get(resultSize-1).equals(""))

1294                 resultSize--;

1295         String[] result = new String[resultSize];

1296         return matchList.subList(0, resultSize).toArray(result);

1297     }

1298 

1299     /**

1300      * Splits the given input sequence around matches of this pattern.

1301      *

1302      * <p> This method works as if by invoking the two-argument {@link

1303      * #split(java.lang.CharSequence, int) split} method with the given input

1304      * sequence and a limit argument of zero.  Trailing empty strings are

1305      * therefore not included in the resulting array. </p>

1306      *

1307      * <p> The input {@code "boo:and:foo"}, for example, yields the following

1308      * results with these expressions:

1309      *

1310      * <table class="plain" style="margin-left:2em">

1311      * <caption style="display:none">Split examples showing regex and result</caption>

1312      * <thead>

1313      * <tr>

1314      *  <th scope="col">Regex</th>

1315      *  <th scope="col">Result</th>

1316      * </tr>

1317      * </thead>

1318      * <tbody>

1319      * <tr><th scope="row" style="text-weight:normal">:</th>

1320      *     <td>{@code { "boo", "and", "foo" }}</td></tr>

1321      * <tr><th scope="row" style="text-weight:normal">o</th>

1322      *     <td>{@code { "b", "", ":and:f" }}</td></tr>

1323      * </tbody>

1324      * </table>

1325      *

1326      *

1327      * @param  input

1328      *         The character sequence to be split

1329      *

1330      * @return  The array of strings computed by splitting the input

1331      *          around matches of this pattern

1332      */

1333     public String[] split(CharSequence input) {

1334         return split(input, 0);

1335     }

1336 

1337     /**

1338      * Returns a literal pattern {@code String} for the specified

1339      * {@code String}.

1340      *

1341      * <p>This method produces a {@code String} that can be used to

1342      * create a {@code Pattern} that would match the string

1343      * {@code s} as if it were a literal pattern.</p> Metacharacters

1344      * or escape sequences in the input sequence will be given no special

1345      * meaning.

1346      *

1347      * @param  s The string to be literalized

1348      * @return  A literal string replacement

1349      * @since 1.5

1350      */

1351     public static String quote(String s) {

1352         int slashEIndex = s.indexOf("\\E");

1353         if (slashEIndex == -1)

1354             return "\\Q" + s + "\\E";

1355 

1356         int lenHint = s.length();

1357         lenHint = (lenHint < Integer.MAX_VALUE - 8 - lenHint) ?

1358                 (lenHint << 1) : (Integer.MAX_VALUE - 8);

1359 

1360         StringBuilder sb = new StringBuilder(lenHint);

1361         sb.append("\\Q");

1362         int current = 0;

1363         do {

1364             sb.append(s, current, slashEIndex)

1365                     .append("\\E\\\\E\\Q");

1366             current = slashEIndex + 2;

1367         } while ((slashEIndex = s.indexOf("\\E", current)) != -1);

1368 

1369         return sb.append(s, current, s.length())

1370                 .append("\\E")

1371                 .toString();

1372     }

1373 

1374     /**

1375      * Recompile the Pattern instance from a stream.  The original pattern

1376      * string is read in and the object tree is recompiled from it.

1377      */

1378     private void readObject(java.io.ObjectInputStream s)

1379         throws java.io.IOException, ClassNotFoundException {

1380 

1381         // Read in all fields

1382         s.defaultReadObject();

1383 

1384         // reset the flags

1385         flags0 = flags;

1386 

1387         // Initialize counts

1388         capturingGroupCount = 1;

1389         localCount = 0;

1390         localTCNCount = 0;

1391 

1392         // if length > 0, the Pattern is lazily compiled

1393         if (pattern.isEmpty()) {

1394             root = new Start(lastAccept);

1395             matchRoot = lastAccept;

1396             compiled = true;

1397         }

1398     }

1399 

1400     /**

1401      * This private constructor is used to create all Patterns. The pattern

1402      * string and match flags are all that is needed to completely describe

1403      * a Pattern. An empty pattern string results in an object tree with

1404      * only a Start node and a LastNode node.

1405      */

1406     private Pattern(String p, int f) {

1407         if ((f & ~ALL_FLAGS) != 0) {

1408             throw new IllegalArgumentException("Unknown flag 0x"

1409                                                + Integer.toHexString(f));

1410         }

1411         pattern = p;

1412         flags = f;

1413 

1414         // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present

1415         if ((flags & UNICODE_CHARACTER_CLASS) != 0)

1416             flags |= UNICODE_CASE;

1417 

1418         // 'flags' for compiling

1419         flags0 = flags;

1420 

1421         // Reset group index count

1422         capturingGroupCount = 1;

1423         localCount = 0;

1424         localTCNCount = 0;

1425 

1426         if (!pattern.isEmpty()) {

1427             try {

1428                 compile();

1429             } catch (StackOverflowError soe) {

1430                 throw error("Stack overflow during pattern compilation");

1431             }

1432         } else {

1433             root = new Start(lastAccept);

1434             matchRoot = lastAccept;

1435         }

1436     }

1437 

1438     /**

1439      * The pattern is converted to normalized form ({@link

1440      * java.text.Normalizer.Form.NFC NFC}, canonical decomposition,

1441      * followed by canonical composition for the character class

1442      * part, and {@link java.text.Normalizer.Form.NFD NFD},

1443      * canonical decomposition) for the rest), and then a pure

1444      * group is constructed to match canonical equivalences of the

1445      * characters.

1446      */

1447     private static String normalize(String pattern) {

1448         int plen = pattern.length();

1449         StringBuilder pbuf = new StringBuilder(plen);

1450         char last = 0;

1451         int lastStart = 0;

1452         char cc = 0;

1453         for (int i = 0; i < plen;) {

1454             char c = pattern.charAt(i);

1455             if (cc == 0 &&    // top level

1456                 c == '\\' && i + 1 < plen && pattern.charAt(i + 1) == '\\') {

1457                 i += 2; last = 0;

1458                 continue;

1459             }

1460             if (c == '[' && last != '\\') {

1461                 if (cc == 0) {

1462                     if (lastStart < i)

1463                         normalizeSlice(pattern, lastStart, i, pbuf);

1464                     lastStart = i;

1465                 }

1466                 cc++;

1467             } else if (c == ']' && last != '\\') {

1468                 cc--;

1469                 if (cc == 0) {

1470                     normalizeClazz(pattern, lastStart, i + 1, pbuf);

1471                     lastStart = i + 1;

1472                 }

1473             }

1474             last = c;

1475             i++;

1476         }

1477         assert (cc == 0);

1478         if (lastStart < plen)

1479             normalizeSlice(pattern, lastStart, plen, pbuf);

1480         return pbuf.toString();

1481     }

1482 

1483     private static void normalizeSlice(String src, int off, int limit,

1484                                        StringBuilder dst)

1485     {

1486         int len = src.length();

1487         int off0 = off;

1488         while (off < limit && ASCII.isAscii(src.charAt(off))) {

1489             off++;

1490         }

1491         if (off == limit) {

1492             dst.append(src, off0, limit);

1493             return;

1494         }

1495         off--;

1496         if (off < off0)

1497             off = off0;

1498         else

1499             dst.append(src, off0, off);

1500         while (off < limit) {

1501             int ch0 = src.codePointAt(off);

1502             if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) {

1503                 dst.append((char)ch0);

1504                 off++;

1505                 continue;

1506             }

1507             int j = off + Character.charCount(ch0);

1508             int ch1;

1509             while (j < limit) {

1510                 ch1 = src.codePointAt(j);

1511                 if (Grapheme.isBoundary(ch0, ch1))

1512                     break;

1513                 ch0 = ch1;

1514                 j += Character.charCount(ch1);

1515             }

1516             String seq = src.substring(off, j);

1517             String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD);

1518             off = j;

1519             if (nfd.length() > 1) {

1520                 ch0 = nfd.codePointAt(0);

1521                 ch1 = nfd.codePointAt(Character.charCount(ch0));

1522                 if (Character.getType(ch1) == Character.NON_SPACING_MARK) {

1523                     Set<String> altns = new LinkedHashSet<>();

1524                     altns.add(seq);

1525                     produceEquivalentAlternation(nfd, altns);

1526                     dst.append("(?:");

1527                     altns.forEach( s -> dst.append(s).append('|'));

1528                     dst.delete(dst.length() - 1, dst.length());

1529                     dst.append(")");

1530                     continue;

1531                 }

1532             }

1533             String nfc = Normalizer.normalize(seq, Normalizer.Form.NFC);

1534             if (!seq.equals(nfc) && !nfd.equals(nfc))

1535                 dst.append("(?:" + seq + "|" + nfd  + "|" + nfc + ")");

1536             else if (!seq.equals(nfd))

1537                 dst.append("(?:" + seq + "|" + nfd + ")");

1538             else

1539                 dst.append(seq);

1540         }

1541     }

1542 

1543     private static void normalizeClazz(String src, int off, int limit,

1544                                        StringBuilder dst)

1545     {

1546         dst.append(Normalizer.normalize(src.substring(off, limit), Form.NFC));

1547     }

1548 

1549     /**

1550      * Given a specific sequence composed of a regular character and

1551      * combining marks that follow it, produce the alternation that will

1552      * match all canonical equivalences of that sequence.

1553      */

1554     private static void produceEquivalentAlternation(String src,

1555                                                      Set<String> dst)

1556     {

1557         int len = countChars(src, 0, 1);

1558         if (src.length() == len) {

1559             dst.add(src);  // source has one character.

1560             return;

1561         }

1562         String base = src.substring(0,len);

1563         String combiningMarks = src.substring(len);

1564         String[] perms = producePermutations(combiningMarks);

1565         // Add combined permutations

1566         for(int x = 0; x < perms.length; x++) {

1567             String next = base + perms[x];

1568             dst.add(next);

1569             next = composeOneStep(next);

1570             if (next != null) {

1571                 produceEquivalentAlternation(next, dst);

1572             }

1573         }

1574     }

1575 

1576     /**

1577      * Returns an array of strings that have all the possible

1578      * permutations of the characters in the input string.

1579      * This is used to get a list of all possible orderings

1580      * of a set of combining marks. Note that some of the permutations

1581      * are invalid because of combining class collisions, and these

1582      * possibilities must be removed because they are not canonically

1583      * equivalent.

1584      */

1585     private static String[] producePermutations(String input) {

1586         if (input.length() == countChars(input, 0, 1))

1587             return new String[] {input};

1588 

1589         if (input.length() == countChars(input, 0, 2)) {

1590             int c0 = Character.codePointAt(input, 0);

1591             int c1 = Character.codePointAt(input, Character.charCount(c0));

1592             if (getClass(c1) == getClass(c0)) {

1593                 return new String[] {input};

1594             }

1595             String[] result = new String[2];

1596             result[0] = input;

1597             StringBuilder sb = new StringBuilder(2);

1598             sb.appendCodePoint(c1);

1599             sb.appendCodePoint(c0);

1600             result[1] = sb.toString();

1601             return result;

1602         }

1603 

1604         int length = 1;

1605         int nCodePoints = countCodePoints(input);

1606         for(int x=1; x<nCodePoints; x++)

1607             length = length * (x+1);

1608 

1609         String[] temp = new String[length];

1610 

1611         int combClass[] = new int[nCodePoints];

1612         for(int x=0, i=0; x<nCodePoints; x++) {

1613             int c = Character.codePointAt(input, i);

1614             combClass[x] = getClass(c);

1615             i +=  Character.charCount(c);

1616         }

1617 

1618         // For each char, take it out and add the permutations

1619         // of the remaining chars

1620         int index = 0;

1621         int len;

1622         // offset maintains the index in code units.

1623 loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {

1624             len = countChars(input, offset, 1);

1625             for(int y=x-1; y>=0; y--) {

1626                 if (combClass[y] == combClass[x]) {

1627                     continue loop;

1628                 }

1629             }

1630             StringBuilder sb = new StringBuilder(input);

1631             String otherChars = sb.delete(offset, offset+len).toString();

1632             String[] subResult = producePermutations(otherChars);

1633 

1634             String prefix = input.substring(offset, offset+len);

1635             for (String sre : subResult)

1636                 temp[index++] = prefix + sre;

1637         }

1638         String[] result = new String[index];

1639         System.arraycopy(temp, 0, result, 0, index);

1640         return result;

1641     }

1642 

1643     private static int getClass(int c) {

1644         return sun.text.Normalizer.getCombiningClass(c);

1645     }

1646 

1647     /**

1648      * Attempts to compose input by combining the first character

1649      * with the first combining mark following it. Returns a String

1650      * that is the composition of the leading character with its first

1651      * combining mark followed by the remaining combining marks. Returns

1652      * null if the first two characters cannot be further composed.

1653      */

1654     private static String composeOneStep(String input) {

1655         int len = countChars(input, 0, 2);

1656         String firstTwoCharacters = input.substring(0, len);

1657         String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);

1658         if (result.equals(firstTwoCharacters))

1659             return null;

1660         else {

1661             String remainder = input.substring(len);

1662             return result + remainder;

1663         }

1664     }

1665 

1666     /**

1667      * Preprocess any \Q...\E sequences in `temp', meta-quoting them.

1668      * See the description of `quotemeta' in perlfunc(1).

1669      */

1670     private void RemoveQEQuoting() {

1671         final int pLen = patternLength;

1672         int i = 0;

1673         while (i < pLen-1) {

1674             if (temp[i] != '\\')

1675                 i += 1;

1676             else if (temp[i + 1] != 'Q')

1677                 i += 2;

1678             else

1679                 break;

1680         }

1681         if (i >= pLen - 1)    // No \Q sequence found

1682             return;

1683         int j = i;

1684         i += 2;

1685         int[] newtemp = new int[j + 3*(pLen-i) + 2];

1686         System.arraycopy(temp, 0, newtemp, 0, j);

1687 

1688         boolean inQuote = true;

1689         boolean beginQuote = true;

1690         while (i < pLen) {

1691             int c = temp[i++];

1692             if (!ASCII.isAscii(c) || ASCII.isAlpha(c)) {

1693                 newtemp[j++] = c;

1694             } else if (ASCII.isDigit(c)) {

1695                 if (beginQuote) {

1696                     /*

1697                      * A unicode escape \[0xu] could be before this quote,

1698                      * and we don't want this numeric char to processed as

1699                      * part of the escape.

1700                      */

1701                     newtemp[j++] = '\\';

1702                     newtemp[j++] = 'x';

1703                     newtemp[j++] = '3';

1704                 }

1705                 newtemp[j++] = c;

1706             } else if (c != '\\') {

1707                 if (inQuote) newtemp[j++] = '\\';

1708                 newtemp[j++] = c;

1709             } else if (inQuote) {

1710                 if (temp[i] == 'E') {

1711                     i++;

1712                     inQuote = false;

1713                 } else {

1714                     newtemp[j++] = '\\';

1715                     newtemp[j++] = '\\';

1716                 }

1717             } else {

1718                 if (temp[i] == 'Q') {

1719                     i++;

1720                     inQuote = true;

1721                     beginQuote = true;

1722                     continue;

1723                 } else {

1724                     newtemp[j++] = c;

1725                     if (i != pLen)

1726                         newtemp[j++] = temp[i++];

1727                 }

1728             }

1729 

1730             beginQuote = false;

1731         }

1732 

1733         patternLength = j;

1734         temp = Arrays.copyOf(newtemp, j + 2); // double zero termination

1735     }

1736 

1737     /**

1738      * Copies regular expression to an int array and invokes the parsing

1739      * of the expression which will create the object tree.

1740      */

1741     private void compile() {

1742         // Handle canonical equivalences

1743         if (has(CANON_EQ) && !has(LITERAL)) {

1744             normalizedPattern = normalize(pattern);

1745         } else {

1746             normalizedPattern = pattern;

1747         }

1748         patternLength = normalizedPattern.length();

1749 

1750         // Copy pattern to int array for convenience

1751         // Use double zero to terminate pattern

1752         temp = new int[patternLength + 2];

1753 

1754         hasSupplementary = false;

1755         int c, count = 0;

1756         // Convert all chars into code points

1757         for (int x = 0; x < patternLength; x += Character.charCount(c)) {

1758             c = normalizedPattern.codePointAt(x);

1759             if (isSupplementary(c)) {

1760                 hasSupplementary = true;

1761             }

1762             temp[count++] = c;

1763         }

1764 

1765         patternLength = count;   // patternLength now in code points

1766 

1767         if (! has(LITERAL))

1768             RemoveQEQuoting();

1769 

1770         // Allocate all temporary objects here.

1771         buffer = new int[32];

1772         groupNodes = new GroupHead[10];

1773         namedGroups = null;

1774         topClosureNodes = new ArrayList<>(10);

1775 

1776         if (has(LITERAL)) {

1777             // Literal pattern handling

1778             matchRoot = newSlice(temp, patternLength, hasSupplementary);

1779             matchRoot.next = lastAccept;

1780         } else {

1781             // Start recursive descent parsing

1782             matchRoot = expr(lastAccept);

1783             // Check extra pattern characters

1784             if (patternLength != cursor) {

1785                 if (peek() == ')') {

1786                     throw error("Unmatched closing ')'");

1787                 } else {

1788                     throw error("Unexpected internal error");

1789                 }

1790             }

1791         }

1792 

1793         // Peephole optimization

1794         if (matchRoot instanceof Slice) {

1795             root = BnM.optimize(matchRoot);

1796             if (root == matchRoot) {

1797                 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);

1798             }

1799         } else if (matchRoot instanceof Begin || matchRoot instanceof First) {

1800             root = matchRoot;

1801         } else {

1802             root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);

1803         }

1804 

1805         // Optimize the greedy Loop to prevent exponential backtracking, IF there

1806         // is no group ref in this pattern. With a non-negative localTCNCount value,

1807         // the greedy type Loop, Curly will skip the backtracking for any starting

1808         // position "i" that failed in the past.

1809         if (!hasGroupRef) {

1810             for (Node node : topClosureNodes) {

1811                 if (node instanceof Loop) {

1812                     // non-deterministic-greedy-group

1813                     ((Loop)node).posIndex = localTCNCount++;

1814                 }

1815             }

1816         }

1817 

1818         // Release temporary storage

1819         temp = null;

1820         buffer = null;

1821         groupNodes = null;

1822         patternLength = 0;

1823         compiled = true;

1824         topClosureNodes = null;

1825     }

1826 

1827     Map<String, Integer> namedGroups() {

1828         Map<String, Integer> groups = namedGroups;

1829         if (groups == null) {

1830             namedGroups = groups = new HashMap<>(2);

1831         }

1832         return groups;

1833     }

1834 

1835     /**

1836      * Used to accumulate information about a subtree of the object graph

1837      * so that optimizations can be applied to the subtree.

1838      */

1839     static final class TreeInfo {

1840         int minLength;

1841         int maxLength;

1842         boolean maxValid;

1843         boolean deterministic;

1844 

1845         TreeInfo() {

1846             reset();

1847         }

1848         void reset() {

1849             minLength = 0;

1850             maxLength = 0;

1851             maxValid = true;

1852             deterministic = true;

1853         }

1854     }

1855 

1856     /*

1857      * The following private methods are mainly used to improve the

1858      * readability of the code. In order to let the Java compiler easily

1859      * inline them, we should not put many assertions or error checks in them.

1860      */

1861 

1862     /**

1863      * Indicates whether a particular flag is set or not.

1864      */

1865     private boolean has(int f) {

1866         return (flags0 & f) != 0;

1867     }

1868 

1869     /**

1870      * Match next character, signal error if failed.

1871      */

1872     private void accept(int ch, String s) {

1873         int testChar = temp[cursor++];

1874         if (has(COMMENTS))

1875             testChar = parsePastWhitespace(testChar);

1876         if (ch != testChar) {

1877             throw error(s);

1878         }

1879     }

1880 

1881     /**

1882      * Mark the end of pattern with a specific character.

1883      */

1884     private void mark(int c) {

1885         temp[patternLength] = c;

1886     }

1887 

1888     /**

1889      * Peek the next character, and do not advance the cursor.

1890      */

1891     private int peek() {

1892         int ch = temp[cursor];

1893         if (has(COMMENTS))

1894             ch = peekPastWhitespace(ch);

1895         return ch;

1896     }

1897 

1898     /**

1899      * Read the next character, and advance the cursor by one.

1900      */

1901     private int read() {

1902         int ch = temp[cursor++];

1903         if (has(COMMENTS))

1904             ch = parsePastWhitespace(ch);

1905         return ch;

1906     }

1907 

1908     /**

1909      * Read the next character, and advance the cursor by one,

1910      * ignoring the COMMENTS setting

1911      */

1912     private int readEscaped() {

1913         int ch = temp[cursor++];

1914         return ch;

1915     }

1916 

1917     /**

1918      * Advance the cursor by one, and peek the next character.

1919      */

1920     private int next() {

1921         int ch = temp[++cursor];

1922         if (has(COMMENTS))

1923             ch = peekPastWhitespace(ch);

1924         return ch;

1925     }

1926 

1927     /**

1928      * Advance the cursor by one, and peek the next character,

1929      * ignoring the COMMENTS setting

1930      */

1931     private int nextEscaped() {

1932         int ch = temp[++cursor];

1933         return ch;

1934     }

1935 

1936     /**

1937      * If in xmode peek past whitespace and comments.

1938      */

1939     private int peekPastWhitespace(int ch) {

1940         while (ASCII.isSpace(ch) || ch == '#') {

1941             while (ASCII.isSpace(ch))

1942                 ch = temp[++cursor];

1943             if (ch == '#') {

1944                 ch = peekPastLine();

1945             }

1946         }

1947         return ch;

1948     }

1949 

1950     /**

1951      * If in xmode parse past whitespace and comments.

1952      */

1953     private int parsePastWhitespace(int ch) {

1954         while (ASCII.isSpace(ch) || ch == '#') {

1955             while (ASCII.isSpace(ch))

1956                 ch = temp[cursor++];

1957             if (ch == '#')

1958                 ch = parsePastLine();

1959         }

1960         return ch;

1961     }

1962 

1963     /**

1964      * xmode parse past comment to end of line.

1965      */

1966     private int parsePastLine() {

1967         int ch = temp[cursor++];

1968         while (ch != 0 && !isLineSeparator(ch))

1969             ch = temp[cursor++];

1970         if (ch == 0 && cursor > patternLength) {

1971             cursor = patternLength;

1972             ch = temp[cursor++];

1973         }

1974         return ch;

1975     }

1976 

1977     /**

1978      * xmode peek past comment to end of line.

1979      */

1980     private int peekPastLine() {

1981         int ch = temp[++cursor];

1982         while (ch != 0 && !isLineSeparator(ch))

1983             ch = temp[++cursor];

1984         if (ch == 0 && cursor > patternLength) {

1985             cursor = patternLength;

1986             ch = temp[cursor];

1987         }

1988         return ch;

1989     }

1990 

1991     /**

1992      * Determines if character is a line separator in the current mode

1993      */

1994     private boolean isLineSeparator(int ch) {

1995         if (has(UNIX_LINES)) {

1996             return ch == '\n';

1997         } else {

1998             return (ch == '\n' ||

1999                     ch == '\r' ||

2000                     (ch|1) == '\u2029' ||

2001                     ch == '\u0085');

2002         }

2003     }

2004 

2005     /**

2006      * Read the character after the next one, and advance the cursor by two.

2007      */

2008     private int skip() {

2009         int i = cursor;

2010         int ch = temp[i+1];

2011         cursor = i + 2;

2012         return ch;

2013     }

2014 

2015     /**

2016      * Unread one next character, and retreat cursor by one.

2017      */

2018     private void unread() {

2019         cursor--;

2020     }

2021 

2022     /**

2023      * Internal method used for handling all syntax errors. The pattern is

2024      * displayed with a pointer to aid in locating the syntax error.

2025      */

2026     private PatternSyntaxException error(String s) {

2027         return new PatternSyntaxException(s, normalizedPattern,  cursor - 1);

2028     }

2029 

2030     /**

2031      * Determines if there is any supplementary character or unpaired

2032      * surrogate in the specified range.

2033      */

2034     private boolean findSupplementary(int start, int end) {

2035         for (int i = start; i < end; i++) {

2036             if (isSupplementary(temp[i]))

2037                 return true;

2038         }

2039         return false;

2040     }

2041 

2042     /**

2043      * Determines if the specified code point is a supplementary

2044      * character or unpaired surrogate.

2045      */

2046     private static final boolean isSupplementary(int ch) {

2047         return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||

2048                Character.isSurrogate((char)ch);

2049     }

2050 

2051     /**

2052      *  The following methods handle the main parsing. They are sorted

2053      *  according to their precedence order, the lowest one first.

2054      */

2055 

2056     /**

2057      * The expression is parsed with branch nodes added for alternations.

2058      * This may be called recursively to parse sub expressions that may

2059      * contain alternations.

2060      */

2061     private Node expr(Node end) {

2062         Node prev = null;

2063         Node firstTail = null;

2064         Branch branch = null;

2065         Node branchConn = null;

2066 

2067         for (;;) {

2068             Node node = sequence(end);

2069             Node nodeTail = root;      //double return

2070             if (prev == null) {

2071                 prev = node;

2072                 firstTail = nodeTail;

2073             } else {

2074                 // Branch

2075                 if (branchConn == null) {

2076                     branchConn = new BranchConn();

2077                     branchConn.next = end;

2078                 }

2079                 if (node == end) {

2080                     // if the node returned from sequence() is "end"

2081                     // we have an empty expr, set a null atom into

2082                     // the branch to indicate to go "next" directly.

2083                     node = null;

2084                 } else {

2085                     // the "tail.next" of each atom goes to branchConn

2086                     nodeTail.next = branchConn;

2087                 }

2088                 if (prev == branch) {

2089                     branch.add(node);

2090                 } else {

2091                     if (prev == end) {

2092                         prev = null;

2093                     } else {

2094                         // replace the "end" with "branchConn" at its tail.next

2095                         // when put the "prev" into the branch as the first atom.

2096                         firstTail.next = branchConn;

2097                     }

2098                     prev = branch = new Branch(prev, node, branchConn);

2099                 }

2100             }

2101             if (peek() != '|') {

2102                 return prev;

2103             }

2104             next();

2105         }

2106     }

2107 

2108     @SuppressWarnings("fallthrough")

2109     /**

2110      * Parsing of sequences between alternations.

2111      */

2112     private Node sequence(Node end) {

2113         Node head = null;

2114         Node tail = null;

2115         Node node = null;

2116     LOOP:

2117         for (;;) {

2118             int ch = peek();

2119             switch (ch) {

2120             case '(':

2121                 // Because group handles its own closure,

2122                 // we need to treat it differently

2123                 node = group0();

2124                 // Check for comment or flag group

2125                 if (node == null)

2126                     continue;

2127                 if (head == null)

2128                     head = node;

2129                 else

2130                     tail.next = node;

2131                 // Double return: Tail was returned in root

2132                 tail = root;

2133                 continue;

2134             case '[':

2135                 if (has(CANON_EQ) && !has(LITERAL))

2136                     node = new NFCCharProperty(clazz(true));

2137                 else

2138                     node = newCharProperty(clazz(true));

2139                 break;

2140             case '\\':

2141                 ch = nextEscaped();

2142                 if (ch == 'p' || ch == 'P') {

2143                     boolean oneLetter = true;

2144                     boolean comp = (ch == 'P');

2145                     ch = next(); // Consume { if present

2146                     if (ch != '{') {

2147                         unread();

2148                     } else {

2149                         oneLetter = false;

2150                     }

2151                     // node = newCharProperty(family(oneLetter, comp));

2152                     if (has(CANON_EQ) && !has(LITERAL))

2153                         node = new NFCCharProperty(family(oneLetter, comp));

2154                     else

2155                         node = newCharProperty(family(oneLetter, comp));

2156                 } else {

2157                     unread();

2158                     node = atom();

2159                 }

2160                 break;

2161             case '^':

2162                 next();

2163                 if (has(MULTILINE)) {

2164                     if (has(UNIX_LINES))

2165                         node = new UnixCaret();

2166                     else

2167                         node = new Caret();

2168                 } else {

2169                     node = new Begin();

2170                 }

2171                 break;

2172             case '$':

2173                 next();

2174                 if (has(UNIX_LINES))

2175                     node = new UnixDollar(has(MULTILINE));

2176                 else

2177                     node = new Dollar(has(MULTILINE));

2178                 break;

2179             case '.':

2180                 next();

2181                 if (has(DOTALL)) {

2182                     node = new CharProperty(ALL());

2183                 } else {

2184                     if (has(UNIX_LINES)) {

2185                         node = new CharProperty(UNIXDOT());

2186                     } else {

2187                         node = new CharProperty(DOT());

2188                     }

2189                 }

2190                 break;

2191             case '|':

2192             case ')':

2193                 break LOOP;

2194             case ']': // Now interpreting dangling ] and } as literals

2195             case '}':

2196                 node = atom();

2197                 break;

2198             case '?':

2199             case '*':

2200             case '+':

2201                 next();

2202                 throw error("Dangling meta character '" + ((char)ch) + "'");

2203             case 0:

2204                 if (cursor >= patternLength) {

2205                     break LOOP;

2206                 }

2207                 // Fall through

2208             default:

2209                 node = atom();

2210                 break;

2211             }

2212 

2213             node = closure(node);

2214             /* save the top dot-greedy nodes (.*, .+) as well

2215             if (node instanceof GreedyCharProperty &&

2216                 ((GreedyCharProperty)node).cp instanceof Dot) {

2217                 topClosureNodes.add(node);

2218             }

2219             */

2220             if (head == null) {

2221                 head = tail = node;

2222             } else {

2223                 tail.next = node;

2224                 tail = node;

2225             }

2226         }

2227         if (head == null) {

2228             return end;

2229         }

2230         tail.next = end;

2231         root = tail;      //double return

2232         return head;

2233     }

2234 

2235     @SuppressWarnings("fallthrough")

2236     /**

2237      * Parse and add a new Single or Slice.

2238      */

2239     private Node atom() {

2240         int first = 0;

2241         int prev = -1;

2242         boolean hasSupplementary = false;

2243         int ch = peek();

2244         for (;;) {

2245             switch (ch) {

2246             case '*':

2247             case '+':

2248             case '?':

2249             case '{':

2250                 if (first > 1) {

2251                     cursor = prev;    // Unwind one character

2252                     first--;

2253                 }

2254                 break;

2255             case '$':

2256             case '.':

2257             case '^':

2258             case '(':

2259             case '[':

2260             case '|':

2261             case ')':

2262                 break;

2263             case '\\':

2264                 ch = nextEscaped();

2265                 if (ch == 'p' || ch == 'P') { // Property

2266                     if (first > 0) { // Slice is waiting; handle it first

2267                         unread();

2268                         break;

2269                     } else { // No slice; just return the family node

2270                         boolean comp = (ch == 'P');

2271                         boolean oneLetter = true;

2272                         ch = next(); // Consume { if present

2273                         if (ch != '{')

2274                             unread();

2275                         else

2276                             oneLetter = false;

2277                         if (has(CANON_EQ) && !has(LITERAL))

2278                             return new NFCCharProperty(family(oneLetter, comp));

2279                         else

2280                             return newCharProperty(family(oneLetter, comp));

2281                     }

2282                 }

2283                 unread();

2284                 prev = cursor;

2285                 ch = escape(false, first == 0, false);

2286                 if (ch >= 0) {

2287                     append(ch, first);

2288                     first++;

2289                     if (isSupplementary(ch)) {

2290                         hasSupplementary = true;

2291                     }

2292                     ch = peek();

2293                     continue;

2294                 } else if (first == 0) {

2295                     return root;

2296                 }

2297                 // Unwind meta escape sequence

2298                 cursor = prev;

2299                 break;

2300             case 0:

2301                 if (cursor >= patternLength) {

2302                     break;

2303                 }

2304                 // Fall through

2305             default:

2306                 prev = cursor;

2307                 append(ch, first);

2308                 first++;

2309                 if (isSupplementary(ch)) {

2310                     hasSupplementary = true;

2311                 }

2312                 ch = next();

2313                 continue;

2314             }

2315             break;

2316         }

2317         if (first == 1) {

2318             return newCharProperty(single(buffer[0]));

2319         } else {

2320             return newSlice(buffer, first, hasSupplementary);

2321         }

2322     }

2323 

2324     private void append(int ch, int len) {

2325         if (len >= buffer.length) {

2326             int[] tmp = new int[len+len];

2327             System.arraycopy(buffer, 0, tmp, 0, len);

2328             buffer = tmp;

2329         }

2330         buffer[len] = ch;

2331     }

2332 

2333     /**

2334      * Parses a backref greedily, taking as many numbers as it

2335      * can. The first digit is always treated as a backref, but

2336      * multi digit numbers are only treated as a backref if at

2337      * least that many backrefs exist at this point in the regex.

2338      */

2339     private Node ref(int refNum) {

2340         boolean done = false;

2341         while(!done) {

2342             int ch = peek();

2343             switch(ch) {

2344             case '0':

2345             case '1':

2346             case '2':

2347             case '3':

2348             case '4':

2349             case '5':

2350             case '6':

2351             case '7':

2352             case '8':

2353             case '9':

2354                 int newRefNum = (refNum * 10) + (ch - '0');

2355                 // Add another number if it doesn't make a group

2356                 // that doesn't exist

2357                 if (capturingGroupCount - 1 < newRefNum) {

2358                     done = true;

2359                     break;

2360                 }

2361                 refNum = newRefNum;

2362                 read();

2363                 break;

2364             default:

2365                 done = true;

2366                 break;

2367             }

2368         }

2369         hasGroupRef = true;

2370         if (has(CASE_INSENSITIVE))

2371             return new CIBackRef(refNum, has(UNICODE_CASE));

2372         else

2373             return new BackRef(refNum);

2374     }

2375 

2376     /**

2377      * Parses an escape sequence to determine the actual value that needs

2378      * to be matched.

2379      * If -1 is returned and create was true a new object was added to the tree

2380      * to handle the escape sequence.

2381      * If the returned value is greater than zero, it is the value that

2382      * matches the escape sequence.

2383      */

2384     private int escape(boolean inclass, boolean create, boolean isrange) {

2385         int ch = skip();

2386         switch (ch) {

2387         case '0':

2388             return o();

2389         case '1':

2390         case '2':

2391         case '3':

2392         case '4':

2393         case '5':

2394         case '6':

2395         case '7':

2396         case '8':

2397         case '9':

2398             if (inclass) break;

2399             if (create) {

2400                 root = ref((ch - '0'));

2401             }

2402             return -1;

2403         case 'A':

2404             if (inclass) break;

2405             if (create) root = new Begin();

2406             return -1;

2407         case 'B':

2408             if (inclass) break;

2409             if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));

2410             return -1;

2411         case 'C':

2412             break;

2413         case 'D':

2414             if (create) {

2415                 predicate = has(UNICODE_CHARACTER_CLASS) ?

2416                             CharPredicates.DIGIT() : CharPredicates.ASCII_DIGIT();

2417                 predicate = predicate.negate();

2418                 if (!inclass)

2419                     root = newCharProperty(predicate);

2420             }

2421             return -1;

2422         case 'E':

2423         case 'F':

2424             break;

2425         case 'G':

2426             if (inclass) break;

2427             if (create) root = new LastMatch();

2428             return -1;

2429         case 'H':

2430             if (create) {

2431                 predicate = HorizWS().negate();

2432                 if (!inclass)

2433                     root = newCharProperty(predicate);

2434             }

2435             return -1;

2436         case 'I':

2437         case 'J':

2438         case 'K':

2439         case 'L':

2440         case 'M':

2441             break;

2442         case 'N':

2443             return N();

2444         case 'O':

2445         case 'P':

2446         case 'Q':

2447             break;

2448         case 'R':

2449             if (inclass) break;

2450             if (create) root = new LineEnding();

2451             return -1;

2452         case 'S':

2453             if (create) {

2454                 predicate = has(UNICODE_CHARACTER_CLASS) ?

2455                             CharPredicates.WHITE_SPACE() : CharPredicates.ASCII_SPACE();

2456                 predicate = predicate.negate();

2457                 if (!inclass)

2458                     root = newCharProperty(predicate);

2459             }

2460             return -1;

2461         case 'T':

2462         case 'U':

2463             break;

2464         case 'V':

2465             if (create) {

2466                 predicate = VertWS().negate();

2467                 if (!inclass)

2468                     root = newCharProperty(predicate);

2469             }

2470             return -1;

2471         case 'W':

2472             if (create) {

2473                 predicate = has(UNICODE_CHARACTER_CLASS) ?

2474                             CharPredicates.WORD() : CharPredicates.ASCII_WORD();

2475                 predicate = predicate.negate();

2476                 if (!inclass)

2477                     root = newCharProperty(predicate);

2478             }

2479             return -1;

2480         case 'X':

2481             if (inclass) break;

2482             if (create) {

2483                 root = new XGrapheme();

2484             }

2485             return -1;

2486         case 'Y':

2487             break;

2488         case 'Z':

2489             if (inclass) break;

2490             if (create) {

2491                 if (has(UNIX_LINES))

2492                     root = new UnixDollar(false);

2493                 else

2494                     root = new Dollar(false);

2495             }

2496             return -1;

2497         case 'a':

2498             return '\007';

2499         case 'b':

2500             if (inclass) break;

2501             if (create) {

2502                 if (peek() == '{') {

2503                     if (skip() == 'g') {

2504                         if (read() == '}') {

2505                             root = new GraphemeBound();

2506                             return -1;

2507                         }

2508                         break;  // error missing trailing }

2509                     }

2510                     unread(); unread();

2511                 }

2512                 root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));

2513             }

2514             return -1;

2515         case 'c':

2516             return c();

2517         case 'd':

2518             if (create) {

2519                 predicate = has(UNICODE_CHARACTER_CLASS) ?

2520                             CharPredicates.DIGIT() : CharPredicates.ASCII_DIGIT();

2521                 if (!inclass)

2522                     root = newCharProperty(predicate);

2523             }

2524             return -1;

2525         case 'e':

2526             return '\033';

2527         case 'f':

2528             return '\f';

2529         case 'g':

2530             break;

2531         case 'h':

2532             if (create) {

2533                 predicate = HorizWS();

2534                 if (!inclass)

2535                     root = newCharProperty(predicate);

2536             }

2537             return -1;

2538         case 'i':

2539         case 'j':

2540             break;

2541         case 'k':

2542             if (inclass)

2543                 break;

2544             if (read() != '<')

2545                 throw error("\\k is not followed by '<' for named capturing group");

2546             String name = groupname(read());

2547             if (!namedGroups().containsKey(name))

2548                 throw error("named capturing group <" + name + "> does not exist");

2549             if (create) {

2550                 hasGroupRef = true;

2551                 if (has(CASE_INSENSITIVE))

2552                     root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));

2553                 else

2554                     root = new BackRef(namedGroups().get(name));

2555             }

2556             return -1;

2557         case 'l':

2558         case 'm':

2559             break;

2560         case 'n':

2561             return '\n';

2562         case 'o':

2563         case 'p':

2564         case 'q':

2565             break;

2566         case 'r':

2567             return '\r';

2568         case 's':

2569             if (create) {

2570                 predicate = has(UNICODE_CHARACTER_CLASS) ?

2571                             CharPredicates.WHITE_SPACE() : CharPredicates.ASCII_SPACE();

2572                 if (!inclass)

2573                     root = newCharProperty(predicate);

2574             }

2575             return -1;

2576         case 't':

2577             return '\t';

2578         case 'u':

2579             return u();

2580         case 'v':

2581             // '\v' was implemented as VT/0x0B in releases < 1.8 (though

2582             // undocumented). In JDK8 '\v' is specified as a predefined

2583             // character class for all vertical whitespace characters.

2584             // So [-1, root=VertWS node] pair is returned (instead of a

2585             // single 0x0B). This breaks the range if '\v' is used as

2586             // the start or end value, such as [\v-...] or [...-\v], in

2587             // which a single definite value (0x0B) is expected. For

2588             // compatibility concern '\013'/0x0B is returned if isrange.

2589             if (isrange)

2590                 return '\013';

2591             if (create) {

2592                 predicate = VertWS();

2593                 if (!inclass)

2594                     root = newCharProperty(predicate);

2595             }

2596             return -1;

2597         case 'w':

2598             if (create) {

2599                 predicate = has(UNICODE_CHARACTER_CLASS) ?

2600                             CharPredicates.WORD() : CharPredicates.ASCII_WORD();

2601                 if (!inclass)

2602                     root = newCharProperty(predicate);

2603             }

2604             return -1;

2605         case 'x':

2606             return x();

2607         case 'y':

2608             break;

2609         case 'z':

2610             if (inclass) break;

2611             if (create) root = new End();

2612             return -1;

2613         default:

2614             return ch;

2615         }

2616         throw error("Illegal/unsupported escape sequence");

2617     }

2618 

2619     /**

2620      * Parse a character class, and return the node that matches it.

2621      *

2622      * Consumes a ] on the way out if consume is true. Usually consume

2623      * is true except for the case of [abc&&def] where def is a separate

2624      * right hand node with "understood" brackets.

2625      */

2626     private CharPredicate clazz(boolean consume) {

2627         CharPredicate prev = null;

2628         CharPredicate curr = null;

2629         BitClass bits = new BitClass();

2630         BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch];

2631 

2632         boolean isNeg = false;

2633         boolean hasBits = false;

2634         int ch = next();

2635 

2636         // Negates if first char in a class, otherwise literal

2637         if (ch == '^' && temp[cursor-1] == '[') {

2638             ch = next();

2639             isNeg = true;

2640         }

2641         for (;;) {

2642             switch (ch) {

2643                 case '[':

2644                     curr = clazz(true);

2645                     if (prev == null)

2646                         prev = curr;

2647                     else

2648                         prev = prev.union(curr);

2649                     ch = peek();

2650                     continue;

2651                 case '&':

2652                     ch = next();

2653                     if (ch == '&') {

2654                         ch = next();

2655                         CharPredicate right = null;

2656                         while (ch != ']' && ch != '&') {

2657                             if (ch == '[') {

2658                                 if (right == null)

2659                                     right = clazz(true);

2660                                 else

2661                                     right = right.union(clazz(true));

2662                             } else { // abc&&def

2663                                 unread();

2664                                 right = clazz(false);

2665                             }

2666                             ch = peek();

2667                         }

2668                         if (hasBits) {

2669                             // bits used, union has high precedence

2670                             if (prev == null) {

2671                                 prev = curr = bitsP;

2672                             } else {

2673                                 prev = prev.union(bitsP);

2674                             }

2675                             hasBits = false;

2676                         }

2677                         if (right != null)

2678                             curr = right;

2679                         if (prev == null) {

2680                             if (right == null)

2681                                 throw error("Bad class syntax");

2682                             else

2683                                 prev = right;

2684                         } else {

2685                             prev = prev.and(curr);

2686                         }

2687                     } else {

2688                         // treat as a literal &

2689                         unread();

2690                         break;

2691                     }

2692                     continue;

2693                 case 0:

2694                     if (cursor >= patternLength)

2695                         throw error("Unclosed character class");

2696                     break;

2697                 case ']':

2698                     if (prev != null || hasBits) {

2699                         if (consume)

2700                             next();

2701                         if (prev == null)

2702                             prev = bitsP;

2703                         else if (hasBits)

2704                             prev = prev.union(bitsP);

2705                         if (isNeg)

2706                             return prev.negate();

2707                         return prev;

2708                     }

2709                     break;

2710                 default:

2711                     break;

2712             }

2713             curr = range(bits);

2714             if (curr == null) {    // the bits used

2715                 hasBits = true;

2716             } else {

2717                 if (prev == null)

2718                     prev = curr;

2719                 else if (prev != curr)

2720                     prev = prev.union(curr);

2721             }

2722             ch = peek();

2723         }

2724     }

2725 

2726     private CharPredicate bitsOrSingle(BitClass bits, int ch) {

2727         /* Bits can only handle codepoints in [u+0000-u+00ff] range.

2728            Use "single" node instead of bits when dealing with unicode

2729            case folding for codepoints listed below.

2730            (1)Uppercase out of range: u+00ff, u+00b5

2731               toUpperCase(u+00ff) -> u+0178

2732               toUpperCase(u+00b5) -> u+039c

2733            (2)LatinSmallLetterLongS u+17f

2734               toUpperCase(u+017f) -> u+0053

2735            (3)LatinSmallLetterDotlessI u+131

2736               toUpperCase(u+0131) -> u+0049

2737            (4)LatinCapitalLetterIWithDotAbove u+0130

2738               toLowerCase(u+0130) -> u+0069

2739            (5)KelvinSign u+212a

2740               toLowerCase(u+212a) ==> u+006B

2741            (6)AngstromSign u+212b

2742               toLowerCase(u+212b) ==> u+00e5

2743         */

2744         if (ch < 256 &&

2745             !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&

2746               (ch == 0xff || ch == 0xb5 ||

2747                ch == 0x49 || ch == 0x69 ||    //I and i

2748                ch == 0x53 || ch == 0x73 ||    //S and s

2749                ch == 0x4b || ch == 0x6b ||    //K and k

2750                ch == 0xc5 || ch == 0xe5))) {  //A+ring

2751             bits.add(ch, flags0);

2752             return null;

2753         }

2754         return single(ch);

2755     }

2756 

2757     /**

2758      *  Returns a suitably optimized, single character predicate

2759      */

2760     private CharPredicate single(final int ch) {

2761         if (has(CASE_INSENSITIVE)) {

2762             int lower, upper;

2763             if (has(UNICODE_CASE)) {

2764                 upper = Character.toUpperCase(ch);

2765                 lower = Character.toLowerCase(upper);

2766                 // Unicode case insensitive matches

2767                 if (upper != lower)

2768                     return SingleU(lower);

2769             } else if (ASCII.isAscii(ch)) {

2770                 lower = ASCII.toLower(ch);

2771                 upper = ASCII.toUpper(ch);

2772                 // Case insensitive matches a given BMP character

2773                 if (lower != upper)

2774                     return SingleI(lower, upper);

2775             }

2776         }

2777         if (isSupplementary(ch))

2778             return SingleS(ch);

2779         return Single(ch);  // Match a given BMP character

2780     }

2781 

2782     /**

2783      * Parse a single character or a character range in a character class

2784      * and return its representative node.

2785      */

2786     private CharPredicate range(BitClass bits) {

2787         int ch = peek();

2788         if (ch == '\\') {

2789             ch = nextEscaped();

2790             if (ch == 'p' || ch == 'P') { // A property

2791                 boolean comp = (ch == 'P');

2792                 boolean oneLetter = true;

2793                 // Consume { if present

2794                 ch = next();

2795                 if (ch != '{')

2796                     unread();

2797                 else

2798                     oneLetter = false;

2799                 return family(oneLetter, comp);

2800             } else { // ordinary escape

2801                 boolean isrange = temp[cursor+1] == '-';

2802                 unread();

2803                 ch = escape(true, true, isrange);

2804                 if (ch == -1)

2805                     return predicate;

2806             }

2807         } else {

2808             next();

2809         }

2810         if (ch >= 0) {

2811             if (peek() == '-') {

2812                 int endRange = temp[cursor+1];

2813                 if (endRange == '[') {

2814                     return bitsOrSingle(bits, ch);

2815                 }

2816                 if (endRange != ']') {

2817                     next();

2818                     int m = peek();

2819                     if (m == '\\') {

2820                         m = escape(true, false, true);

2821                     } else {

2822                         next();

2823                     }

2824                     if (m < ch) {

2825                         throw error("Illegal character range");

2826                     }

2827                     if (has(CASE_INSENSITIVE)) {

2828                         if (has(UNICODE_CASE))

2829                             return CIRangeU(ch, m);

2830                         return CIRange(ch, m);

2831                     } else {

2832                         return Range(ch, m);

2833                     }

2834                 }

2835             }

2836             return bitsOrSingle(bits, ch);

2837         }

2838         throw error("Unexpected character '"+((char)ch)+"'");

2839     }

2840 

2841     /**

2842      * Parses a Unicode character family and returns its representative node.

2843      */

2844     private CharPredicate family(boolean singleLetter, boolean isComplement) {

2845         next();

2846         String name;

2847         CharPredicate p = null;

2848 

2849         if (singleLetter) {

2850             int c = temp[cursor];

2851             if (!Character.isSupplementaryCodePoint(c)) {

2852                 name = String.valueOf((char)c);

2853             } else {

2854                 name = new String(temp, cursor, 1);

2855             }

2856             read();

2857         } else {

2858             int i = cursor;

2859             mark('}');

2860             while(read() != '}') {

2861             }

2862             mark('\000');

2863             int j = cursor;

2864             if (j > patternLength)

2865                 throw error("Unclosed character family");

2866             if (i + 1 >= j)

2867                 throw error("Empty character family");

2868             name = new String(temp, i, j-i-1);

2869         }

2870 

2871         int i = name.indexOf('=');

2872         if (i != -1) {

2873             // property construct \p{name=value}

2874             String value = name.substring(i + 1);

2875             name = name.substring(0, i).toLowerCase(Locale.ENGLISH);

2876             switch (name) {

2877                 case "sc":

2878                 case "script":

2879                     p = CharPredicates.forUnicodeScript(value);

2880                     break;

2881                 case "blk":

2882                 case "block":

2883                     p = CharPredicates.forUnicodeBlock(value);

2884                     break;

2885                 case "gc":

2886                 case "general_category":

2887                     p = CharPredicates.forProperty(value);

2888                     break;

2889                 default:

2890                     break;

2891             }

2892             if (p == null)

2893                 throw error("Unknown Unicode property {name=<" + name + ">, "

2894                              + "value=<" + value + ">}");

2895 

2896         } else {

2897             if (name.startsWith("In")) {

2898                 // \p{InBlockName}

2899                 p = CharPredicates.forUnicodeBlock(name.substring(2));

2900             } else if (name.startsWith("Is")) {

2901                 // \p{IsGeneralCategory} and \p{IsScriptName}

2902                 name = name.substring(2);

2903                 p = CharPredicates.forUnicodeProperty(name);

2904                 if (p == null)

2905                     p = CharPredicates.forProperty(name);

2906                 if (p == null)

2907                     p = CharPredicates.forUnicodeScript(name);

2908             } else {

2909                 if (has(UNICODE_CHARACTER_CLASS)) {

2910                     p = CharPredicates.forPOSIXName(name);

2911                 }

2912                 if (p == null)

2913                     p = CharPredicates.forProperty(name);

2914             }

2915             if (p == null)

2916                 throw error("Unknown character property name {In/Is" + name + "}");

2917         }

2918         if (isComplement) {

2919             // it might be too expensive to detect if a complement of

2920             // CharProperty can match "certain" supplementary. So just

2921             // go with StartS.

2922             hasSupplementary = true;

2923             p = p.negate();

2924         }

2925         return p;

2926     }

2927 

2928     private CharProperty newCharProperty(CharPredicate p) {

2929         if (p == null)

2930             return null;

2931         if (p instanceof BmpCharPredicate)

2932             return new BmpCharProperty((BmpCharPredicate)p);

2933         else

2934             return new CharProperty(p);

2935     }

2936 

2937     /**

2938      * Parses and returns the name of a "named capturing group", the trailing

2939      * ">" is consumed after parsing.

2940      */

2941     private String groupname(int ch) {

2942         StringBuilder sb = new StringBuilder();

2943         if (!ASCII.isAlpha(ch))

2944             throw error("capturing group name does not start with a Latin letter");

2945         do {

2946             sb.append((char) ch);

2947         } while (ASCII.isAlnum(ch=read()));

2948         if (ch != '>')

2949             throw error("named capturing group is missing trailing '>'");

2950         return sb.toString();

2951     }

2952 

2953     /**

2954      * Parses a group and returns the head node of a set of nodes that process

2955      * the group. Sometimes a double return system is used where the tail is

2956      * returned in root.

2957      */

2958     private Node group0() {

2959         boolean capturingGroup = false;

2960         Node head = null;

2961         Node tail = null;

2962         int save = flags0;

2963         int saveTCNCount = topClosureNodes.size();

2964         root = null;

2965         int ch = next();

2966         if (ch == '?') {

2967             ch = skip();

2968             switch (ch) {

2969             case ':':   //  (?:xxx) pure group

2970                 head = createGroup(true);

2971                 tail = root;

2972                 head.next = expr(tail);

2973                 break;

2974             case '=':   // (?=xxx) and (?!xxx) lookahead

2975             case '!':

2976                 head = createGroup(true);

2977                 tail = root;

2978                 head.next = expr(tail);

2979                 if (ch == '=') {

2980                     head = tail = new Pos(head);

2981                 } else {

2982                     head = tail = new Neg(head);

2983                 }

2984                 break;

2985             case '>':   // (?>xxx)  independent group

2986                 head = createGroup(true);

2987                 tail = root;

2988                 head.next = expr(tail);

2989                 head = tail = new Ques(head, Qtype.INDEPENDENT);

2990                 break;

2991             case '<':   // (?<xxx)  look behind

2992                 ch = read();

2993                 if (ch != '=' && ch != '!') {

2994                     // named captured group

2995                     String name = groupname(ch);

2996                     if (namedGroups().containsKey(name))

2997                         throw error("Named capturing group <" + name

2998                                     + "> is already defined");

2999                     capturingGroup = true;

3000                     head = createGroup(false);

3001                     tail = root;

3002                     namedGroups().put(name, capturingGroupCount-1);

3003                     head.next = expr(tail);

3004                     break;

3005                 }

3006                 int start = cursor;

3007                 head = createGroup(true);

3008                 tail = root;

3009                 head.next = expr(tail);

3010                 tail.next = lookbehindEnd;

3011                 TreeInfo info = new TreeInfo();

3012                 head.study(info);

3013                 if (info.maxValid == false) {

3014                     throw error("Look-behind group does not have "

3015                                 + "an obvious maximum length");

3016                 }

3017                 boolean hasSupplementary = findSupplementary(start, patternLength);

3018                 if (ch == '=') {

3019                     head = tail = (hasSupplementary ?

3020                                    new BehindS(head, info.maxLength,

3021                                                info.minLength) :

3022                                    new Behind(head, info.maxLength,

3023                                               info.minLength));

3024                 } else { // if (ch == '!')

3025                     head = tail = (hasSupplementary ?

3026                                    new NotBehindS(head, info.maxLength,

3027                                                   info.minLength) :

3028                                    new NotBehind(head, info.maxLength,

3029                                                  info.minLength));

3030                 }

3031                 // clear all top-closure-nodes inside lookbehind

3032                 if (saveTCNCount < topClosureNodes.size())

3033                     topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear();

3034                 break;

3035             case '$':

3036             case '@':

3037                 throw error("Unknown group type");

3038             default:    // (?xxx:) inlined match flags

3039                 unread();

3040                 addFlag();

3041                 ch = read();

3042                 if (ch == ')') {

3043                     return null;    // Inline modifier only

3044                 }

3045                 if (ch != ':') {

3046                     throw error("Unknown inline modifier");

3047                 }

3048                 head = createGroup(true);

3049                 tail = root;

3050                 head.next = expr(tail);

3051                 break;

3052             }

3053         } else { // (xxx) a regular group

3054             capturingGroup = true;

3055             head = createGroup(false);

3056             tail = root;

3057             head.next = expr(tail);

3058         }

3059 

3060         accept(')', "Unclosed group");

3061         flags0 = save;

3062 

3063         // Check for quantifiers

3064         Node node = closure(head);

3065         if (node == head) { // No closure

3066             root = tail;

3067             return node;    // Dual return

3068         }

3069         if (head == tail) { // Zero length assertion

3070             root = node;

3071             return node;    // Dual return

3072         }

3073 

3074         // have group closure, clear all inner closure nodes from the

3075         // top list (no backtracking stopper optimization for inner

3076         if (saveTCNCount < topClosureNodes.size())

3077             topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear();

3078 

3079         if (node instanceof Ques) {

3080             Ques ques = (Ques) node;

3081             if (ques.type == Qtype.POSSESSIVE) {

3082                 root = node;

3083                 return node;

3084             }

3085             tail.next = new BranchConn();

3086             tail = tail.next;

3087             if (ques.type == Qtype.GREEDY) {

3088                 head = new Branch(head, null, tail);

3089             } else { // Reluctant quantifier

3090                 head = new Branch(null, head, tail);

3091             }

3092             root = tail;

3093             return head;

3094         } else if (node instanceof Curly) {

3095             Curly curly = (Curly) node;

3096             if (curly.type == Qtype.POSSESSIVE) {

3097                 root = node;

3098                 return node;

3099             }

3100             // Discover if the group is deterministic

3101             TreeInfo info = new TreeInfo();

3102             if (head.study(info)) { // Deterministic

3103                 GroupTail temp = (GroupTail) tail;

3104                 head = root = new GroupCurly(head.next, curly.cmin,

3105                                    curly.cmax, curly.type,

3106                                    ((GroupTail)tail).localIndex,

3107                                    ((GroupTail)tail).groupIndex,

3108                                              capturingGroup);

3109                 return head;

3110             } else { // Non-deterministic

3111                 int temp = ((GroupHead) head).localIndex;

3112                 Loop loop;

3113                 if (curly.type == Qtype.GREEDY) {

3114                     loop = new Loop(this.localCount, temp);

3115                     // add the max_reps greedy to the top-closure-node list

3116                     if (curly.cmax == MAX_REPS)

3117                         topClosureNodes.add(loop);

3118                 } else {  // Reluctant Curly

3119                     loop = new LazyLoop(this.localCount, temp);

3120                 }

3121                 Prolog prolog = new Prolog(loop);

3122                 this.localCount += 1;

3123                 loop.cmin = curly.cmin;

3124                 loop.cmax = curly.cmax;

3125                 loop.body = head;

3126                 tail.next = loop;

3127                 root = loop;

3128                 return prolog; // Dual return

3129             }

3130         }

3131         throw error("Internal logic error");

3132     }

3133 

3134     /**

3135      * Create group head and tail nodes using double return. If the group is

3136      * created with anonymous true then it is a pure group and should not

3137      * affect group counting.

3138      */

3139     private Node createGroup(boolean anonymous) {

3140         int localIndex = localCount++;

3141         int groupIndex = 0;

3142         if (!anonymous)

3143             groupIndex = capturingGroupCount++;

3144         GroupHead head = new GroupHead(localIndex);

3145         root = new GroupTail(localIndex, groupIndex);

3146 

3147         // for debug/print only, head.match does NOT need the "tail" info

3148         head.tail = (GroupTail)root;

3149 

3150         if (!anonymous && groupIndex < 10)

3151             groupNodes[groupIndex] = head;

3152         return head;

3153     }

3154 

3155     @SuppressWarnings("fallthrough")

3156     /**

3157      * Parses inlined match flags and set them appropriately.

3158      */

3159     private void addFlag() {

3160         int ch = peek();

3161         for (;;) {

3162             switch (ch) {

3163             case 'i':

3164                 flags0 |= CASE_INSENSITIVE;

3165                 break;

3166             case 'm':

3167                 flags0 |= MULTILINE;

3168                 break;

3169             case 's':

3170                 flags0 |= DOTALL;

3171                 break;

3172             case 'd':

3173                 flags0 |= UNIX_LINES;

3174                 break;

3175             case 'u':

3176                 flags0 |= UNICODE_CASE;

3177                 break;

3178             case 'c':

3179                 flags0 |= CANON_EQ;

3180                 break;

3181             case 'x':

3182                 flags0 |= COMMENTS;

3183                 break;

3184             case 'U':

3185                 flags0 |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE);

3186                 break;

3187             case '-': // subFlag then fall through

3188                 ch = next();

3189                 subFlag();

3190             default:

3191                 return;

3192             }

3193             ch = next();

3194         }

3195     }

3196 

3197     @SuppressWarnings("fallthrough")

3198     /**

3199      * Parses the second part of inlined match flags and turns off

3200      * flags appropriately.

3201      */

3202     private void subFlag() {

3203         int ch = peek();

3204         for (;;) {

3205             switch (ch) {

3206             case 'i':

3207                 flags0 &= ~CASE_INSENSITIVE;

3208                 break;

3209             case 'm':

3210                 flags0 &= ~MULTILINE;

3211                 break;

3212             case 's':

3213                 flags0 &= ~DOTALL;

3214                 break;

3215             case 'd':

3216                 flags0 &= ~UNIX_LINES;

3217                 break;

3218             case 'u':

3219                 flags0 &= ~UNICODE_CASE;

3220                 break;

3221             case 'c':

3222                 flags0 &= ~CANON_EQ;

3223                 break;

3224             case 'x':

3225                 flags0 &= ~COMMENTS;

3226                 break;

3227             case 'U':

3228                 flags0 &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);

3229                 break;

3230             default:

3231                 return;

3232             }

3233             ch = next();

3234         }

3235     }

3236 

3237     static final int MAX_REPS   = 0x7FFFFFFF;

3238 

3239     static enum Qtype {

3240         GREEDY, LAZY, POSSESSIVE, INDEPENDENT

3241     }

3242 

3243     private Node curly(Node prev, int cmin) {

3244         int ch = next();

3245         if (ch == '?') {

3246             next();

3247             return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY);

3248         } else if (ch == '+') {

3249             next();

3250             return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE);

3251         }

3252         if (prev instanceof BmpCharProperty) {

3253             return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);

3254         } else if (prev instanceof CharProperty) {

3255             return new CharPropertyGreedy((CharProperty)prev, cmin);

3256         }

3257         return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY);

3258     }

3259 

3260     /**

3261      * Processes repetition. If the next character peeked is a quantifier

3262      * then new nodes must be appended to handle the repetition.

3263      * Prev could be a single or a group, so it could be a chain of nodes.

3264      */

3265     private Node closure(Node prev) {

3266         Node atom;

3267         int ch = peek();

3268         switch (ch) {

3269         case '?':

3270             ch = next();

3271             if (ch == '?') {

3272                 next();

3273                 return new Ques(prev, Qtype.LAZY);

3274             } else if (ch == '+') {

3275                 next();

3276                 return new Ques(prev, Qtype.POSSESSIVE);

3277             }

3278             return new Ques(prev, Qtype.GREEDY);

3279         case '*':

3280             return curly(prev, 0);

3281         case '+':

3282             return curly(prev, 1);

3283         case '{':

3284             ch = temp[cursor+1];

3285             if (ASCII.isDigit(ch)) {

3286                 skip();

3287                 int cmin = 0;

3288                 do {

3289                     cmin = cmin * 10 + (ch - '0');

3290                 } while (ASCII.isDigit(ch = read()));

3291                 int cmax = cmin;

3292                 if (ch == ',') {

3293                     ch = read();

3294                     cmax = MAX_REPS;

3295                     if (ch != '}') {

3296                         cmax = 0;

3297                         while (ASCII.isDigit(ch)) {

3298                             cmax = cmax * 10 + (ch - '0');

3299                             ch = read();

3300                         }

3301                     }

3302                 }

3303                 if (ch != '}')

3304                     throw error("Unclosed counted closure");

3305                 if (((cmin) | (cmax) | (cmax - cmin)) < 0)

3306                     throw error("Illegal repetition range");

3307                 Curly curly;

3308                 ch = peek();

3309                 if (ch == '?') {

3310                     next();

3311                     curly = new Curly(prev, cmin, cmax, Qtype.LAZY);

3312                 } else if (ch == '+') {

3313                     next();

3314                     curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);

3315                 } else {

3316                     curly = new Curly(prev, cmin, cmax, Qtype.GREEDY);

3317                 }

3318                 return curly;

3319             } else {

3320                 throw error("Illegal repetition");

3321             }

3322         default:

3323             return prev;

3324         }

3325     }

3326 

3327     /**

3328      *  Utility method for parsing control escape sequences.

3329      */

3330     private int c() {

3331         if (cursor < patternLength) {

3332             return read() ^ 64;

3333         }

3334         throw error("Illegal control escape sequence");

3335     }

3336 

3337     /**

3338      *  Utility method for parsing octal escape sequences.

3339      */

3340     private int o() {

3341         int n = read();

3342         if (((n-'0')|('7'-n)) >= 0) {

3343             int m = read();

3344             if (((m-'0')|('7'-m)) >= 0) {

3345                 int o = read();

3346                 if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) {

3347                     return (n - '0') * 64 + (m - '0') * 8 + (o - '0');

3348                 }

3349                 unread();

3350                 return (n - '0') * 8 + (m - '0');

3351             }

3352             unread();

3353             return (n - '0');

3354         }

3355         throw error("Illegal octal escape sequence");

3356     }

3357 

3358     /**

3359      *  Utility method for parsing hexadecimal escape sequences.

3360      */

3361     private int x() {

3362         int n = read();

3363         if (ASCII.isHexDigit(n)) {

3364             int m = read();

3365             if (ASCII.isHexDigit(m)) {

3366                 return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);

3367             }

3368         } else if (n == '{' && ASCII.isHexDigit(peek())) {

3369             int ch = 0;

3370             while (ASCII.isHexDigit(n = read())) {

3371                 ch = (ch << 4) + ASCII.toDigit(n);

3372                 if (ch > Character.MAX_CODE_POINT)

3373                     throw error("Hexadecimal codepoint is too big");

3374             }

3375             if (n != '}')

3376                 throw error("Unclosed hexadecimal escape sequence");

3377             return ch;

3378         }

3379         throw error("Illegal hexadecimal escape sequence");

3380     }

3381 

3382     /**

3383      *  Utility method for parsing unicode escape sequences.

3384      */

3385     private int cursor() {

3386         return cursor;

3387     }

3388 

3389     private void setcursor(int pos) {

3390         cursor = pos;

3391     }

3392 

3393     private int uxxxx() {

3394         int n = 0;

3395         for (int i = 0; i < 4; i++) {

3396             int ch = read();

3397             if (!ASCII.isHexDigit(ch)) {

3398                 throw error("Illegal Unicode escape sequence");

3399             }

3400             n = n * 16 + ASCII.toDigit(ch);

3401         }

3402         return n;

3403     }

3404 

3405     private int u() {

3406         int n = uxxxx();

3407         if (Character.isHighSurrogate((char)n)) {

3408             int cur = cursor();

3409             if (read() == '\\' && read() == 'u') {

3410                 int n2 = uxxxx();

3411                 if (Character.isLowSurrogate((char)n2))

3412                     return Character.toCodePoint((char)n, (char)n2);

3413             }

3414             setcursor(cur);

3415         }

3416         return n;

3417     }

3418 

3419     private int N() {

3420         if (read() == '{') {

3421             int i = cursor;

3422             while (read() != '}') {

3423                 if (cursor >= patternLength)

3424                     throw error("Unclosed character name escape sequence");

3425             }

3426             String name = new String(temp, i, cursor - i - 1);

3427             try {

3428                 return Character.codePointOf(name);

3429             } catch (IllegalArgumentException x) {

3430                 throw error("Unknown character name [" + name + "]");

3431             }

3432         }

3433         throw error("Illegal character name escape sequence");

3434     }

3435 

3436     //

3437     // Utility methods for code point support

3438     //

3439     private static final int countChars(CharSequence seq, int index,

3440                                         int lengthInCodePoints) {

3441         // optimization

3442         if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {

3443             assert (index >= 0 && index < seq.length());

3444             return 1;

3445         }

3446         int length = seq.length();

3447         int x = index;

3448         if (lengthInCodePoints >= 0) {

3449             assert (index >= 0 && index < length);

3450             for (int i = 0; x < length && i < lengthInCodePoints; i++) {

3451                 if (Character.isHighSurrogate(seq.charAt(x++))) {

3452                     if (x < length && Character.isLowSurrogate(seq.charAt(x))) {

3453                         x++;

3454                     }

3455                 }

3456             }

3457             return x - index;

3458         }

3459 

3460         assert (index >= 0 && index <= length);

3461         if (index == 0) {

3462             return 0;

3463         }

3464         int len = -lengthInCodePoints;

3465         for (int i = 0; x > 0 && i < len; i++) {

3466             if (Character.isLowSurrogate(seq.charAt(--x))) {

3467                 if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) {

3468                     x--;

3469                 }

3470             }

3471         }

3472         return index - x;

3473     }

3474 

3475     private static final int countCodePoints(CharSequence seq) {

3476         int length = seq.length();

3477         int n = 0;

3478         for (int i = 0; i < length; ) {

3479             n++;

3480             if (Character.isHighSurrogate(seq.charAt(i++))) {

3481                 if (i < length && Character.isLowSurrogate(seq.charAt(i))) {

3482                     i++;

3483                 }

3484             }

3485         }

3486         return n;

3487     }

3488 

3489     /**

3490      *  Creates a bit vector for matching Latin-1 values. A normal BitClass

3491      *  never matches values above Latin-1, and a complemented BitClass always

3492      *  matches values above Latin-1.

3493      */

3494     static final class BitClass extends BmpCharProperty {

3495         final boolean[] bits;

3496         BitClass() {

3497             this(new boolean[256]);

3498         }

3499         private BitClass(boolean[] bits) {

3500             super( ch -> ch < 256 && bits[ch]);

3501             this.bits = bits;

3502         }

3503         BitClass add(int c, int flags) {

3504             assert c >= 0 && c <= 255;

3505             if ((flags & CASE_INSENSITIVE) != 0) {

3506                 if (ASCII.isAscii(c)) {

3507                     bits[ASCII.toUpper(c)] = true;

3508                     bits[ASCII.toLower(c)] = true;

3509                 } else if ((flags & UNICODE_CASE) != 0) {

3510                     bits[Character.toLowerCase(c)] = true;

3511                     bits[Character.toUpperCase(c)] = true;

3512                 }

3513             }

3514             bits[c] = true;

3515             return this;

3516         }

3517     }

3518 

3519     /**

3520      *  Utility method for creating a string slice matcher.

3521      */

3522     private Node newSlice(int[] buf, int count, boolean hasSupplementary) {

3523         int[] tmp = new int[count];

3524         if (has(CASE_INSENSITIVE)) {

3525             if (has(UNICODE_CASE)) {

3526                 for (int i = 0; i < count; i++) {

3527                     tmp[i] = Character.toLowerCase(

3528                                  Character.toUpperCase(buf[i]));

3529                 }

3530                 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);

3531             }

3532             for (int i = 0; i < count; i++) {

3533                 tmp[i] = ASCII.toLower(buf[i]);

3534             }

3535             return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);

3536         }

3537         for (int i = 0; i < count; i++) {

3538             tmp[i] = buf[i];

3539         }

3540         return hasSupplementary ? new SliceS(tmp) : new Slice(tmp);

3541     }

3542 

3543     /**

3544      * The following classes are the building components of the object

3545      * tree that represents a compiled regular expression. The object tree

3546      * is made of individual elements that handle constructs in the Pattern.

3547      * Each type of object knows how to match its equivalent construct with

3548      * the match() method.

3549      */

3550 

3551     /**

3552      * Base class for all node classes. Subclasses should override the match()

3553      * method as appropriate. This class is an accepting node, so its match()

3554      * always returns true.

3555      */

3556     static class Node extends Object {

3557         Node next;

3558         Node() {

3559             next = Pattern.accept;

3560         }

3561         /**

3562          * This method implements the classic accept node.

3563          */

3564         boolean match(Matcher matcher, int i, CharSequence seq) {

3565             matcher.last = i;

3566             matcher.groups[0] = matcher.first;

3567             matcher.groups[1] = matcher.last;

3568             return true;

3569         }

3570         /**

3571          * This method is good for all zero length assertions.

3572          */

3573         boolean study(TreeInfo info) {

3574             if (next != null) {

3575                 return next.study(info);

3576             } else {

3577                 return info.deterministic;

3578             }

3579         }

3580     }

3581 

3582     static class LastNode extends Node {

3583         /**

3584          * This method implements the classic accept node with

3585          * the addition of a check to see if the match occurred

3586          * using all of the input.

3587          */

3588         boolean match(Matcher matcher, int i, CharSequence seq) {

3589             if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to)

3590                 return false;

3591             matcher.last = i;

3592             matcher.groups[0] = matcher.first;

3593             matcher.groups[1] = matcher.last;

3594             return true;

3595         }

3596     }

3597 

3598     /**

3599      * Used for REs that can start anywhere within the input string.

3600      * This basically tries to match repeatedly at each spot in the

3601      * input string, moving forward after each try. An anchored search

3602      * or a BnM will bypass this node completely.

3603      */

3604     static class Start extends Node {

3605         int minLength;

3606         Start(Node node) {

3607             this.next = node;

3608             TreeInfo info = new TreeInfo();

3609             next.study(info);

3610             minLength = info.minLength;

3611         }

3612         boolean match(Matcher matcher, int i, CharSequence seq) {

3613             if (i > matcher.to - minLength) {

3614                 matcher.hitEnd = true;

3615                 return false;

3616             }

3617             int guard = matcher.to - minLength;

3618             for (; i <= guard; i++) {

3619                 if (next.match(matcher, i, seq)) {

3620                     matcher.first = i;

3621                     matcher.groups[0] = matcher.first;

3622                     matcher.groups[1] = matcher.last;

3623                     return true;

3624                 }

3625             }

3626             matcher.hitEnd = true;

3627             return false;

3628         }

3629         boolean study(TreeInfo info) {

3630             next.study(info);

3631             info.maxValid = false;

3632             info.deterministic = false;

3633             return false;

3634         }

3635     }

3636 

3637     /*

3638      * StartS supports supplementary characters, including unpaired surrogates.

3639      */

3640     static final class StartS extends Start {

3641         StartS(Node node) {

3642             super(node);

3643         }

3644         boolean match(Matcher matcher, int i, CharSequence seq) {

3645             if (i > matcher.to - minLength) {

3646                 matcher.hitEnd = true;

3647                 return false;

3648             }

3649             int guard = matcher.to - minLength;

3650             while (i <= guard) {

3651                 //if ((ret = next.match(matcher, i, seq)) || i == guard)

3652                 if (next.match(matcher, i, seq)) {

3653                     matcher.first = i;

3654                     matcher.groups[0] = matcher.first;

3655                     matcher.groups[1] = matcher.last;

3656                     return true;

3657                 }

3658                 if (i == guard)

3659                     break;

3660                 // Optimization to move to the next character. This is

3661                 // faster than countChars(seq, i, 1).

3662                 if (Character.isHighSurrogate(seq.charAt(i++))) {

3663                     if (i < seq.length() &&

3664                         Character.isLowSurrogate(seq.charAt(i))) {

3665                         i++;

3666                     }

3667                 }

3668             }

3669             matcher.hitEnd = true;

3670             return false;

3671         }

3672     }

3673 

3674     /**

3675      * Node to anchor at the beginning of input. This object implements the

3676      * match for a \A sequence, and the caret anchor will use this if not in

3677      * multiline mode.

3678      */

3679     static final class Begin extends Node {

3680         boolean match(Matcher matcher, int i, CharSequence seq) {

3681             int fromIndex = (matcher.anchoringBounds) ?

3682                 matcher.from : 0;

3683             if (i == fromIndex && next.match(matcher, i, seq)) {

3684                 matcher.first = i;

3685                 matcher.groups[0] = i;

3686                 matcher.groups[1] = matcher.last;

3687                 return true;

3688             } else {

3689                 return false;

3690             }

3691         }

3692     }

3693 

3694     /**

3695      * Node to anchor at the end of input. This is the absolute end, so this

3696      * should not match at the last newline before the end as $ will.

3697      */

3698     static final class End extends Node {

3699         boolean match(Matcher matcher, int i, CharSequence seq) {

3700             int endIndex = (matcher.anchoringBounds) ?

3701                 matcher.to : matcher.getTextLength();

3702             if (i == endIndex) {

3703                 matcher.hitEnd = true;

3704                 return next.match(matcher, i, seq);

3705             }

3706             return false;

3707         }

3708     }

3709 

3710     /**

3711      * Node to anchor at the beginning of a line. This is essentially the

3712      * object to match for the multiline ^.

3713      */

3714     static final class Caret extends Node {

3715         boolean match(Matcher matcher, int i, CharSequence seq) {

3716             int startIndex = matcher.from;

3717             int endIndex = matcher.to;

3718             if (!matcher.anchoringBounds) {

3719                 startIndex = 0;

3720                 endIndex = matcher.getTextLength();

3721             }

3722             // Perl does not match ^ at end of input even after newline

3723             if (i == endIndex) {

3724                 matcher.hitEnd = true;

3725                 return false;

3726             }

3727             if (i > startIndex) {

3728                 char ch = seq.charAt(i-1);

3729                 if (ch != '\n' && ch != '\r'

3730                     && (ch|1) != '\u2029'

3731                     && ch != '\u0085' ) {

3732                     return false;

3733                 }

3734                 // Should treat /r/n as one newline

3735                 if (ch == '\r' && seq.charAt(i) == '\n')

3736                     return false;

3737             }

3738             return next.match(matcher, i, seq);

3739         }

3740     }

3741 

3742     /**

3743      * Node to anchor at the beginning of a line when in unixdot mode.

3744      */

3745     static final class UnixCaret extends Node {

3746         boolean match(Matcher matcher, int i, CharSequence seq) {

3747             int startIndex = matcher.from;

3748             int endIndex = matcher.to;

3749             if (!matcher.anchoringBounds) {

3750                 startIndex = 0;

3751                 endIndex = matcher.getTextLength();

3752             }

3753             // Perl does not match ^ at end of input even after newline

3754             if (i == endIndex) {

3755                 matcher.hitEnd = true;

3756                 return false;

3757             }

3758             if (i > startIndex) {

3759                 char ch = seq.charAt(i-1);

3760                 if (ch != '\n') {

3761                     return false;

3762                 }

3763             }

3764             return next.match(matcher, i, seq);

3765         }

3766     }

3767 

3768     /**

3769      * Node to match the location where the last match ended.

3770      * This is used for the \G construct.

3771      */

3772     static final class LastMatch extends Node {

3773         boolean match(Matcher matcher, int i, CharSequence seq) {

3774             if (i != matcher.oldLast)

3775                 return false;

3776             return next.match(matcher, i, seq);

3777         }

3778     }

3779 

3780     /**

3781      * Node to anchor at the end of a line or the end of input based on the

3782      * multiline mode.

3783      *

3784      * When not in multiline mode, the $ can only match at the very end

3785      * of the input, unless the input ends in a line terminator in which

3786      * it matches right before the last line terminator.

3787      *

3788      * Note that \r\n is considered an atomic line terminator.

3789      *

3790      * Like ^ the $ operator matches at a position, it does not match the

3791      * line terminators themselves.

3792      */

3793     static final class Dollar extends Node {

3794         boolean multiline;

3795         Dollar(boolean mul) {

3796             multiline = mul;

3797         }

3798         boolean match(Matcher matcher, int i, CharSequence seq) {

3799             int endIndex = (matcher.anchoringBounds) ?

3800                 matcher.to : matcher.getTextLength();

3801             if (!multiline) {

3802                 if (i < endIndex - 2)

3803                     return false;

3804                 if (i == endIndex - 2) {

3805                     char ch = seq.charAt(i);

3806                     if (ch != '\r')

3807                         return false;

3808                     ch = seq.charAt(i + 1);

3809                     if (ch != '\n')

3810                         return false;

3811                 }

3812             }

3813             // Matches before any line terminator; also matches at the

3814             // end of input

3815             // Before line terminator:

3816             // If multiline, we match here no matter what

3817             // If not multiline, fall through so that the end

3818             // is marked as hit; this must be a /r/n or a /n

3819             // at the very end so the end was hit; more input

3820             // could make this not match here

3821             if (i < endIndex) {

3822                 char ch = seq.charAt(i);

3823                  if (ch == '\n') {

3824                      // No match between \r\n

3825                      if (i > 0 && seq.charAt(i-1) == '\r')

3826                          return false;

3827                      if (multiline)

3828                          return next.match(matcher, i, seq);

3829                  } else if (ch == '\r' || ch == '\u0085' ||

3830                             (ch|1) == '\u2029') {

3831                      if (multiline)

3832                          return next.match(matcher, i, seq);

3833                  } else { // No line terminator, no match

3834                      return false;

3835                  }

3836             }

3837             // Matched at current end so hit end

3838             matcher.hitEnd = true;

3839             // If a $ matches because of end of input, then more input

3840             // could cause it to fail!

3841             matcher.requireEnd = true;

3842             return next.match(matcher, i, seq);

3843         }

3844         boolean study(TreeInfo info) {

3845             next.study(info);

3846             return info.deterministic;

3847         }

3848     }

3849 

3850     /**

3851      * Node to anchor at the end of a line or the end of input based on the

3852      * multiline mode when in unix lines mode.

3853      */

3854     static final class UnixDollar extends Node {

3855         boolean multiline;

3856         UnixDollar(boolean mul) {

3857             multiline = mul;

3858         }

3859         boolean match(Matcher matcher, int i, CharSequence seq) {

3860             int endIndex = (matcher.anchoringBounds) ?

3861                 matcher.to : matcher.getTextLength();

3862             if (i < endIndex) {

3863                 char ch = seq.charAt(i);

3864                 if (ch == '\n') {

3865                     // If not multiline, then only possible to

3866                     // match at very end or one before end

3867                     if (multiline == false && i != endIndex - 1)

3868                         return false;

3869                     // If multiline return next.match without setting

3870                     // matcher.hitEnd

3871                     if (multiline)

3872                         return next.match(matcher, i, seq);

3873                 } else {

3874                     return false;

3875                 }

3876             }

3877             // Matching because at the end or 1 before the end;

3878             // more input could change this so set hitEnd

3879             matcher.hitEnd = true;

3880             // If a $ matches because of end of input, then more input

3881             // could cause it to fail!

3882             matcher.requireEnd = true;

3883             return next.match(matcher, i, seq);

3884         }

3885         boolean study(TreeInfo info) {

3886             next.study(info);

3887             return info.deterministic;

3888         }

3889     }

3890 

3891     /**

3892      * Node class that matches a Unicode line ending '\R'

3893      */

3894     static final class LineEnding extends Node {

3895         boolean match(Matcher matcher, int i, CharSequence seq) {

3896             // (u+000Du+000A|[u+000Au+000Bu+000Cu+000Du+0085u+2028u+2029])

3897             if (i < matcher.to) {

3898                 int ch = seq.charAt(i);

3899                 if (ch == 0x0A || ch == 0x0B || ch == 0x0C ||

3900                     ch == 0x85 || ch == 0x2028 || ch == 0x2029)

3901                     return next.match(matcher, i + 1, seq);

3902                 if (ch == 0x0D) {

3903                     i++;

3904                     if (i < matcher.to) {

3905                         if (seq.charAt(i) == 0x0A &&

3906                             next.match(matcher, i + 1, seq)) {

3907                             return true;

3908                         }

3909                     } else {

3910                         matcher.hitEnd = true;

3911                     }

3912                     return next.match(matcher, i, seq);

3913                 }

3914             } else {

3915                 matcher.hitEnd = true;

3916             }

3917             return false;

3918         }

3919         boolean study(TreeInfo info) {

3920             info.minLength++;

3921             info.maxLength += 2;

3922             return next.study(info);

3923         }

3924     }

3925 

3926     /**

3927      * Abstract node class to match one character satisfying some

3928      * boolean property.

3929      */

3930     static class CharProperty extends Node {

3931         CharPredicate predicate;

3932 

3933         CharProperty (CharPredicate predicate) {

3934             this.predicate = predicate;

3935         }

3936         boolean match(Matcher matcher, int i, CharSequence seq) {

3937             if (i < matcher.to) {

3938                 int ch = Character.codePointAt(seq, i);

3939                 return predicate.is(ch) &&

3940                        next.match(matcher, i + Character.charCount(ch), seq);

3941             } else {

3942                 matcher.hitEnd = true;

3943                 return false;

3944             }

3945         }

3946         boolean study(TreeInfo info) {

3947             info.minLength++;

3948             info.maxLength++;

3949             return next.study(info);

3950         }

3951     }

3952 

3953     /**

3954      * Optimized version of CharProperty that works only for

3955      * properties never satisfied by Supplementary characters.

3956      */

3957     private static class BmpCharProperty extends CharProperty {

3958         BmpCharProperty (BmpCharPredicate predicate) {

3959             super(predicate);

3960         }

3961         boolean match(Matcher matcher, int i, CharSequence seq) {

3962             if (i < matcher.to) {

3963                 return predicate.is(seq.charAt(i)) &&

3964                        next.match(matcher, i + 1, seq);

3965             } else {

3966                 matcher.hitEnd = true;

3967                 return false;

3968             }

3969         }

3970     }

3971 

3972     private static class NFCCharProperty extends Node {

3973         CharPredicate predicate;

3974         NFCCharProperty (CharPredicate predicate) {

3975             this.predicate = predicate;

3976         }

3977 

3978         boolean match(Matcher matcher, int i, CharSequence seq) {

3979             if (i < matcher.to) {

3980                 int ch0 = Character.codePointAt(seq, i);

3981                 int n = Character.charCount(ch0);

3982                 int j = i + n;

3983                 while (j < matcher.to) {

3984                     int ch1 = Character.codePointAt(seq, j);

3985                     if (Grapheme.isBoundary(ch0, ch1))

3986                         break;

3987                     ch0 = ch1;

3988                     j += Character.charCount(ch1);

3989                 }

3990                 if (i + n == j) {    // single, assume nfc cp

3991                     if (predicate.is(ch0))

3992                         return next.match(matcher, j, seq);

3993                 } else {

3994                     while (i + n < j) {

3995                         String nfc = Normalizer.normalize(

3996                             seq.toString().substring(i, j), Normalizer.Form.NFC);

3997                         if (nfc.codePointCount(0, nfc.length()) == 1) {

3998                             if (predicate.is(nfc.codePointAt(0)) &&

3999                                 next.match(matcher, j, seq)) {

4000                                 return true;

4001                             }

4002                         }

4003 

4004                         ch0 = Character.codePointBefore(seq, j);

4005                         j -= Character.charCount(ch0);

4006                     }

4007                 }

4008                 if (j < matcher.to)

4009                     return false;

4010             }

4011             matcher.hitEnd = true;

4012             return false;

4013         }

4014 

4015         boolean study(TreeInfo info) {

4016             info.minLength++;

4017             info.deterministic = false;

4018             return next.study(info);

4019         }

4020     }

4021 

4022     /**

4023      * Node class that matches an unicode extended grapheme cluster

4024      */

4025     static class XGrapheme extends Node {

4026         boolean match(Matcher matcher, int i, CharSequence seq) {

4027             if (i < matcher.to) {

4028                 int ch0 = Character.codePointAt(seq, i);

4029                     i += Character.charCount(ch0);

4030                 while (i < matcher.to) {

4031                     int ch1 = Character.codePointAt(seq, i);

4032                     if (Grapheme.isBoundary(ch0, ch1))

4033                         break;

4034                     ch0 = ch1;

4035                     i += Character.charCount(ch1);

4036                 }

4037                 return next.match(matcher, i, seq);

4038             }

4039             matcher.hitEnd = true;

4040             return false;

4041         }

4042 

4043         boolean study(TreeInfo info) {

4044             info.minLength++;

4045             info.deterministic = false;

4046             return next.study(info);

4047         }

4048     }

4049 

4050     /**

4051      * Node class that handles grapheme boundaries

4052      */

4053     static class GraphemeBound extends Node {

4054         boolean match(Matcher matcher, int i, CharSequence seq) {

4055             int startIndex = matcher.from;

4056             int endIndex = matcher.to;

4057             if (matcher.transparentBounds) {

4058                 startIndex = 0;

4059                 endIndex = matcher.getTextLength();

4060             }

4061             if (i == startIndex) {

4062                 return next.match(matcher, i, seq);

4063             }

4064             if (i < endIndex) {

4065                 if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||

4066                     !Grapheme.isBoundary(Character.codePointBefore(seq, i),

4067                                          Character.codePointAt(seq, i))) {

4068                     return false;

4069                 }

4070             } else {

4071                 matcher.hitEnd = true;

4072                 matcher.requireEnd = true;

4073             }

4074             return next.match(matcher, i, seq);

4075         }

4076     }

4077 

4078     /**

4079      * Base class for all Slice nodes

4080      */

4081     static class SliceNode extends Node {

4082         int[] buffer;

4083         SliceNode(int[] buf) {

4084             buffer = buf;

4085         }

4086         boolean study(TreeInfo info) {

4087             info.minLength += buffer.length;

4088             info.maxLength += buffer.length;

4089             return next.study(info);

4090         }

4091     }

4092 

4093     /**

4094      * Node class for a case sensitive/BMP-only sequence of literal

4095      * characters.

4096      */

4097     static class Slice extends SliceNode {

4098         Slice(int[] buf) {

4099             super(buf);

4100         }

4101         boolean match(Matcher matcher, int i, CharSequence seq) {

4102             int[] buf = buffer;

4103             int len = buf.length;

4104             for (int j=0; j<len; j++) {

4105                 if ((i+j) >= matcher.to) {

4106                     matcher.hitEnd = true;

4107                     return false;

4108                 }

4109                 if (buf[j] != seq.charAt(i+j))

4110                     return false;

4111             }

4112             return next.match(matcher, i+len, seq);

4113         }

4114     }

4115 

4116     /**

4117      * Node class for a case_insensitive/BMP-only sequence of literal

4118      * characters.

4119      */

4120     static class SliceI extends SliceNode {

4121         SliceI(int[] buf) {

4122             super(buf);

4123         }

4124         boolean match(Matcher matcher, int i, CharSequence seq) {

4125             int[] buf = buffer;

4126             int len = buf.length;

4127             for (int j=0; j<len; j++) {

4128                 if ((i+j) >= matcher.to) {

4129                     matcher.hitEnd = true;

4130                     return false;

4131                 }

4132                 int c = seq.charAt(i+j);

4133                 if (buf[j] != c &&

4134                     buf[j] != ASCII.toLower(c))

4135                     return false;

4136             }

4137             return next.match(matcher, i+len, seq);

4138         }

4139     }

4140 

4141     /**

4142      * Node class for a unicode_case_insensitive/BMP-only sequence of

4143      * literal characters. Uses unicode case folding.

4144      */

4145     static final class SliceU extends SliceNode {

4146         SliceU(int[] buf) {

4147             super(buf);

4148         }

4149         boolean match(Matcher matcher, int i, CharSequence seq) {

4150             int[] buf = buffer;

4151             int len = buf.length;

4152             for (int j=0; j<len; j++) {

4153                 if ((i+j) >= matcher.to) {

4154                     matcher.hitEnd = true;

4155                     return false;

4156                 }

4157                 int c = seq.charAt(i+j);

4158                 if (buf[j] != c &&

4159                     buf[j] != Character.toLowerCase(Character.toUpperCase(c)))

4160                     return false;

4161             }

4162             return next.match(matcher, i+len, seq);

4163         }

4164     }

4165 

4166     /**

4167      * Node class for a case sensitive sequence of literal characters

4168      * including supplementary characters.

4169      */

4170     static final class SliceS extends Slice {

4171         SliceS(int[] buf) {

4172             super(buf);

4173         }

4174         boolean match(Matcher matcher, int i, CharSequence seq) {

4175             int[] buf = buffer;

4176             int x = i;

4177             for (int j = 0; j < buf.length; j++) {

4178                 if (x >= matcher.to) {

4179                     matcher.hitEnd = true;

4180                     return false;

4181                 }

4182                 int c = Character.codePointAt(seq, x);

4183                 if (buf[j] != c)

4184                     return false;

4185                 x += Character.charCount(c);

4186                 if (x > matcher.to) {

4187                     matcher.hitEnd = true;

4188                     return false;

4189                 }

4190             }

4191             return next.match(matcher, x, seq);

4192         }

4193     }

4194 

4195     /**

4196      * Node class for a case insensitive sequence of literal characters

4197      * including supplementary characters.

4198      */

4199     static class SliceIS extends SliceNode {

4200         SliceIS(int[] buf) {

4201             super(buf);

4202         }

4203         int toLower(int c) {

4204             return ASCII.toLower(c);

4205         }

4206         boolean match(Matcher matcher, int i, CharSequence seq) {

4207             int[] buf = buffer;

4208             int x = i;

4209             for (int j = 0; j < buf.length; j++) {

4210                 if (x >= matcher.to) {

4211                     matcher.hitEnd = true;

4212                     return false;

4213                 }

4214                 int c = Character.codePointAt(seq, x);

4215                 if (buf[j] != c && buf[j] != toLower(c))

4216                     return false;

4217                 x += Character.charCount(c);

4218                 if (x > matcher.to) {

4219                     matcher.hitEnd = true;

4220                     return false;

4221                 }

4222             }

4223             return next.match(matcher, x, seq);

4224         }

4225     }

4226 

4227     /**

4228      * Node class for a case insensitive sequence of literal characters.

4229      * Uses unicode case folding.

4230      */

4231     static final class SliceUS extends SliceIS {

4232         SliceUS(int[] buf) {

4233             super(buf);

4234         }

4235         int toLower(int c) {

4236             return Character.toLowerCase(Character.toUpperCase(c));

4237         }

4238     }

4239 

4240     /**

4241      * The 0 or 1 quantifier. This one class implements all three types.

4242      */

4243     static final class Ques extends Node {

4244         Node atom;

4245         Qtype type;

4246         Ques(Node node, Qtype type) {

4247             this.atom = node;

4248             this.type = type;

4249         }

4250         boolean match(Matcher matcher, int i, CharSequence seq) {

4251             switch (type) {

4252             case GREEDY:

4253                 return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))

4254                     || next.match(matcher, i, seq);

4255             case LAZY:

4256                 return next.match(matcher, i, seq)

4257                     || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));

4258             case POSSESSIVE:

4259                 if (atom.match(matcher, i, seq)) i = matcher.last;

4260                 return next.match(matcher, i, seq);

4261             default:

4262                 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);

4263             }

4264         }

4265         boolean study(TreeInfo info) {

4266             if (type != Qtype.INDEPENDENT) {

4267                 int minL = info.minLength;

4268                 atom.study(info);

4269                 info.minLength = minL;

4270                 info.deterministic = false;

4271                 return next.study(info);

4272             } else {

4273                 atom.study(info);

4274                 return next.study(info);

4275             }

4276         }

4277     }

4278 

4279     /**

4280      * Handles the greedy style repetition with the minimum either be

4281      * 0 or 1 and the maximum be MAX_REPS, for * and + quantifier.

4282      */

4283     static class CharPropertyGreedy extends Node {

4284         final CharPredicate predicate;

4285         final int cmin;

4286 

4287         CharPropertyGreedy(CharProperty cp, int cmin) {

4288             this.predicate = cp.predicate;

4289             this.cmin = cmin;

4290         }

4291         boolean match(Matcher matcher, int i,  CharSequence seq) {

4292             int n = 0;

4293             int to = matcher.to;

4294             // greedy, all the way down

4295             while (i < to) {

4296                 int ch = Character.codePointAt(seq, i);

4297                 if (!predicate.is(ch))

4298                    break;

4299                 i += Character.charCount(ch);

4300                 n++;

4301             }

4302             if (i >= to) {

4303                 matcher.hitEnd = true;

4304             }

4305             while (n >= cmin) {

4306                 if (next.match(matcher, i, seq))

4307                     return true;

4308                 if (n == cmin)

4309                     return false;

4310                  // backing off if match fails

4311                 int ch = Character.codePointBefore(seq, i);

4312                 i -= Character.charCount(ch);

4313                 n--;

4314             }

4315             return false;

4316         }

4317 

4318         boolean study(TreeInfo info) {

4319             info.minLength += cmin;

4320             if (info.maxValid) {

4321                 info.maxLength += MAX_REPS;

4322             }

4323             info.deterministic = false;

4324             return next.study(info);

4325         }

4326     }

4327 

4328     static final class BmpCharPropertyGreedy extends CharPropertyGreedy {

4329 

4330         BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) {

4331             super(bcp, cmin);

4332         }

4333 

4334         boolean match(Matcher matcher, int i,  CharSequence seq) {

4335             int n = 0;

4336             int to = matcher.to;

4337             while (i < to && predicate.is(seq.charAt(i))) {

4338                 i++; n++;

4339             }

4340             if (i >= to) {

4341                 matcher.hitEnd = true;

4342             }

4343             while (n >= cmin) {

4344                 if (next.match(matcher, i, seq))

4345                     return true;

4346                 i--; n--;  // backing off if match fails

4347             }

4348             return false;

4349         }

4350     }

4351 

4352     /**

4353      * Handles the curly-brace style repetition with a specified minimum and

4354      * maximum occurrences. The * quantifier is handled as a special case.

4355      * This class handles the three types.

4356      */

4357     static final class Curly extends Node {

4358         Node atom;

4359         Qtype type;

4360         int cmin;

4361         int cmax;

4362 

4363         Curly(Node node, int cmin, int cmax, Qtype type) {

4364             this.atom = node;

4365             this.type = type;

4366             this.cmin = cmin;

4367             this.cmax = cmax;

4368         }

4369         boolean match(Matcher matcher, int i, CharSequence seq) {

4370             int j;

4371             for (j = 0; j < cmin; j++) {

4372                 if (atom.match(matcher, i, seq)) {

4373                     i = matcher.last;

4374                     continue;

4375                 }

4376                 return false;

4377             }

4378             if (type == Qtype.GREEDY)

4379                 return match0(matcher, i, j, seq);

4380             else if (type == Qtype.LAZY)

4381                 return match1(matcher, i, j, seq);

4382             else

4383                 return match2(matcher, i, j, seq);

4384         }

4385         // Greedy match.

4386         // i is the index to start matching at

4387         // j is the number of atoms that have matched

4388         boolean match0(Matcher matcher, int i, int j, CharSequence seq) {

4389             if (j >= cmax) {

4390                 // We have matched the maximum... continue with the rest of

4391                 // the regular expression

4392                 return next.match(matcher, i, seq);

4393             }

4394             int backLimit = j;

4395             while (atom.match(matcher, i, seq)) {

4396                 // k is the length of this match

4397                 int k = matcher.last - i;

4398                 if (k == 0) // Zero length match

4399                     break;

4400                 // Move up index and number matched

4401                 i = matcher.last;

4402                 j++;

4403                 // We are greedy so match as many as we can

4404                 while (j < cmax) {

4405                     if (!atom.match(matcher, i, seq))

4406                         break;

4407                     if (i + k != matcher.last) {

4408                         if (match0(matcher, matcher.last, j+1, seq))

4409                             return true;

4410                         break;

4411                     }

4412                     i += k;

4413                     j++;

4414                 }

4415                 // Handle backing off if match fails

4416                 while (j >= backLimit) {

4417                    if (next.match(matcher, i, seq))

4418                         return true;

4419                     i -= k;

4420                     j--;

4421                 }

4422                 return false;

4423             }

4424             return next.match(matcher, i, seq);

4425         }

4426         // Reluctant match. At this point, the minimum has been satisfied.

4427         // i is the index to start matching at

4428         // j is the number of atoms that have matched

4429         boolean match1(Matcher matcher, int i, int j, CharSequence seq) {

4430             for (;;) {

4431                 // Try finishing match without consuming any more

4432                 if (next.match(matcher, i, seq))

4433                     return true;

4434                 // At the maximum, no match found

4435                 if (j >= cmax)

4436                     return false;

4437                 // Okay, must try one more atom

4438                 if (!atom.match(matcher, i, seq))

4439                     return false;

4440                 // If we haven't moved forward then must break out

4441                 if (i == matcher.last)

4442                     return false;

4443                 // Move up index and number matched

4444                 i = matcher.last;

4445                 j++;

4446             }

4447         }

4448         boolean match2(Matcher matcher, int i, int j, CharSequence seq) {

4449             for (; j < cmax; j++) {

4450                 if (!atom.match(matcher, i, seq))

4451                     break;

4452                 if (i == matcher.last)

4453                     break;

4454                 i = matcher.last;

4455             }

4456             return next.match(matcher, i, seq);

4457         }

4458         boolean study(TreeInfo info) {

4459             // Save original info

4460             int minL = info.minLength;

4461             int maxL = info.maxLength;

4462             boolean maxV = info.maxValid;

4463             boolean detm = info.deterministic;

4464             info.reset();

4465 

4466             atom.study(info);

4467 

4468             int temp = info.minLength * cmin + minL;

4469             if (temp < minL) {

4470                 temp = 0xFFFFFFF; // arbitrary large number

4471             }

4472             info.minLength = temp;

4473 

4474             if (maxV & info.maxValid) {

4475                 temp = info.maxLength * cmax + maxL;

4476                 info.maxLength = temp;

4477                 if (temp < maxL) {

4478                     info.maxValid = false;

4479                 }

4480             } else {

4481                 info.maxValid = false;

4482             }

4483 

4484             if (info.deterministic && cmin == cmax)

4485                 info.deterministic = detm;

4486             else

4487                 info.deterministic = false;

4488             return next.study(info);

4489         }

4490     }

4491 

4492     /**

4493      * Handles the curly-brace style repetition with a specified minimum and

4494      * maximum occurrences in deterministic cases. This is an iterative

4495      * optimization over the Prolog and Loop system which would handle this

4496      * in a recursive way. The * quantifier is handled as a special case.

4497      * If capture is true then this class saves group settings and ensures

4498      * that groups are unset when backing off of a group match.

4499      */

4500     static final class GroupCurly extends Node {

4501         Node atom;

4502         Qtype type;

4503         int cmin;

4504         int cmax;

4505         int localIndex;

4506         int groupIndex;

4507         boolean capture;

4508 

4509         GroupCurly(Node node, int cmin, int cmax, Qtype type, int local,

4510                    int group, boolean capture) {

4511             this.atom = node;

4512             this.type = type;

4513             this.cmin = cmin;

4514             this.cmax = cmax;

4515             this.localIndex = local;

4516             this.groupIndex = group;

4517             this.capture = capture;

4518         }

4519         boolean match(Matcher matcher, int i, CharSequence seq) {

4520             int[] groups = matcher.groups;

4521             int[] locals = matcher.locals;

4522             int save0 = locals[localIndex];

4523             int save1 = 0;

4524             int save2 = 0;

4525 

4526             if (capture) {

4527                 save1 = groups[groupIndex];

4528                 save2 = groups[groupIndex+1];

4529             }

4530 

4531             // Notify GroupTail there is no need to setup group info

4532             // because it will be set here

4533             locals[localIndex] = -1;

4534 

4535             boolean ret = true;

4536             for (int j = 0; j < cmin; j++) {

4537                 if (atom.match(matcher, i, seq)) {

4538                     if (capture) {

4539                         groups[groupIndex] = i;

4540                         groups[groupIndex+1] = matcher.last;

4541                     }

4542                     i = matcher.last;

4543                 } else {

4544                     ret = false;

4545                     break;

4546                 }

4547             }

4548             if (ret) {

4549                 if (type == Qtype.GREEDY) {

4550                     ret = match0(matcher, i, cmin, seq);

4551                 } else if (type == Qtype.LAZY) {

4552                     ret = match1(matcher, i, cmin, seq);

4553                 } else {

4554                     ret = match2(matcher, i, cmin, seq);

4555                 }

4556             }

4557             if (!ret) {

4558                 locals[localIndex] = save0;

4559                 if (capture) {

4560                     groups[groupIndex] = save1;

4561                     groups[groupIndex+1] = save2;

4562                 }

4563             }

4564             return ret;

4565         }

4566         // Aggressive group match

4567         boolean match0(Matcher matcher, int i, int j, CharSequence seq) {

4568             // don't back off passing the starting "j"

4569             int min = j;

4570             int[] groups = matcher.groups;

4571             int save0 = 0;

4572             int save1 = 0;

4573             if (capture) {

4574                 save0 = groups[groupIndex];

4575                 save1 = groups[groupIndex+1];

4576             }

4577             for (;;) {

4578                 if (j >= cmax)

4579                     break;

4580                 if (!atom.match(matcher, i, seq))

4581                     break;

4582                 int k = matcher.last - i;

4583                 if (k <= 0) {

4584                     if (capture) {

4585                         groups[groupIndex] = i;

4586                         groups[groupIndex+1] = i + k;

4587                     }

4588                     i = i + k;

4589                     break;

4590                 }

4591                 for (;;) {

4592                     if (capture) {

4593                         groups[groupIndex] = i;

4594                         groups[groupIndex+1] = i + k;

4595                     }

4596                     i = i + k;

4597                     if (++j >= cmax)

4598                         break;

4599                     if (!atom.match(matcher, i, seq))

4600                         break;

4601                     if (i + k != matcher.last) {

4602                         if (match0(matcher, i, j, seq))

4603                             return true;

4604                         break;

4605                     }

4606                 }

4607                 while (j > min) {

4608                     if (next.match(matcher, i, seq)) {

4609                         if (capture) {

4610                             groups[groupIndex+1] = i;

4611                             groups[groupIndex] = i - k;

4612                         }

4613                         return true;

4614                     }

4615                     // backing off

4616                     i = i - k;

4617                     if (capture) {

4618                         groups[groupIndex+1] = i;

4619                         groups[groupIndex] = i - k;

4620                     }

4621                     j--;

4622 

4623                 }

4624                 break;

4625             }

4626             if (capture) {

4627                 groups[groupIndex] = save0;

4628                 groups[groupIndex+1] = save1;

4629             }

4630             return next.match(matcher, i, seq);

4631         }

4632         // Reluctant matching

4633         boolean match1(Matcher matcher, int i, int j, CharSequence seq) {

4634             for (;;) {

4635                 if (next.match(matcher, i, seq))

4636                     return true;

4637                 if (j >= cmax)

4638                     return false;

4639                 if (!atom.match(matcher, i, seq))

4640                     return false;

4641                 if (i == matcher.last)

4642                     return false;

4643                 if (capture) {

4644                     matcher.groups[groupIndex] = i;

4645                     matcher.groups[groupIndex+1] = matcher.last;

4646                 }

4647                 i = matcher.last;

4648                 j++;

4649             }

4650         }

4651         // Possessive matching

4652         boolean match2(Matcher matcher, int i, int j, CharSequence seq) {

4653             for (; j < cmax; j++) {

4654                 if (!atom.match(matcher, i, seq)) {

4655                     break;

4656                 }

4657                 if (capture) {

4658                     matcher.groups[groupIndex] = i;

4659                     matcher.groups[groupIndex+1] = matcher.last;

4660                 }

4661                 if (i == matcher.last) {

4662                     break;

4663                 }

4664                 i = matcher.last;

4665             }

4666             return next.match(matcher, i, seq);

4667         }

4668         boolean study(TreeInfo info) {

4669             // Save original info

4670             int minL = info.minLength;

4671             int maxL = info.maxLength;

4672             boolean maxV = info.maxValid;

4673             boolean detm = info.deterministic;

4674             info.reset();

4675 

4676             atom.study(info);

4677 

4678             int temp = info.minLength * cmin + minL;

4679             if (temp < minL) {

4680                 temp = 0xFFFFFFF; // Arbitrary large number

4681             }

4682             info.minLength = temp;

4683 

4684             if (maxV & info.maxValid) {

4685                 temp = info.maxLength * cmax + maxL;

4686                 info.maxLength = temp;

4687                 if (temp < maxL) {

4688                     info.maxValid = false;

4689                 }

4690             } else {

4691                 info.maxValid = false;

4692             }

4693 

4694             if (info.deterministic && cmin == cmax) {

4695                 info.deterministic = detm;

4696             } else {

4697                 info.deterministic = false;

4698             }

4699             return next.study(info);

4700         }

4701     }

4702 

4703     /**

4704      * A Guard node at the end of each atom node in a Branch. It

4705      * serves the purpose of chaining the "match" operation to

4706      * "next" but not the "study", so we can collect the TreeInfo

4707      * of each atom node without including the TreeInfo of the

4708      * "next".

4709      */

4710     static final class BranchConn extends Node {

4711         BranchConn() {};

4712         boolean match(Matcher matcher, int i, CharSequence seq) {

4713             return next.match(matcher, i, seq);

4714         }

4715         boolean study(TreeInfo info) {

4716             return info.deterministic;

4717         }

4718     }

4719 

4720     /**

4721      * Handles the branching of alternations. Note this is also used for

4722      * the ? quantifier to branch between the case where it matches once

4723      * and where it does not occur.

4724      */

4725     static final class Branch extends Node {

4726         Node[] atoms = new Node[2];

4727         int size = 2;

4728         Node conn;

4729         Branch(Node first, Node second, Node branchConn) {

4730             conn = branchConn;

4731             atoms[0] = first;

4732             atoms[1] = second;

4733         }

4734 

4735         void add(Node node) {

4736             if (size >= atoms.length) {

4737                 Node[] tmp = new Node[atoms.length*2];

4738                 System.arraycopy(atoms, 0, tmp, 0, atoms.length);

4739                 atoms = tmp;

4740             }

4741             atoms[size++] = node;

4742         }

4743 

4744         boolean match(Matcher matcher, int i, CharSequence seq) {

4745             for (int n = 0; n < size; n++) {

4746                 if (atoms[n] == null) {

4747                     if (conn.next.match(matcher, i, seq))

4748                         return true;

4749                 } else if (atoms[n].match(matcher, i, seq)) {

4750                     return true;

4751                 }

4752             }

4753             return false;

4754         }

4755 

4756         boolean study(TreeInfo info) {

4757             int minL = info.minLength;

4758             int maxL = info.maxLength;

4759             boolean maxV = info.maxValid;

4760 

4761             int minL2 = Integer.MAX_VALUE; //arbitrary large enough num

4762             int maxL2 = -1;

4763             for (int n = 0; n < size; n++) {

4764                 info.reset();

4765                 if (atoms[n] != null)

4766                     atoms[n].study(info);

4767                 minL2 = Math.min(minL2, info.minLength);

4768                 maxL2 = Math.max(maxL2, info.maxLength);

4769                 maxV = (maxV & info.maxValid);

4770             }

4771 

4772             minL += minL2;

4773             maxL += maxL2;

4774 

4775             info.reset();

4776             conn.next.study(info);

4777 

4778             info.minLength += minL;

4779             info.maxLength += maxL;

4780             info.maxValid &= maxV;

4781             info.deterministic = false;

4782             return false;

4783         }

4784     }

4785 

4786     /**

4787      * The GroupHead saves the location where the group begins in the locals

4788      * and restores them when the match is done.

4789      *

4790      * The matchRef is used when a reference to this group is accessed later

4791      * in the expression. The locals will have a negative value in them to

4792      * indicate that we do not want to unset the group if the reference

4793      * doesn't match.

4794      */

4795     static final class GroupHead extends Node {

4796         int localIndex;

4797         GroupTail tail;    // for debug/print only, match does not need to know

4798         GroupHead(int localCount) {

4799             localIndex = localCount;

4800         }

4801         boolean match(Matcher matcher, int i, CharSequence seq) {

4802             int save = matcher.locals[localIndex];

4803             matcher.locals[localIndex] = i;

4804             boolean ret = next.match(matcher, i, seq);

4805             matcher.locals[localIndex] = save;

4806             return ret;

4807         }

4808         boolean matchRef(Matcher matcher, int i, CharSequence seq) {

4809             int save = matcher.locals[localIndex];

4810             matcher.locals[localIndex] = ~i; // HACK

4811             boolean ret = next.match(matcher, i, seq);

4812             matcher.locals[localIndex] = save;

4813             return ret;

4814         }

4815     }

4816 

4817     /**

4818      * Recursive reference to a group in the regular expression. It calls

4819      * matchRef because if the reference fails to match we would not unset

4820      * the group.

4821      */

4822     static final class GroupRef extends Node {

4823         GroupHead head;

4824         GroupRef(GroupHead head) {

4825             this.head = head;

4826         }

4827         boolean match(Matcher matcher, int i, CharSequence seq) {

4828             return head.matchRef(matcher, i, seq)

4829                 && next.match(matcher, matcher.last, seq);

4830         }

4831         boolean study(TreeInfo info) {

4832             info.maxValid = false;

4833             info.deterministic = false;

4834             return next.study(info);

4835         }

4836     }

4837 

4838     /**

4839      * The GroupTail handles the setting of group beginning and ending

4840      * locations when groups are successfully matched. It must also be able to

4841      * unset groups that have to be backed off of.

4842      *

4843      * The GroupTail node is also used when a previous group is referenced,

4844      * and in that case no group information needs to be set.

4845      */

4846     static final class GroupTail extends Node {

4847         int localIndex;

4848         int groupIndex;

4849         GroupTail(int localCount, int groupCount) {

4850             localIndex = localCount;

4851             groupIndex = groupCount + groupCount;

4852         }

4853         boolean match(Matcher matcher, int i, CharSequence seq) {

4854             int tmp = matcher.locals[localIndex];

4855             if (tmp >= 0) { // This is the normal group case.

4856                 // Save the group so we can unset it if it

4857                 // backs off of a match.

4858                 int groupStart = matcher.groups[groupIndex];

4859                 int groupEnd = matcher.groups[groupIndex+1];

4860 

4861                 matcher.groups[groupIndex] = tmp;

4862                 matcher.groups[groupIndex+1] = i;

4863                 if (next.match(matcher, i, seq)) {

4864                     return true;

4865                 }

4866                 matcher.groups[groupIndex] = groupStart;

4867                 matcher.groups[groupIndex+1] = groupEnd;

4868                 return false;

4869             } else {

4870                 // This is a group reference case. We don't need to save any

4871                 // group info because it isn't really a group.

4872                 matcher.last = i;

4873                 return true;

4874             }

4875         }

4876     }

4877 

4878     /**

4879      * This sets up a loop to handle a recursive quantifier structure.

4880      */

4881     static final class Prolog extends Node {

4882         Loop loop;

4883         Prolog(Loop loop) {

4884             this.loop = loop;

4885         }

4886         boolean match(Matcher matcher, int i, CharSequence seq) {

4887             return loop.matchInit(matcher, i, seq);

4888         }

4889         boolean study(TreeInfo info) {

4890             return loop.study(info);

4891         }

4892     }

4893 

4894     /**

4895      * Handles the repetition count for a greedy Curly. The matchInit

4896      * is called from the Prolog to save the index of where the group

4897      * beginning is stored. A zero length group check occurs in the

4898      * normal match but is skipped in the matchInit.

4899      */

4900     static class Loop extends Node {

4901         Node body;

4902         int countIndex; // local count index in matcher locals

4903         int beginIndex; // group beginning index

4904         int cmin, cmax;

4905         int posIndex;

4906         Loop(int countIndex, int beginIndex) {

4907             this.countIndex = countIndex;

4908             this.beginIndex = beginIndex;

4909             this.posIndex = -1;

4910         }

4911         boolean match(Matcher matcher, int i, CharSequence seq) {

4912             // Avoid infinite loop in zero-length case.

4913             if (i > matcher.locals[beginIndex]) {

4914                 int count = matcher.locals[countIndex];

4915 

4916                 // This block is for before we reach the minimum

4917                 // iterations required for the loop to match

4918                 if (count < cmin) {

4919                     matcher.locals[countIndex] = count + 1;

4920                     boolean b = body.match(matcher, i, seq);

4921                     // If match failed we must backtrack, so

4922                     // the loop count should NOT be incremented

4923                     if (!b)

4924                         matcher.locals[countIndex] = count;

4925                     // Return success or failure since we are under

4926                     // minimum

4927                     return b;

4928                 }

4929                 // This block is for after we have the minimum

4930                 // iterations required for the loop to match

4931                 if (count < cmax) {

4932                     // Let's check if we have already tried and failed

4933                     // at this starting position "i" in the past.

4934                     // If yes, then just return false wihtout trying

4935                     // again, to stop the exponential backtracking.

4936                     if (posIndex != -1 &&

4937                         matcher.localsPos[posIndex].contains(i)) {

4938                         return next.match(matcher, i, seq);

4939                     }

4940                     matcher.locals[countIndex] = count + 1;

4941                     boolean b = body.match(matcher, i, seq);

4942                     // If match failed we must backtrack, so

4943                     // the loop count should NOT be incremented

4944                     if (b)

4945                         return true;

4946                     matcher.locals[countIndex] = count;

4947                     // save the failed position

4948                     if (posIndex != -1) {

4949                         matcher.localsPos[posIndex].add(i);

4950                     }

4951                 }

4952             }

4953             return next.match(matcher, i, seq);

4954         }

4955         boolean matchInit(Matcher matcher, int i, CharSequence seq) {

4956             int save = matcher.locals[countIndex];

4957             boolean ret = false;

4958             if (posIndex != -1 && matcher.localsPos[posIndex] == null) {

4959                 matcher.localsPos[posIndex] = new IntHashSet();

4960             }

4961             if (0 < cmin) {

4962                 matcher.locals[countIndex] = 1;

4963                 ret = body.match(matcher, i, seq);

4964             } else if (0 < cmax) {

4965                 matcher.locals[countIndex] = 1;

4966                 ret = body.match(matcher, i, seq);

4967                 if (ret == false)

4968                     ret = next.match(matcher, i, seq);

4969             } else {

4970                 ret = next.match(matcher, i, seq);

4971             }

4972             matcher.locals[countIndex] = save;

4973             return ret;

4974         }

4975         boolean study(TreeInfo info) {

4976             info.maxValid = false;

4977             info.deterministic = false;

4978             return false;

4979         }

4980     }

4981 

4982     /**

4983      * Handles the repetition count for a reluctant Curly. The matchInit

4984      * is called from the Prolog to save the index of where the group

4985      * beginning is stored. A zero length group check occurs in the

4986      * normal match but is skipped in the matchInit.

4987      */

4988     static final class LazyLoop extends Loop {

4989         LazyLoop(int countIndex, int beginIndex) {

4990             super(countIndex, beginIndex);

4991         }

4992         boolean match(Matcher matcher, int i, CharSequence seq) {

4993             // Check for zero length group

4994             if (i > matcher.locals[beginIndex]) {

4995                 int count = matcher.locals[countIndex];

4996                 if (count < cmin) {

4997                     matcher.locals[countIndex] = count + 1;

4998                     boolean result = body.match(matcher, i, seq);

4999                     // If match failed we must backtrack, so

5000                     // the loop count should NOT be incremented

5001                     if (!result)

5002                         matcher.locals[countIndex] = count;

5003                     return result;

5004                 }

5005                 if (next.match(matcher, i, seq))

5006                     return true;

5007                 if (count < cmax) {

5008                     matcher.locals[countIndex] = count + 1;

5009                     boolean result = body.match(matcher, i, seq);

5010                     // If match failed we must backtrack, so

5011                     // the loop count should NOT be incremented

5012                     if (!result)

5013                         matcher.locals[countIndex] = count;

5014                     return result;

5015                 }

5016                 return false;

5017             }

5018             return next.match(matcher, i, seq);

5019         }

5020         boolean matchInit(Matcher matcher, int i, CharSequence seq) {

5021             int save = matcher.locals[countIndex];

5022             boolean ret = false;

5023             if (0 < cmin) {

5024                 matcher.locals[countIndex] = 1;

5025                 ret = body.match(matcher, i, seq);

5026             } else if (next.match(matcher, i, seq)) {

5027                 ret = true;

5028             } else if (0 < cmax) {

5029                 matcher.locals[countIndex] = 1;

5030                 ret = body.match(matcher, i, seq);

5031             }

5032             matcher.locals[countIndex] = save;

5033             return ret;

5034         }

5035         boolean study(TreeInfo info) {

5036             info.maxValid = false;

5037             info.deterministic = false;

5038             return false;

5039         }

5040     }

5041 

5042     /**

5043      * Refers to a group in the regular expression. Attempts to match

5044      * whatever the group referred to last matched.

5045      */

5046     static class BackRef extends Node {

5047         int groupIndex;

5048         BackRef(int groupCount) {

5049             super();

5050             groupIndex = groupCount + groupCount;

5051         }

5052         boolean match(Matcher matcher, int i, CharSequence seq) {

5053             int j = matcher.groups[groupIndex];

5054             int k = matcher.groups[groupIndex+1];

5055 

5056             int groupSize = k - j;

5057             // If the referenced group didn't match, neither can this

5058             if (j < 0)

5059                 return false;

5060 

5061             // If there isn't enough input left no match

5062             if (i + groupSize > matcher.to) {

5063                 matcher.hitEnd = true;

5064                 return false;

5065             }

5066             // Check each new char to make sure it matches what the group

5067             // referenced matched last time around

5068             for (int index=0; index<groupSize; index++)

5069                 if (seq.charAt(i+index) != seq.charAt(j+index))

5070                     return false;

5071 

5072             return next.match(matcher, i+groupSize, seq);

5073         }

5074         boolean study(TreeInfo info) {

5075             info.maxValid = false;

5076             return next.study(info);

5077         }

5078     }

5079 

5080     static class CIBackRef extends Node {

5081         int groupIndex;

5082         boolean doUnicodeCase;

5083         CIBackRef(int groupCount, boolean doUnicodeCase) {

5084             super();

5085             groupIndex = groupCount + groupCount;

5086             this.doUnicodeCase = doUnicodeCase;

5087         }

5088         boolean match(Matcher matcher, int i, CharSequence seq) {

5089             int j = matcher.groups[groupIndex];

5090             int k = matcher.groups[groupIndex+1];

5091 

5092             int groupSize = k - j;

5093 

5094             // If the referenced group didn't match, neither can this

5095             if (j < 0)

5096                 return false;

5097 

5098             // If there isn't enough input left no match

5099             if (i + groupSize > matcher.to) {

5100                 matcher.hitEnd = true;

5101                 return false;

5102             }

5103 

5104             // Check each new char to make sure it matches what the group

5105             // referenced matched last time around

5106             int x = i;

5107             for (int index=0; index<groupSize; index++) {

5108                 int c1 = Character.codePointAt(seq, x);

5109                 int c2 = Character.codePointAt(seq, j);

5110                 if (c1 != c2) {

5111                     if (doUnicodeCase) {

5112                         int cc1 = Character.toUpperCase(c1);

5113                         int cc2 = Character.toUpperCase(c2);

5114                         if (cc1 != cc2 &&

5115                             Character.toLowerCase(cc1) !=

5116                             Character.toLowerCase(cc2))

5117                             return false;

5118                     } else {

5119                         if (ASCII.toLower(c1) != ASCII.toLower(c2))

5120                             return false;

5121                     }

5122                 }

5123                 x += Character.charCount(c1);

5124                 j += Character.charCount(c2);

5125             }

5126 

5127             return next.match(matcher, i+groupSize, seq);

5128         }

5129         boolean study(TreeInfo info) {

5130             info.maxValid = false;

5131             return next.study(info);

5132         }

5133     }

5134 

5135     /**

5136      * Searches until the next instance of its atom. This is useful for

5137      * finding the atom efficiently without passing an instance of it

5138      * (greedy problem) and without a lot of wasted search time (reluctant

5139      * problem).

5140      */

5141     static final class First extends Node {

5142         Node atom;

5143         First(Node node) {

5144             this.atom = BnM.optimize(node);

5145         }

5146         boolean match(Matcher matcher, int i, CharSequence seq) {

5147             if (atom instanceof BnM) {

5148                 return atom.match(matcher, i, seq)

5149                     && next.match(matcher, matcher.last, seq);

5150             }

5151             for (;;) {

5152                 if (i > matcher.to) {

5153                     matcher.hitEnd = true;

5154                     return false;

5155                 }

5156                 if (atom.match(matcher, i, seq)) {

5157                     return next.match(matcher, matcher.last, seq);

5158                 }

5159                 i += countChars(seq, i, 1);

5160                 matcher.first++;

5161             }

5162         }

5163         boolean study(TreeInfo info) {

5164             atom.study(info);

5165             info.maxValid = false;

5166             info.deterministic = false;

5167             return next.study(info);

5168         }

5169     }

5170 

5171     static final class Conditional extends Node {

5172         Node cond, yes, not;

5173         Conditional(Node cond, Node yes, Node not) {

5174             this.cond = cond;

5175             this.yes = yes;

5176             this.not = not;

5177         }

5178         boolean match(Matcher matcher, int i, CharSequence seq) {

5179             if (cond.match(matcher, i, seq)) {

5180                 return yes.match(matcher, i, seq);

5181             } else {

5182                 return not.match(matcher, i, seq);

5183             }

5184         }

5185         boolean study(TreeInfo info) {

5186             int minL = info.minLength;

5187             int maxL = info.maxLength;

5188             boolean maxV = info.maxValid;

5189             info.reset();

5190             yes.study(info);

5191 

5192             int minL2 = info.minLength;

5193             int maxL2 = info.maxLength;

5194             boolean maxV2 = info.maxValid;

5195             info.reset();

5196             not.study(info);

5197 

5198             info.minLength = minL + Math.min(minL2, info.minLength);

5199             info.maxLength = maxL + Math.max(maxL2, info.maxLength);

5200             info.maxValid = (maxV & maxV2 & info.maxValid);

5201             info.deterministic = false;

5202             return next.study(info);

5203         }

5204     }

5205 

5206     /**

5207      * Zero width positive lookahead.

5208      */

5209     static final class Pos extends Node {

5210         Node cond;

5211         Pos(Node cond) {

5212             this.cond = cond;

5213         }

5214         boolean match(Matcher matcher, int i, CharSequence seq) {

5215             int savedTo = matcher.to;

5216             boolean conditionMatched = false;

5217 

5218             // Relax transparent region boundaries for lookahead

5219             if (matcher.transparentBounds)

5220                 matcher.to = matcher.getTextLength();

5221             try {

5222                 conditionMatched = cond.match(matcher, i, seq);

5223             } finally {

5224                 // Reinstate region boundaries

5225                 matcher.to = savedTo;

5226             }

5227             return conditionMatched && next.match(matcher, i, seq);

5228         }

5229     }

5230 

5231     /**

5232      * Zero width negative lookahead.

5233      */

5234     static final class Neg extends Node {

5235         Node cond;

5236         Neg(Node cond) {

5237             this.cond = cond;

5238         }

5239         boolean match(Matcher matcher, int i, CharSequence seq) {

5240             int savedTo = matcher.to;

5241             boolean conditionMatched = false;

5242 

5243             // Relax transparent region boundaries for lookahead

5244             if (matcher.transparentBounds)

5245                 matcher.to = matcher.getTextLength();

5246             try {

5247                 if (i < matcher.to) {

5248                     conditionMatched = !cond.match(matcher, i, seq);

5249                 } else {

5250                     // If a negative lookahead succeeds then more input

5251                     // could cause it to fail!

5252                     matcher.requireEnd = true;

5253                     conditionMatched = !cond.match(matcher, i, seq);

5254                 }

5255             } finally {

5256                 // Reinstate region boundaries

5257                 matcher.to = savedTo;

5258             }

5259             return conditionMatched && next.match(matcher, i, seq);

5260         }

5261     }

5262 

5263     /**

5264      * For use with lookbehinds; matches the position where the lookbehind

5265      * was encountered.

5266      */

5267     static Node lookbehindEnd = new Node() {

5268         boolean match(Matcher matcher, int i, CharSequence seq) {

5269             return i == matcher.lookbehindTo;

5270         }

5271     };

5272 

5273     /**

5274      * Zero width positive lookbehind.

5275      */

5276     static class Behind extends Node {

5277         Node cond;

5278         int rmax, rmin;

5279         Behind(Node cond, int rmax, int rmin) {

5280             this.cond = cond;

5281             this.rmax = rmax;

5282             this.rmin = rmin;

5283         }

5284 

5285         boolean match(Matcher matcher, int i, CharSequence seq) {

5286             int savedFrom = matcher.from;

5287             boolean conditionMatched = false;

5288             int startIndex = (!matcher.transparentBounds) ?

5289                              matcher.from : 0;

5290             int from = Math.max(i - rmax, startIndex);

5291             // Set end boundary

5292             int savedLBT = matcher.lookbehindTo;

5293             matcher.lookbehindTo = i;

5294             // Relax transparent region boundaries for lookbehind

5295             if (matcher.transparentBounds)

5296                 matcher.from = 0;

5297             for (int j = i - rmin; !conditionMatched && j >= from; j--) {

5298                 conditionMatched = cond.match(matcher, j, seq);

5299             }

5300             matcher.from = savedFrom;

5301             matcher.lookbehindTo = savedLBT;

5302             return conditionMatched && next.match(matcher, i, seq);

5303         }

5304     }

5305 

5306     /**

5307      * Zero width positive lookbehind, including supplementary

5308      * characters or unpaired surrogates.

5309      */

5310     static final class BehindS extends Behind {

5311         BehindS(Node cond, int rmax, int rmin) {

5312             super(cond, rmax, rmin);

5313         }

5314         boolean match(Matcher matcher, int i, CharSequence seq) {

5315             int rmaxChars = countChars(seq, i, -rmax);

5316             int rminChars = countChars(seq, i, -rmin);

5317             int savedFrom = matcher.from;

5318             int startIndex = (!matcher.transparentBounds) ?

5319                              matcher.from : 0;

5320             boolean conditionMatched = false;

5321             int from = Math.max(i - rmaxChars, startIndex);

5322             // Set end boundary

5323             int savedLBT = matcher.lookbehindTo;

5324             matcher.lookbehindTo = i;

5325             // Relax transparent region boundaries for lookbehind

5326             if (matcher.transparentBounds)

5327                 matcher.from = 0;

5328 

5329             for (int j = i - rminChars;

5330                  !conditionMatched && j >= from;

5331                  j -= j>from ? countChars(seq, j, -1) : 1) {

5332                 conditionMatched = cond.match(matcher, j, seq);

5333             }

5334             matcher.from = savedFrom;

5335             matcher.lookbehindTo = savedLBT;

5336             return conditionMatched && next.match(matcher, i, seq);

5337         }

5338     }

5339 

5340     /**

5341      * Zero width negative lookbehind.

5342      */

5343     static class NotBehind extends Node {

5344         Node cond;

5345         int rmax, rmin;

5346         NotBehind(Node cond, int rmax, int rmin) {

5347             this.cond = cond;

5348             this.rmax = rmax;

5349             this.rmin = rmin;

5350         }

5351 

5352         boolean match(Matcher matcher, int i, CharSequence seq) {

5353             int savedLBT = matcher.lookbehindTo;

5354             int savedFrom = matcher.from;

5355             boolean conditionMatched = false;

5356             int startIndex = (!matcher.transparentBounds) ?

5357                              matcher.from : 0;

5358             int from = Math.max(i - rmax, startIndex);

5359             matcher.lookbehindTo = i;

5360             // Relax transparent region boundaries for lookbehind

5361             if (matcher.transparentBounds)

5362                 matcher.from = 0;

5363             for (int j = i - rmin; !conditionMatched && j >= from; j--) {

5364                 conditionMatched = cond.match(matcher, j, seq);

5365             }

5366             // Reinstate region boundaries

5367             matcher.from = savedFrom;

5368             matcher.lookbehindTo = savedLBT;

5369             return !conditionMatched && next.match(matcher, i, seq);

5370         }

5371     }

5372 

5373     /**

5374      * Zero width negative lookbehind, including supplementary

5375      * characters or unpaired surrogates.

5376      */

5377     static final class NotBehindS extends NotBehind {

5378         NotBehindS(Node cond, int rmax, int rmin) {

5379             super(cond, rmax, rmin);

5380         }

5381         boolean match(Matcher matcher, int i, CharSequence seq) {

5382             int rmaxChars = countChars(seq, i, -rmax);

5383             int rminChars = countChars(seq, i, -rmin);

5384             int savedFrom = matcher.from;

5385             int savedLBT = matcher.lookbehindTo;

5386             boolean conditionMatched = false;

5387             int startIndex = (!matcher.transparentBounds) ?

5388                              matcher.from : 0;

5389             int from = Math.max(i - rmaxChars, startIndex);

5390             matcher.lookbehindTo = i;

5391             // Relax transparent region boundaries for lookbehind

5392             if (matcher.transparentBounds)

5393                 matcher.from = 0;

5394             for (int j = i - rminChars;

5395                  !conditionMatched && j >= from;

5396                  j -= j>from ? countChars(seq, j, -1) : 1) {

5397                 conditionMatched = cond.match(matcher, j, seq);

5398             }

5399             //Reinstate region boundaries

5400             matcher.from = savedFrom;

5401             matcher.lookbehindTo = savedLBT;

5402             return !conditionMatched && next.match(matcher, i, seq);

5403         }

5404     }

5405 

5406     /**

5407      * Handles word boundaries. Includes a field to allow this one class to

5408      * deal with the different types of word boundaries we can match. The word

5409      * characters include underscores, letters, and digits. Non spacing marks

5410      * can are also part of a word if they have a base character, otherwise

5411      * they are ignored for purposes of finding word boundaries.

5412      */

5413     static final class Bound extends Node {

5414         static int LEFT = 0x1;

5415         static int RIGHT= 0x2;

5416         static int BOTH = 0x3;

5417         static int NONE = 0x4;

5418         int type;

5419         boolean useUWORD;

5420         Bound(int n, boolean useUWORD) {

5421             type = n;

5422             this.useUWORD = useUWORD;

5423         }

5424 

5425         boolean isWord(int ch) {

5426             return useUWORD ? CharPredicates.WORD().is(ch)

5427                             : (ch == '_' || Character.isLetterOrDigit(ch));

5428         }

5429 

5430         int check(Matcher matcher, int i, CharSequence seq) {

5431             int ch;

5432             boolean left = false;

5433             int startIndex = matcher.from;

5434             int endIndex = matcher.to;

5435             if (matcher.transparentBounds) {

5436                 startIndex = 0;

5437                 endIndex = matcher.getTextLength();

5438             }

5439             if (i > startIndex) {

5440                 ch = Character.codePointBefore(seq, i);

5441                 left = (isWord(ch) ||

5442                     ((Character.getType(ch) == Character.NON_SPACING_MARK)

5443                      && hasBaseCharacter(matcher, i-1, seq)));

5444             }

5445             boolean right = false;

5446             if (i < endIndex) {

5447                 ch = Character.codePointAt(seq, i);

5448                 right = (isWord(ch) ||

5449                     ((Character.getType(ch) == Character.NON_SPACING_MARK)

5450                      && hasBaseCharacter(matcher, i, seq)));

5451             } else {

5452                 // Tried to access char past the end

5453                 matcher.hitEnd = true;

5454                 // The addition of another char could wreck a boundary

5455                 matcher.requireEnd = true;

5456             }

5457             return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE);

5458         }

5459         boolean match(Matcher matcher, int i, CharSequence seq) {

5460             return (check(matcher, i, seq) & type) > 0

5461                 && next.match(matcher, i, seq);

5462         }

5463     }

5464 

5465     /**

5466      * Non spacing marks only count as word characters in bounds calculations

5467      * if they have a base character.

5468      */

5469     private static boolean hasBaseCharacter(Matcher matcher, int i,

5470                                             CharSequence seq)

5471     {

5472         int start = (!matcher.transparentBounds) ?

5473             matcher.from : 0;

5474         for (int x=i; x >= start; x--) {

5475             int ch = Character.codePointAt(seq, x);

5476             if (Character.isLetterOrDigit(ch))

5477                 return true;

5478             if (Character.getType(ch) == Character.NON_SPACING_MARK)

5479                 continue;

5480             return false;

5481         }

5482         return false;

5483     }

5484 

5485     /**

5486      * Attempts to match a slice in the input using the Boyer-Moore string

5487      * matching algorithm. The algorithm is based on the idea that the

5488      * pattern can be shifted farther ahead in the search text if it is

5489      * matched right to left.

5490      * <p>

5491      * The pattern is compared to the input one character at a time, from

5492      * the rightmost character in the pattern to the left. If the characters

5493      * all match the pattern has been found. If a character does not match,

5494      * the pattern is shifted right a distance that is the maximum of two

5495      * functions, the bad character shift and the good suffix shift. This

5496      * shift moves the attempted match position through the input more

5497      * quickly than a naive one position at a time check.

5498      * <p>

5499      * The bad character shift is based on the character from the text that

5500      * did not match. If the character does not appear in the pattern, the

5501      * pattern can be shifted completely beyond the bad character. If the

5502      * character does occur in the pattern, the pattern can be shifted to

5503      * line the pattern up with the next occurrence of that character.

5504      * <p>

5505      * The good suffix shift is based on the idea that some subset on the right

5506      * side of the pattern has matched. When a bad character is found, the

5507      * pattern can be shifted right by the pattern length if the subset does

5508      * not occur again in pattern, or by the amount of distance to the

5509      * next occurrence of the subset in the pattern.

5510      *

5511      * Boyer-Moore search methods adapted from code by Amy Yu.

5512      */

5513     static class BnM extends Node {

5514         int[] buffer;

5515         int[] lastOcc;

5516         int[] optoSft;

5517 

5518         /**

5519          * Pre calculates arrays needed to generate the bad character

5520          * shift and the good suffix shift. Only the last seven bits

5521          * are used to see if chars match; This keeps the tables small

5522          * and covers the heavily used ASCII range, but occasionally

5523          * results in an aliased match for the bad character shift.

5524          */

5525         static Node optimize(Node node) {

5526             if (!(node instanceof Slice)) {

5527                 return node;

5528             }

5529 

5530             int[] src = ((Slice) node).buffer;

5531             int patternLength = src.length;

5532             // The BM algorithm requires a bit of overhead;

5533             // If the pattern is short don't use it, since

5534             // a shift larger than the pattern length cannot

5535             // be used anyway.

5536             if (patternLength < 4) {

5537                 return node;

5538             }

5539             int i, j, k;

5540             int[] lastOcc = new int[128];

5541             int[] optoSft = new int[patternLength];

5542             // Precalculate part of the bad character shift

5543             // It is a table for where in the pattern each

5544             // lower 7-bit value occurs

5545             for (i = 0; i < patternLength; i++) {

5546                 lastOcc[src[i]&0x7F] = i + 1;

5547             }

5548             // Precalculate the good suffix shift

5549             // i is the shift amount being considered

5550 NEXT:       for (i = patternLength; i > 0; i--) {

5551                 // j is the beginning index of suffix being considered

5552                 for (j = patternLength - 1; j >= i; j--) {

5553                     // Testing for good suffix

5554                     if (src[j] == src[j-i]) {

5555                         // src[j..len] is a good suffix

5556                         optoSft[j-1] = i;

5557                     } else {

5558                         // No match. The array has already been

5559                         // filled up with correct values before.

5560                         continue NEXT;

5561                     }

5562                 }

5563                 // This fills up the remaining of optoSft

5564                 // any suffix can not have larger shift amount

5565                 // then its sub-suffix. Why???

5566                 while (j > 0) {

5567                     optoSft[--j] = i;

5568                 }

5569             }

5570             // Set the guard value because of unicode compression

5571             optoSft[patternLength-1] = 1;

5572             if (node instanceof SliceS)

5573                 return new BnMS(src, lastOcc, optoSft, node.next);

5574             return new BnM(src, lastOcc, optoSft, node.next);

5575         }

5576         BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) {

5577             this.buffer = src;

5578             this.lastOcc = lastOcc;

5579             this.optoSft = optoSft;

5580             this.next = next;

5581         }

5582         boolean match(Matcher matcher, int i, CharSequence seq) {

5583             int[] src = buffer;

5584             int patternLength = src.length;

5585             int last = matcher.to - patternLength;

5586 

5587             // Loop over all possible match positions in text

5588 NEXT:       while (i <= last) {

5589                 // Loop over pattern from right to left

5590                 for (int j = patternLength - 1; j >= 0; j--) {

5591                     int ch = seq.charAt(i+j);

5592                     if (ch != src[j]) {

5593                         // Shift search to the right by the maximum of the

5594                         // bad character shift and the good suffix shift

5595                         i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]);

5596                         continue NEXT;

5597                     }

5598                 }

5599                 // Entire pattern matched starting at i

5600                 matcher.first = i;

5601                 boolean ret = next.match(matcher, i + patternLength, seq);

5602                 if (ret) {

5603                     matcher.first = i;

5604                     matcher.groups[0] = matcher.first;

5605                     matcher.groups[1] = matcher.last;

5606                     return true;

5607                 }

5608                 i++;

5609             }

5610             // BnM is only used as the leading node in the unanchored case,

5611             // and it replaced its Start() which always searches to the end

5612             // if it doesn't find what it's looking for, so hitEnd is true.

5613             matcher.hitEnd = true;

5614             return false;

5615         }

5616         boolean study(TreeInfo info) {

5617             info.minLength += buffer.length;

5618             info.maxValid = false;

5619             return next.study(info);

5620         }

5621     }

5622 

5623     /**

5624      * Supplementary support version of BnM(). Unpaired surrogates are

5625      * also handled by this class.

5626      */

5627     static final class BnMS extends BnM {

5628         int lengthInChars;

5629 

5630         BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) {

5631             super(src, lastOcc, optoSft, next);

5632             for (int cp : buffer) {

5633                 lengthInChars += Character.charCount(cp);

5634             }

5635         }

5636         boolean match(Matcher matcher, int i, CharSequence seq) {

5637             int[] src = buffer;

5638             int patternLength = src.length;

5639             int last = matcher.to - lengthInChars;

5640 

5641             // Loop over all possible match positions in text

5642 NEXT:       while (i <= last) {

5643                 // Loop over pattern from right to left

5644                 int ch;

5645                 for (int j = countChars(seq, i, patternLength), x = patternLength - 1;

5646                      j > 0; j -= Character.charCount(ch), x--) {

5647                     ch = Character.codePointBefore(seq, i+j);

5648                     if (ch != src[x]) {

5649                         // Shift search to the right by the maximum of the

5650                         // bad character shift and the good suffix shift

5651                         int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]);

5652                         i += countChars(seq, i, n);

5653                         continue NEXT;

5654                     }

5655                 }

5656                 // Entire pattern matched starting at i

5657                 matcher.first = i;

5658                 boolean ret = next.match(matcher, i + lengthInChars, seq);

5659                 if (ret) {

5660                     matcher.first = i;

5661                     matcher.groups[0] = matcher.first;

5662                     matcher.groups[1] = matcher.last;

5663                     return true;

5664                 }

5665                 i += countChars(seq, i, 1);

5666             }

5667             matcher.hitEnd = true;

5668             return false;

5669         }

5670     }

5671 

5672     @FunctionalInterface

5673     static interface CharPredicate {

5674         boolean is(int ch);

5675 

5676         default CharPredicate and(CharPredicate p) {

5677             return ch -> is(ch) && p.is(ch);

5678         }

5679         default CharPredicate union(CharPredicate p) {

5680             return ch -> is(ch) || p.is(ch);

5681         }

5682         default CharPredicate union(CharPredicate p1,

5683                                     CharPredicate p2 ) {

5684             return ch -> is(ch) || p1.is(ch) || p2.is(ch);

5685         }

5686         default CharPredicate negate() {

5687             return ch -> !is(ch);

5688         }

5689     }

5690 

5691     static interface BmpCharPredicate extends CharPredicate {

5692 

5693         default CharPredicate and(CharPredicate p) {

5694             if(p instanceof BmpCharPredicate)

5695                 return (BmpCharPredicate)(ch -> is(ch) && p.is(ch));

5696             return ch -> is(ch) && p.is(ch);

5697         }

5698         default CharPredicate union(CharPredicate p) {

5699             if (p instanceof BmpCharPredicate)

5700                 return (BmpCharPredicate)(ch -> is(ch) || p.is(ch));

5701             return ch -> is(ch) || p.is(ch);

5702         }

5703         static CharPredicate union(CharPredicate... predicates) {

5704             CharPredicate cp = ch -> {

5705                 for (CharPredicate p : predicates) {

5706                     if (!p.is(ch))

5707                         return false;

5708                 }

5709                 return true;

5710             };

5711             for (CharPredicate p : predicates) {

5712                 if (! (p instanceof BmpCharPredicate))

5713                     return cp;

5714             }

5715             return (BmpCharPredicate)cp;

5716         }

5717     }

5718 

5719     /**

5720      * matches a Perl vertical whitespace

5721      */

5722     static BmpCharPredicate VertWS() {

5723         return cp -> (cp >= 0x0A && cp <= 0x0D) ||

5724             cp == 0x85 || cp == 0x2028 || cp == 0x2029;

5725     }

5726 

5727     /**

5728      * matches a Perl horizontal whitespace

5729      */

5730     static BmpCharPredicate HorizWS() {

5731         return cp ->

5732             cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 ||

5733             cp == 0x180e || cp >= 0x2000 && cp <= 0x200a ||  cp == 0x202f ||

5734             cp == 0x205f || cp == 0x3000;

5735     }

5736 

5737     /**

5738      *  for the Unicode category ALL and the dot metacharacter when

5739      *  in dotall mode.

5740      */

5741     static CharPredicate ALL() {

5742         return ch -> true;

5743     }

5744 

5745     /**

5746      * for the dot metacharacter when dotall is not enabled.

5747      */

5748     static CharPredicate DOT() {

5749         return ch ->

5750             (ch != '\n' && ch != '\r'

5751             && (ch|1) != '\u2029'

5752             && ch != '\u0085');

5753     }

5754 

5755     /**

5756      *  the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled.

5757      */

5758     static CharPredicate UNIXDOT() {

5759         return ch ->  ch != '\n';

5760     }

5761 

5762     /**

5763      * Indicate that matches a Supplementary Unicode character

5764      */

5765     static CharPredicate SingleS(int c) {

5766         return ch -> ch == c;

5767     }

5768 

5769     /**

5770      * A bmp/optimized predicate of single

5771      */

5772     static BmpCharPredicate Single(int c) {

5773         return ch -> ch == c;

5774     }

5775 

5776     /**

5777      * Case insensitive matches a given BMP character

5778      */

5779     static BmpCharPredicate SingleI(int lower, int upper) {

5780         return ch -> ch == lower || ch == upper;

5781     }

5782 

5783     /**

5784      * Unicode case insensitive matches a given Unicode character

5785      */

5786     static CharPredicate SingleU(int lower) {

5787         return ch -> lower == ch ||

5788                      lower == Character.toLowerCase(Character.toUpperCase(ch));

5789     }

5790 

5791     private static boolean inRange(int lower, int ch, int upper) {

5792         return lower <= ch && ch <= upper;

5793     }

5794 

5795     /**

5796      * Charactrs within a explicit value range

5797      */

5798     static CharPredicate Range(int lower, int upper) {

5799         if (upper < Character.MIN_HIGH_SURROGATE ||

5800             lower > Character.MAX_HIGH_SURROGATE &&

5801             upper < Character.MIN_SUPPLEMENTARY_CODE_POINT)

5802             return (BmpCharPredicate)(ch -> inRange(lower, ch, upper));

5803         return ch -> inRange(lower, ch, upper);

5804     }

5805 

5806    /**

5807     * Charactrs within a explicit value range in a case insensitive manner.

5808     */

5809     static CharPredicate CIRange(int lower, int upper) {

5810         return ch -> inRange(lower, ch, upper) ||

5811                      ASCII.isAscii(ch) &&

5812                      (inRange(lower, ASCII.toUpper(ch), upper) ||

5813                       inRange(lower, ASCII.toLower(ch), upper));

5814     }

5815 

5816     static CharPredicate CIRangeU(int lower, int upper) {

5817         return ch -> {

5818             if (inRange(lower, ch, upper))

5819                 return true;

5820             int up = Character.toUpperCase(ch);

5821             return inRange(lower, up, upper) ||

5822                    inRange(lower, Character.toLowerCase(up), upper);

5823         };

5824     }

5825 

5826     /**

5827      *  This must be the very first initializer.

5828      */

5829     static final Node accept = new Node();

5830 

5831     static final Node lastAccept = new LastNode();

5832 

5833     /**

5834      * Creates a predicate that tests if this pattern is found in a given input

5835      * string.

5836      *

5837      * @apiNote

5838      * This method creates a predicate that behaves as if it creates a matcher

5839      * from the input sequence and then calls {@code find}, for example a

5840      * predicate of the form:

5841      * <pre>{@code

5842      *   s -> matcher(s).find();

5843      * }</pre>

5844      *

5845      * @return  The predicate which can be used for finding a match on a

5846      *          subsequence of a string

5847      * @since   1.8

5848      * @see     Matcher#find

5849      */

5850     public Predicate<String> asPredicate() {

5851         return s -> matcher(s).find();

5852     }

5853 

5854     /**

5855      * Creates a predicate that tests if this pattern matches a given input string.

5856      *

5857      * @apiNote

5858      * This method creates a predicate that behaves as if it creates a matcher

5859      * from the input sequence and then calls {@code matches}, for example a

5860      * predicate of the form:

5861      * <pre>{@code

5862      *   s -> matcher(s).matches();

5863      * }</pre>

5864      *

5865      * @return  The predicate which can be used for matching an input string

5866      *          against this pattern.

5867      * @since   11

5868      * @see     Matcher#matches

5869      */

5870     public Predicate<String> asMatchPredicate() {

5871         return s -> matcher(s).matches();

5872     }

5873 

5874     /**

5875      * Creates a stream from the given input sequence around matches of this

5876      * pattern.

5877      *

5878      * <p> The stream returned by this method contains each substring of the

5879      * input sequence that is terminated by another subsequence that matches

5880      * this pattern or is terminated by the end of the input sequence.  The

5881      * substrings in the stream are in the order in which they occur in the

5882      * input. Trailing empty strings will be discarded and not encountered in

5883      * the stream.

5884      *

5885      * <p> If this pattern does not match any subsequence of the input then

5886      * the resulting stream has just one element, namely the input sequence in

5887      * string form.

5888      *

5889      * <p> When there is a positive-width match at the beginning of the input

5890      * sequence then an empty leading substring is included at the beginning

5891      * of the stream. A zero-width match at the beginning however never produces

5892      * such empty leading substring.

5893      *

5894      * <p> If the input sequence is mutable, it must remain constant during the

5895      * execution of the terminal stream operation.  Otherwise, the result of the

5896      * terminal stream operation is undefined.

5897      *

5898      * @param   input

5899      *          The character sequence to be split

5900      *

5901      * @return  The stream of strings computed by splitting the input

5902      *          around matches of this pattern

5903      * @see     #split(CharSequence)

5904      * @since   1.8

5905      */

5906     public Stream<String> splitAsStream(final CharSequence input) {

5907         class MatcherIterator implements Iterator<String> {

5908             private Matcher matcher;

5909             // The start position of the next sub-sequence of input

5910             // when current == input.length there are no more elements

5911             private int current;

5912             // null if the next element, if any, needs to obtained

5913             private String nextElement;

5914             // > 0 if there are N next empty elements

5915             private int emptyElementCount;

5916 

5917             public String next() {

5918                 if (!hasNext())

5919                     throw new NoSuchElementException();

5920 

5921                 if (emptyElementCount == 0) {

5922                     String n = nextElement;

5923                     nextElement = null;

5924                     return n;

5925                 } else {

5926                     emptyElementCount--;

5927                     return "";

5928                 }

5929             }

5930 

5931             public boolean hasNext() {

5932                 if (matcher == null) {

5933                     matcher = matcher(input);

5934                     // If the input is an empty string then the result can only be a

5935                     // stream of the input.  Induce that by setting the empty

5936                     // element count to 1

5937                     emptyElementCount = input.length() == 0 ? 1 : 0;

5938                 }

5939                 if (nextElement != null || emptyElementCount > 0)

5940                     return true;

5941 

5942                 if (current == input.length())

5943                     return false;

5944 

5945                 // Consume the next matching element

5946                 // Count sequence of matching empty elements

5947                 while (matcher.find()) {

5948                     nextElement = input.subSequence(current, matcher.start()).toString();

5949                     current = matcher.end();

5950                     if (!nextElement.isEmpty()) {

5951                         return true;

5952                     } else if (current > 0) { // no empty leading substring for zero-width

5953                                               // match at the beginning of the input

5954                         emptyElementCount++;

5955                     }

5956                 }

5957 

5958                 // Consume last matching element

5959                 nextElement = input.subSequence(current, input.length()).toString();

5960                 current = input.length();

5961                 if (!nextElement.isEmpty()) {

5962                     return true;

5963                 } else {

5964                     // Ignore a terminal sequence of matching empty elements

5965                     emptyElementCount = 0;

5966                     nextElement = null;

5967                     return false;

5968                 }

5969             }

5970         }

5971         return StreamSupport.stream(Spliterators.spliteratorUnknownSize(

5972                 new MatcherIterator(), Spliterator.ORDERED | Spliterator.NONNULL), false);

5973     }

5974 }

5975