1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.tomcat.util.buf;
18
19 import java.nio.ByteBuffer;
20 import java.nio.CharBuffer;
21 import java.nio.charset.CharsetDecoder;
22 import java.nio.charset.CoderResult;
23 import java.nio.charset.StandardCharsets;
24
25 /**
26 * Decodes bytes to UTF-8. Extracted from Apache Harmony and modified to reject
27 * code points from U+D800 to U+DFFF as per RFC3629. The standard Java decoder
28 * does not reject these. It has also been modified to reject code points
29 * greater than U+10FFFF which the standard Java decoder rejects but the harmony
30 * one does not.
31 */
32 public class Utf8Decoder extends CharsetDecoder {
33
34 // The next table contains information about UTF-8 charset and
35 // correspondence of 1st byte to the length of sequence
36 // For information please visit http://www.ietf.org/rfc/rfc3629.txt
37 //
38 // Please note, o means 0, actually.
39 // -------------------------------------------------------------------
40 // 0 1 2 3 Value
41 // -------------------------------------------------------------------
42 // oxxxxxxx 00000000 00000000 0xxxxxxx
43 // 11oyyyyy 1oxxxxxx 00000000 00000yyy yyxxxxxx
44 // 111ozzzz 1oyyyyyy 1oxxxxxx 00000000 zzzzyyyy yyxxxxxx
45 // 1111ouuu 1ouuzzzz 1oyyyyyy 1oxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
46 private static final int remainingBytes[] = {
47 // 1owwwwww
48 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
49 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
50 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
51 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
52 // 11oyyyyy
53 -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55 // 111ozzzz
56 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
57 // 1111ouuu
58 3, 3, 3, 3, 3, -1, -1, -1,
59 // > 11110111
60 -1, -1, -1, -1, -1, -1, -1, -1};
61 private static final int remainingNumbers[] = {0, // 0 1 2 3
62 4224, // (01o00000b << 6)+(1o000000b)
63 401536, // (011o0000b << 12)+(1o000000b << 6)+(1o000000b)
64 29892736 // (0111o000b << 18)+(1o000000b << 12)+(1o000000b <<
65 // 6)+(1o000000b)
66 };
67 private static final int lowerEncodingLimit[] = {-1, 0x80, 0x800, 0x10000};
68
69
70 public Utf8Decoder() {
71 super(StandardCharsets.UTF_8, 1.0f, 1.0f);
72 }
73
74
75 @Override
76 protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
77 if (in.hasArray() && out.hasArray()) {
78 return decodeHasArray(in, out);
79 }
80 return decodeNotHasArray(in, out);
81 }
82
83
84 private CoderResult decodeNotHasArray(ByteBuffer in, CharBuffer out) {
85 int outRemaining = out.remaining();
86 int pos = in.position();
87 int limit = in.limit();
88 try {
89 while (pos < limit) {
90 if (outRemaining == 0) {
91 return CoderResult.OVERFLOW;
92 }
93 int jchar = in.get();
94 if (jchar < 0) {
95 jchar = jchar & 0x7F;
96 int tail = remainingBytes[jchar];
97 if (tail == -1) {
98 return CoderResult.malformedForLength(1);
99 }
100 if (limit - pos < 1 + tail) {
101 // No early test for invalid sequences here as peeking
102 // at the next byte is harder
103 return CoderResult.UNDERFLOW;
104 }
105 int nextByte;
106 for (int i = 0; i < tail; i++) {
107 nextByte = in.get() & 0xFF;
108 if ((nextByte & 0xC0) != 0x80) {
109 return CoderResult.malformedForLength(1 + i);
110 }
111 jchar = (jchar << 6) + nextByte;
112 }
113 jchar -= remainingNumbers[tail];
114 if (jchar < lowerEncodingLimit[tail]) {
115 // Should have been encoded in a fewer octets
116 return CoderResult.malformedForLength(1);
117 }
118 pos += tail;
119 }
120 // Apache Tomcat added test
121 if (jchar >= 0xD800 && jchar <= 0xDFFF) {
122 return CoderResult.unmappableForLength(3);
123 }
124 // Apache Tomcat added test
125 if (jchar > 0x10FFFF) {
126 return CoderResult.unmappableForLength(4);
127 }
128 if (jchar <= 0xffff) {
129 out.put((char) jchar);
130 outRemaining--;
131 } else {
132 if (outRemaining < 2) {
133 return CoderResult.OVERFLOW;
134 }
135 out.put((char) ((jchar >> 0xA) + 0xD7C0));
136 out.put((char) ((jchar & 0x3FF) + 0xDC00));
137 outRemaining -= 2;
138 }
139 pos++;
140 }
141 return CoderResult.UNDERFLOW;
142 } finally {
143 in.position(pos);
144 }
145 }
146
147
148 private CoderResult decodeHasArray(ByteBuffer in, CharBuffer out) {
149 int outRemaining = out.remaining();
150 int pos = in.position();
151 int limit = in.limit();
152 final byte[] bArr = in.array();
153 final char[] cArr = out.array();
154 final int inIndexLimit = limit + in.arrayOffset();
155 int inIndex = pos + in.arrayOffset();
156 int outIndex = out.position() + out.arrayOffset();
157 // if someone would change the limit in process,
158 // he would face consequences
159 for (; inIndex < inIndexLimit && outRemaining > 0; inIndex++) {
160 int jchar = bArr[inIndex];
161 if (jchar < 0) {
162 jchar = jchar & 0x7F;
163 // If first byte is invalid, tail will be set to -1
164 int tail = remainingBytes[jchar];
165 if (tail == -1) {
166 in.position(inIndex - in.arrayOffset());
167 out.position(outIndex - out.arrayOffset());
168 return CoderResult.malformedForLength(1);
169 }
170 // Additional checks to detect invalid sequences ASAP
171 // Checks derived from Unicode 6.2, Chapter 3, Table 3-7
172 // Check 2nd byte
173 int tailAvailable = inIndexLimit - inIndex - 1;
174 if (tailAvailable > 0) {
175 // First byte C2..DF, second byte 80..BF
176 if (jchar > 0x41 && jchar < 0x60 &&
177 (bArr[inIndex + 1] & 0xC0) != 0x80) {
178 in.position(inIndex - in.arrayOffset());
179 out.position(outIndex - out.arrayOffset());
180 return CoderResult.malformedForLength(1);
181 }
182 // First byte E0, second byte A0..BF
183 if (jchar == 0x60 && (bArr[inIndex + 1] & 0xE0) != 0xA0) {
184 in.position(inIndex - in.arrayOffset());
185 out.position(outIndex - out.arrayOffset());
186 return CoderResult.malformedForLength(1);
187 }
188 // First byte E1..EC, second byte 80..BF
189 if (jchar > 0x60 && jchar < 0x6D &&
190 (bArr[inIndex + 1] & 0xC0) != 0x80) {
191 in.position(inIndex - in.arrayOffset());
192 out.position(outIndex - out.arrayOffset());
193 return CoderResult.malformedForLength(1);
194 }
195 // First byte ED, second byte 80..9F
196 if (jchar == 0x6D && (bArr[inIndex + 1] & 0xE0) != 0x80) {
197 in.position(inIndex - in.arrayOffset());
198 out.position(outIndex - out.arrayOffset());
199 return CoderResult.malformedForLength(1);
200 }
201 // First byte EE..EF, second byte 80..BF
202 if (jchar > 0x6D && jchar < 0x70 &&
203 (bArr[inIndex + 1] & 0xC0) != 0x80) {
204 in.position(inIndex - in.arrayOffset());
205 out.position(outIndex - out.arrayOffset());
206 return CoderResult.malformedForLength(1);
207 }
208 // First byte F0, second byte 90..BF
209 if (jchar == 0x70 &&
210 ((bArr[inIndex + 1] & 0xFF) < 0x90 ||
211 (bArr[inIndex + 1] & 0xFF) > 0xBF)) {
212 in.position(inIndex - in.arrayOffset());
213 out.position(outIndex - out.arrayOffset());
214 return CoderResult.malformedForLength(1);
215 }
216 // First byte F1..F3, second byte 80..BF
217 if (jchar > 0x70 && jchar < 0x74 &&
218 (bArr[inIndex + 1] & 0xC0) != 0x80) {
219 in.position(inIndex - in.arrayOffset());
220 out.position(outIndex - out.arrayOffset());
221 return CoderResult.malformedForLength(1);
222 }
223 // First byte F4, second byte 80..8F
224 if (jchar == 0x74 &&
225 (bArr[inIndex + 1] & 0xF0) != 0x80) {
226 in.position(inIndex - in.arrayOffset());
227 out.position(outIndex - out.arrayOffset());
228 return CoderResult.malformedForLength(1);
229 }
230 }
231 // Check third byte if present and expected
232 if (tailAvailable > 1 && tail > 1) {
233 if ((bArr[inIndex + 2] & 0xC0) != 0x80) {
234 in.position(inIndex - in.arrayOffset());
235 out.position(outIndex - out.arrayOffset());
236 return CoderResult.malformedForLength(2);
237 }
238 }
239 // Check fourth byte if present and expected
240 if (tailAvailable > 2 && tail > 2) {
241 if ((bArr[inIndex + 3] & 0xC0) != 0x80) {
242 in.position(inIndex - in.arrayOffset());
243 out.position(outIndex - out.arrayOffset());
244 return CoderResult.malformedForLength(3);
245 }
246 }
247 if (tailAvailable < tail) {
248 break;
249 }
250 for (int i = 0; i < tail; i++) {
251 int nextByte = bArr[inIndex + i + 1] & 0xFF;
252 if ((nextByte & 0xC0) != 0x80) {
253 in.position(inIndex - in.arrayOffset());
254 out.position(outIndex - out.arrayOffset());
255 return CoderResult.malformedForLength(1 + i);
256 }
257 jchar = (jchar << 6) + nextByte;
258 }
259 jchar -= remainingNumbers[tail];
260 if (jchar < lowerEncodingLimit[tail]) {
261 // Should have been encoded in fewer octets
262 in.position(inIndex - in.arrayOffset());
263 out.position(outIndex - out.arrayOffset());
264 return CoderResult.malformedForLength(1);
265 }
266 inIndex += tail;
267 }
268 // Apache Tomcat added test
269 if (jchar >= 0xD800 && jchar <= 0xDFFF) {
270 return CoderResult.unmappableForLength(3);
271 }
272 // Apache Tomcat added test
273 if (jchar > 0x10FFFF) {
274 return CoderResult.unmappableForLength(4);
275 }
276 if (jchar <= 0xffff) {
277 cArr[outIndex++] = (char) jchar;
278 outRemaining--;
279 } else {
280 if (outRemaining < 2) {
281 // Encoded with 4 bytes. inIndex currently points
282 // to the final byte. Move it back to first byte.
283 inIndex -= 3;
284 in.position(inIndex - in.arrayOffset());
285 out.position(outIndex - out.arrayOffset());
286 return CoderResult.OVERFLOW;
287 }
288 cArr[outIndex++] = (char) ((jchar >> 0xA) + 0xD7C0);
289 cArr[outIndex++] = (char) ((jchar & 0x3FF) + 0xDC00);
290 outRemaining -= 2;
291 }
292 }
293 in.position(inIndex - in.arrayOffset());
294 out.position(outIndex - out.arrayOffset());
295 return (outRemaining == 0 && inIndex < inIndexLimit) ?
296 CoderResult.OVERFLOW :
297 CoderResult.UNDERFLOW;
298 }
299 }
300