1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.tomcat.util.buf;
18
19 import java.io.IOException;
20 import java.nio.charset.StandardCharsets;
21 import java.util.BitSet;
22
23 /**
24 * Efficient implementation of a UTF-8 encoder.
25 * This class is not thread safe - you need one encoder per thread.
26 * The encoder will save and recycle the internal objects, avoiding
27 * garbage.
28 *
29 * You can add extra characters that you want preserved, for example
30 * while encoding a URL you can add "/".
31 *
32 * @author Costin Manolache
33 */
34 public final class UEncoder {
35
36 public enum SafeCharsSet {
37 WITH_SLASH("/"), DEFAULT("");
38 private final BitSet safeChars;
39
40 private BitSet getSafeChars() {
41 return this.safeChars;
42 }
43
44 private SafeCharsSet(String additionalSafeChars) {
45 safeChars = initialSafeChars();
46 for (char c : additionalSafeChars.toCharArray()) {
47 safeChars.set(c);
48 }
49 }
50 }
51
52 // Not static - the set may differ ( it's better than adding
53 // an extra check for "/", "+", etc
54 private BitSet safeChars=null;
55 private C2BConverter c2b=null;
56 private ByteChunk bb=null;
57 private CharChunk cb=null;
58 private CharChunk output=null;
59
60 /**
61 * Create a UEncoder with an unmodifiable safe character set.
62 *
63 * @param safeCharsSet safe characters for this encoder
64 */
65 public UEncoder(SafeCharsSet safeCharsSet) {
66 this.safeChars = safeCharsSet.getSafeChars();
67 }
68
69 /**
70 * URL Encode string, using a specified encoding.
71 *
72 * @param s string to be encoded
73 * @param start the beginning index, inclusive
74 * @param end the ending index, exclusive
75 *
76 * @return A new CharChunk contained the URL encoded string
77 *
78 * @throws IOException If an I/O error occurs
79 */
80 public CharChunk encodeURL(String s, int start, int end)
81 throws IOException {
82 if (c2b == null) {
83 bb = new ByteChunk(8); // small enough.
84 cb = new CharChunk(2); // small enough.
85 output = new CharChunk(64); // small enough.
86 c2b = new C2BConverter(StandardCharsets.UTF_8);
87 } else {
88 bb.recycle();
89 cb.recycle();
90 output.recycle();
91 }
92
93 for (int i = start; i < end; i++) {
94 char c = s.charAt(i);
95 if (safeChars.get(c)) {
96 output.append(c);
97 } else {
98 cb.append(c);
99 c2b.convert(cb, bb);
100
101 // "surrogate" - UTF is _not_ 16 bit, but 21 !!!!
102 // ( while UCS is 31 ). Amazing...
103 if (c >= 0xD800 && c <= 0xDBFF) {
104 if ((i+1) < end) {
105 char d = s.charAt(i+1);
106 if (d >= 0xDC00 && d <= 0xDFFF) {
107 cb.append(d);
108 c2b.convert(cb, bb);
109 i++;
110 }
111 }
112 }
113
114 urlEncode(output, bb);
115 cb.recycle();
116 bb.recycle();
117 }
118 }
119
120 return output;
121 }
122
123 protected void urlEncode(CharChunk out, ByteChunk bb)
124 throws IOException {
125 byte[] bytes = bb.getBuffer();
126 for (int j = bb.getStart(); j < bb.getEnd(); j++) {
127 out.append('%');
128 char ch = Character.forDigit((bytes[j] >> 4) & 0xF, 16);
129 out.append(ch);
130 ch = Character.forDigit(bytes[j] & 0xF, 16);
131 out.append(ch);
132 }
133 }
134
135 // -------------------- Internal implementation --------------------
136
137 private static BitSet initialSafeChars() {
138 BitSet initialSafeChars=new BitSet(128);
139 int i;
140 for (i = 'a'; i <= 'z'; i++) {
141 initialSafeChars.set(i);
142 }
143 for (i = 'A'; i <= 'Z'; i++) {
144 initialSafeChars.set(i);
145 }
146 for (i = '0'; i <= '9'; i++) {
147 initialSafeChars.set(i);
148 }
149 //safe
150 initialSafeChars.set('$');
151 initialSafeChars.set('-');
152 initialSafeChars.set('_');
153 initialSafeChars.set('.');
154
155 // Dangerous: someone may treat this as " "
156 // RFC1738 does allow it, it's not reserved
157 // initialSafeChars.set('+');
158 //extra
159 initialSafeChars.set('!');
160 initialSafeChars.set('*');
161 initialSafeChars.set('\'');
162 initialSafeChars.set('(');
163 initialSafeChars.set(')');
164 initialSafeChars.set(',');
165 return initialSafeChars;
166 }
167 }
168