File Source: stringutils.java
1 /*
2 * Copyright (c) 2003-2006, Simon Brown
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * - Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * - Neither the name of Pebble nor the names of its contributors may
17 * be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32 package net.sourceforge.pebble.util;
33
34 import java.util.*;
35 import java.util.regex.Matcher;
36 import java.util.regex.Pattern;
37
38 /**
39 * A collection of utility methods for manipulating strings.
40 *
41 * @author Simon Brown
42 */
/*
P/P * Method: void net.sourceforge.pebble.util.StringUtils()
*/
43 public final class StringUtils {
44
/*
P/P * Method: net.sourceforge.pebble.util.StringUtils__static_init
*
* Postconditions:
* init'ed(BR_TAG_PATTERN)
* init'ed(CLOSING_A_TAG_PATTERN)
* init'ed(CLOSING_BLOCKQUOTE_TAG_PATTERN)
* init'ed(CLOSING_B_TAG_PATTERN)
* init'ed(CLOSING_EM_TAG_PATTERN)
* init'ed(CLOSING_I_TAG_PATTERN)
* init'ed(CLOSING_LI_TAG_PATTERN)
* init'ed(CLOSING_OL_TAG_PATTERN)
* init'ed(CLOSING_PRE_TAG_PATTERN)
* init'ed(CLOSING_P_TAG_PATTERN)
* ...
*/
45 private static final Pattern OPENING_B_TAG_PATTERN = Pattern.compile("<b>", Pattern.CASE_INSENSITIVE);
46 private static final Pattern CLOSING_B_TAG_PATTERN = Pattern.compile("</b>", Pattern.CASE_INSENSITIVE);
47 private static final Pattern OPENING_STRONG_TAG_PATTERN = Pattern.compile("<strong>", Pattern.CASE_INSENSITIVE);
48 private static final Pattern CLOSING_STRONG_TAG_PATTERN = Pattern.compile("</strong>", Pattern.CASE_INSENSITIVE);
49 private static final Pattern OPENING_I_TAG_PATTERN = Pattern.compile("<i>", Pattern.CASE_INSENSITIVE);
50 private static final Pattern CLOSING_I_TAG_PATTERN = Pattern.compile("</i>", Pattern.CASE_INSENSITIVE);
51 private static final Pattern OPENING_EM_TAG_PATTERN = Pattern.compile("<em>", Pattern.CASE_INSENSITIVE);
52 private static final Pattern CLOSING_EM_TAG_PATTERN = Pattern.compile("</em>", Pattern.CASE_INSENSITIVE);
53 private static final Pattern OPENING_BLOCKQUOTE_TAG_PATTERN = Pattern.compile("<blockquote>", Pattern.CASE_INSENSITIVE);
54 private static final Pattern CLOSING_BLOCKQUOTE_TAG_PATTERN = Pattern.compile("</blockquote>", Pattern.CASE_INSENSITIVE);
55 private static final Pattern BR_TAG_PATTERN = Pattern.compile("<br */*>", Pattern.CASE_INSENSITIVE);
56 private static final Pattern OPENING_P_TAG_PATTERN = Pattern.compile("<p>", Pattern.CASE_INSENSITIVE);
57 private static final Pattern CLOSING_P_TAG_PATTERN = Pattern.compile("</p>", Pattern.CASE_INSENSITIVE);
58 private static final Pattern OPENING_PRE_TAG_PATTERN = Pattern.compile("<pre>", Pattern.CASE_INSENSITIVE);
59 private static final Pattern CLOSING_PRE_TAG_PATTERN = Pattern.compile("</pre>", Pattern.CASE_INSENSITIVE);
60 private static final Pattern OPENING_UL_TAG_PATTERN = Pattern.compile("<ul>", Pattern.CASE_INSENSITIVE);
61 private static final Pattern CLOSING_UL_TAG_PATTERN = Pattern.compile("</ul>", Pattern.CASE_INSENSITIVE);
62 private static final Pattern OPENING_OL_TAG_PATTERN = Pattern.compile("<ol>", Pattern.CASE_INSENSITIVE);
63 private static final Pattern CLOSING_OL_TAG_PATTERN = Pattern.compile("</ol>", Pattern.CASE_INSENSITIVE);
64 private static final Pattern OPENING_LI_TAG_PATTERN = Pattern.compile("<li>", Pattern.CASE_INSENSITIVE);
65 private static final Pattern CLOSING_LI_TAG_PATTERN = Pattern.compile("</li>", Pattern.CASE_INSENSITIVE);
66 private static final Pattern CLOSING_A_TAG_PATTERN = Pattern.compile("</a>", Pattern.CASE_INSENSITIVE);
67 private static final Pattern OPENING_A_TAG_PATTERN = Pattern.compile("<a href=.*?>", Pattern.CASE_INSENSITIVE);
68 private static final Pattern OPENING_SUP_TAG_PATTERN = Pattern.compile("<sup>", Pattern.CASE_INSENSITIVE);
69 private static final Pattern CLOSING_SUP_TAG_PATTERN = Pattern.compile("</sup>", Pattern.CASE_INSENSITIVE);
70 private static final Pattern OPENING_SUB_TAG_PATTERN = Pattern.compile("<sub>", Pattern.CASE_INSENSITIVE);
71 private static final Pattern CLOSING_SUB_TAG_PATTERN = Pattern.compile("</sub>", Pattern.CASE_INSENSITIVE);
72
73 public static final int MAX_CONTENT_LENGTH = 255;
74 public static final int MAX_WORD_LENGTH = 20;
75 public static final int MAX_NUM_OF_POSTS = 5;
76
77
78 //HTML4 248 named entities
79 private final static Map<String,String> htmlEntities = new HashMap<String,String>();
80 private final static Collection<String> allowedSchemes = new ArrayList<String>();
81
82 static {
83 htmlEntities.put(" ", "\u00A0");
84 htmlEntities.put("¡", "\u00A1");
85 htmlEntities.put("¢", "\u00A2");
86 htmlEntities.put("£", "\u00A3");
87 htmlEntities.put("¤", "\u00A4");
88 htmlEntities.put("¥", "\u00A5");
89 htmlEntities.put("¦", "\u00A6");
90 htmlEntities.put("§", "\u00A7");
91 htmlEntities.put("¨", "\u00A8");
92 htmlEntities.put("©", "\u00A9");
93 htmlEntities.put("ª", "\u00AA");
94 htmlEntities.put("«", "\u00AB");
95 htmlEntities.put("¬", "\u00AC");
96 htmlEntities.put("­", "\u00AD");
97 htmlEntities.put("®", "\u00AE");
98 htmlEntities.put("¯", "\u00AF");
99 htmlEntities.put("°", "\u00B0");
100 htmlEntities.put("±", "\u00B1");
101 htmlEntities.put("²", "\u00B2");
102 htmlEntities.put("³", "\u00B3");
103 htmlEntities.put("´", "\u00B4");
104 htmlEntities.put("µ", "\u00B5");
105 htmlEntities.put("¶", "\u00B6");
106 htmlEntities.put("·", "\u00B7");
107 htmlEntities.put("¸", "\u00B8");
108 htmlEntities.put("¹", "\u00B9");
109 htmlEntities.put("º", "\u00BA");
110 htmlEntities.put("»", "\u00BB");
111 htmlEntities.put("¼", "\u00BC");
112 htmlEntities.put("½", "\u00BD");
113 htmlEntities.put("¾", "\u00BE");
114 htmlEntities.put("¿", "\u00BF");
115 htmlEntities.put("À", "\u00C0");
116 htmlEntities.put("Á", "\u00C1");
117 htmlEntities.put("Â", "\u00C2");
118 htmlEntities.put("Ã", "\u00C3");
119 htmlEntities.put("Ä", "\u00C4");
120 htmlEntities.put("Å", "\u00C5");
121 htmlEntities.put("Æ", "\u00C6");
122 htmlEntities.put("Ç", "\u00C7");
123 htmlEntities.put("È", "\u00C8");
124 htmlEntities.put("É", "\u00C9");
125 htmlEntities.put("Ê", "\u00CA");
126 htmlEntities.put("Ë", "\u00CB");
127 htmlEntities.put("Ì", "\u00CC");
128 htmlEntities.put("Í", "\u00CD");
129 htmlEntities.put("Î", "\u00CE");
130 htmlEntities.put("Ï", "\u00CF");
131 htmlEntities.put("Ð", "\u00D0");
132 htmlEntities.put("Ñ", "\u00D1");
133 htmlEntities.put("Ò", "\u00D2");
134 htmlEntities.put("Ó", "\u00D3");
135 htmlEntities.put("Ô", "\u00D4");
136 htmlEntities.put("Õ", "\u00D5");
137 htmlEntities.put("Ö", "\u00D6");
138 htmlEntities.put("×", "\u00D7");
139 htmlEntities.put("Ø", "\u00D8");
140 htmlEntities.put("Ù", "\u00D9");
141 htmlEntities.put("Ú", "\u00DA");
142 htmlEntities.put("Û", "\u00DB");
143 htmlEntities.put("Ü", "\u00DC");
144 htmlEntities.put("Ý", "\u00DD");
145 htmlEntities.put("Þ", "\u00DE");
146 htmlEntities.put("ß", "\u00DF");
147 htmlEntities.put("à", "\u00E0");
148 htmlEntities.put("á", "\u00E1");
149 htmlEntities.put("â", "\u00E2");
150 htmlEntities.put("ã", "\u00E3");
151 htmlEntities.put("ä", "\u00E4");
152 htmlEntities.put("å", "\u00E5");
153 htmlEntities.put("æ", "\u00E6");
154 htmlEntities.put("ç", "\u00E7");
155 htmlEntities.put("è", "\u00E8");
156 htmlEntities.put("é", "\u00E9");
157 htmlEntities.put("ê", "\u00EA");
158 htmlEntities.put("ë", "\u00EB");
159 htmlEntities.put("ì", "\u00EC");
160 htmlEntities.put("í", "\u00ED");
161 htmlEntities.put("î", "\u00EE");
162 htmlEntities.put("ï", "\u00EF");
163 htmlEntities.put("ð", "\u00F0");
164 htmlEntities.put("ñ", "\u00F1");
165 htmlEntities.put("ò", "\u00F2");
166 htmlEntities.put("ó", "\u00F3");
167 htmlEntities.put("ô", "\u00F4");
168 htmlEntities.put("õ", "\u00F5");
169 htmlEntities.put("ö", "\u00F6");
170 htmlEntities.put("÷", "\u00F7");
171 htmlEntities.put("ø", "\u00F8");
172 htmlEntities.put("ù", "\u00F9");
173 htmlEntities.put("ú", "\u00FA");
174 htmlEntities.put("û", "\u00FB");
175 htmlEntities.put("ü", "\u00FC");
176 htmlEntities.put("ý", "\u00FD");
177 htmlEntities.put("þ", "\u00FE");
178 htmlEntities.put("ÿ", "\u00FF");
179 htmlEntities.put("Œ", "\u0152");
180 htmlEntities.put("œ", "\u0153");
181 htmlEntities.put("Š", "\u0160");
182 htmlEntities.put("š", "\u0161");
183 htmlEntities.put("Ÿ", "\u0178");
184 htmlEntities.put("ƒ", "\u0192");
185 htmlEntities.put("ˆ", "\u02C6");
186 htmlEntities.put("˜", "\u02DC");
187 htmlEntities.put("Α", "\u0391");
188 htmlEntities.put("Β", "\u0392");
189 htmlEntities.put("Γ", "\u0393");
190 htmlEntities.put("Δ", "\u0394");
191 htmlEntities.put("Ε", "\u0395");
192 htmlEntities.put("Ζ", "\u0396");
193 htmlEntities.put("Η", "\u0397");
194 htmlEntities.put("Θ", "\u0398");
195 htmlEntities.put("Ι", "\u0399");
196 htmlEntities.put("Κ", "\u039A");
197 htmlEntities.put("Λ", "\u039B");
198 htmlEntities.put("Μ", "\u039C");
199 htmlEntities.put("Ν", "\u039D");
200 htmlEntities.put("Ξ", "\u039E");
201 htmlEntities.put("Ο", "\u039F");
202 htmlEntities.put("Π", "\u03A0");
203 htmlEntities.put("Ρ", "\u03A1");
204 htmlEntities.put("Σ", "\u03A3");
205 htmlEntities.put("Τ", "\u03A4");
206 htmlEntities.put("Υ", "\u03A5");
207 htmlEntities.put("Φ", "\u03A6");
208 htmlEntities.put("Χ", "\u03A7");
209 htmlEntities.put("Ψ", "\u03A8");
210 htmlEntities.put("Ω", "\u03A9");
211 htmlEntities.put("α", "\u03B1");
212 htmlEntities.put("β", "\u03B2");
213 htmlEntities.put("γ", "\u03B3");
214 htmlEntities.put("δ", "\u03B4");
215 htmlEntities.put("ε", "\u03B5");
216 htmlEntities.put("ζ", "\u03B6");
217 htmlEntities.put("η", "\u03B7");
218 htmlEntities.put("θ", "\u03B8");
219 htmlEntities.put("ι", "\u03B9");
220 htmlEntities.put("κ", "\u03BA");
221 htmlEntities.put("λ", "\u03BB");
222 htmlEntities.put("μ", "\u03BC");
223 htmlEntities.put("ν", "\u03BD");
224 htmlEntities.put("ξ", "\u03BE");
225 htmlEntities.put("ο", "\u03BF");
226 htmlEntities.put("π", "\u03C0");
227 htmlEntities.put("ρ", "\u03C1");
228 htmlEntities.put("ς", "\u03C2");
229 htmlEntities.put("σ", "\u03C3");
230 htmlEntities.put("τ", "\u03C4");
231 htmlEntities.put("υ", "\u03C5");
232 htmlEntities.put("φ", "\u03C6");
233 htmlEntities.put("χ", "\u03C7");
234 htmlEntities.put("ψ", "\u03C8");
235 htmlEntities.put("ω", "\u03C9");
236 htmlEntities.put("ϑ", "\u03D1");
237 htmlEntities.put("ϒ", "\u03D2");
238 htmlEntities.put("ϖ", "\u03D6");
239 htmlEntities.put(" ", "\u2002");
240 htmlEntities.put(" ", "\u2003");
241 htmlEntities.put(" ", "\u2009");
242 htmlEntities.put("‌", "\u200C");
243 htmlEntities.put("‍", "\u200D");
244 htmlEntities.put("‎", "\u200E");
245 htmlEntities.put("‏", "\u200F");
246 htmlEntities.put("–", "\u2013");
247 htmlEntities.put("—", "\u2014");
248 htmlEntities.put("‘", "\u2018");
249 htmlEntities.put("’", "\u2019");
250 htmlEntities.put("‚", "\u201A");
251 htmlEntities.put("“", "\u201C");
252 htmlEntities.put("”", "\u201D");
253 htmlEntities.put("„", "\u201E");
254 htmlEntities.put("†", "\u2020");
255 htmlEntities.put("‡", "\u2021");
256 htmlEntities.put("•", "\u2022");
257 htmlEntities.put("…", "\u2026");
258 htmlEntities.put("‰", "\u2030");
259 htmlEntities.put("′", "\u2032");
260 htmlEntities.put("″", "\u2033");
261 htmlEntities.put("‹", "\u2039");
262 htmlEntities.put("›", "\u203A");
263 htmlEntities.put("‾", "\u203E");
264 htmlEntities.put("⁄", "\u2044");
265 htmlEntities.put("€", "\u20AC");
266 htmlEntities.put("ℑ", "\u2111");
267 htmlEntities.put("℘", "\u2118");
268 htmlEntities.put("ℜ", "\u211C");
269 htmlEntities.put("™", "\u2122");
270 htmlEntities.put("ℵ", "\u2135");
271 htmlEntities.put("←", "\u2190");
272 htmlEntities.put("↑", "\u2191");
273 htmlEntities.put("→", "\u2192");
274 htmlEntities.put("↓", "\u2193");
275 htmlEntities.put("↔", "\u2194");
276 htmlEntities.put("↵", "\u21B5");
277 htmlEntities.put("⇐", "\u21D0");
278 htmlEntities.put("⇑", "\u21D1");
279 htmlEntities.put("⇒", "\u21D2");
280 htmlEntities.put("⇓", "\u21D3");
281 htmlEntities.put("⇔", "\u21D4");
282 htmlEntities.put("∀", "\u2200");
283 htmlEntities.put("∂", "\u2202");
284 htmlEntities.put("∃", "\u2203");
285 htmlEntities.put("∅", "\u2205");
286 htmlEntities.put("∇", "\u2207");
287 htmlEntities.put("∈", "\u2208");
288 htmlEntities.put("∉", "\u2209");
289 htmlEntities.put("∋", "\u220B");
290 htmlEntities.put("∏", "\u220F");
291 htmlEntities.put("∑", "\u2211");
292 htmlEntities.put("−", "\u2212");
293 htmlEntities.put("∗", "\u2217");
294 htmlEntities.put("√", "\u221A");
295 htmlEntities.put("∝", "\u221D");
296 htmlEntities.put("∞", "\u221E");
297 htmlEntities.put("∠", "\u2220");
298 htmlEntities.put("∧", "\u2227");
299 htmlEntities.put("∨", "\u2228");
300 htmlEntities.put("∩", "\u2229");
301 htmlEntities.put("∪", "\u222A");
302 htmlEntities.put("∫", "\u222B");
303 htmlEntities.put("∴", "\u2234");
304 htmlEntities.put("∼", "\u223C");
305 htmlEntities.put("≅", "\u2245");
306 htmlEntities.put("≈", "\u2248");
307 htmlEntities.put("≠", "\u2260");
308 htmlEntities.put("≡", "\u2261");
309 htmlEntities.put("≤", "\u2264");
310 htmlEntities.put("≥", "\u2265");
311 htmlEntities.put("⊂", "\u2282");
312 htmlEntities.put("⊃", "\u2283");
313 htmlEntities.put("⊄", "\u2284");
314 htmlEntities.put("⊆", "\u2286");
315 htmlEntities.put("⊇", "\u2287");
316 htmlEntities.put("⊕", "\u2295");
317 htmlEntities.put("⊗", "\u2297");
318 htmlEntities.put("⊥", "\u22A5");
319 htmlEntities.put("⋅", "\u22C5");
320 htmlEntities.put("⌈", "\u2308");
321 htmlEntities.put("⌉", "\u2309");
322 htmlEntities.put("⌊", "\u230A");
323 htmlEntities.put("⌋", "\u230B");
324 htmlEntities.put("⟨", "\u2329");
325 htmlEntities.put("⟩", "\u232A");
326 htmlEntities.put("◊", "\u25CA");
327 htmlEntities.put("♠", "\u2660");
328 htmlEntities.put("♣", "\u2663");
329 htmlEntities.put("♥", "\u2665");
330 htmlEntities.put("♦", "\u2666");
331
332 allowedSchemes.add("https://");
333 allowedSchemes.add("http://");
334 allowedSchemes.add("ftp://");
335 allowedSchemes.add("mailto:");
336 }
337
338
339 /**
340 * Filters out characters that have meaning within JSP and HTML, and
341 * replaces them with "escaped" versions.
342 *
343 * @param s the String to filter
344 * @return the filtered String
345 */
346 public static String transformHTML(String s) {
347
/*
P/P * Method: String transformHTML(String)
*
* Postconditions:
* init'ed(return_value)
*
* Test Vectors:
* s: Inverse{null}, Addr_Set{null}
* java.lang.String:charAt(...)@357: {34}, {38}, {60}, {62}, {0..33, 35..37, 39..59, 61, 63..216-1}
*/
348 if (s == null) {
349 return null;
350 }
351
352 StringBuffer buf = new StringBuffer(s.length());
353
354 // loop through every character and replace if necessary
355 int length = s.length();
356 for (int i = 0; i < length; i++) {
357 switch (s.charAt(i)) {
358 case '<':
359 buf.append("<");
360 break;
361 case '>':
362 buf.append(">");
363 break;
364 case '&':
365 buf.append("&");
366 break;
367 case '\"':
368 buf.append(""");
369 break;
370 default :
371 buf.append(s.charAt(i));
372 }
373 }
374
375 return buf.toString();
376 }
377
378 /**
379 * Transforms the given String into a subset of HTML displayable on a web
380 * page. The subset includes <b>, <i>, <p>, <br>,
381 * <pre> and <a href> (and their corresponding end tags).
382 *
383 * @param s the String to transform
384 * @return the transformed String
385 */
386 public static String transformToHTMLSubset(String s) {
387
/*
P/P * Method: String transformToHTMLSubset(String)
*
* Presumptions:
* java.lang.String:indexOf(...)@430 + java.lang.String:length(...)@432 in -231..232-1
* java.lang.String:indexOf(...)@436 + java.lang.String:length(...)@438 in -231..232-1
* java.lang.String:indexOf(...)@436 + java.lang.String:length(...)@439 in -231..232-1
* java.util.regex.Matcher:replaceAll(...)@462 != null
* java.util.regex.Pattern:compile(...)@45 != null
* ...
*
* Postconditions:
* init'ed(return_value)
*
* Test Vectors:
* s: Inverse{null}, Addr_Set{null}
* java.lang.String:indexOf(...)@430: {-231..-1}, {0..232-1}
* java.lang.String:indexOf(...)@436: {-231..-1}, {0..232-1}
* java.util.regex.Matcher:find(...)@425: {1}, {0}
*/
388 if (s == null) {
389 return null;
390 }
391
392 s = replace(s, OPENING_B_TAG_PATTERN, "<b>");
393 s = replace(s, CLOSING_B_TAG_PATTERN, "</b>");
394 s = replace(s, OPENING_STRONG_TAG_PATTERN, "<strong>");
395 s = replace(s, CLOSING_STRONG_TAG_PATTERN, "</strong>");
396 s = replace(s, OPENING_I_TAG_PATTERN, "<i>");
397 s = replace(s, CLOSING_I_TAG_PATTERN, "</i>");
398 s = replace(s, OPENING_EM_TAG_PATTERN, "<em>");
399 s = replace(s, CLOSING_EM_TAG_PATTERN, "</em>");
400 s = replace(s, OPENING_BLOCKQUOTE_TAG_PATTERN, "<blockquote>");
401 s = replace(s, CLOSING_BLOCKQUOTE_TAG_PATTERN, "</blockquote>");
402 s = replace(s, BR_TAG_PATTERN, "<br />");
403 s = replace(s, OPENING_P_TAG_PATTERN, "<p>");
404 s = replace(s, CLOSING_P_TAG_PATTERN, "</p>");
405 s = replace(s, OPENING_PRE_TAG_PATTERN, "<pre>");
406 s = replace(s, CLOSING_PRE_TAG_PATTERN, "</pre>");
407 s = replace(s, OPENING_UL_TAG_PATTERN, "<ul>");
408 s = replace(s, CLOSING_UL_TAG_PATTERN, "</ul>");
409 s = replace(s, OPENING_OL_TAG_PATTERN, "<ol>");
410 s = replace(s, CLOSING_OL_TAG_PATTERN, "</ol>");
411 s = replace(s, OPENING_LI_TAG_PATTERN, "<li>");
412 s = replace(s, CLOSING_LI_TAG_PATTERN, "</li>");
413 s = replace(s, OPENING_SUP_TAG_PATTERN, "<sup>");
414 s = replace(s, CLOSING_SUP_TAG_PATTERN, "</sup>");
415 s = replace(s, OPENING_SUB_TAG_PATTERN, "<sub>");
416 s = replace(s, CLOSING_SUB_TAG_PATTERN, "</sub>");
417
418 // HTTP links - remove all attributes other than href
419 s = replace(s, CLOSING_A_TAG_PATTERN, "</a>");
420 Matcher m = OPENING_A_TAG_PATTERN.matcher(s);
421 // Use a single buffer for efficiency
422 StringBuffer buffer = new StringBuffer();
423 // The position in the original string that we are up to
424 int position = 0;
425 while (m.find()) {
426 int start = m.start();
427 int end = m.end();
428 buffer.append(s.subSequence(position, start)).append("<a href=");
429 String link = s.substring(start, end);
430 int startOfHrefIndex = link.indexOf("href="");
431 if (startOfHrefIndex > -1) {
432 int startOfHrefValue = startOfHrefIndex + "href="".length();
433 int endOfHrefIndex = link.indexOf(""", startOfHrefValue);
434 buffer.append("\"").append(validateUrl(link.substring(startOfHrefValue, endOfHrefIndex))).append("\"");
435 } else {
436 startOfHrefIndex = link.indexOf("href='");
437 if (startOfHrefIndex > -1) {
438 int startOfHrefValue = startOfHrefIndex + "href='".length();
439 int endOfHrefIndex = link.indexOf("'", startOfHrefIndex+"href='".length());
440 buffer.append("'").append(validateUrl(link.substring(startOfHrefValue, endOfHrefIndex))).append("'");
441 }
442 }
443 buffer.append(">");
444 position = end;
445 }
446 // If position is still 0 there were no matches, so don't do anything
447 if (position > 0) {
448 buffer.append(s.subSequence(position, s.length()));
449 s = buffer.toString();
450 }
451
452 // escaped angle brackets and other allowed entities
453 s = s.replaceAll("<", "<");
454 s = s.replaceAll(">", ">");
455 s = s.replaceAll("&([#a-zA-Z0-9]{1,}?);", "&$1;");
456
457 return s;
458 }
459
460 private static String replace(String string, Pattern pattern, String replacement) {
/*
P/P * Method: String replace(String, Pattern, String)
*
* Preconditions:
* pattern != null
*
* Presumptions:
* java.util.regex.Pattern:matcher(...)@461 != null
*
* Postconditions:
* init'ed(return_value)
*/
461 Matcher m = pattern.matcher(string);
462 return m.replaceAll(replacement);
463 }
464
465 /**
466 * Filters out newline characters.
467 *
468 * @param s the String to filter
469 * @return the filtered String
470 */
471 public static String filterNewlines(String s) {
472
/*
P/P * Method: String filterNewlines(String)
*
* Postconditions:
* init'ed(return_value)
*
* Test Vectors:
* s: Inverse{null}, Addr_Set{null}
*/
473 if (s == null) {
474 return null;
475 }
476
477 StringBuffer buf = new StringBuffer(s.length());
478
479 // loop through every character and replace if necessary
480 int length = s.length();
481 for (int i = 0; i < length; i++) {
482 switch (s.charAt(i)) {
483 case '\r':
484 break;
485 default :
486 buf.append(s.charAt(i));
487 }
488 }
489
490 return buf.toString();
491 }
492
493 /**
494 * Filters out all HTML tags.
495 *
496 * @param s the String to filter
497 * @return the filtered String
498 */
499 public static String filterHTML(String s) {
/*
P/P * Method: String filterHTML(String)
*
* Postconditions:
* init'ed(return_value)
*
* Test Vectors:
* s: Inverse{null}, Addr_Set{null}
*/
500 if (s == null) {
501 return null;
502 }
503
504 s = s.replaceAll("<", "");
505 s = s.replaceAll(">", "");
506 s = s.replaceAll(" ", "");
507 s = s.replaceAll("(?s)<!--.*?-->", "");
508 return s.replaceAll("(?s)<.*?>", "");
509 }
510
511 public static String truncate(String s) {
/*
P/P * Method: String truncate(String)
*
* Postconditions:
* return_value != null
*/
512 return truncate(s, MAX_CONTENT_LENGTH);
513 }
514
515 public static String truncate(String s, int maxLength) {
/*
P/P * Method: String truncate(String, int)
*
* Presumptions:
* words.length@524 <= 232-1
* words[i]@524 != null
*
* Postconditions:
* return_value != null
*
* Test Vectors:
* java.lang.String:length(...)@530: {0..20}, {21..232-1}
*/
516 String content = StringUtils.filterHTML(s);
517
518 // then truncate, if necessary
519 if (content == null) {
520 return "";
521 } else {
522 StringBuffer buf = new StringBuffer();
523
524 String words[] = content.split("\\s");
525 for (int i = 0; i < words.length; i++) {
526 if (buf.length() + words[i].length() > maxLength) {
527 // truncate here
528 buf.append("...");
529 return buf.toString();
530 } else if (words[i].length() > MAX_WORD_LENGTH) {
531 // truncate here
532 buf.append(words[i].substring(0, MAX_WORD_LENGTH));
533 buf.append("...");
534 return buf.toString();
535 } else {
536 buf.append(words[i]);
537 if ((i+1) < words.length) {
538 buf.append(" ");
539 }
540 }
541 }
542
543 return buf.toString();
544 }
545 }
546
547 public static String stripScriptTags(String html) {
/*
P/P * Method: String stripScriptTags(String)
*
* Postconditions:
* init'ed(return_value)
*
* Test Vectors:
* html: Inverse{null}, Addr_Set{null}
*/
548 if (html == null) {
549 return html;
550 }
551
552 html = html.replaceAll("<script.*?>.*?</script.*?>", "");
553 html = html.replaceAll("<script.*?/>", "");
554 return html;
555 }
556
557
558 public static String unescapeHTMLEntities(String source) {
/*
P/P * Method: String unescapeHTMLEntities(String)
*
* Preconditions:
* (soft) source != null
*
* Presumptions:
* java.util.Map:keySet(...)@559 != null
*
* Postconditions:
* return_value != null
*
* Test Vectors:
* java.util.Iterator:hasNext(...)@561: {1}, {0}
*/
559 Iterator<String> it = htmlEntities.keySet().iterator();
560
561 while(it.hasNext()) {
562
563 String key = it.next();
564 String val = htmlEntities.get(key);
565 source = source.replaceAll(key, val);
566 }
567 return source;
568 }
569
570 public static String validateUrl(String url) {
571 // whitelist, don't blacklist.
/*
P/P * Method: String validateUrl(String)
*
* Preconditions:
* (soft) url != null
*
* Postconditions:
* return_value == One-of{url, &""}
* (soft) return_value != null
*
* Test Vectors:
* java.lang.String:startsWith(...)@573: {0}, {1}
* java.util.Iterator:hasNext(...)@572: {1}, {0}
*/
572 for (String scheme : allowedSchemes) {
573 if (url.startsWith(scheme)) {
574 return url;
575 }
576 }
577 return "";
578 }
579
580 }
SofCheck Inspector Build Version : 2.22510
| stringutils.java |
2010-Jun-25 19:40:32 |
| stringutils.class |
2010-Jul-19 20:23:38 |