File Source: stringutils.java

     1  /*
     2   * Copyright (c) 2003-2006, Simon Brown
     3   * All rights reserved.
     4   *
     5   * Redistribution and use in source and binary forms, with or without
     6   * modification, are permitted provided that the following conditions are met:
     7   *
     8   *   - Redistributions of source code must retain the above copyright
     9   *     notice, this list of conditions and the following disclaimer.
    10   *
    11   *   - Redistributions in binary form must reproduce the above copyright
    12   *     notice, this list of conditions and the following disclaimer in
    13   *     the documentation and/or other materials provided with the
    14   *     distribution.
    15   *
    16   *   - Neither the name of Pebble nor the names of its contributors may
    17   *     be used to endorse or promote products derived from this software
    18   *     without specific prior written permission.
    19   *
    20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
    21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
    24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    30   * POSSIBILITY OF SUCH DAMAGE.
    31   */
    32  package net.sourceforge.pebble.util;
    33  
    34  import java.util.*;
    35  import java.util.regex.Matcher;
    36  import java.util.regex.Pattern;
    37  
    38  /**
    39   * A collection of utility methods for manipulating strings.
    40   *
    41   * @author    Simon Brown
    42   */
         /* 
    P/P   *  Method: void net.sourceforge.pebble.util.StringUtils()
          */
    43  public final class StringUtils {
    44  
           /* 
    P/P     *  Method: net.sourceforge.pebble.util.StringUtils__static_init
            * 
            *  Postconditions:
            *    init'ed(BR_TAG_PATTERN)
            *    init'ed(CLOSING_A_TAG_PATTERN)
            *    init'ed(CLOSING_BLOCKQUOTE_TAG_PATTERN)
            *    init'ed(CLOSING_B_TAG_PATTERN)
            *    init'ed(CLOSING_EM_TAG_PATTERN)
            *    init'ed(CLOSING_I_TAG_PATTERN)
            *    init'ed(CLOSING_LI_TAG_PATTERN)
            *    init'ed(CLOSING_OL_TAG_PATTERN)
            *    init'ed(CLOSING_PRE_TAG_PATTERN)
            *    init'ed(CLOSING_P_TAG_PATTERN)
            *    ...
            */
    45    private static final Pattern OPENING_B_TAG_PATTERN = Pattern.compile("<b>", Pattern.CASE_INSENSITIVE);
    46    private static final Pattern CLOSING_B_TAG_PATTERN = Pattern.compile("</b>", Pattern.CASE_INSENSITIVE);
    47    private static final Pattern OPENING_STRONG_TAG_PATTERN = Pattern.compile("<strong>", Pattern.CASE_INSENSITIVE);
    48    private static final Pattern CLOSING_STRONG_TAG_PATTERN = Pattern.compile("</strong>", Pattern.CASE_INSENSITIVE);
    49    private static final Pattern OPENING_I_TAG_PATTERN = Pattern.compile("<i>", Pattern.CASE_INSENSITIVE);
    50    private static final Pattern CLOSING_I_TAG_PATTERN = Pattern.compile("</i>", Pattern.CASE_INSENSITIVE);
    51    private static final Pattern OPENING_EM_TAG_PATTERN = Pattern.compile("<em>", Pattern.CASE_INSENSITIVE);
    52    private static final Pattern CLOSING_EM_TAG_PATTERN = Pattern.compile("</em>", Pattern.CASE_INSENSITIVE);
    53    private static final Pattern OPENING_BLOCKQUOTE_TAG_PATTERN = Pattern.compile("<blockquote>", Pattern.CASE_INSENSITIVE);
    54    private static final Pattern CLOSING_BLOCKQUOTE_TAG_PATTERN = Pattern.compile("</blockquote>", Pattern.CASE_INSENSITIVE);
    55    private static final Pattern BR_TAG_PATTERN = Pattern.compile("<br */*>", Pattern.CASE_INSENSITIVE);
    56    private static final Pattern OPENING_P_TAG_PATTERN = Pattern.compile("<p>", Pattern.CASE_INSENSITIVE);
    57    private static final Pattern CLOSING_P_TAG_PATTERN = Pattern.compile("</p>", Pattern.CASE_INSENSITIVE);
    58    private static final Pattern OPENING_PRE_TAG_PATTERN = Pattern.compile("<pre>", Pattern.CASE_INSENSITIVE);
    59    private static final Pattern CLOSING_PRE_TAG_PATTERN = Pattern.compile("</pre>", Pattern.CASE_INSENSITIVE);
    60    private static final Pattern OPENING_UL_TAG_PATTERN = Pattern.compile("<ul>", Pattern.CASE_INSENSITIVE);
    61    private static final Pattern CLOSING_UL_TAG_PATTERN = Pattern.compile("</ul>", Pattern.CASE_INSENSITIVE);
    62    private static final Pattern OPENING_OL_TAG_PATTERN = Pattern.compile("<ol>", Pattern.CASE_INSENSITIVE);
    63    private static final Pattern CLOSING_OL_TAG_PATTERN = Pattern.compile("</ol>", Pattern.CASE_INSENSITIVE);
    64    private static final Pattern OPENING_LI_TAG_PATTERN = Pattern.compile("<li>", Pattern.CASE_INSENSITIVE);
    65    private static final Pattern CLOSING_LI_TAG_PATTERN = Pattern.compile("</li>", Pattern.CASE_INSENSITIVE);
    66    private static final Pattern CLOSING_A_TAG_PATTERN = Pattern.compile("</a>", Pattern.CASE_INSENSITIVE);
    67    private static final Pattern OPENING_A_TAG_PATTERN = Pattern.compile("<a href=.*?>", Pattern.CASE_INSENSITIVE);
    68    private static final Pattern OPENING_SUP_TAG_PATTERN = Pattern.compile("<sup>", Pattern.CASE_INSENSITIVE);
    69    private static final Pattern CLOSING_SUP_TAG_PATTERN = Pattern.compile("</sup>", Pattern.CASE_INSENSITIVE);
    70    private static final Pattern OPENING_SUB_TAG_PATTERN = Pattern.compile("<sub>", Pattern.CASE_INSENSITIVE);
    71    private static final Pattern CLOSING_SUB_TAG_PATTERN = Pattern.compile("</sub>", Pattern.CASE_INSENSITIVE);
    72  
    73    public static final int MAX_CONTENT_LENGTH = 255;
    74    public static final int MAX_WORD_LENGTH = 20;
    75    public static final int MAX_NUM_OF_POSTS = 5;
    76  
    77    
    78    //HTML4 248 named entities
    79    private final static Map<String,String> htmlEntities = new HashMap<String,String>();
    80    private final static Collection<String> allowedSchemes = new ArrayList<String>();
    81  
    82    static {
    83  	htmlEntities.put("&nbsp;", "\u00A0");
    84  	htmlEntities.put("&iexcl;", "\u00A1");
    85  	htmlEntities.put("&cent;", "\u00A2");
    86  	htmlEntities.put("&pound;", "\u00A3");
    87  	htmlEntities.put("&curren;", "\u00A4");
    88  	htmlEntities.put("&yen;", "\u00A5");
    89  	htmlEntities.put("&brvbar;", "\u00A6");
    90  	htmlEntities.put("&sect;", "\u00A7");
    91  	htmlEntities.put("&uml;", "\u00A8");
    92  	htmlEntities.put("&copy;", "\u00A9");
    93  	htmlEntities.put("&ordf;", "\u00AA");
    94  	htmlEntities.put("&laquo;", "\u00AB");
    95  	htmlEntities.put("&not;", "\u00AC");
    96  	htmlEntities.put("&shy;", "\u00AD");
    97  	htmlEntities.put("&reg;", "\u00AE");
    98  	htmlEntities.put("&macr;", "\u00AF");
    99  	htmlEntities.put("&deg;", "\u00B0");
   100  	htmlEntities.put("&plusmn;", "\u00B1");
   101  	htmlEntities.put("&sup2;", "\u00B2");
   102  	htmlEntities.put("&sup3;", "\u00B3");
   103  	htmlEntities.put("&acute;", "\u00B4");
   104  	htmlEntities.put("&micro;", "\u00B5");
   105  	htmlEntities.put("&para;", "\u00B6");
   106  	htmlEntities.put("&middot;", "\u00B7");
   107  	htmlEntities.put("&cedil;", "\u00B8");
   108  	htmlEntities.put("&sup1;", "\u00B9");
   109  	htmlEntities.put("&ordm;", "\u00BA");
   110  	htmlEntities.put("&raquo;", "\u00BB");
   111  	htmlEntities.put("&frac14;", "\u00BC");
   112  	htmlEntities.put("&frac12;", "\u00BD");
   113  	htmlEntities.put("&frac34;", "\u00BE");
   114  	htmlEntities.put("&iquest;", "\u00BF");
   115  	htmlEntities.put("&Agrave;", "\u00C0");
   116  	htmlEntities.put("&Aacute;", "\u00C1");
   117  	htmlEntities.put("&Acirc;", "\u00C2");
   118  	htmlEntities.put("&Atilde;", "\u00C3");
   119  	htmlEntities.put("&Auml;", "\u00C4");
   120  	htmlEntities.put("&Aring;", "\u00C5");
   121  	htmlEntities.put("&AElig;", "\u00C6");
   122  	htmlEntities.put("&Ccedil;", "\u00C7");
   123  	htmlEntities.put("&Egrave;", "\u00C8");
   124  	htmlEntities.put("&Eacute;", "\u00C9");
   125  	htmlEntities.put("&Ecirc;", "\u00CA");
   126  	htmlEntities.put("&Euml;", "\u00CB");
   127  	htmlEntities.put("&Igrave;", "\u00CC");
   128  	htmlEntities.put("&Iacute;", "\u00CD");
   129  	htmlEntities.put("&Icirc;", "\u00CE");
   130  	htmlEntities.put("&Iuml;", "\u00CF");
   131  	htmlEntities.put("&ETH;", "\u00D0");
   132  	htmlEntities.put("&Ntilde;", "\u00D1");
   133  	htmlEntities.put("&Ograve;", "\u00D2");
   134  	htmlEntities.put("&Oacute;", "\u00D3");
   135  	htmlEntities.put("&Ocirc;", "\u00D4");
   136  	htmlEntities.put("&Otilde;", "\u00D5");
   137  	htmlEntities.put("&Ouml;", "\u00D6");
   138  	htmlEntities.put("&times;", "\u00D7");
   139  	htmlEntities.put("&Oslash;", "\u00D8");
   140  	htmlEntities.put("&Ugrave;", "\u00D9");
   141  	htmlEntities.put("&Uacute;", "\u00DA");
   142  	htmlEntities.put("&Ucirc;", "\u00DB");
   143  	htmlEntities.put("&Uuml;", "\u00DC");
   144  	htmlEntities.put("&Yacute;", "\u00DD");
   145  	htmlEntities.put("&THORN;", "\u00DE");
   146  	htmlEntities.put("&szlig;", "\u00DF");
   147  	htmlEntities.put("&agrave;", "\u00E0");
   148  	htmlEntities.put("&aacute;", "\u00E1");
   149  	htmlEntities.put("&acirc;", "\u00E2");
   150  	htmlEntities.put("&atilde;", "\u00E3");
   151  	htmlEntities.put("&auml;", "\u00E4");
   152  	htmlEntities.put("&aring;", "\u00E5");
   153  	htmlEntities.put("&aelig;", "\u00E6");
   154  	htmlEntities.put("&ccedil;", "\u00E7");
   155  	htmlEntities.put("&egrave;", "\u00E8");
   156  	htmlEntities.put("&eacute;", "\u00E9");
   157  	htmlEntities.put("&ecirc;", "\u00EA");
   158  	htmlEntities.put("&euml;", "\u00EB");
   159  	htmlEntities.put("&igrave;", "\u00EC");
   160  	htmlEntities.put("&iacute;", "\u00ED");
   161  	htmlEntities.put("&icirc;", "\u00EE");
   162  	htmlEntities.put("&iuml;", "\u00EF");
   163  	htmlEntities.put("&eth;", "\u00F0");
   164  	htmlEntities.put("&ntilde;", "\u00F1");
   165  	htmlEntities.put("&ograve;", "\u00F2");
   166  	htmlEntities.put("&oacute;", "\u00F3");
   167  	htmlEntities.put("&ocirc;", "\u00F4");
   168  	htmlEntities.put("&otilde;", "\u00F5");
   169  	htmlEntities.put("&ouml;", "\u00F6");
   170  	htmlEntities.put("&divide;", "\u00F7");
   171  	htmlEntities.put("&oslash;", "\u00F8");
   172  	htmlEntities.put("&ugrave;", "\u00F9");
   173  	htmlEntities.put("&uacute;", "\u00FA");
   174  	htmlEntities.put("&ucirc;", "\u00FB");
   175  	htmlEntities.put("&uuml;", "\u00FC");
   176  	htmlEntities.put("&yacute;", "\u00FD");
   177  	htmlEntities.put("&thorn;", "\u00FE");
   178  	htmlEntities.put("&yuml;", "\u00FF");
   179  	htmlEntities.put("&OElig;", "\u0152");
   180  	htmlEntities.put("&oelig;", "\u0153");
   181  	htmlEntities.put("&Scaron;", "\u0160");
   182  	htmlEntities.put("&scaron;", "\u0161");
   183  	htmlEntities.put("&Yuml;", "\u0178");
   184  	htmlEntities.put("&fnof;", "\u0192");
   185  	htmlEntities.put("&circ;", "\u02C6");
   186  	htmlEntities.put("&tilde;", "\u02DC");
   187  	htmlEntities.put("&Alpha;", "\u0391");
   188  	htmlEntities.put("&Beta;", "\u0392");
   189  	htmlEntities.put("&Gamma;", "\u0393");
   190  	htmlEntities.put("&Delta;", "\u0394");
   191  	htmlEntities.put("&Epsilon;", "\u0395");
   192  	htmlEntities.put("&Zeta;", "\u0396");
   193  	htmlEntities.put("&Eta;", "\u0397");
   194  	htmlEntities.put("&Theta;", "\u0398");
   195  	htmlEntities.put("&Iota;", "\u0399");
   196  	htmlEntities.put("&Kappa;", "\u039A");
   197  	htmlEntities.put("&Lambda;", "\u039B");
   198  	htmlEntities.put("&Mu;", "\u039C");
   199  	htmlEntities.put("&Nu;", "\u039D");
   200  	htmlEntities.put("&Xi;", "\u039E");
   201  	htmlEntities.put("&Omicron;", "\u039F");
   202  	htmlEntities.put("&Pi;", "\u03A0");
   203  	htmlEntities.put("&Rho;", "\u03A1");
   204  	htmlEntities.put("&Sigma;", "\u03A3");
   205  	htmlEntities.put("&Tau;", "\u03A4");
   206  	htmlEntities.put("&Upsilon;", "\u03A5");
   207  	htmlEntities.put("&Phi;", "\u03A6");
   208  	htmlEntities.put("&Chi;", "\u03A7");
   209  	htmlEntities.put("&Psi;", "\u03A8");
   210  	htmlEntities.put("&Omega;", "\u03A9");
   211  	htmlEntities.put("&alpha;", "\u03B1");
   212  	htmlEntities.put("&beta;", "\u03B2");
   213  	htmlEntities.put("&gamma;", "\u03B3");
   214  	htmlEntities.put("&delta;", "\u03B4");
   215  	htmlEntities.put("&epsilon;", "\u03B5");
   216  	htmlEntities.put("&zeta;", "\u03B6");
   217  	htmlEntities.put("&eta;", "\u03B7");
   218  	htmlEntities.put("&theta;", "\u03B8");
   219  	htmlEntities.put("&iota;", "\u03B9");
   220  	htmlEntities.put("&kappa;", "\u03BA");
   221  	htmlEntities.put("&lambda;", "\u03BB");
   222  	htmlEntities.put("&mu;", "\u03BC");
   223  	htmlEntities.put("&nu;", "\u03BD");
   224  	htmlEntities.put("&xi;", "\u03BE");
   225  	htmlEntities.put("&omicron;", "\u03BF");
   226  	htmlEntities.put("&pi;", "\u03C0");
   227  	htmlEntities.put("&rho;", "\u03C1");
   228  	htmlEntities.put("&sigmaf;", "\u03C2");
   229  	htmlEntities.put("&sigma;", "\u03C3");
   230  	htmlEntities.put("&tau;", "\u03C4");
   231  	htmlEntities.put("&upsilon;", "\u03C5");
   232  	htmlEntities.put("&phi;", "\u03C6");
   233  	htmlEntities.put("&chi;", "\u03C7");
   234  	htmlEntities.put("&psi;", "\u03C8");
   235  	htmlEntities.put("&omega;", "\u03C9");
   236  	htmlEntities.put("&thetasym;", "\u03D1");
   237  	htmlEntities.put("&upsih;", "\u03D2");
   238  	htmlEntities.put("&piv;", "\u03D6");
   239  	htmlEntities.put("&ensp;", "\u2002");
   240  	htmlEntities.put("&emsp;", "\u2003");
   241  	htmlEntities.put("&thinsp;", "\u2009");
   242  	htmlEntities.put("&zwnj;", "\u200C");
   243  	htmlEntities.put("&zwj;", "\u200D");
   244  	htmlEntities.put("&lrm;", "\u200E");
   245  	htmlEntities.put("&rlm;", "\u200F");
   246  	htmlEntities.put("&ndash;", "\u2013");
   247  	htmlEntities.put("&mdash;", "\u2014");
   248  	htmlEntities.put("&lsquo;", "\u2018");
   249  	htmlEntities.put("&rsquo;", "\u2019");
   250  	htmlEntities.put("&sbquo;", "\u201A");
   251  	htmlEntities.put("&ldquo;", "\u201C");
   252  	htmlEntities.put("&rdquo;", "\u201D");
   253  	htmlEntities.put("&bdquo;", "\u201E");
   254  	htmlEntities.put("&dagger;", "\u2020");
   255  	htmlEntities.put("&Dagger;", "\u2021");
   256  	htmlEntities.put("&bull;", "\u2022");
   257  	htmlEntities.put("&hellip;", "\u2026");
   258  	htmlEntities.put("&permil;", "\u2030");
   259  	htmlEntities.put("&prime;", "\u2032");
   260  	htmlEntities.put("&Prime;", "\u2033");
   261  	htmlEntities.put("&lsaquo;", "\u2039");
   262  	htmlEntities.put("&rsaquo;", "\u203A");
   263  	htmlEntities.put("&oline;", "\u203E");
   264  	htmlEntities.put("&frasl;", "\u2044");
   265  	htmlEntities.put("&euro;", "\u20AC");
   266  	htmlEntities.put("&image;", "\u2111");
   267  	htmlEntities.put("&weierp;", "\u2118");
   268  	htmlEntities.put("&real;", "\u211C");
   269  	htmlEntities.put("&trade;", "\u2122");
   270  	htmlEntities.put("&alefsym;", "\u2135");
   271  	htmlEntities.put("&larr;", "\u2190");
   272  	htmlEntities.put("&uarr;", "\u2191");
   273  	htmlEntities.put("&rarr;", "\u2192");
   274  	htmlEntities.put("&darr;", "\u2193");
   275  	htmlEntities.put("&harr;", "\u2194");
   276  	htmlEntities.put("&crarr;", "\u21B5");
   277  	htmlEntities.put("&lArr;", "\u21D0");
   278  	htmlEntities.put("&uArr;", "\u21D1");
   279  	htmlEntities.put("&rArr;", "\u21D2");
   280  	htmlEntities.put("&dArr;", "\u21D3");
   281  	htmlEntities.put("&hArr;", "\u21D4");
   282  	htmlEntities.put("&forall;", "\u2200");
   283  	htmlEntities.put("&part;", "\u2202");
   284  	htmlEntities.put("&exist;", "\u2203");
   285  	htmlEntities.put("&empty;", "\u2205");
   286  	htmlEntities.put("&nabla;", "\u2207");
   287  	htmlEntities.put("&isin;", "\u2208");
   288  	htmlEntities.put("&notin;", "\u2209");
   289  	htmlEntities.put("&ni;", "\u220B");
   290  	htmlEntities.put("&prod;", "\u220F");
   291  	htmlEntities.put("&sum;", "\u2211");
   292  	htmlEntities.put("&minus;", "\u2212");
   293  	htmlEntities.put("&lowast;", "\u2217");
   294  	htmlEntities.put("&radic;", "\u221A");
   295  	htmlEntities.put("&prop;", "\u221D");
   296  	htmlEntities.put("&infin;", "\u221E");
   297  	htmlEntities.put("&ang;", "\u2220");
   298  	htmlEntities.put("&and;", "\u2227");
   299  	htmlEntities.put("&or;", "\u2228");
   300  	htmlEntities.put("&cap;", "\u2229");
   301  	htmlEntities.put("&cup;", "\u222A");
   302  	htmlEntities.put("&int;", "\u222B");
   303  	htmlEntities.put("&there4;", "\u2234");
   304  	htmlEntities.put("&sim;", "\u223C");
   305  	htmlEntities.put("&cong;", "\u2245");
   306  	htmlEntities.put("&asymp;", "\u2248");
   307  	htmlEntities.put("&ne;", "\u2260");
   308  	htmlEntities.put("&equiv;", "\u2261");
   309  	htmlEntities.put("&le;", "\u2264");
   310  	htmlEntities.put("&ge;", "\u2265");
   311  	htmlEntities.put("&sub;", "\u2282");
   312  	htmlEntities.put("&sup;", "\u2283");
   313  	htmlEntities.put("&nsub;", "\u2284");
   314  	htmlEntities.put("&sube;", "\u2286");
   315  	htmlEntities.put("&supe;", "\u2287");
   316  	htmlEntities.put("&oplus;", "\u2295");
   317  	htmlEntities.put("&otimes;", "\u2297");
   318  	htmlEntities.put("&perp;", "\u22A5");
   319  	htmlEntities.put("&sdot;", "\u22C5");
   320  	htmlEntities.put("&lceil;", "\u2308");
   321  	htmlEntities.put("&rceil;", "\u2309");
   322  	htmlEntities.put("&lfloor;", "\u230A");
   323  	htmlEntities.put("&rfloor;", "\u230B");
   324  	htmlEntities.put("&lang;", "\u2329");
   325  	htmlEntities.put("&rang;", "\u232A");
   326  	htmlEntities.put("&loz;", "\u25CA");
   327  	htmlEntities.put("&spades;", "\u2660");
   328  	htmlEntities.put("&clubs;", "\u2663");
   329  	htmlEntities.put("&hearts;", "\u2665");
   330  	htmlEntities.put("&diams;", "\u2666");
   331  
   332    allowedSchemes.add("https://");
   333    allowedSchemes.add("http://");
   334    allowedSchemes.add("ftp://");
   335    allowedSchemes.add("mailto:");
   336    }
   337  	
   338  
   339    /**
   340     * Filters out characters that have meaning within JSP and HTML, and
   341     * replaces them with "escaped" versions.
   342     *
   343     * @param s   the String to filter
   344     * @return  the filtered String
   345     */
   346    public static String transformHTML(String s) {
   347  
             /* 
    P/P       *  Method: String transformHTML(String)
              * 
              *  Postconditions:
              *    init'ed(return_value)
              * 
              *  Test Vectors:
              *    s: Inverse{null}, Addr_Set{null}
              *    java.lang.String:charAt(...)@357: {34}, {38}, {60}, {62}, {0..33, 35..37, 39..59, 61, 63..216-1}
              */
   348      if (s == null) {
   349        return null;
   350      }
   351  
   352      StringBuffer buf = new StringBuffer(s.length());
   353  
   354      // loop through every character and replace if necessary
   355      int length = s.length();
   356      for (int i = 0; i < length; i++) {
   357        switch (s.charAt(i)) {
   358          case '<':
   359            buf.append("<");
   360            break;
   361          case '>':
   362            buf.append(">");
   363            break;
   364          case '&':
   365            buf.append("&");
   366            break;
   367          case '\"':
   368              buf.append(""");
   369              break;
   370          default :
   371            buf.append(s.charAt(i));
   372        }
   373      }
   374  
   375      return buf.toString();
   376    }
   377  
   378    /**
   379     * Transforms the given String into a subset of HTML displayable on a web
   380     * page. The subset includes <b>, <i>, <p>, <br>,
   381     * <pre> and <a href> (and their corresponding end tags).
   382     *
   383     * @param s   the String to transform
   384     * @return    the transformed String
   385     */
   386    public static String transformToHTMLSubset(String s) {
   387  
             /* 
    P/P       *  Method: String transformToHTMLSubset(String)
              * 
              *  Presumptions:
              *    java.lang.String:indexOf(...)@430 + java.lang.String:length(...)@432 in -231..232-1
              *    java.lang.String:indexOf(...)@436 + java.lang.String:length(...)@438 in -231..232-1
              *    java.lang.String:indexOf(...)@436 + java.lang.String:length(...)@439 in -231..232-1
              *    java.util.regex.Matcher:replaceAll(...)@462 != null
              *    java.util.regex.Pattern:compile(...)@45 != null
              *    ...
              * 
              *  Postconditions:
              *    init'ed(return_value)
              * 
              *  Test Vectors:
              *    s: Inverse{null}, Addr_Set{null}
              *    java.lang.String:indexOf(...)@430: {-231..-1}, {0..232-1}
              *    java.lang.String:indexOf(...)@436: {-231..-1}, {0..232-1}
              *    java.util.regex.Matcher:find(...)@425: {1}, {0}
              */
   388      if (s == null) {
   389        return null;
   390      }
   391  
   392      s = replace(s, OPENING_B_TAG_PATTERN, "<b>");
   393      s = replace(s, CLOSING_B_TAG_PATTERN, "</b>");
   394      s = replace(s, OPENING_STRONG_TAG_PATTERN, "<strong>");
   395      s = replace(s, CLOSING_STRONG_TAG_PATTERN, "</strong>");
   396      s = replace(s, OPENING_I_TAG_PATTERN, "<i>");
   397      s = replace(s, CLOSING_I_TAG_PATTERN, "</i>");
   398      s = replace(s, OPENING_EM_TAG_PATTERN, "<em>");
   399      s = replace(s, CLOSING_EM_TAG_PATTERN, "</em>");
   400      s = replace(s, OPENING_BLOCKQUOTE_TAG_PATTERN, "<blockquote>");
   401      s = replace(s, CLOSING_BLOCKQUOTE_TAG_PATTERN, "</blockquote>");
   402      s = replace(s, BR_TAG_PATTERN, "<br />");
   403      s = replace(s, OPENING_P_TAG_PATTERN, "<p>");
   404      s = replace(s, CLOSING_P_TAG_PATTERN, "</p>");
   405      s = replace(s, OPENING_PRE_TAG_PATTERN, "<pre>");
   406      s = replace(s, CLOSING_PRE_TAG_PATTERN, "</pre>");
   407      s = replace(s, OPENING_UL_TAG_PATTERN, "<ul>");
   408      s = replace(s, CLOSING_UL_TAG_PATTERN, "</ul>");
   409      s = replace(s, OPENING_OL_TAG_PATTERN, "<ol>");
   410      s = replace(s, CLOSING_OL_TAG_PATTERN, "</ol>");
   411      s = replace(s, OPENING_LI_TAG_PATTERN, "<li>");
   412      s = replace(s, CLOSING_LI_TAG_PATTERN, "</li>");
   413      s = replace(s, OPENING_SUP_TAG_PATTERN, "<sup>");
   414      s = replace(s, CLOSING_SUP_TAG_PATTERN, "</sup>");
   415      s = replace(s, OPENING_SUB_TAG_PATTERN, "<sub>");
   416      s = replace(s, CLOSING_SUB_TAG_PATTERN, "</sub>");
   417  
   418      // HTTP links - remove all attributes other than href
   419      s = replace(s, CLOSING_A_TAG_PATTERN, "</a>");
   420      Matcher m = OPENING_A_TAG_PATTERN.matcher(s);
   421      // Use a single buffer for efficiency
   422      StringBuffer buffer = new StringBuffer();
   423      // The position in the original string that we are up to
   424      int position = 0;
   425      while (m.find()) {
   426        int start = m.start();
   427        int end = m.end();
   428        buffer.append(s.subSequence(position, start)).append("<a href=");
   429        String link = s.substring(start, end);
   430        int startOfHrefIndex = link.indexOf("href="");
   431        if (startOfHrefIndex > -1) {
   432          int startOfHrefValue = startOfHrefIndex + "href="".length();
   433          int endOfHrefIndex = link.indexOf(""", startOfHrefValue);
   434          buffer.append("\"").append(validateUrl(link.substring(startOfHrefValue, endOfHrefIndex))).append("\"");
   435        } else {
   436          startOfHrefIndex = link.indexOf("href='");
   437          if (startOfHrefIndex > -1) {
   438            int startOfHrefValue = startOfHrefIndex + "href='".length();
   439            int endOfHrefIndex = link.indexOf("'", startOfHrefIndex+"href='".length());
   440            buffer.append("'").append(validateUrl(link.substring(startOfHrefValue, endOfHrefIndex))).append("'");
   441          }
   442        }
   443        buffer.append(">");
   444        position = end;
   445      }
   446      // If position is still 0 there were no matches, so don't do anything
   447      if (position > 0) {
   448        buffer.append(s.subSequence(position, s.length()));
   449        s = buffer.toString();
   450      }
   451  
   452      // escaped angle brackets and other allowed entities
   453      s = s.replaceAll("&lt;", "<");
   454      s = s.replaceAll("&gt;", ">");
   455      s = s.replaceAll("&([#a-zA-Z0-9]{1,}?);", "&$1;");
   456      
   457      return s;
   458    }
   459  
   460    private static String replace(String string, Pattern pattern, String replacement) {
             /* 
    P/P       *  Method: String replace(String, Pattern, String)
              * 
              *  Preconditions:
              *    pattern != null
              * 
              *  Presumptions:
              *    java.util.regex.Pattern:matcher(...)@461 != null
              * 
              *  Postconditions:
              *    init'ed(return_value)
              */
   461      Matcher m = pattern.matcher(string);
   462      return m.replaceAll(replacement);
   463    }
   464  
   465    /**
   466     * Filters out newline characters.
   467     *
   468     * @param s   the String to filter
   469     * @return  the filtered String
   470     */
   471    public static String filterNewlines(String s) {
   472  
             /* 
    P/P       *  Method: String filterNewlines(String)
              * 
              *  Postconditions:
              *    init'ed(return_value)
              * 
              *  Test Vectors:
              *    s: Inverse{null}, Addr_Set{null}
              */
   473      if (s == null) {
   474        return null;
   475      }
   476  
   477      StringBuffer buf = new StringBuffer(s.length());
   478  
   479      // loop through every character and replace if necessary
   480      int length = s.length();
   481      for (int i = 0; i < length; i++) {
   482        switch (s.charAt(i)) {
   483          case '\r':
   484            break;
   485          default :
   486            buf.append(s.charAt(i));
   487        }
   488      }
   489  
   490      return buf.toString();
   491    }
   492  
   493    /**
   494     * Filters out all HTML tags.
   495     *
   496     * @param s   the String to filter
   497     * @return    the filtered String
   498     */
   499    public static String filterHTML(String s) {
             /* 
    P/P       *  Method: String filterHTML(String)
              * 
              *  Postconditions:
              *    init'ed(return_value)
              * 
              *  Test Vectors:
              *    s: Inverse{null}, Addr_Set{null}
              */
   500      if (s == null) {
   501        return null;
   502      }
   503  
   504      s = s.replaceAll("<", "");
   505      s = s.replaceAll(">", "");
   506      s = s.replaceAll("&nbsp;", "");
   507      s = s.replaceAll("(?s)<!--.*?-->", "");
   508      return s.replaceAll("(?s)<.*?>", "");
   509    }
   510  
   511    public static String truncate(String s) {
             /* 
    P/P       *  Method: String truncate(String)
              * 
              *  Postconditions:
              *    return_value != null
              */
   512      return truncate(s, MAX_CONTENT_LENGTH);
   513    }
   514  
   515    public static String truncate(String s, int maxLength) {
             /* 
    P/P       *  Method: String truncate(String, int)
              * 
              *  Presumptions:
              *    words.length@524 <= 232-1
              *    words[i]@524 != null
              * 
              *  Postconditions:
              *    return_value != null
              * 
              *  Test Vectors:
              *    java.lang.String:length(...)@530: {0..20}, {21..232-1}
              */
   516      String content = StringUtils.filterHTML(s);
   517  
   518      // then truncate, if necessary
   519      if (content == null) {
   520        return "";
   521      } else {
   522        StringBuffer buf = new StringBuffer();
   523  
   524        String words[] = content.split("\\s");
   525        for (int i = 0; i < words.length; i++) {
   526          if (buf.length() + words[i].length() > maxLength) {
   527            // truncate here
   528            buf.append("...");
   529            return buf.toString();
   530          } else if (words[i].length() > MAX_WORD_LENGTH) {
   531            // truncate here
   532            buf.append(words[i].substring(0, MAX_WORD_LENGTH));
   533            buf.append("...");
   534            return buf.toString();
   535          } else {
   536            buf.append(words[i]);
   537            if ((i+1) < words.length) {
   538              buf.append(" ");
   539            }
   540          }
   541        }
   542  
   543        return buf.toString();
   544      }
   545    }
   546  
   547    public static String stripScriptTags(String html) {
             /* 
    P/P       *  Method: String stripScriptTags(String)
              * 
              *  Postconditions:
              *    init'ed(return_value)
              * 
              *  Test Vectors:
              *    html: Inverse{null}, Addr_Set{null}
              */
   548      if (html == null) {
   549        return html;
   550      }
   551  
   552      html = html.replaceAll("<script.*?>.*?</script.*?>", "");
   553      html = html.replaceAll("<script.*?/>", "");
   554      return html;
   555    }
   556  
   557  
   558    public static String unescapeHTMLEntities(String source) {
              /* 
    P/P        *  Method: String unescapeHTMLEntities(String)
               * 
               *  Preconditions:
               *    (soft) source != null
               * 
               *  Presumptions:
               *    java.util.Map:keySet(...)@559 != null
               * 
               *  Postconditions:
               *    return_value != null
               * 
               *  Test Vectors:
               *    java.util.Iterator:hasNext(...)@561: {1}, {0}
               */
   559       Iterator<String> it = htmlEntities.keySet().iterator(); 
   560  	 
   561  	 while(it.hasNext()) { 
   562  		 
   563  		 String key = it.next(); 
   564  		 String val = htmlEntities.get(key); 
   565  		 source = source.replaceAll(key, val);
   566  	 } 
   567       return source;
   568    }
   569  
   570    public static String validateUrl(String url) {
   571      // whitelist, don't blacklist.
             /* 
    P/P       *  Method: String validateUrl(String)
              * 
              *  Preconditions:
              *    (soft) url != null
              * 
              *  Postconditions:
              *    return_value == One-of{url, &""}
              *    (soft) return_value != null
              * 
              *  Test Vectors:
              *    java.lang.String:startsWith(...)@573: {0}, {1}
              *    java.util.Iterator:hasNext(...)@572: {1}, {0}
              */
   572      for (String scheme : allowedSchemes) {
   573        if (url.startsWith(scheme)) {
   574          return url;
   575        }
   576      }
   577      return "";
   578    }
   579  
   580  }








SofCheck Inspector Build Version : 2.22510
stringutils.java 2010-Jun-25 19:40:32
stringutils.class 2010-Jul-19 20:23:38