Java Tutorial/Development/HTML Parser — различия между версиями
| Admin (обсуждение | вклад)  м (1 версия) | |
| (нет различий) | |
Текущая версия на 15:28, 31 мая 2010
Содержание
- 1 A collection of all character entites defined in the HTML4 standard.
- 2 Convert to HTML string
- 3 Decode an HTML color string like "#F567BA;" into a Color
- 4 Escape HTML
- 5 extends HTMLEditorKit.ParserCallback
- 6 Filter message string for characters that are sensitive in HTML
- 7 Filter the specified message string for characters that are sensitive in HTML
- 8 HTML color names
- 9 html parser DTD
- 10 insert HTML block dynamically
- 11 List Tags
- 12 Parse HTML
- 13 Text To HTML
- 14 Unescape HTML
- 15 Use javax.swing.text.html.HTMLEditorKit to parse HTML
- 16 Utility methods for dealing with HTML
A collection of all character entites defined in the HTML4 standard.
/**
 * 
 * LibXML : a free Java layouting library
 * 
 *
 * Project Info:  http://reporting.pentaho.org/libxml/
 *
 * (C) Copyright 2006-2008, by Object Refinery Ltd, Pentaho Corporation and Contributors.
 *
 * This library is free software; you can redistribute it and/or modify it under the terms
 * of the GNU Lesser General Public License as published by the Free Software Foundation;
 * either version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 * Boston, MA 02111-1307, USA.
 *
 * [Java is a trademark or registered trademark of Sun Microsystems, Inc.
 * in the United States and other countries.]
 *
 *
 * ------------
 * HtmlCharacterEntities.java
 * ------------
 */
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
/**
 * A collection of all character entites defined in the HTML4 standard. The key
 * is the entity name, the property value is the decoded string.
 *
 * @author Thomas Morgner
 */
public class HtmlCharacterEntities extends Properties
{
  /**
   * The singleton instance for this entity-parser implementation.
   */
  private static CharacterEntityParser entityParser;
  private static final long serialVersionUID = 5118172339379209383L;
  /**
   * Gets the character entity parser for HTML content. The CharacterEntity
   * parser translates known characters into predefined entities.
   *
   * @return the character entity parser instance.
   */
  public static CharacterEntityParser getEntityParser()
  {
    if (entityParser == null)
    {
      entityParser = new CharacterEntityParser(new HtmlCharacterEntities());
    }
    return entityParser;
  }
  /**
   * Creates an instance.
   */
  public HtmlCharacterEntities()
  {
    setProperty("ang", "\u2220");
    setProperty("spades", "\u2660");
    setProperty("frasl", "\u2044");
    setProperty("copy", "\u00a9");
    setProperty("Upsilon", "\u03a5");
    setProperty("rsquo", "\u2019");
    setProperty("sdot", "\u22c5");
    setProperty("beta", "\u03b2");
    setProperty("egrave", "\u00e8");
    setProperty("Pi", "\u03a0");
    setProperty("micro", "\u00b5");
    setProperty("lArr", "\u21d0");
    setProperty("Beta", "\u0392");
    setProperty("eacute", "\u00e9");
    setProperty("agrave", "\u00e0");
    setProperty("sbquo", "\u201a");
    setProperty("ucirc", "\u00fb");
    setProperty("mdash", "\u2014");
    setProperty("rho", "\u03c1");
    setProperty("Nu", "\u039d");
    setProperty("ne", "\u2260");
    setProperty("nsub", "\u2284");
    setProperty("AElig", "\u00c6");
    setProperty("raquo", "\u00bb");
    setProperty("aacute", "\u00e1");
    setProperty("le", "\u2264");
    setProperty("harr", "\u2194");
    setProperty("frac34", "\u00be");
    setProperty("bdquo", "\u201e");
    setProperty("cup", "\u222a");
    setProperty("frac14", "\u00bc");
    setProperty("exist", "\u2203");
    setProperty("Ccedil", "\u00c7");
    setProperty("phi", "\u03c6");
    setProperty("Lambda", "\u039b");
    setProperty("alpha", "\u03b1");
    setProperty("sigma", "\u03c3");
    setProperty("thetasym", "\u03d1");
    setProperty("Rho", "\u03a1");
    setProperty("hArr", "\u21d4");
    setProperty("Dagger", "\u2021");
    setProperty("otilde", "\u00f5");
    setProperty("Epsilon", "\u0395");
    setProperty("iuml", "\u00ef");
    setProperty("Phi", "\u03a6");
    setProperty("prod", "\u220f");
    setProperty("Aring", "\u00c5");
    setProperty("rlm", "\u200f");
    setProperty("yen", "\u00a5");
    setProperty("emsp", "\u2003");
    setProperty("rang", "\u232a");
    setProperty("Atilde", "\u00c3");
    setProperty("Iuml", "\u00cf");
    setProperty("iota", "\u03b9");
    setProperty("deg", "\u00b0");
    setProperty("prop", "\u221d");
    setProperty("and", "\u2227");
    setProperty("para", "\u00b6");
    setProperty("darr", "\u2193");
    setProperty("curren", "\u00a4");
    setProperty("crarr", "\u21b5");
    setProperty("not", "\u00ac");
    setProperty("Iota", "\u0399");
    setProperty("aelig", "\u00e6");
    setProperty("rdquo", "\u201d");
    setProperty("Ocirc", "\u00d4");
    setProperty("ntilde", "\u00f1");
    setProperty("reg", "\u00ae");
    setProperty("zeta", "\u03b6");
    setProperty("middot", "\u00b7");
    setProperty("cent", "\u00a2");
    setProperty("quot", "\"");
    setProperty("hellip", "\u2026");
    setProperty("Zeta", "\u0396");
    setProperty("rceil", "\u2309");
    setProperty("eta", "\u03b7");
    setProperty("nbsp", "\u00a0");
    setProperty("rarr", "\u2192");
    setProperty("frac12", "\u00bd");
    setProperty("real", "\u211c");
    setProperty("mu", "\u03bc");
    setProperty("dArr", "\u21d3");
    setProperty("divide", "\u00f7");
    setProperty("cap", "\u2229");
    setProperty("chi", "\u03c7");
    setProperty("times", "\u00d7");
    setProperty("euml", "\u00eb");
    setProperty("Gamma", "\u0393");
    setProperty("loz", "\u25ca");
    setProperty("acute", "\u00b4");
    setProperty("Omega", "\u03a9");
    setProperty("ndash", "\u2013");
    setProperty("clubs", "\u2663");
    setProperty("macr", "\u00af");
    setProperty("Yacute", "\u00dd");
    setProperty("Ugrave", "\u00d9");
    setProperty("Euml", "\u00cb");
    setProperty("Eta", "\u0397");
    setProperty("sect", "\u00a7");
    setProperty("asymp", "\u2248");
    setProperty("ordm", "\u00ba");
    setProperty("rArr", "\u21d2");
    setProperty("radic", "\u221a");
    setProperty("Uacute", "\u00da");
    setProperty("omicron", "\u03bf");
    setProperty("Chi", "\u03a7");
    setProperty("aring", "\u00e5");
    setProperty("Theta", "\u0398");
    setProperty("supe", "\u2287");
    setProperty("ensp", "\u2002");
    setProperty("uml", "\u00a8");
    setProperty("ccedil", "\u00e7");
    setProperty("lambda", "\u03bb");
    setProperty("gt", "\u003e");
    setProperty("uarr", "\u2191");
    setProperty("alefsym", "\u2135");
    setProperty("auml", "\u00e4");
    setProperty("sup3", "\u00b3");
    setProperty("circ", "\u02c6");
    setProperty("lsquo", "\u2018");
    setProperty("Auml", "\u00c4");
    setProperty("dagger", "\u2020");
    setProperty("Kappa", "\u039a");
    setProperty("cong", "\u2245");
    setProperty("zwnj", "\u200c");
    setProperty("shy", "\u00ad");
    setProperty("ouml", "\u00f6");
    setProperty("diams", "\u2666");
    setProperty("uArr", "\u21d1");
    setProperty("atilde", "\u00e3");
    setProperty("THORN", "\u00de");
    setProperty("or", "\u2228");
    setProperty("Ograve", "\u00d2");
    setProperty("ocirc", "\u00f4");
    setProperty("plusm", "\u00b1");
    setProperty("Ouml", "\u00d6");
    setProperty("nabla", "\u2207");
    setProperty("psi", "\u03c8");
    setProperty("sigmaf", "\u03c2");
    setProperty("euro", "\u20ac");
    setProperty("sube", "\u2286");
    setProperty("sup2", "\u00b2");
    setProperty("laquo", "\u00ab");
    setProperty("forall", "\u2200");
    setProperty("Oacute", "\u00d3");
    setProperty("iexcl", "\u00a1");
    fillMoreEntities();
  }
  /**
   * Externalized initialization method to make CheckStyle happy.
   */
  private void fillMoreEntities()
  {
    setProperty("piv", "\u03d6");
    setProperty("minus", "\u2212");
    setProperty("zwj", "\u200d");
    setProperty("tau", "\u03c4");
    setProperty("Mu", "\u039c");
    setProperty("gamma", "\u03b3");
    setProperty("sup", "\u2283");
    setProperty("Psi", "\u03a8");
    setProperty("omega", "\u03c9");
    setProperty("Oslash", "\u00d8");
    setProperty("weierp", "\u2118");
    setProperty("Igrave", "\u00cc");
    setProperty("OElig", "\u0152");
    setProperty("sup1", "\u00b9");
    setProperty("cedil", "\u00b8");
    setProperty("upsilon", "\u03c5");
    setProperty("equiv", "\u2261");
    setProperty("isin", "\u2208");
    setProperty("Delta", "\u0394");
    setProperty("yacute", "\u00fd");
    setProperty("ugrave", "\u00f9");
    setProperty("ge", "\u2265");
    setProperty("Iacute", "\u00cd");
    setProperty("brvbar", "\u00a6");
    setProperty("Tau", "\u03a4");
    setProperty("Prime", "\u2033");
    setProperty("rfloor", "\u22a7");
    setProperty("Ecirc", "\u00ca");
    setProperty("ETH", "\u00d0");
    setProperty("int", "\u222b");
    setProperty("xi", "\u03be");
    setProperty("uacute", "\u00fa");
    setProperty("bull", "\u2022");
    setProperty("Scaron", "\u0160");
    setProperty("theta", "\u03b8");
    setProperty("yuml", "\u00ff");
    setProperty("oplus", "\u2295");
    setProperty("part", "\u2202");
    setProperty("ldquo", "\u201c");
    setProperty("Icirc", "\u00ce");
    setProperty("Yuml", "\u0178");
    setProperty("eth", "\u00f0");
    setProperty("Acirc", "\u00c2");
    setProperty("sub", "\u2282");
    setProperty("lceil", "\u2308");
    setProperty("Egrave", "\u00c8");
    setProperty("tilde", "\u02dc");
    setProperty("pi", "\u03c0");
    setProperty("rsaquo", "\u203a");
    setProperty("kappa", "\u03ba");
    setProperty("upsih", "\u03d2");
    setProperty("Omicron", "\u039f");
    setProperty("otimes", "\u2297");
    setProperty("ni", "\u220b");
    setProperty("amp", "\u0026");
    setProperty("Eacute", "\u00c9");
    setProperty("nu", "\u03bd");
    setProperty("Ucirc", "\u00db");
    setProperty("uuml", "\u00fc");
    setProperty("oslash", "\u00f8");
    setProperty("thorn", "\u00fe");
    setProperty("trade", "\u2122");
    setProperty("epsilon", "\u03b5");
    setProperty("ograve", "\u00f2");
    setProperty("hearts", "\u2665");
    setProperty("iquest", "\u00bf");
    setProperty("Uuml", "\u00dc");
    setProperty("empty", "\u2205");
    setProperty("lowast", "\u2217");
    setProperty("sum", "\u2211");
    setProperty("lfloor", "\u22a6");
    setProperty("lrm", "\u200e");
    setProperty("oacute", "\u00f3");
    setProperty("image", "\u2111");
    setProperty("Agrave", "\u00c0");
    setProperty("oline", "\u203e");
    setProperty("oelig", "\u0153");
    setProperty("Sigma", "\u03a3");
    setProperty("permil", "\u2030");
    setProperty("perp", "\u22a5");
    setProperty("lt", "\u003c");
    setProperty("Aacute", "\u00c1");
    setProperty("acirc", "\u00e2");
    setProperty("lang", "\u2329");
    setProperty("delta", "\u03b4");
    setProperty("infin", "\u221e");
    setProperty("igrave", "\u00ec");
    setProperty("ordf", "\u00aa");
    setProperty("lsaquo", "\u2039");
    setProperty("prime", "\u2032");
    setProperty("ecirc", "\u00ea");
    setProperty("there4", "\u2234");
    setProperty("iacute", "\u00ed");
    setProperty("sim", "\u223c");
    setProperty("Alpha", "\u0391");
    setProperty("pound", "\u00a3");
    setProperty("notin", "\u2209");
    setProperty("Ntilde", "\u00d1");
    setProperty("Xi", "\u039e");
    setProperty("thinsp", "\u2009");
    setProperty("Otilde", "\u00d5");
    setProperty("icirc", "\u00ee");
    setProperty("scaron", "\u0161");
    setProperty("szlig", "\u00df");
    setProperty("larr", "\u2190");
  }
}
/**
 * 
 * LibXML : a free Java layouting library
 * 
 *
 * Project Info:  http://reporting.pentaho.org/libxml/
 *
 * (C) Copyright 2006-2008, by Object Refinery Ltd, Pentaho Corporation and Contributors.
 *
 * This library is free software; you can redistribute it and/or modify it under the terms
 * of the GNU Lesser General Public License as published by the Free Software Foundation;
 * either version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 * Boston, MA 02111-1307, USA.
 *
 * [Java is a trademark or registered trademark of Sun Microsystems, Inc.
 * in the United States and other countries.]
 *
 *
 * ------------
 * CharacterEntityParser.java
 * ------------
 */
/**
 * The character entity parser replaces all known occurrences of an entity in
 * the format &entityname;.
 *
 * @author Thomas Morgner
 */
 class CharacterEntityParser
{
  private String[] charMap;
  
  /**
   * the entities, keyed by entity name.
   */
  private final HashMap entities;
  /**
   * Creates a new CharacterEntityParser and initializes the parser with the
   * given set of entities.
   *
   * @param characterEntities the entities used for the parser
   */
  public CharacterEntityParser(final Properties characterEntities)
  {
    if (characterEntities == null)
    {
      throw new NullPointerException("CharacterEntities must not be null");
    }
    entities = new HashMap(characterEntities);
    charMap = new String[65536];
    
    final Iterator entries = entities.entrySet().iterator();
    while (entries.hasNext())
    {
      final Map.Entry entry = (Map.Entry) entries.next();
      final String value = (String) entry.getValue();
      final String entityName = (String) entry.getKey();
      if (value.length() != 1)
      {
        throw new IllegalStateException();
      }
      charMap[value.charAt(0)] = entityName;
    }
  }
  /**
   * Creates a new CharacterEntityParser and initializes the parser with the
   * given set of entities.
   *
   * @param characterEntities the entities used for the parser
   */
  public CharacterEntityParser(final HashMap characterEntities)
  {
    if (characterEntities == null)
    {
      throw new NullPointerException("CharacterEntities must not be null");
    }
    entities = (HashMap) characterEntities.clone();
    charMap = new String[65536];
    final Iterator entries = entities.entrySet().iterator();
    while (entries.hasNext())
    {
      final Map.Entry entry = (Map.Entry) entries.next();
      final String value = (String) entry.getValue();
      final String entityName = (String) entry.getKey();
      if (value.length() != 1)
      {
        throw new IllegalStateException();
      }
      charMap[value.charAt(0)] = entityName;
    }
  }
  /**
   * create a new Character entity parser and initializes the parser with the
   * entities defined in the XML standard.
   *
   * @return the CharacterEntityParser initialized with XML entities.
   */
  public static CharacterEntityParser createXMLEntityParser()
  {
    final HashMap entities = new HashMap();
    entities.put("amp", "&");
    entities.put("quot", "\"");
    entities.put("lt", "<");
    entities.put("gt", ">");
    entities.put("apos", "\u0027");
    return new CharacterEntityParser(entities);
  }
  /**
   * returns the entities used in the parser.
   *
   * @return the properties for this parser.
   */
  private HashMap getEntities()
  {
    return entities;
  }
  /**
   * Looks up the character for the entity name specified in <code>key</code>.
   *
   * @param key the entity name
   * @return the character as string with a length of 1
   */
  private String lookupCharacter(final String key)
  {
    return (String) getEntities().get(key);
  }
  /**
   * Encode the given String, so that all known entites are encoded. All
   * characters represented by these entites are now removed from the string.
   *
   * @param value the original string
   * @return the encoded string.
   */
  public String encodeEntities(final String value)
  {
    if (value == null)
    {
      throw new NullPointerException();
    }
    final int length = value.length();
    final StringBuffer writer = new StringBuffer(length);
    for (int i = 0; i < length; i++)
    {
      final char character = value.charAt(i);
      final String lookup = charMap[character];
      if (lookup == null)
      {
        writer.append(character);
      }
      else
      {
        writer.append("&");
        writer.append(lookup);
        writer.append(";");
      }
    }
    return writer.toString();
  }
  /**
   * Decode the string, all known entities are replaced by their resolved
   * characters.
   *
   * @param value the string that should be decoded.
   * @return the decoded string.
   */
  public String decodeEntities(final String value)
  {
    if (value == null)
    {
      throw new NullPointerException();
    }
    int parserIndex = 0;
    int subStart = value.indexOf("&", parserIndex);
    if (subStart == -1)
    {
      return value;
    }
    int subEnd = value.indexOf(";", subStart);
    if (subEnd == -1)
    {
      return value;
    }
    final StringBuffer bufValue = new StringBuffer(value.substring(0, subStart));
    do
    {
      // at this point we know, that there is at least one entity ..
      if (value.charAt(subStart + 1) == "#")
      {
        final int subValue = parseInt(value.substring(subStart + 2, subEnd), 0);
        if ((subValue >= 1) && (subValue <= 65536))
        {
          final char[] chr = new char[1];
          chr[0] = (char) subValue;
          bufValue.append(chr);
        }
        else
        {
          // invalid entity, do not decode ..
          bufValue.append(value.substring(subStart, subEnd));
        }
      }
      else
      {
        final String entity = value.substring(subStart + 1, subEnd);
        final String replaceString = lookupCharacter(entity);
        if (replaceString != null)
        {
          bufValue.append(decodeEntities(replaceString));
        }
        else
        {
          bufValue.append("&");
          bufValue.append(entity);
          bufValue.append(";");
        }
      }
      parserIndex = subEnd + 1;
      subStart = value.indexOf("&", parserIndex);
      if (subStart == -1)
      {
        bufValue.append(value.substring(parserIndex));
        subEnd = -1;
      }
      else
      {
        subEnd = value.indexOf(";", subStart);
        if (subEnd == -1)
        {
          bufValue.append(value.substring(parserIndex));
        }
        else
        {
          bufValue.append(value.substring(parserIndex, subStart));
        }
      }
    }
    while (subStart != -1 && subEnd != -1);
    return bufValue.toString();
  }
  /**
   * Parses the given string into an int-value. On errors the default value
   * is returned.
   *
   * @param s          the string
   * @param defaultVal the default value that should be used in case of errors
   * @return the parsed int or the default value.
   */
  private int parseInt(final String s, final int defaultVal)
  {
    if (s == null)
    {
      return defaultVal;
    }
    try
    {
      return Integer.parseInt(s);
    }
    catch (Exception e)
    {
      // ignored ..
    }
    return defaultVal;
  }
}
   
   
Convert to HTML string
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
/*
 *  soapUI, copyright (C) 2004-2009 eviware.ru 
 *
 *  soapUI is free software; you can redistribute it and/or modify it under the 
 *  terms of version 2.1 of the GNU Lesser General Public License as published by 
 *  the Free Software Foundation.
 *
 *  soapUI is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without 
 *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 
 *  See the GNU Lesser General Public License for more details at gnu.org.
 */
public class Utils {
  
  public static String toHtml( String string )
  {
    if( StringUtils.isNullOrEmpty( string ) )
      return "<html><body></body></html>";
    BufferedReader st = new BufferedReader( new StringReader( string ) );
    StringBuffer buf = new StringBuffer( "<html><body>" );
    try
    {
      String str = st.readLine();
      while( str != null )
      {
        if( str.equalsIgnoreCase( "<br/>" ) )
        {
          str = "<br>";
        }
        buf.append( str );
        if( !str.equalsIgnoreCase( "<br>" ) )
        {
          buf.append( "<br>" );
        }
        str = st.readLine();
      }
    }
    catch( IOException e )
    {
      e.printStackTrace();
    }
    buf.append( "</body></html>" );
    string = buf.toString();
    return string;
  }
}
   
   
Decode an HTML color string like "#F567BA;" into a Color
/*
 * Copyright 2005 Joe Walker
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import java.awt.Color;
/**
 * Utilities for working with colors.
 * @author Joe Walker [joe at getahead dot ltd dot uk]
 */
public class ColorUtil
{
    /**
     * Decode an HTML color string like "#F567BA;" into a {@link Color}
     * @param colorString The string to decode
     * @return The decoded color
     * @throws IllegalArgumentException if the color sequence is not valid
     */
    public static Color decodeHtmlColorString(String colorString)
    {
        Color color;
    
        if (colorString.startsWith("#"))
        {
            colorString = colorString.substring(1);
        }
        if (colorString.endsWith(";"))
        {
            colorString = colorString.substring(0, colorString.length() - 1);
        }
    
        int red, green, blue;
        switch (colorString.length())
        {
        case 6:
            red = Integer.parseInt(colorString.substring(0, 2), 16);
            green = Integer.parseInt(colorString.substring(2, 4), 16);
            blue = Integer.parseInt(colorString.substring(4, 6), 16);
            color = new Color(red, green, blue);
            break;
        case 3:
            red = Integer.parseInt(colorString.substring(0, 1), 16);
            green = Integer.parseInt(colorString.substring(1, 2), 16);
            blue = Integer.parseInt(colorString.substring(2, 3), 16);
            color = new Color(red, green, blue);
            break;
        case 1:
            red = green = blue = Integer.parseInt(colorString.substring(0, 1), 16);
            color = new Color(red, green, blue);
            break;
        default:
            throw new IllegalArgumentException("Invalid color: " + colorString);
        }
        return color;
    }
}
   
   
Escape HTML
/*
 * Static String formatting and query routines.
 * Copyright (C) 2001-2005 Stephen Ostermiller
 * http://ostermiller.org/contact.pl?regarding=Java+Utilities
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * See COPYING.TXT for details.
 */
import java.util.HashMap;
import java.util.regex.Pattern;
/**
 * Utilities for String formatting, manipulation, and queries.
 * More information about this class is available from .
 *
 * @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities
 * @since ostermillerutils 1.00.00
 */
public class StringHelper {
  /**
   * Replaces characters that may be confused by a HTML
   * parser with their equivalent character entity references.
   * 
   * Any data that will appear as text on a web page should
   * be be escaped.  This is especially important for data
   * that comes from untrusted sources such as Internet users.
   * A common mistake in CGI programming is to ask a user for
   * data and then put that data on a web page.  For example:<pre>
   * Server: What is your name?
   * User: <b>Joe<b>
   * Server: Hello <b>Joe</b>, Welcome</pre>
   * If the name is put on the page without checking that it doesn"t
   * contain HTML code or without sanitizing that HTML code, the user
   * could reformat the page, insert scripts, and control the the
   * content on your web server.
   * 
   * This method will replace HTML characters such as > with their
   * HTML entity reference (&gt;) so that the html parser will
   * be sure to interpret them as plain text rather than HTML or script.
   * 
   * This method should be used for both data to be displayed in text
   * in the html document, and data put in form elements. For example:<br>
   * <code><html><body><i>This in not a &lt;tag&gt;
   * in HTML</i></body></html></code><br>
   * and<br>
   * <code><form><input type="hidden" name="date" value="<i>This data could
   * be &quot;malicious&quot;</i>"></form></code><br>
   * In the second example, the form data would be properly be resubmitted
   * to your cgi script in the URLEncoded format:<br>
   * <code><i>This data could be %22malicious%22</i></code>
   *
   * @param s String to be escaped
   * @return escaped String
   * @throws NullPointerException if s is null.
   *
   * @since ostermillerutils 1.00.00
   */
  public static String escapeHTML(String s){
    int length = s.length();
    int newLength = length;
    boolean someCharacterEscaped = false;
    // first check for characters that might
    // be dangerous and calculate a length
    // of the string that has escapes.
    for (int i=0; i<length; i++){
      char c = s.charAt(i);
      int cint = 0xffff & c;
      if (cint < 32){
        switch(c){
          case "\r":
          case "\n":
          case "\t":
          case "\f":{
          } break;
          default: {
            newLength -= 1;
            someCharacterEscaped = true;
          }
        }
      } else {
        switch(c){
          case "\"":{
            newLength += 5;
            someCharacterEscaped = true;
          } break;
          case "&":
          case "\"":{
            newLength += 4;
            someCharacterEscaped = true;
          } break;
          case "<":
          case ">":{
            newLength += 3;
            someCharacterEscaped = true;
          } break;
        }
      }
    }
    if (!someCharacterEscaped){
      // nothing to escape in the string
      return s;
    }
    StringBuffer sb = new StringBuffer(newLength);
    for (int i=0; i<length; i++){
      char c = s.charAt(i);
      int cint = 0xffff & c;
      if (cint < 32){
        switch(c){
          case "\r":
          case "\n":
          case "\t":
          case "\f":{
            sb.append(c);
          } break;
          default: {
            // Remove this character
          }
        }
      } else {
        switch(c){
          case "\"":{
            sb.append(""");
          } break;
          case "\"":{
            sb.append("'");
          } break;
          case "&":{
            sb.append("&");
          } break;
          case "<":{
            sb.append("<");
          } break;
          case ">":{
            sb.append(">");
          } break;
          default: {
            sb.append(c);
          }
        }
      }
    }
    return sb.toString();
  }
}
   
   
extends HTMLEditorKit.ParserCallback
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Enumeration;
import javax.swing.text.AttributeSet;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
public class MainClass {
  public static void main(String[] args) throws Exception {
    ParserGetter kit = new ParserGetter();
    HTMLEditorKit.Parser parser = kit.getParser();
    URL u = new URL("http://www.jexp.ru");
    InputStream in = u.openStream();
    InputStreamReader r = new InputStreamReader(in);
    String remoteFileName = u.getFile();
    if (remoteFileName.endsWith("/")) {
      remoteFileName += "index.html";
    }
    if (remoteFileName.startsWith("/")) {
      remoteFileName = remoteFileName.substring(1);
    }
    File localDirectory = new File(u.getHost());
    while (remoteFileName.indexOf("/") > -1) {
      String part = remoteFileName.substring(0, remoteFileName.indexOf("/"));
      remoteFileName = remoteFileName.substring(remoteFileName.indexOf("/") + 1);
      localDirectory = new File(localDirectory, part);
    }
    if (localDirectory.mkdirs()) {
      File output = new File(localDirectory, remoteFileName);
      FileWriter out = new FileWriter(output);
      HTMLEditorKit.ParserCallback callback = new PageSaver(out, u);
      parser.parse(r, callback, false);
    }
  }
}
class PageSaver extends HTMLEditorKit.ParserCallback {
  private Writer out;
  private URL base;
  public PageSaver(Writer out, URL base) {
    this.out = out;
    this.base = base;
  }
  public void handleStartTag(HTML.Tag tag, MutableAttributeSet attributes, int position) {
    try {
      out.write("<" + tag);
      this.writeAttributes(attributes);
       if (tag == HTML.Tag.APPLET && attributes.getAttribute(HTML.Attribute.CODEBASE) == null) {
        String codebase = base.toString();
        if (codebase.endsWith(".htm") || codebase.endsWith(".html")) {
          codebase = codebase.substring(0, codebase.lastIndexOf("/"));
        }
        out.write(" codebase=\"" + codebase + "\"");
      }
      out.write(">");
      out.flush();
    } catch (IOException ex) {
      System.err.println(ex);
    }
  }
  public void handleEndTag(HTML.Tag tag, int position) {
    try {
      out.write("</" + tag + ">");
      out.flush();
    } catch (IOException ex) {
      System.err.println(ex);
    }
  }
  private void writeAttributes(AttributeSet attributes) throws IOException {
    Enumeration e = attributes.getAttributeNames();
    while (e.hasMoreElements()) {
      Object name = e.nextElement();
      String value = (String) attributes.getAttribute(name);
      try {
        if (name == HTML.Attribute.HREF || name == HTML.Attribute.SRC
            || name == HTML.Attribute.LOWSRC || name == HTML.Attribute.CODEBASE) {
          URL u = new URL(base, value);
          out.write(" " + name + "=\"" + u + "\"");
        } else {
          out.write(" " + name + "=\"" + value + "\"");
        }
      } catch (MalformedURLException ex) {
        System.err.println(ex);
        System.err.println(base);
        System.err.println(value);
        ex.printStackTrace();
      }
    }
  }
  public void handleComment(char[] text, int position) {
    try {
      out.write("<!-- ");
      out.write(text);
      out.write(" -->");
      out.flush();
    } catch (IOException ex) {
      System.err.println(ex);
    }
  }
  public void handleText(char[] text, int position) {
    try {
      out.write(text);
      out.flush();
    } catch (IOException ex) {
      System.err.println(ex);
    }
  }
  public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attributes, int position) {
    try {
      out.write("<" + tag);
      this.writeAttributes(attributes);
      out.write(">");
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
}
class ParserGetter extends HTMLEditorKit {
  public HTMLEditorKit.Parser getParser() {
    return super.getParser();
  }
}
   
   
Filter message string for characters that are sensitive in HTML
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
 * HTML filter utility.
 *
 * @author Craig R. McClanahan
 * @author Tim Tye
 * @version $Revision: 467217 $ $Date: 2006-10-24 05:14:34 +0200 (mar., 24 oct. 2006) $
 */
public final class HTMLFilter {
    /**
     * Filter the specified message string for characters that are sensitive
     * in HTML.  This avoids potential attacks caused by including JavaScript
     * codes in the request URL that is often reported in error messages.
     *
     * @param message The message string to be filtered
     */
    public static String filter(String message) {
        if (message == null)
            return (null);
        char content[] = new char[message.length()];
        message.getChars(0, message.length(), content, 0);
        StringBuffer result = new StringBuffer(content.length + 50);
        for (int i = 0; i < content.length; i++) {
            switch (content[i]) {
            case "<":
                result.append("<");
                break;
            case ">":
                result.append(">");
                break;
            case "&":
                result.append("&");
                break;
            case """:
                result.append(""");
                break;
            default:
                result.append(content[i]);
            }
        }
        return (result.toString());
    }
}
   
   
Filter the specified message string for characters that are sensitive in HTML
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
 * HTML filter utility.
 *
 * @author Craig R. McClanahan
 * @author Tim Tye
 * @version $Revision: 467217 $ $Date: 2006-10-24 05:14:34 +0200 (Tue, 24 Oct 2006) $
 */
public final class HTMLFilter {
    /**
     * Filter the specified message string for characters that are sensitive
     * in HTML.  This avoids potential attacks caused by including JavaScript
     * codes in the request URL that is often reported in error messages.
     *
     * @param message The message string to be filtered
     */
    public static String filter(String message) {
        if (message == null)
            return (null);
        char content[] = new char[message.length()];
        message.getChars(0, message.length(), content, 0);
        StringBuffer result = new StringBuffer(content.length + 50);
        for (int i = 0; i < content.length; i++) {
            switch (content[i]) {
            case "<":
                result.append("<");
                break;
            case ">":
                result.append(">");
                break;
            case "&":
                result.append("&");
                break;
            case """:
                result.append(""");
                break;
            default:
                result.append(content[i]);
            }
        }
        return (result.toString());
    }
}
   
   
HTML color names
//Revised from com.greef.ui;
import java.util.Map;
import java.util.HashMap;
import java.util.Collection;
import java.lang.reflect.Field;
import java.awt.Color;
/**
 * HTML color names. It"s intended use is for parsing a name and return the
 * corresponding color or return a name for a given color.
 * @author Adrian Ber
 */
public class HTMLColors {
    /** Don"t instantiate this, use only the static methods */
    private HTMLColors() {
    }
    /** map between color names and colors;
     * tough there are fields for every color we use a map because is a faster
     * way to get the color
     */
    private static Map<String, Color> name2color = new HashMap<String, Color>();
    /** map between colors and color names;
     * tough there are fields for every color we use a map because is a faster
     * way to get the color
     */
    private static Map<Color, String> color2name = new HashMap<Color, String>();
    /** Initialiase colors map */
    private static void initColorsMap() {
        Field[] fields = HTMLColors.class.getFields();
        for (Field field : fields) {
            if (field.getType().isAssignableFrom(Color.class)) {
                addColor(field.getName());
            }
        }
    }
    /** Used to initialize the map */
    private static void addColor(String colorName, Color color) {
        name2color.put(colorName, color);
        color2name.put(color, colorName);
    }
    /** Used to initialize the map */
    private static void addColor(String colorName) {
        addColor(colorName, getColorFromField(colorName));
    }
    /** Used to initialize the map */
    private static void addColor(String colorName, int colorRGB) {
        addColor(colorName, new Color(colorRGB));
    }
    /** Returns a color with the specified case-insensitive name. */
    private static Color getColorFromField(String name) {
        try {
            Field colorField = HTMLColors.class.getField(name.toLowerCase());
            return (Color) colorField.get(HTMLColors.class);
        }
        catch (NoSuchFieldException exc) {
        }
        catch (SecurityException exc) {
        }
        catch (IllegalAccessException exc) {
        }
        catch (IllegalArgumentException exc) {
        }
        return null;
    }
    /** Returns a color with the specified case-insensitive name.*/
    public static String getName(Color color) {
        return color2name.get(color);
    }
    /** Returns a color with the specified case-insensitive name.*/
    public static Color getColor(String name) {
        return name2color.get(name.toLowerCase());
    }
    /** Returns a collection of all color names */
    public static Collection<String> colors() {
        return name2color.keySet();
    }
    /** Transform a color string into a color object.
     *  @param s the color string
     *  @return the color object
     */
    public static Color decodeColor(String s) {
        if (s == null)
            return null;
        Color c;
        try {
            c = Color.decode(s);
        }
        catch (NumberFormatException exc) {
            c = HTMLColors.getColor(s);
        }
        return c;
    }
    public static final Color aliceblue = new Color(0xf0f8ff);
    public static final Color antiquewhite = new Color(0xfaebd7);
    public static final Color aqua = new Color(0x00ffff);
    public static final Color aquamarine = new Color(0x7fffd4);
    public static final Color azure = new Color(0xf0ffff);
    public static final Color beige = new Color(0xf5f5dc);
    public static final Color bisque = new Color(0xffe4c4);
    public static final Color black = new Color(0x000000);
    public static final Color blanchedalmond = new Color(0xffebcd);
    public static final Color blue = new Color(0x0000ff);
    public static final Color blueviolet = new Color(0x8a2be2);
    public static final Color brown = new Color(0xa52a2a);
    public static final Color burlywood = new Color(0xdeb887);
    public static final Color cadetblue = new Color(0x5f9ea0);
    public static final Color chartreuse = new Color(0x7fff00);
    public static final Color chocolate = new Color(0xd2691e);
    public static final Color coral = new Color(0xff7f50);
    public static final Color cornflowerblue = new Color(0x6495ed);
    public static final Color cornsilk = new Color(0xfff8dc);
    public static final Color crimson = new Color(0xdc143c);
    public static final Color cyan = new Color(0x00ffff);
    public static final Color darkblue = new Color(0x00008b);
    public static final Color darkcyan = new Color(0x008b8b);
    public static final Color darkgoldenrod = new Color(0xb8860b);
    public static final Color darkgray = new Color(0xa9a9a9);
    public static final Color darkgreen = new Color(0x006400);
    public static final Color darkkhaki = new Color(0xbdb76b);
    public static final Color darkmagenta = new Color(0x8b008b);
    public static final Color darkolivegreen = new Color(0x556b2f);
    public static final Color darkorange = new Color(0xff8c00);
    public static final Color darkorchid = new Color(0x9932cc);
    public static final Color darkred = new Color(0x8b0000);
    public static final Color darksalmon = new Color(0xe9967a);
    public static final Color darkseagreen = new Color(0x8fbc8f);
    public static final Color darkslateblue = new Color(0x483d8b);
    public static final Color darkslategray = new Color(0x2f4f4f);
    public static final Color darkturquoise = new Color(0x00ced1);
    public static final Color darkviolet = new Color(0x9400d3);
    public static final Color deeppink = new Color(0xff1493);
    public static final Color deepskyblue = new Color(0x00bfff);
    public static final Color dimgray = new Color(0x696969);
    public static final Color dodgerblue = new Color(0x1e90ff);
    public static final Color firebrick = new Color(0xb22222);
    public static final Color floralwhite = new Color(0xfffaf0);
    public static final Color forestgreen = new Color(0x228b22);
    public static final Color fuchsia = new Color(0xff00ff);
    public static final Color gainsboro = new Color(0xdcdcdc);
    public static final Color ghostwhite = new Color(0xf8f8ff);
    public static final Color gold = new Color(0xffd700);
    public static final Color goldenrod = new Color(0xdaa520);
    public static final Color gray = new Color(0x808080);
    public static final Color green = new Color(0x008000);
    public static final Color greenyellow = new Color(0xadff2f);
    public static final Color honeydew = new Color(0xf0fff0);
    public static final Color hotpink = new Color(0xff69b4);
    public static final Color indianred = new Color(0xcd5c5c);
    public static final Color indigo = new Color(0x4b0082);
    public static final Color ivory = new Color(0xfffff0);
    public static final Color khaki = new Color(0xf0e68c);
    public static final Color lavender = new Color(0xe6e6fa);
    public static final Color lavenderblush = new Color(0xfff0f5);
    public static final Color lawngreen = new Color(0x7cfc00);
    public static final Color lemonchiffon = new Color(0xfffacd);
    public static final Color lightblue = new Color(0xadd8e6);
    public static final Color lightcoral = new Color(0xf08080);
    public static final Color lightcyan = new Color(0xe0ffff);
    public static final Color lightgoldenrodyellow = new Color(0xfafad2);
    public static final Color lightgreen = new Color(0x90ee90);
    public static final Color lightgrey = new Color(0xd3d3d3);
    public static final Color lightpink = new Color(0xffb6c1);
    public static final Color lightsalmon = new Color(0xffa07a);
    public static final Color lightseagreen = new Color(0x20b2aa);
    public static final Color lightskyblue = new Color(0x87cefa);
    public static final Color lightslategray = new Color(0x778899);
    public static final Color lightsteelblue = new Color(0xb0c4de);
    public static final Color lightyellow = new Color(0xffffe0);
    public static final Color lime = new Color(0x00ff00);
    public static final Color limegreen = new Color(0x32cd32);
    public static final Color linen = new Color(0xfaf0e6);
    public static final Color magenta = new Color(0xff00ff);
    public static final Color maroon = new Color(0x800000);
    public static final Color mediumaquamarine = new Color(0x66cdaa);
    public static final Color mediumblue = new Color(0x0000cd);
    public static final Color mediumorchid = new Color(0xba55d3);
    public static final Color mediumpurple = new Color(0x9370db);
    public static final Color mediumseagreen = new Color(0x3cb371);
    public static final Color mediumslateblue = new Color(0x7b68ee);
    public static final Color mediumspringgreen = new Color(0x00fa9a);
    public static final Color mediumturquoise = new Color(0x48d1cc);
    public static final Color mediumvioletred = new Color(0xc71585);
    public static final Color midnightblue = new Color(0x191970);
    public static final Color mintcream = new Color(0xf5fffa);
    public static final Color mistyrose = new Color(0xffe4e1);
    public static final Color moccasin = new Color(0xffe4b5);
    public static final Color navajowhite = new Color(0xffdead);
    public static final Color navy = new Color(0x000080);
    public static final Color oldlace = new Color(0xfdf5e6);
    public static final Color olive = new Color(0x808000);
    public static final Color olivedrab = new Color(0x6b8e23);
    public static final Color orange = new Color(0xffa500);
    public static final Color orangered = new Color(0xff4500);
    public static final Color orchid = new Color(0xda70d6);
    public static final Color palegoldenrod = new Color(0xeee8aa);
    public static final Color palegreen = new Color(0x98fb98);
    public static final Color paleturquoise = new Color(0xafeeee);
    public static final Color palevioletred = new Color(0xdb7093);
    public static final Color papayawhip = new Color(0xffefd5);
    public static final Color peachpuff = new Color(0xffdab9);
    public static final Color peru = new Color(0xcd853f);
    public static final Color pink = new Color(0xffc0cb);
    public static final Color plum = new Color(0xdda0dd);
    public static final Color powderblue = new Color(0xb0e0e6);
    public static final Color purple = new Color(0x800080);
    public static final Color red = new Color(0xff0000);
    public static final Color rosybrown = new Color(0xbc8f8f);
    public static final Color royalblue = new Color(0x4169e1);
    public static final Color saddlebrown = new Color(0x8b4513);
    public static final Color salmon = new Color(0xfa8072);
    public static final Color sandybrown = new Color(0xf4a460);
    public static final Color seagreen = new Color(0x2e8b57);
    public static final Color seashell = new Color(0xfff5ee);
    public static final Color sienna = new Color(0xa0522d);
    public static final Color silver = new Color(0xc0c0c0);
    public static final Color skyblue = new Color(0x87ceeb);
    public static final Color slateblue = new Color(0x6a5acd);
    public static final Color slategray = new Color(0x708090);
    public static final Color snow = new Color(0xfffafa);
    public static final Color springgreen = new Color(0x00ff7f);
    public static final Color steelblue = new Color(0x4682b4);
    public static final Color tan = new Color(0xd2b48c);
    public static final Color teal = new Color(0x008080);
    public static final Color thistle = new Color(0xd8bfd8);
    public static final Color tomato = new Color(0xff6347);
    public static final Color turquoise = new Color(0x40e0d0);
    public static final Color violet = new Color(0xee82ee);
    public static final Color wheat = new Color(0xf5deb3);
    public static final Color white = new Color(0xffffff);
    public static final Color whitesmoke = new Color(0xf5f5f5);
    public static final Color yellow = new Color(0xffff00);
    public static final Color yellowgreen = new Color(0x9acd32);
    static {
        initColorsMap();
    }
}
   
   
html parser DTD
import java.io.IOException;
import javax.swing.text.html.parser.DTD;
public class MainClass {
  public static void main(String[] args) {
    try {
      DTD d1 = DTD.getDTD("html");
      for (int i = 0; i < 14; i++) {
        System.out.println(d1.getElement(i).getName());
      }
    } catch (IOException e) {
      System.err.println(e);
      e.printStackTrace();
    }
  }
}
   
   
#pcdata html meta base isindex head body applet param p title style link script
insert HTML block dynamically
/*
 *  Licensed to the Apache Software Foundation (ASF) under one
 *  or more contributor license agreements.  See the NOTICE file
 *  distributed with this work for additional information
 *  regarding copyright ownership.  The ASF licenses this file
 *  to you under the Apache License, Version 2.0 (the
 *  "License"); you may not use this file except in compliance
 *  with the License.  You may obtain a copy of the License at
 *  
 *    http://www.apache.org/licenses/LICENSE-2.0
 *  
 *  Unless required by applicable law or agreed to in writing,
 *  software distributed under the License is distributed on an
 *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *  KIND, either express or implied.  See the License for the
 *  specific language governing permissions and limitations
 *  under the License. 
 *  
 */
/**
 * Various string manipulation methods that are more efficient then chaining
 * string operations: all is done in the same buffer without creating a bunch of
 * string objects.
 * 
 * @author 
 */
public class Main {
  /**
   * This method is used to insert HTML block dynamically
   * 
   * @param source
   *            the HTML code to be processes
   * @param replaceNl
   *            if true "\n" will be replaced by <br>
   * @param replaceTag
   *            if true "<" will be replaced by < and ">" will be replaced
   *            by >
   * @param replaceQuote
   *            if true "\"" will be replaced by "
   * @return the formated html block
   */
  public static final String formatHtml( String source, boolean replaceNl, boolean replaceTag,
      boolean replaceQuote )
  {
      StringBuffer buf = new StringBuffer();
      int len = source.length();
      for ( int ii = 0; ii < len; ii++ )
      {
          char ch = source.charAt( ii );
          
          switch ( ch )
          {
              case "\"":
                  if ( replaceQuote )
                  {
                      buf.append( """ );
                  }
                  else
                  {
                      buf.append( ch );
                  }
                  break;
              case "<":
                  if ( replaceTag )
                  {
                      buf.append( "<" );
                  }
                  else
                  {
                      buf.append( ch );
                  }
                  break;
              case ">":
                  if ( replaceTag )
                  {
                      buf.append( ">" );
                  }
                  else
                  {
                      buf.append( ch );
                  }
                  break;
              case "\n":
                  if ( replaceNl )
                  {
                      if ( replaceTag )
                      {
                          buf.append( "<br>" );
                      }
                      else
                      {
                          buf.append( "<br>" );
                      }
                  }
                  else
                  {
                      buf.append( ch );
                  }
                  break;
              case "\r":
                  break;
              case "&":
                  buf.append( "&" );
                  break;
              default:
                  buf.append( ch );
                  break;
          }
      }
      return buf.toString();
  }
}
   
   
List Tags
import javax.swing.text.html.HTML;
public class MainClass {
  public static void main(String[] args) {
    HTML.Tag[] list = HTML.getAllTags();
    for (int i = 0; i < list.length; i++) {
      System.out.println((i + 1) + ": " + list[i]);
    }
  }
}
   
   
1: a 2: address 3: applet 4: area 5: b 6: base 7: basefont 8: big 9: blockquote 10: body 11: br 12: caption 13: center 14: cite 15: code 16: dd 17: dfn 18: dir 19: div 20: dl 21: dt 22: em 23: font 24: form 25: frame 26: frameset 27: h1 28: h2 29: h3 30: h4 31: h5 32: h6 33: head 34: hr 35: html 36: i 37: img 38: input 39: isindex 40: kbd 41: li 42: link 43: map 44: menu 45: meta 46: nobr 47: noframes 48: object 49: ol 50: option 51: p 52: param 53: pre 54: samp 55: script 56: select 57: small 58: span 59: strike 60: s 61: strong 62: style 63: sub 64: sup 65: table 66: td 67: textarea 68: th 69: title 70: tr 71: tt 72: u 73: ul 74: var
Parse HTML
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.Enumeration;
import javax.swing.text.AttributeSet;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
public class MainClass {
  public static void main(String[] args) {
    ParserGetter kit = new ParserGetter();
    HTMLEditorKit.Parser parser = kit.getParser();
    HTMLEditorKit.ParserCallback callback = new ReportAttributes();
    try {
      URL u = new URL("http://www.jexp.ru");
      InputStream in = u.openStream();
      InputStreamReader r = new InputStreamReader(in);
      parser.parse(r, callback, false);
    } catch (IOException e) {
      System.err.println(e);
    }
  }
}
class ReportAttributes extends HTMLEditorKit.ParserCallback {
  public void handleStartTag(HTML.Tag tag, MutableAttributeSet attributes, int position) {
    this.listAttributes(attributes);
  }
  private void listAttributes(AttributeSet attributes) {
    Enumeration e = attributes.getAttributeNames();
    while (e.hasMoreElements()) {
      Object name = e.nextElement();
      Object value = attributes.getAttribute(name);
      if (!attributes.containsAttribute(name.toString(), value)) {
        System.out.println("containsAttribute() fails");
      }
      if (!attributes.isDefined(name.toString())) {
        System.out.println("isDefined() fails");
      }
      System.out.println(name + "=" + value);
    }
  }
  public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attributes, int position) {
    this.listAttributes(attributes);
  }
}
class ParserGetter extends HTMLEditorKit {
  public HTMLEditorKit.Parser getParser() {
    return super.getParser();
  }
}
   
   
Text To HTML
/*
    GNU LESSER GENERAL PUBLIC LICENSE
    Copyright (C) 2006 The XAMJ Project
    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.
    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.
    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    Contact info: lobochief@users.sourceforge.net
*/
public class Html {
  public static String textToHTML(String text) {
    if(text == null) {
      return null;
    }
    int length = text.length();
    boolean prevSlashR = false;
    StringBuffer out = new StringBuffer();
    for(int i = 0; i < length; i++) {
      char ch = text.charAt(i);
      switch(ch) {
      case "\r":
        if(prevSlashR) {
          out.append("<br>");         
        }
        prevSlashR = true;
        break;
      case "\n":
        prevSlashR = false;
        out.append("<br>");
        break;
      case """:
        if(prevSlashR) {
          out.append("<br>");
          prevSlashR = false;         
        }
        out.append(""");
        break;
      case "<":
        if(prevSlashR) {
          out.append("<br>");
          prevSlashR = false;         
        }
        out.append("<");
        break;
      case ">":
        if(prevSlashR) {
          out.append("<br>");
          prevSlashR = false;         
        }
        out.append(">");
        break;
      case "&":
        if(prevSlashR) {
          out.append("<br>");
          prevSlashR = false;         
        }
        out.append("&");
        break;
      default:
        if(prevSlashR) {
          out.append("<br>");
          prevSlashR = false;         
        }
        out.append(ch);
        break;
      }
    }
    return out.toString();
  }
}
   
   
Unescape HTML
/*
 * Static String formatting and query routines.
 * Copyright (C) 2001-2005 Stephen Ostermiller
 * http://ostermiller.org/contact.pl?regarding=Java+Utilities
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * See COPYING.TXT for details.
 */
import java.util.HashMap;
import java.util.regex.Pattern;
/**
 * Utilities for String formatting, manipulation, and queries.
 * More information about this class is available from .
 *
 * @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities
 * @since ostermillerutils 1.00.00
 */
public class StringHelper {
  private static HashMap<String,Integer> htmlEntities = new HashMap<String,Integer>();
  static {
    htmlEntities.put("nbsp", new Integer(160));
    htmlEntities.put("iexcl", new Integer(161));
    htmlEntities.put("cent", new Integer(162));
    htmlEntities.put("pound", new Integer(163));
    htmlEntities.put("curren", new Integer(164));
    htmlEntities.put("yen", new Integer(165));
    htmlEntities.put("brvbar", new Integer(166));
    htmlEntities.put("sect", new Integer(167));
    htmlEntities.put("uml", new Integer(168));
    htmlEntities.put("copy", new Integer(169));
    htmlEntities.put("ordf", new Integer(170));
    htmlEntities.put("laquo", new Integer(171));
    htmlEntities.put("not", new Integer(172));
    htmlEntities.put("shy", new Integer(173));
    htmlEntities.put("reg", new Integer(174));
    htmlEntities.put("macr", new Integer(175));
    htmlEntities.put("deg", new Integer(176));
    htmlEntities.put("plusmn", new Integer(177));
    htmlEntities.put("sup2", new Integer(178));
    htmlEntities.put("sup3", new Integer(179));
    htmlEntities.put("acute", new Integer(180));
    htmlEntities.put("micro", new Integer(181));
    htmlEntities.put("para", new Integer(182));
    htmlEntities.put("middot", new Integer(183));
    htmlEntities.put("cedil", new Integer(184));
    htmlEntities.put("sup1", new Integer(185));
    htmlEntities.put("ordm", new Integer(186));
    htmlEntities.put("raquo", new Integer(187));
    htmlEntities.put("frac14", new Integer(188));
    htmlEntities.put("frac12", new Integer(189));
    htmlEntities.put("frac34", new Integer(190));
    htmlEntities.put("iquest", new Integer(191));
    htmlEntities.put("Agrave", new Integer(192));
    htmlEntities.put("Aacute", new Integer(193));
    htmlEntities.put("Acirc", new Integer(194));
    htmlEntities.put("Atilde", new Integer(195));
    htmlEntities.put("Auml", new Integer(196));
    htmlEntities.put("Aring", new Integer(197));
    htmlEntities.put("AElig", new Integer(198));
    htmlEntities.put("Ccedil", new Integer(199));
    htmlEntities.put("Egrave", new Integer(200));
    htmlEntities.put("Eacute", new Integer(201));
    htmlEntities.put("Ecirc", new Integer(202));
    htmlEntities.put("Euml", new Integer(203));
    htmlEntities.put("Igrave", new Integer(204));
    htmlEntities.put("Iacute", new Integer(205));
    htmlEntities.put("Icirc", new Integer(206));
    htmlEntities.put("Iuml", new Integer(207));
    htmlEntities.put("ETH", new Integer(208));
    htmlEntities.put("Ntilde", new Integer(209));
    htmlEntities.put("Ograve", new Integer(210));
    htmlEntities.put("Oacute", new Integer(211));
    htmlEntities.put("Ocirc", new Integer(212));
    htmlEntities.put("Otilde", new Integer(213));
    htmlEntities.put("Ouml", new Integer(214));
    htmlEntities.put("times", new Integer(215));
    htmlEntities.put("Oslash", new Integer(216));
    htmlEntities.put("Ugrave", new Integer(217));
    htmlEntities.put("Uacute", new Integer(218));
    htmlEntities.put("Ucirc", new Integer(219));
    htmlEntities.put("Uuml", new Integer(220));
    htmlEntities.put("Yacute", new Integer(221));
    htmlEntities.put("THORN", new Integer(222));
    htmlEntities.put("szlig", new Integer(223));
    htmlEntities.put("agrave", new Integer(224));
    htmlEntities.put("aacute", new Integer(225));
    htmlEntities.put("acirc", new Integer(226));
    htmlEntities.put("atilde", new Integer(227));
    htmlEntities.put("auml", new Integer(228));
    htmlEntities.put("aring", new Integer(229));
    htmlEntities.put("aelig", new Integer(230));
    htmlEntities.put("ccedil", new Integer(231));
    htmlEntities.put("egrave", new Integer(232));
    htmlEntities.put("eacute", new Integer(233));
    htmlEntities.put("ecirc", new Integer(234));
    htmlEntities.put("euml", new Integer(235));
    htmlEntities.put("igrave", new Integer(236));
    htmlEntities.put("iacute", new Integer(237));
    htmlEntities.put("icirc", new Integer(238));
    htmlEntities.put("iuml", new Integer(239));
    htmlEntities.put("eth", new Integer(240));
    htmlEntities.put("ntilde", new Integer(241));
    htmlEntities.put("ograve", new Integer(242));
    htmlEntities.put("oacute", new Integer(243));
    htmlEntities.put("ocirc", new Integer(244));
    htmlEntities.put("otilde", new Integer(245));
    htmlEntities.put("ouml", new Integer(246));
    htmlEntities.put("divide", new Integer(247));
    htmlEntities.put("oslash", new Integer(248));
    htmlEntities.put("ugrave", new Integer(249));
    htmlEntities.put("uacute", new Integer(250));
    htmlEntities.put("ucirc", new Integer(251));
    htmlEntities.put("uuml", new Integer(252));
    htmlEntities.put("yacute", new Integer(253));
    htmlEntities.put("thorn", new Integer(254));
    htmlEntities.put("yuml", new Integer(255));
    htmlEntities.put("fnof", new Integer(402));
    htmlEntities.put("Alpha", new Integer(913));
    htmlEntities.put("Beta", new Integer(914));
    htmlEntities.put("Gamma", new Integer(915));
    htmlEntities.put("Delta", new Integer(916));
    htmlEntities.put("Epsilon", new Integer(917));
    htmlEntities.put("Zeta", new Integer(918));
    htmlEntities.put("Eta", new Integer(919));
    htmlEntities.put("Theta", new Integer(920));
    htmlEntities.put("Iota", new Integer(921));
    htmlEntities.put("Kappa", new Integer(922));
    htmlEntities.put("Lambda", new Integer(923));
    htmlEntities.put("Mu", new Integer(924));
    htmlEntities.put("Nu", new Integer(925));
    htmlEntities.put("Xi", new Integer(926));
    htmlEntities.put("Omicron", new Integer(927));
    htmlEntities.put("Pi", new Integer(928));
    htmlEntities.put("Rho", new Integer(929));
    htmlEntities.put("Sigma", new Integer(931));
    htmlEntities.put("Tau", new Integer(932));
    htmlEntities.put("Upsilon", new Integer(933));
    htmlEntities.put("Phi", new Integer(934));
    htmlEntities.put("Chi", new Integer(935));
    htmlEntities.put("Psi", new Integer(936));
    htmlEntities.put("Omega", new Integer(937));
    htmlEntities.put("alpha", new Integer(945));
    htmlEntities.put("beta", new Integer(946));
    htmlEntities.put("gamma", new Integer(947));
    htmlEntities.put("delta", new Integer(948));
    htmlEntities.put("epsilon", new Integer(949));
    htmlEntities.put("zeta", new Integer(950));
    htmlEntities.put("eta", new Integer(951));
    htmlEntities.put("theta", new Integer(952));
    htmlEntities.put("iota", new Integer(953));
    htmlEntities.put("kappa", new Integer(954));
    htmlEntities.put("lambda", new Integer(955));
    htmlEntities.put("mu", new Integer(956));
    htmlEntities.put("nu", new Integer(957));
    htmlEntities.put("xi", new Integer(958));
    htmlEntities.put("omicron", new Integer(959));
    htmlEntities.put("pi", new Integer(960));
    htmlEntities.put("rho", new Integer(961));
    htmlEntities.put("sigmaf", new Integer(962));
    htmlEntities.put("sigma", new Integer(963));
    htmlEntities.put("tau", new Integer(964));
    htmlEntities.put("upsilon", new Integer(965));
    htmlEntities.put("phi", new Integer(966));
    htmlEntities.put("chi", new Integer(967));
    htmlEntities.put("psi", new Integer(968));
    htmlEntities.put("omega", new Integer(969));
    htmlEntities.put("thetasym", new Integer(977));
    htmlEntities.put("upsih", new Integer(978));
    htmlEntities.put("piv", new Integer(982));
    htmlEntities.put("bull", new Integer(8226));
    htmlEntities.put("hellip", new Integer(8230));
    htmlEntities.put("prime", new Integer(8242));
    htmlEntities.put("Prime", new Integer(8243));
    htmlEntities.put("oline", new Integer(8254));
    htmlEntities.put("frasl", new Integer(8260));
    htmlEntities.put("weierp", new Integer(8472));
    htmlEntities.put("image", new Integer(8465));
    htmlEntities.put("real", new Integer(8476));
    htmlEntities.put("trade", new Integer(8482));
    htmlEntities.put("alefsym", new Integer(8501));
    htmlEntities.put("larr", new Integer(8592));
    htmlEntities.put("uarr", new Integer(8593));
    htmlEntities.put("rarr", new Integer(8594));
    htmlEntities.put("darr", new Integer(8595));
    htmlEntities.put("harr", new Integer(8596));
    htmlEntities.put("crarr", new Integer(8629));
    htmlEntities.put("lArr", new Integer(8656));
    htmlEntities.put("uArr", new Integer(8657));
    htmlEntities.put("rArr", new Integer(8658));
    htmlEntities.put("dArr", new Integer(8659));
    htmlEntities.put("hArr", new Integer(8660));
    htmlEntities.put("forall", new Integer(8704));
    htmlEntities.put("part", new Integer(8706));
    htmlEntities.put("exist", new Integer(8707));
    htmlEntities.put("empty", new Integer(8709));
    htmlEntities.put("nabla", new Integer(8711));
    htmlEntities.put("isin", new Integer(8712));
    htmlEntities.put("notin", new Integer(8713));
    htmlEntities.put("ni", new Integer(8715));
    htmlEntities.put("prod", new Integer(8719));
    htmlEntities.put("sum", new Integer(8721));
    htmlEntities.put("minus", new Integer(8722));
    htmlEntities.put("lowast", new Integer(8727));
    htmlEntities.put("radic", new Integer(8730));
    htmlEntities.put("prop", new Integer(8733));
    htmlEntities.put("infin", new Integer(8734));
    htmlEntities.put("ang", new Integer(8736));
    htmlEntities.put("and", new Integer(8743));
    htmlEntities.put("or", new Integer(8744));
    htmlEntities.put("cap", new Integer(8745));
    htmlEntities.put("cup", new Integer(8746));
    htmlEntities.put("int", new Integer(8747));
    htmlEntities.put("there4", new Integer(8756));
    htmlEntities.put("sim", new Integer(8764));
    htmlEntities.put("cong", new Integer(8773));
    htmlEntities.put("asymp", new Integer(8776));
    htmlEntities.put("ne", new Integer(8800));
    htmlEntities.put("equiv", new Integer(8801));
    htmlEntities.put("le", new Integer(8804));
    htmlEntities.put("ge", new Integer(8805));
    htmlEntities.put("sub", new Integer(8834));
    htmlEntities.put("sup", new Integer(8835));
    htmlEntities.put("nsub", new Integer(8836));
    htmlEntities.put("sube", new Integer(8838));
    htmlEntities.put("supe", new Integer(8839));
    htmlEntities.put("oplus", new Integer(8853));
    htmlEntities.put("otimes", new Integer(8855));
    htmlEntities.put("perp", new Integer(8869));
    htmlEntities.put("sdot", new Integer(8901));
    htmlEntities.put("lceil", new Integer(8968));
    htmlEntities.put("rceil", new Integer(8969));
    htmlEntities.put("lfloor", new Integer(8970));
    htmlEntities.put("rfloor", new Integer(8971));
    htmlEntities.put("lang", new Integer(9001));
    htmlEntities.put("rang", new Integer(9002));
    htmlEntities.put("loz", new Integer(9674));
    htmlEntities.put("spades", new Integer(9824));
    htmlEntities.put("clubs", new Integer(9827));
    htmlEntities.put("hearts", new Integer(9829));
    htmlEntities.put("diams", new Integer(9830));
    htmlEntities.put("quot", new Integer(34));
    htmlEntities.put("amp", new Integer(38));
    htmlEntities.put("lt", new Integer(60));
    htmlEntities.put("gt", new Integer(62));
    htmlEntities.put("OElig", new Integer(338));
    htmlEntities.put("oelig", new Integer(339));
    htmlEntities.put("Scaron", new Integer(352));
    htmlEntities.put("scaron", new Integer(353));
    htmlEntities.put("Yuml", new Integer(376));
    htmlEntities.put("circ", new Integer(710));
    htmlEntities.put("tilde", new Integer(732));
    htmlEntities.put("ensp", new Integer(8194));
    htmlEntities.put("emsp", new Integer(8195));
    htmlEntities.put("thinsp", new Integer(8201));
    htmlEntities.put("zwnj", new Integer(8204));
    htmlEntities.put("zwj", new Integer(8205));
    htmlEntities.put("lrm", new Integer(8206));
    htmlEntities.put("rlm", new Integer(8207));
    htmlEntities.put("ndash", new Integer(8211));
    htmlEntities.put("mdash", new Integer(8212));
    htmlEntities.put("lsquo", new Integer(8216));
    htmlEntities.put("rsquo", new Integer(8217));
    htmlEntities.put("sbquo", new Integer(8218));
    htmlEntities.put("ldquo", new Integer(8220));
    htmlEntities.put("rdquo", new Integer(8221));
    htmlEntities.put("bdquo", new Integer(8222));
    htmlEntities.put("dagger", new Integer(8224));
    htmlEntities.put("Dagger", new Integer(8225));
    htmlEntities.put("permil", new Integer(8240));
    htmlEntities.put("lsaquo", new Integer(8249));
    htmlEntities.put("rsaquo", new Integer(8250));
    htmlEntities.put("euro", new Integer(8364));
  }
  /**
   * Turn any HTML escape entities in the string into
   * characters and return the resulting string.
   *
   * @param s String to be unescaped.
   * @return unescaped String.
   * @throws NullPointerException if s is null.
   *
   * @since ostermillerutils 1.00.00
   */
  public static String unescapeHTML(String s){
    StringBuffer result = new StringBuffer(s.length());
    int ampInd = s.indexOf("&");
    int lastEnd = 0;
    while (ampInd >= 0){
      int nextAmp = s.indexOf("&", ampInd+1);
      int nextSemi = s.indexOf(";", ampInd+1);
      if (nextSemi != -1 && (nextAmp == -1 || nextSemi < nextAmp)){
        int value = -1;
        String escape = s.substring(ampInd+1,nextSemi);
        try {
          if (escape.startsWith("#")){
            value = Integer.parseInt(escape.substring(1), 10);
          } else {
            if (htmlEntities.containsKey(escape)){
              value = ((Integer)(htmlEntities.get(escape))).intValue();
            }
          }
        } catch (NumberFormatException x){
        }
        result.append(s.substring(lastEnd, ampInd));
        lastEnd = nextSemi + 1;
        if (value >= 0 && value <= 0xffff){
          result.append((char)value);
        } else {
          result.append("&").append(escape).append(";");
        }
      }
      ampInd = nextAmp;
    }
    result.append(s.substring(lastEnd));
    return result.toString();
  }
}
   
   
Use javax.swing.text.html.HTMLEditorKit to parse HTML
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URL;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
public class MainClass {
  private static void parse(URL url, String encoding) throws IOException {
    ParserGetter kit = new ParserGetter();
    HTMLEditorKit.Parser parser = kit.getParser();
    InputStream in = url.openStream();
    InputStreamReader r = new InputStreamReader(in, encoding);
    HTMLEditorKit.ParserCallback callback = new Outliner(new OutputStreamWriter(System.out));
    parser.parse(r, callback, true);
  }
  public static void main(String[] args) throws Exception {
    ParserGetter kit = new ParserGetter();
    HTMLEditorKit.Parser parser = kit.getParser();
    String encoding = "ISO-8859-1";
    URL url = new URL("http://www.jexp.ru");
    InputStream in = url.openStream();
    InputStreamReader r = new InputStreamReader(in, encoding);
    // parse once just to detect the encoding
    HTMLEditorKit.ParserCallback doNothing = new HTMLEditorKit.ParserCallback();
    parser.parse(r, doNothing, false);
    parse(url, encoding);
  }
}
class Outliner extends HTMLEditorKit.ParserCallback {
  private Writer out;
  private int level = 0;
  private boolean inHeader = false;
  private static String lineSeparator = System.getProperty("line.separator", "\r\n");
  public Outliner(Writer out) {
    this.out = out;
  }
  public void handleStartTag(HTML.Tag tag, MutableAttributeSet attributes, int position) {
    int newLevel = 0;
    if (tag == HTML.Tag.H1)
      newLevel = 1;
    else if (tag == HTML.Tag.H2)
      newLevel = 2;
    else if (tag == HTML.Tag.H3)
      newLevel = 3;
    else if (tag == HTML.Tag.H4)
      newLevel = 4;
    else if (tag == HTML.Tag.H5)
      newLevel = 5;
    else if (tag == HTML.Tag.H6)
      newLevel = 6;
    else
      return;
    this.inHeader = true;
    try {
      if (newLevel > this.level) {
        for (int i = 0; i < newLevel - this.level; i++) {
          out.write("<ul>" + lineSeparator + "<li>");
        }
      } else if (newLevel < this.level) {
        for (int i = 0; i < this.level - newLevel; i++) {
          out.write(lineSeparator + "</ul>" + lineSeparator);
        }
        out.write(lineSeparator + "<li>");
      } else {
        out.write(lineSeparator + "<li>");
      }
      this.level = newLevel;
      out.flush();
    } catch (IOException ex) {
      System.err.println(ex);
    }
  }
  public void handleEndTag(HTML.Tag tag, int position) {
    if (tag == HTML.Tag.H1 || tag == HTML.Tag.H2 || tag == HTML.Tag.H3 || tag == HTML.Tag.H4
        || tag == HTML.Tag.H5 || tag == HTML.Tag.H6) {
      inHeader = false;
    }
    // work around bug in the parser that fails to call flush
    if (tag == HTML.Tag.HTML)
      this.flush();
  }
  public void handleText(char[] text, int position) {
    if (inHeader) {
      try {
        out.write(text);
        out.flush();
      } catch (IOException ex) {
        System.err.println(ex);
      }
    }
  }
  public void flush() {
    try {
      while (this.level-- > 0) {
        out.write(lineSeparator + "</ul>");
      }
      out.flush();
    } catch (IOException e) {
      System.err.println(e);
    }
  }
  private static void parse(URL url, String encoding) throws IOException {
    ParserGetter kit = new ParserGetter();
    HTMLEditorKit.Parser parser = kit.getParser();
    InputStream in = url.openStream();
    InputStreamReader r = new InputStreamReader(in, encoding);
    HTMLEditorKit.ParserCallback callback = new Outliner(new OutputStreamWriter(System.out));
    parser.parse(r, callback, true);
  }
}
class ParserGetter extends HTMLEditorKit {
  public HTMLEditorKit.Parser getParser() {
    return super.getParser();
  }
}
   
   
Utility methods for dealing with HTML
/* Copyright 2005-2006 Tim Fennell
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
// sourceforge stripes
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.regex.Pattern;
/**
 * Provides simple utility methods for dealing with HTML.
 *
 * @author Tim Fennell
 */
public class HtmlUtil {
    private static final String FIELD_DELIMITER_STRING = "||";
    private static final Pattern FIELD_DELIMITER_PATTERN = Pattern.rupile("\\|\\|");
    /**
     * Replaces special HTML characters from the set {@literal [<, >, ", ", &]} with their HTML
     * escape codes.  Note that because the escape codes are multi-character that the returned
     * String could be longer than the one passed in.
     *
     * @param fragment a String fragment that might have HTML special characters in it
     * @return the fragment with special characters escaped
     */
    public static String encode(String fragment) {
        // If the input is null, then the output is null
        if (fragment == null) return null;
        StringBuilder builder = new StringBuilder(fragment.length() + 10); // a little wiggle room
        char[] characters = fragment.toCharArray();
        // This loop used to also look for and replace single ticks with ' but it
        // turns out that it"s not strictly necessary since Stripes uses double-quotes
        // around all form fields, and stupid IE6 will render ' verbatim instead
        // of as a single quote.
        for (int i=0; i<characters.length; ++i) {
            switch (characters[i]) {
                case "<"  : builder.append("<"); break;
                case ">"  : builder.append(">"); break;
                case """  : builder.append("""); break;
                case "&"  : builder.append("&"); break;
                default: builder.append(characters[i]);
            }
        }
        return builder.toString();
    }
    /**
     * One of a pair of methods (the other is splitValues) that is used to combine several
     * un-encoded values into a single delimited, encoded value for placement into a
     * hidden field.
     *
     * @param values One or more values which are to be combined
     * @return a single HTML-encoded String that contains all the values in such a way that
     *         they can be converted back into a Collection of Strings with splitValues().
     */
    public static String combineValues(Collection<String> values) {
        if (values == null || values.size() == 0) {
            return "";
        }
        else {
            StringBuilder builder = new StringBuilder(values.size() * 30);
            for (String value : values) {
                builder.append(value).append(FIELD_DELIMITER_STRING);
            }
            
            return encode(builder.toString());
        }
    }
    /**
     * Takes in a String produced by combineValues and returns a Collection of values that
     * contains the same values as originally supplied to combineValues.  Note that the order
     * or items in the collection (and indeed the type of Collection used) are not guaranteed
     * to be the same.
     *
     * @param value a String value produced by
     * @return a Collection of zero or more Strings
     */
    public static Collection<String> splitValues(String value) {
        if (value == null || value.length() == 0) {
            return Collections.emptyList();
        }
        else {
            String[] splits = FIELD_DELIMITER_PATTERN.split(value);
            return Arrays.asList(splits);
        }
    }
}
   
