Java Tutorial/Development/HTML Parser
Содержание
- 1 A collection of all character entites defined in the HTML4 standard.
- 2 Convert to HTML string
- 3 Decode an HTML color string like "#F567BA;" into a Color
- 4 Escape HTML
- 5 extends HTMLEditorKit.ParserCallback
- 6 Filter message string for characters that are sensitive in HTML
- 7 Filter the specified message string for characters that are sensitive in HTML
- 8 HTML color names
- 9 html parser DTD
- 10 insert HTML block dynamically
- 11 List Tags
- 12 Parse HTML
- 13 Text To HTML
- 14 Unescape HTML
- 15 Use javax.swing.text.html.HTMLEditorKit to parse HTML
- 16 Utility methods for dealing with HTML
A collection of all character entites defined in the HTML4 standard.
/**
*
* LibXML : a free Java layouting library
*
*
* Project Info: http://reporting.pentaho.org/libxml/
*
* (C) Copyright 2006-2008, by Object Refinery Ltd, Pentaho Corporation and Contributors.
*
* This library is free software; you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Foundation;
* either version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License along with this
* library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* [Java is a trademark or registered trademark of Sun Microsystems, Inc.
* in the United States and other countries.]
*
*
* ------------
* HtmlCharacterEntities.java
* ------------
*/
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
/**
* A collection of all character entites defined in the HTML4 standard. The key
* is the entity name, the property value is the decoded string.
*
* @author Thomas Morgner
*/
public class HtmlCharacterEntities extends Properties
{
/**
* The singleton instance for this entity-parser implementation.
*/
private static CharacterEntityParser entityParser;
private static final long serialVersionUID = 5118172339379209383L;
/**
* Gets the character entity parser for HTML content. The CharacterEntity
* parser translates known characters into predefined entities.
*
* @return the character entity parser instance.
*/
public static CharacterEntityParser getEntityParser()
{
if (entityParser == null)
{
entityParser = new CharacterEntityParser(new HtmlCharacterEntities());
}
return entityParser;
}
/**
* Creates an instance.
*/
public HtmlCharacterEntities()
{
setProperty("ang", "\u2220");
setProperty("spades", "\u2660");
setProperty("frasl", "\u2044");
setProperty("copy", "\u00a9");
setProperty("Upsilon", "\u03a5");
setProperty("rsquo", "\u2019");
setProperty("sdot", "\u22c5");
setProperty("beta", "\u03b2");
setProperty("egrave", "\u00e8");
setProperty("Pi", "\u03a0");
setProperty("micro", "\u00b5");
setProperty("lArr", "\u21d0");
setProperty("Beta", "\u0392");
setProperty("eacute", "\u00e9");
setProperty("agrave", "\u00e0");
setProperty("sbquo", "\u201a");
setProperty("ucirc", "\u00fb");
setProperty("mdash", "\u2014");
setProperty("rho", "\u03c1");
setProperty("Nu", "\u039d");
setProperty("ne", "\u2260");
setProperty("nsub", "\u2284");
setProperty("AElig", "\u00c6");
setProperty("raquo", "\u00bb");
setProperty("aacute", "\u00e1");
setProperty("le", "\u2264");
setProperty("harr", "\u2194");
setProperty("frac34", "\u00be");
setProperty("bdquo", "\u201e");
setProperty("cup", "\u222a");
setProperty("frac14", "\u00bc");
setProperty("exist", "\u2203");
setProperty("Ccedil", "\u00c7");
setProperty("phi", "\u03c6");
setProperty("Lambda", "\u039b");
setProperty("alpha", "\u03b1");
setProperty("sigma", "\u03c3");
setProperty("thetasym", "\u03d1");
setProperty("Rho", "\u03a1");
setProperty("hArr", "\u21d4");
setProperty("Dagger", "\u2021");
setProperty("otilde", "\u00f5");
setProperty("Epsilon", "\u0395");
setProperty("iuml", "\u00ef");
setProperty("Phi", "\u03a6");
setProperty("prod", "\u220f");
setProperty("Aring", "\u00c5");
setProperty("rlm", "\u200f");
setProperty("yen", "\u00a5");
setProperty("emsp", "\u2003");
setProperty("rang", "\u232a");
setProperty("Atilde", "\u00c3");
setProperty("Iuml", "\u00cf");
setProperty("iota", "\u03b9");
setProperty("deg", "\u00b0");
setProperty("prop", "\u221d");
setProperty("and", "\u2227");
setProperty("para", "\u00b6");
setProperty("darr", "\u2193");
setProperty("curren", "\u00a4");
setProperty("crarr", "\u21b5");
setProperty("not", "\u00ac");
setProperty("Iota", "\u0399");
setProperty("aelig", "\u00e6");
setProperty("rdquo", "\u201d");
setProperty("Ocirc", "\u00d4");
setProperty("ntilde", "\u00f1");
setProperty("reg", "\u00ae");
setProperty("zeta", "\u03b6");
setProperty("middot", "\u00b7");
setProperty("cent", "\u00a2");
setProperty("quot", "\"");
setProperty("hellip", "\u2026");
setProperty("Zeta", "\u0396");
setProperty("rceil", "\u2309");
setProperty("eta", "\u03b7");
setProperty("nbsp", "\u00a0");
setProperty("rarr", "\u2192");
setProperty("frac12", "\u00bd");
setProperty("real", "\u211c");
setProperty("mu", "\u03bc");
setProperty("dArr", "\u21d3");
setProperty("divide", "\u00f7");
setProperty("cap", "\u2229");
setProperty("chi", "\u03c7");
setProperty("times", "\u00d7");
setProperty("euml", "\u00eb");
setProperty("Gamma", "\u0393");
setProperty("loz", "\u25ca");
setProperty("acute", "\u00b4");
setProperty("Omega", "\u03a9");
setProperty("ndash", "\u2013");
setProperty("clubs", "\u2663");
setProperty("macr", "\u00af");
setProperty("Yacute", "\u00dd");
setProperty("Ugrave", "\u00d9");
setProperty("Euml", "\u00cb");
setProperty("Eta", "\u0397");
setProperty("sect", "\u00a7");
setProperty("asymp", "\u2248");
setProperty("ordm", "\u00ba");
setProperty("rArr", "\u21d2");
setProperty("radic", "\u221a");
setProperty("Uacute", "\u00da");
setProperty("omicron", "\u03bf");
setProperty("Chi", "\u03a7");
setProperty("aring", "\u00e5");
setProperty("Theta", "\u0398");
setProperty("supe", "\u2287");
setProperty("ensp", "\u2002");
setProperty("uml", "\u00a8");
setProperty("ccedil", "\u00e7");
setProperty("lambda", "\u03bb");
setProperty("gt", "\u003e");
setProperty("uarr", "\u2191");
setProperty("alefsym", "\u2135");
setProperty("auml", "\u00e4");
setProperty("sup3", "\u00b3");
setProperty("circ", "\u02c6");
setProperty("lsquo", "\u2018");
setProperty("Auml", "\u00c4");
setProperty("dagger", "\u2020");
setProperty("Kappa", "\u039a");
setProperty("cong", "\u2245");
setProperty("zwnj", "\u200c");
setProperty("shy", "\u00ad");
setProperty("ouml", "\u00f6");
setProperty("diams", "\u2666");
setProperty("uArr", "\u21d1");
setProperty("atilde", "\u00e3");
setProperty("THORN", "\u00de");
setProperty("or", "\u2228");
setProperty("Ograve", "\u00d2");
setProperty("ocirc", "\u00f4");
setProperty("plusm", "\u00b1");
setProperty("Ouml", "\u00d6");
setProperty("nabla", "\u2207");
setProperty("psi", "\u03c8");
setProperty("sigmaf", "\u03c2");
setProperty("euro", "\u20ac");
setProperty("sube", "\u2286");
setProperty("sup2", "\u00b2");
setProperty("laquo", "\u00ab");
setProperty("forall", "\u2200");
setProperty("Oacute", "\u00d3");
setProperty("iexcl", "\u00a1");
fillMoreEntities();
}
/**
* Externalized initialization method to make CheckStyle happy.
*/
private void fillMoreEntities()
{
setProperty("piv", "\u03d6");
setProperty("minus", "\u2212");
setProperty("zwj", "\u200d");
setProperty("tau", "\u03c4");
setProperty("Mu", "\u039c");
setProperty("gamma", "\u03b3");
setProperty("sup", "\u2283");
setProperty("Psi", "\u03a8");
setProperty("omega", "\u03c9");
setProperty("Oslash", "\u00d8");
setProperty("weierp", "\u2118");
setProperty("Igrave", "\u00cc");
setProperty("OElig", "\u0152");
setProperty("sup1", "\u00b9");
setProperty("cedil", "\u00b8");
setProperty("upsilon", "\u03c5");
setProperty("equiv", "\u2261");
setProperty("isin", "\u2208");
setProperty("Delta", "\u0394");
setProperty("yacute", "\u00fd");
setProperty("ugrave", "\u00f9");
setProperty("ge", "\u2265");
setProperty("Iacute", "\u00cd");
setProperty("brvbar", "\u00a6");
setProperty("Tau", "\u03a4");
setProperty("Prime", "\u2033");
setProperty("rfloor", "\u22a7");
setProperty("Ecirc", "\u00ca");
setProperty("ETH", "\u00d0");
setProperty("int", "\u222b");
setProperty("xi", "\u03be");
setProperty("uacute", "\u00fa");
setProperty("bull", "\u2022");
setProperty("Scaron", "\u0160");
setProperty("theta", "\u03b8");
setProperty("yuml", "\u00ff");
setProperty("oplus", "\u2295");
setProperty("part", "\u2202");
setProperty("ldquo", "\u201c");
setProperty("Icirc", "\u00ce");
setProperty("Yuml", "\u0178");
setProperty("eth", "\u00f0");
setProperty("Acirc", "\u00c2");
setProperty("sub", "\u2282");
setProperty("lceil", "\u2308");
setProperty("Egrave", "\u00c8");
setProperty("tilde", "\u02dc");
setProperty("pi", "\u03c0");
setProperty("rsaquo", "\u203a");
setProperty("kappa", "\u03ba");
setProperty("upsih", "\u03d2");
setProperty("Omicron", "\u039f");
setProperty("otimes", "\u2297");
setProperty("ni", "\u220b");
setProperty("amp", "\u0026");
setProperty("Eacute", "\u00c9");
setProperty("nu", "\u03bd");
setProperty("Ucirc", "\u00db");
setProperty("uuml", "\u00fc");
setProperty("oslash", "\u00f8");
setProperty("thorn", "\u00fe");
setProperty("trade", "\u2122");
setProperty("epsilon", "\u03b5");
setProperty("ograve", "\u00f2");
setProperty("hearts", "\u2665");
setProperty("iquest", "\u00bf");
setProperty("Uuml", "\u00dc");
setProperty("empty", "\u2205");
setProperty("lowast", "\u2217");
setProperty("sum", "\u2211");
setProperty("lfloor", "\u22a6");
setProperty("lrm", "\u200e");
setProperty("oacute", "\u00f3");
setProperty("image", "\u2111");
setProperty("Agrave", "\u00c0");
setProperty("oline", "\u203e");
setProperty("oelig", "\u0153");
setProperty("Sigma", "\u03a3");
setProperty("permil", "\u2030");
setProperty("perp", "\u22a5");
setProperty("lt", "\u003c");
setProperty("Aacute", "\u00c1");
setProperty("acirc", "\u00e2");
setProperty("lang", "\u2329");
setProperty("delta", "\u03b4");
setProperty("infin", "\u221e");
setProperty("igrave", "\u00ec");
setProperty("ordf", "\u00aa");
setProperty("lsaquo", "\u2039");
setProperty("prime", "\u2032");
setProperty("ecirc", "\u00ea");
setProperty("there4", "\u2234");
setProperty("iacute", "\u00ed");
setProperty("sim", "\u223c");
setProperty("Alpha", "\u0391");
setProperty("pound", "\u00a3");
setProperty("notin", "\u2209");
setProperty("Ntilde", "\u00d1");
setProperty("Xi", "\u039e");
setProperty("thinsp", "\u2009");
setProperty("Otilde", "\u00d5");
setProperty("icirc", "\u00ee");
setProperty("scaron", "\u0161");
setProperty("szlig", "\u00df");
setProperty("larr", "\u2190");
}
}
/**
*
* LibXML : a free Java layouting library
*
*
* Project Info: http://reporting.pentaho.org/libxml/
*
* (C) Copyright 2006-2008, by Object Refinery Ltd, Pentaho Corporation and Contributors.
*
* This library is free software; you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Foundation;
* either version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License along with this
* library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307, USA.
*
* [Java is a trademark or registered trademark of Sun Microsystems, Inc.
* in the United States and other countries.]
*
*
* ------------
* CharacterEntityParser.java
* ------------
*/
/**
* The character entity parser replaces all known occurrences of an entity in
* the format &entityname;.
*
* @author Thomas Morgner
*/
class CharacterEntityParser
{
private String[] charMap;
/**
* the entities, keyed by entity name.
*/
private final HashMap entities;
/**
* Creates a new CharacterEntityParser and initializes the parser with the
* given set of entities.
*
* @param characterEntities the entities used for the parser
*/
public CharacterEntityParser(final Properties characterEntities)
{
if (characterEntities == null)
{
throw new NullPointerException("CharacterEntities must not be null");
}
entities = new HashMap(characterEntities);
charMap = new String[65536];
final Iterator entries = entities.entrySet().iterator();
while (entries.hasNext())
{
final Map.Entry entry = (Map.Entry) entries.next();
final String value = (String) entry.getValue();
final String entityName = (String) entry.getKey();
if (value.length() != 1)
{
throw new IllegalStateException();
}
charMap[value.charAt(0)] = entityName;
}
}
/**
* Creates a new CharacterEntityParser and initializes the parser with the
* given set of entities.
*
* @param characterEntities the entities used for the parser
*/
public CharacterEntityParser(final HashMap characterEntities)
{
if (characterEntities == null)
{
throw new NullPointerException("CharacterEntities must not be null");
}
entities = (HashMap) characterEntities.clone();
charMap = new String[65536];
final Iterator entries = entities.entrySet().iterator();
while (entries.hasNext())
{
final Map.Entry entry = (Map.Entry) entries.next();
final String value = (String) entry.getValue();
final String entityName = (String) entry.getKey();
if (value.length() != 1)
{
throw new IllegalStateException();
}
charMap[value.charAt(0)] = entityName;
}
}
/**
* create a new Character entity parser and initializes the parser with the
* entities defined in the XML standard.
*
* @return the CharacterEntityParser initialized with XML entities.
*/
public static CharacterEntityParser createXMLEntityParser()
{
final HashMap entities = new HashMap();
entities.put("amp", "&");
entities.put("quot", "\"");
entities.put("lt", "<");
entities.put("gt", ">");
entities.put("apos", "\u0027");
return new CharacterEntityParser(entities);
}
/**
* returns the entities used in the parser.
*
* @return the properties for this parser.
*/
private HashMap getEntities()
{
return entities;
}
/**
* Looks up the character for the entity name specified in <code>key</code>.
*
* @param key the entity name
* @return the character as string with a length of 1
*/
private String lookupCharacter(final String key)
{
return (String) getEntities().get(key);
}
/**
* Encode the given String, so that all known entites are encoded. All
* characters represented by these entites are now removed from the string.
*
* @param value the original string
* @return the encoded string.
*/
public String encodeEntities(final String value)
{
if (value == null)
{
throw new NullPointerException();
}
final int length = value.length();
final StringBuffer writer = new StringBuffer(length);
for (int i = 0; i < length; i++)
{
final char character = value.charAt(i);
final String lookup = charMap[character];
if (lookup == null)
{
writer.append(character);
}
else
{
writer.append("&");
writer.append(lookup);
writer.append(";");
}
}
return writer.toString();
}
/**
* Decode the string, all known entities are replaced by their resolved
* characters.
*
* @param value the string that should be decoded.
* @return the decoded string.
*/
public String decodeEntities(final String value)
{
if (value == null)
{
throw new NullPointerException();
}
int parserIndex = 0;
int subStart = value.indexOf("&", parserIndex);
if (subStart == -1)
{
return value;
}
int subEnd = value.indexOf(";", subStart);
if (subEnd == -1)
{
return value;
}
final StringBuffer bufValue = new StringBuffer(value.substring(0, subStart));
do
{
// at this point we know, that there is at least one entity ..
if (value.charAt(subStart + 1) == "#")
{
final int subValue = parseInt(value.substring(subStart + 2, subEnd), 0);
if ((subValue >= 1) && (subValue <= 65536))
{
final char[] chr = new char[1];
chr[0] = (char) subValue;
bufValue.append(chr);
}
else
{
// invalid entity, do not decode ..
bufValue.append(value.substring(subStart, subEnd));
}
}
else
{
final String entity = value.substring(subStart + 1, subEnd);
final String replaceString = lookupCharacter(entity);
if (replaceString != null)
{
bufValue.append(decodeEntities(replaceString));
}
else
{
bufValue.append("&");
bufValue.append(entity);
bufValue.append(";");
}
}
parserIndex = subEnd + 1;
subStart = value.indexOf("&", parserIndex);
if (subStart == -1)
{
bufValue.append(value.substring(parserIndex));
subEnd = -1;
}
else
{
subEnd = value.indexOf(";", subStart);
if (subEnd == -1)
{
bufValue.append(value.substring(parserIndex));
}
else
{
bufValue.append(value.substring(parserIndex, subStart));
}
}
}
while (subStart != -1 && subEnd != -1);
return bufValue.toString();
}
/**
* Parses the given string into an int-value. On errors the default value
* is returned.
*
* @param s the string
* @param defaultVal the default value that should be used in case of errors
* @return the parsed int or the default value.
*/
private int parseInt(final String s, final int defaultVal)
{
if (s == null)
{
return defaultVal;
}
try
{
return Integer.parseInt(s);
}
catch (Exception e)
{
// ignored ..
}
return defaultVal;
}
}
Convert to HTML string
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
/*
* soapUI, copyright (C) 2004-2009 eviware.ru
*
* soapUI is free software; you can redistribute it and/or modify it under the
* terms of version 2.1 of the GNU Lesser General Public License as published by
* the Free Software Foundation.
*
* soapUI is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details at gnu.org.
*/
public class Utils {
public static String toHtml( String string )
{
if( StringUtils.isNullOrEmpty( string ) )
return "<html><body></body></html>";
BufferedReader st = new BufferedReader( new StringReader( string ) );
StringBuffer buf = new StringBuffer( "<html><body>" );
try
{
String str = st.readLine();
while( str != null )
{
if( str.equalsIgnoreCase( "<br/>" ) )
{
str = "<br>";
}
buf.append( str );
if( !str.equalsIgnoreCase( "<br>" ) )
{
buf.append( "<br>" );
}
str = st.readLine();
}
}
catch( IOException e )
{
e.printStackTrace();
}
buf.append( "</body></html>" );
string = buf.toString();
return string;
}
}
Decode an HTML color string like "#F567BA;" into a Color
/*
* Copyright 2005 Joe Walker
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.awt.Color;
/**
* Utilities for working with colors.
* @author Joe Walker [joe at getahead dot ltd dot uk]
*/
public class ColorUtil
{
/**
* Decode an HTML color string like "#F567BA;" into a {@link Color}
* @param colorString The string to decode
* @return The decoded color
* @throws IllegalArgumentException if the color sequence is not valid
*/
public static Color decodeHtmlColorString(String colorString)
{
Color color;
if (colorString.startsWith("#"))
{
colorString = colorString.substring(1);
}
if (colorString.endsWith(";"))
{
colorString = colorString.substring(0, colorString.length() - 1);
}
int red, green, blue;
switch (colorString.length())
{
case 6:
red = Integer.parseInt(colorString.substring(0, 2), 16);
green = Integer.parseInt(colorString.substring(2, 4), 16);
blue = Integer.parseInt(colorString.substring(4, 6), 16);
color = new Color(red, green, blue);
break;
case 3:
red = Integer.parseInt(colorString.substring(0, 1), 16);
green = Integer.parseInt(colorString.substring(1, 2), 16);
blue = Integer.parseInt(colorString.substring(2, 3), 16);
color = new Color(red, green, blue);
break;
case 1:
red = green = blue = Integer.parseInt(colorString.substring(0, 1), 16);
color = new Color(red, green, blue);
break;
default:
throw new IllegalArgumentException("Invalid color: " + colorString);
}
return color;
}
}
Escape HTML
/*
* Static String formatting and query routines.
* Copyright (C) 2001-2005 Stephen Ostermiller
* http://ostermiller.org/contact.pl?regarding=Java+Utilities
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* See COPYING.TXT for details.
*/
import java.util.HashMap;
import java.util.regex.Pattern;
/**
* Utilities for String formatting, manipulation, and queries.
* More information about this class is available from .
*
* @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities
* @since ostermillerutils 1.00.00
*/
public class StringHelper {
/**
* Replaces characters that may be confused by a HTML
* parser with their equivalent character entity references.
*
* Any data that will appear as text on a web page should
* be be escaped. This is especially important for data
* that comes from untrusted sources such as Internet users.
* A common mistake in CGI programming is to ask a user for
* data and then put that data on a web page. For example:<pre>
* Server: What is your name?
* User: <b>Joe<b>
* Server: Hello <b>Joe</b>, Welcome</pre>
* If the name is put on the page without checking that it doesn"t
* contain HTML code or without sanitizing that HTML code, the user
* could reformat the page, insert scripts, and control the the
* content on your web server.
*
* This method will replace HTML characters such as > with their
* HTML entity reference (&gt;) so that the html parser will
* be sure to interpret them as plain text rather than HTML or script.
*
* This method should be used for both data to be displayed in text
* in the html document, and data put in form elements. For example:<br>
* <code><html><body><i>This in not a &lt;tag&gt;
* in HTML</i></body></html></code><br>
* and<br>
* <code><form><input type="hidden" name="date" value="<i>This data could
* be &quot;malicious&quot;</i>"></form></code><br>
* In the second example, the form data would be properly be resubmitted
* to your cgi script in the URLEncoded format:<br>
* <code><i>This data could be %22malicious%22</i></code>
*
* @param s String to be escaped
* @return escaped String
* @throws NullPointerException if s is null.
*
* @since ostermillerutils 1.00.00
*/
public static String escapeHTML(String s){
int length = s.length();
int newLength = length;
boolean someCharacterEscaped = false;
// first check for characters that might
// be dangerous and calculate a length
// of the string that has escapes.
for (int i=0; i<length; i++){
char c = s.charAt(i);
int cint = 0xffff & c;
if (cint < 32){
switch(c){
case "\r":
case "\n":
case "\t":
case "\f":{
} break;
default: {
newLength -= 1;
someCharacterEscaped = true;
}
}
} else {
switch(c){
case "\"":{
newLength += 5;
someCharacterEscaped = true;
} break;
case "&":
case "\"":{
newLength += 4;
someCharacterEscaped = true;
} break;
case "<":
case ">":{
newLength += 3;
someCharacterEscaped = true;
} break;
}
}
}
if (!someCharacterEscaped){
// nothing to escape in the string
return s;
}
StringBuffer sb = new StringBuffer(newLength);
for (int i=0; i<length; i++){
char c = s.charAt(i);
int cint = 0xffff & c;
if (cint < 32){
switch(c){
case "\r":
case "\n":
case "\t":
case "\f":{
sb.append(c);
} break;
default: {
// Remove this character
}
}
} else {
switch(c){
case "\"":{
sb.append(""");
} break;
case "\"":{
sb.append("'");
} break;
case "&":{
sb.append("&");
} break;
case "<":{
sb.append("<");
} break;
case ">":{
sb.append(">");
} break;
default: {
sb.append(c);
}
}
}
}
return sb.toString();
}
}
extends HTMLEditorKit.ParserCallback
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Enumeration;
import javax.swing.text.AttributeSet;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
public class MainClass {
public static void main(String[] args) throws Exception {
ParserGetter kit = new ParserGetter();
HTMLEditorKit.Parser parser = kit.getParser();
URL u = new URL("http://www.jexp.ru");
InputStream in = u.openStream();
InputStreamReader r = new InputStreamReader(in);
String remoteFileName = u.getFile();
if (remoteFileName.endsWith("/")) {
remoteFileName += "index.html";
}
if (remoteFileName.startsWith("/")) {
remoteFileName = remoteFileName.substring(1);
}
File localDirectory = new File(u.getHost());
while (remoteFileName.indexOf("/") > -1) {
String part = remoteFileName.substring(0, remoteFileName.indexOf("/"));
remoteFileName = remoteFileName.substring(remoteFileName.indexOf("/") + 1);
localDirectory = new File(localDirectory, part);
}
if (localDirectory.mkdirs()) {
File output = new File(localDirectory, remoteFileName);
FileWriter out = new FileWriter(output);
HTMLEditorKit.ParserCallback callback = new PageSaver(out, u);
parser.parse(r, callback, false);
}
}
}
class PageSaver extends HTMLEditorKit.ParserCallback {
private Writer out;
private URL base;
public PageSaver(Writer out, URL base) {
this.out = out;
this.base = base;
}
public void handleStartTag(HTML.Tag tag, MutableAttributeSet attributes, int position) {
try {
out.write("<" + tag);
this.writeAttributes(attributes);
if (tag == HTML.Tag.APPLET && attributes.getAttribute(HTML.Attribute.CODEBASE) == null) {
String codebase = base.toString();
if (codebase.endsWith(".htm") || codebase.endsWith(".html")) {
codebase = codebase.substring(0, codebase.lastIndexOf("/"));
}
out.write(" codebase=\"" + codebase + "\"");
}
out.write(">");
out.flush();
} catch (IOException ex) {
System.err.println(ex);
}
}
public void handleEndTag(HTML.Tag tag, int position) {
try {
out.write("</" + tag + ">");
out.flush();
} catch (IOException ex) {
System.err.println(ex);
}
}
private void writeAttributes(AttributeSet attributes) throws IOException {
Enumeration e = attributes.getAttributeNames();
while (e.hasMoreElements()) {
Object name = e.nextElement();
String value = (String) attributes.getAttribute(name);
try {
if (name == HTML.Attribute.HREF || name == HTML.Attribute.SRC
|| name == HTML.Attribute.LOWSRC || name == HTML.Attribute.CODEBASE) {
URL u = new URL(base, value);
out.write(" " + name + "=\"" + u + "\"");
} else {
out.write(" " + name + "=\"" + value + "\"");
}
} catch (MalformedURLException ex) {
System.err.println(ex);
System.err.println(base);
System.err.println(value);
ex.printStackTrace();
}
}
}
public void handleComment(char[] text, int position) {
try {
out.write("<!-- ");
out.write(text);
out.write(" -->");
out.flush();
} catch (IOException ex) {
System.err.println(ex);
}
}
public void handleText(char[] text, int position) {
try {
out.write(text);
out.flush();
} catch (IOException ex) {
System.err.println(ex);
}
}
public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attributes, int position) {
try {
out.write("<" + tag);
this.writeAttributes(attributes);
out.write(">");
} catch (IOException e) {
e.printStackTrace();
}
}
}
class ParserGetter extends HTMLEditorKit {
public HTMLEditorKit.Parser getParser() {
return super.getParser();
}
}
Filter message string for characters that are sensitive in HTML
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* HTML filter utility.
*
* @author Craig R. McClanahan
* @author Tim Tye
* @version $Revision: 467217 $ $Date: 2006-10-24 05:14:34 +0200 (mar., 24 oct. 2006) $
*/
public final class HTMLFilter {
/**
* Filter the specified message string for characters that are sensitive
* in HTML. This avoids potential attacks caused by including JavaScript
* codes in the request URL that is often reported in error messages.
*
* @param message The message string to be filtered
*/
public static String filter(String message) {
if (message == null)
return (null);
char content[] = new char[message.length()];
message.getChars(0, message.length(), content, 0);
StringBuffer result = new StringBuffer(content.length + 50);
for (int i = 0; i < content.length; i++) {
switch (content[i]) {
case "<":
result.append("<");
break;
case ">":
result.append(">");
break;
case "&":
result.append("&");
break;
case """:
result.append(""");
break;
default:
result.append(content[i]);
}
}
return (result.toString());
}
}
Filter the specified message string for characters that are sensitive in HTML
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* HTML filter utility.
*
* @author Craig R. McClanahan
* @author Tim Tye
* @version $Revision: 467217 $ $Date: 2006-10-24 05:14:34 +0200 (Tue, 24 Oct 2006) $
*/
public final class HTMLFilter {
/**
* Filter the specified message string for characters that are sensitive
* in HTML. This avoids potential attacks caused by including JavaScript
* codes in the request URL that is often reported in error messages.
*
* @param message The message string to be filtered
*/
public static String filter(String message) {
if (message == null)
return (null);
char content[] = new char[message.length()];
message.getChars(0, message.length(), content, 0);
StringBuffer result = new StringBuffer(content.length + 50);
for (int i = 0; i < content.length; i++) {
switch (content[i]) {
case "<":
result.append("<");
break;
case ">":
result.append(">");
break;
case "&":
result.append("&");
break;
case """:
result.append(""");
break;
default:
result.append(content[i]);
}
}
return (result.toString());
}
}
HTML color names
//Revised from com.greef.ui;
import java.util.Map;
import java.util.HashMap;
import java.util.Collection;
import java.lang.reflect.Field;
import java.awt.Color;
/**
* HTML color names. It"s intended use is for parsing a name and return the
* corresponding color or return a name for a given color.
* @author Adrian Ber
*/
public class HTMLColors {
/** Don"t instantiate this, use only the static methods */
private HTMLColors() {
}
/** map between color names and colors;
* tough there are fields for every color we use a map because is a faster
* way to get the color
*/
private static Map<String, Color> name2color = new HashMap<String, Color>();
/** map between colors and color names;
* tough there are fields for every color we use a map because is a faster
* way to get the color
*/
private static Map<Color, String> color2name = new HashMap<Color, String>();
/** Initialiase colors map */
private static void initColorsMap() {
Field[] fields = HTMLColors.class.getFields();
for (Field field : fields) {
if (field.getType().isAssignableFrom(Color.class)) {
addColor(field.getName());
}
}
}
/** Used to initialize the map */
private static void addColor(String colorName, Color color) {
name2color.put(colorName, color);
color2name.put(color, colorName);
}
/** Used to initialize the map */
private static void addColor(String colorName) {
addColor(colorName, getColorFromField(colorName));
}
/** Used to initialize the map */
private static void addColor(String colorName, int colorRGB) {
addColor(colorName, new Color(colorRGB));
}
/** Returns a color with the specified case-insensitive name. */
private static Color getColorFromField(String name) {
try {
Field colorField = HTMLColors.class.getField(name.toLowerCase());
return (Color) colorField.get(HTMLColors.class);
}
catch (NoSuchFieldException exc) {
}
catch (SecurityException exc) {
}
catch (IllegalAccessException exc) {
}
catch (IllegalArgumentException exc) {
}
return null;
}
/** Returns a color with the specified case-insensitive name.*/
public static String getName(Color color) {
return color2name.get(color);
}
/** Returns a color with the specified case-insensitive name.*/
public static Color getColor(String name) {
return name2color.get(name.toLowerCase());
}
/** Returns a collection of all color names */
public static Collection<String> colors() {
return name2color.keySet();
}
/** Transform a color string into a color object.
* @param s the color string
* @return the color object
*/
public static Color decodeColor(String s) {
if (s == null)
return null;
Color c;
try {
c = Color.decode(s);
}
catch (NumberFormatException exc) {
c = HTMLColors.getColor(s);
}
return c;
}
public static final Color aliceblue = new Color(0xf0f8ff);
public static final Color antiquewhite = new Color(0xfaebd7);
public static final Color aqua = new Color(0x00ffff);
public static final Color aquamarine = new Color(0x7fffd4);
public static final Color azure = new Color(0xf0ffff);
public static final Color beige = new Color(0xf5f5dc);
public static final Color bisque = new Color(0xffe4c4);
public static final Color black = new Color(0x000000);
public static final Color blanchedalmond = new Color(0xffebcd);
public static final Color blue = new Color(0x0000ff);
public static final Color blueviolet = new Color(0x8a2be2);
public static final Color brown = new Color(0xa52a2a);
public static final Color burlywood = new Color(0xdeb887);
public static final Color cadetblue = new Color(0x5f9ea0);
public static final Color chartreuse = new Color(0x7fff00);
public static final Color chocolate = new Color(0xd2691e);
public static final Color coral = new Color(0xff7f50);
public static final Color cornflowerblue = new Color(0x6495ed);
public static final Color cornsilk = new Color(0xfff8dc);
public static final Color crimson = new Color(0xdc143c);
public static final Color cyan = new Color(0x00ffff);
public static final Color darkblue = new Color(0x00008b);
public static final Color darkcyan = new Color(0x008b8b);
public static final Color darkgoldenrod = new Color(0xb8860b);
public static final Color darkgray = new Color(0xa9a9a9);
public static final Color darkgreen = new Color(0x006400);
public static final Color darkkhaki = new Color(0xbdb76b);
public static final Color darkmagenta = new Color(0x8b008b);
public static final Color darkolivegreen = new Color(0x556b2f);
public static final Color darkorange = new Color(0xff8c00);
public static final Color darkorchid = new Color(0x9932cc);
public static final Color darkred = new Color(0x8b0000);
public static final Color darksalmon = new Color(0xe9967a);
public static final Color darkseagreen = new Color(0x8fbc8f);
public static final Color darkslateblue = new Color(0x483d8b);
public static final Color darkslategray = new Color(0x2f4f4f);
public static final Color darkturquoise = new Color(0x00ced1);
public static final Color darkviolet = new Color(0x9400d3);
public static final Color deeppink = new Color(0xff1493);
public static final Color deepskyblue = new Color(0x00bfff);
public static final Color dimgray = new Color(0x696969);
public static final Color dodgerblue = new Color(0x1e90ff);
public static final Color firebrick = new Color(0xb22222);
public static final Color floralwhite = new Color(0xfffaf0);
public static final Color forestgreen = new Color(0x228b22);
public static final Color fuchsia = new Color(0xff00ff);
public static final Color gainsboro = new Color(0xdcdcdc);
public static final Color ghostwhite = new Color(0xf8f8ff);
public static final Color gold = new Color(0xffd700);
public static final Color goldenrod = new Color(0xdaa520);
public static final Color gray = new Color(0x808080);
public static final Color green = new Color(0x008000);
public static final Color greenyellow = new Color(0xadff2f);
public static final Color honeydew = new Color(0xf0fff0);
public static final Color hotpink = new Color(0xff69b4);
public static final Color indianred = new Color(0xcd5c5c);
public static final Color indigo = new Color(0x4b0082);
public static final Color ivory = new Color(0xfffff0);
public static final Color khaki = new Color(0xf0e68c);
public static final Color lavender = new Color(0xe6e6fa);
public static final Color lavenderblush = new Color(0xfff0f5);
public static final Color lawngreen = new Color(0x7cfc00);
public static final Color lemonchiffon = new Color(0xfffacd);
public static final Color lightblue = new Color(0xadd8e6);
public static final Color lightcoral = new Color(0xf08080);
public static final Color lightcyan = new Color(0xe0ffff);
public static final Color lightgoldenrodyellow = new Color(0xfafad2);
public static final Color lightgreen = new Color(0x90ee90);
public static final Color lightgrey = new Color(0xd3d3d3);
public static final Color lightpink = new Color(0xffb6c1);
public static final Color lightsalmon = new Color(0xffa07a);
public static final Color lightseagreen = new Color(0x20b2aa);
public static final Color lightskyblue = new Color(0x87cefa);
public static final Color lightslategray = new Color(0x778899);
public static final Color lightsteelblue = new Color(0xb0c4de);
public static final Color lightyellow = new Color(0xffffe0);
public static final Color lime = new Color(0x00ff00);
public static final Color limegreen = new Color(0x32cd32);
public static final Color linen = new Color(0xfaf0e6);
public static final Color magenta = new Color(0xff00ff);
public static final Color maroon = new Color(0x800000);
public static final Color mediumaquamarine = new Color(0x66cdaa);
public static final Color mediumblue = new Color(0x0000cd);
public static final Color mediumorchid = new Color(0xba55d3);
public static final Color mediumpurple = new Color(0x9370db);
public static final Color mediumseagreen = new Color(0x3cb371);
public static final Color mediumslateblue = new Color(0x7b68ee);
public static final Color mediumspringgreen = new Color(0x00fa9a);
public static final Color mediumturquoise = new Color(0x48d1cc);
public static final Color mediumvioletred = new Color(0xc71585);
public static final Color midnightblue = new Color(0x191970);
public static final Color mintcream = new Color(0xf5fffa);
public static final Color mistyrose = new Color(0xffe4e1);
public static final Color moccasin = new Color(0xffe4b5);
public static final Color navajowhite = new Color(0xffdead);
public static final Color navy = new Color(0x000080);
public static final Color oldlace = new Color(0xfdf5e6);
public static final Color olive = new Color(0x808000);
public static final Color olivedrab = new Color(0x6b8e23);
public static final Color orange = new Color(0xffa500);
public static final Color orangered = new Color(0xff4500);
public static final Color orchid = new Color(0xda70d6);
public static final Color palegoldenrod = new Color(0xeee8aa);
public static final Color palegreen = new Color(0x98fb98);
public static final Color paleturquoise = new Color(0xafeeee);
public static final Color palevioletred = new Color(0xdb7093);
public static final Color papayawhip = new Color(0xffefd5);
public static final Color peachpuff = new Color(0xffdab9);
public static final Color peru = new Color(0xcd853f);
public static final Color pink = new Color(0xffc0cb);
public static final Color plum = new Color(0xdda0dd);
public static final Color powderblue = new Color(0xb0e0e6);
public static final Color purple = new Color(0x800080);
public static final Color red = new Color(0xff0000);
public static final Color rosybrown = new Color(0xbc8f8f);
public static final Color royalblue = new Color(0x4169e1);
public static final Color saddlebrown = new Color(0x8b4513);
public static final Color salmon = new Color(0xfa8072);
public static final Color sandybrown = new Color(0xf4a460);
public static final Color seagreen = new Color(0x2e8b57);
public static final Color seashell = new Color(0xfff5ee);
public static final Color sienna = new Color(0xa0522d);
public static final Color silver = new Color(0xc0c0c0);
public static final Color skyblue = new Color(0x87ceeb);
public static final Color slateblue = new Color(0x6a5acd);
public static final Color slategray = new Color(0x708090);
public static final Color snow = new Color(0xfffafa);
public static final Color springgreen = new Color(0x00ff7f);
public static final Color steelblue = new Color(0x4682b4);
public static final Color tan = new Color(0xd2b48c);
public static final Color teal = new Color(0x008080);
public static final Color thistle = new Color(0xd8bfd8);
public static final Color tomato = new Color(0xff6347);
public static final Color turquoise = new Color(0x40e0d0);
public static final Color violet = new Color(0xee82ee);
public static final Color wheat = new Color(0xf5deb3);
public static final Color white = new Color(0xffffff);
public static final Color whitesmoke = new Color(0xf5f5f5);
public static final Color yellow = new Color(0xffff00);
public static final Color yellowgreen = new Color(0x9acd32);
static {
initColorsMap();
}
}
html parser DTD
import java.io.IOException;
import javax.swing.text.html.parser.DTD;
public class MainClass {
public static void main(String[] args) {
try {
DTD d1 = DTD.getDTD("html");
for (int i = 0; i < 14; i++) {
System.out.println(d1.getElement(i).getName());
}
} catch (IOException e) {
System.err.println(e);
e.printStackTrace();
}
}
}
#pcdata html meta base isindex head body applet param p title style link script
insert HTML block dynamically
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
/**
* Various string manipulation methods that are more efficient then chaining
* string operations: all is done in the same buffer without creating a bunch of
* string objects.
*
* @author
*/
public class Main {
/**
* This method is used to insert HTML block dynamically
*
* @param source
* the HTML code to be processes
* @param replaceNl
* if true "\n" will be replaced by <br>
* @param replaceTag
* if true "<" will be replaced by < and ">" will be replaced
* by >
* @param replaceQuote
* if true "\"" will be replaced by "
* @return the formated html block
*/
public static final String formatHtml( String source, boolean replaceNl, boolean replaceTag,
boolean replaceQuote )
{
StringBuffer buf = new StringBuffer();
int len = source.length();
for ( int ii = 0; ii < len; ii++ )
{
char ch = source.charAt( ii );
switch ( ch )
{
case "\"":
if ( replaceQuote )
{
buf.append( """ );
}
else
{
buf.append( ch );
}
break;
case "<":
if ( replaceTag )
{
buf.append( "<" );
}
else
{
buf.append( ch );
}
break;
case ">":
if ( replaceTag )
{
buf.append( ">" );
}
else
{
buf.append( ch );
}
break;
case "\n":
if ( replaceNl )
{
if ( replaceTag )
{
buf.append( "<br>" );
}
else
{
buf.append( "<br>" );
}
}
else
{
buf.append( ch );
}
break;
case "\r":
break;
case "&":
buf.append( "&" );
break;
default:
buf.append( ch );
break;
}
}
return buf.toString();
}
}
List Tags
import javax.swing.text.html.HTML;
public class MainClass {
public static void main(String[] args) {
HTML.Tag[] list = HTML.getAllTags();
for (int i = 0; i < list.length; i++) {
System.out.println((i + 1) + ": " + list[i]);
}
}
}
1: a 2: address 3: applet 4: area 5: b 6: base 7: basefont 8: big 9: blockquote 10: body 11: br 12: caption 13: center 14: cite 15: code 16: dd 17: dfn 18: dir 19: div 20: dl 21: dt 22: em 23: font 24: form 25: frame 26: frameset 27: h1 28: h2 29: h3 30: h4 31: h5 32: h6 33: head 34: hr 35: html 36: i 37: img 38: input 39: isindex 40: kbd 41: li 42: link 43: map 44: menu 45: meta 46: nobr 47: noframes 48: object 49: ol 50: option 51: p 52: param 53: pre 54: samp 55: script 56: select 57: small 58: span 59: strike 60: s 61: strong 62: style 63: sub 64: sup 65: table 66: td 67: textarea 68: th 69: title 70: tr 71: tt 72: u 73: ul 74: var
Parse HTML
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.Enumeration;
import javax.swing.text.AttributeSet;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
public class MainClass {
public static void main(String[] args) {
ParserGetter kit = new ParserGetter();
HTMLEditorKit.Parser parser = kit.getParser();
HTMLEditorKit.ParserCallback callback = new ReportAttributes();
try {
URL u = new URL("http://www.jexp.ru");
InputStream in = u.openStream();
InputStreamReader r = new InputStreamReader(in);
parser.parse(r, callback, false);
} catch (IOException e) {
System.err.println(e);
}
}
}
class ReportAttributes extends HTMLEditorKit.ParserCallback {
public void handleStartTag(HTML.Tag tag, MutableAttributeSet attributes, int position) {
this.listAttributes(attributes);
}
private void listAttributes(AttributeSet attributes) {
Enumeration e = attributes.getAttributeNames();
while (e.hasMoreElements()) {
Object name = e.nextElement();
Object value = attributes.getAttribute(name);
if (!attributes.containsAttribute(name.toString(), value)) {
System.out.println("containsAttribute() fails");
}
if (!attributes.isDefined(name.toString())) {
System.out.println("isDefined() fails");
}
System.out.println(name + "=" + value);
}
}
public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attributes, int position) {
this.listAttributes(attributes);
}
}
class ParserGetter extends HTMLEditorKit {
public HTMLEditorKit.Parser getParser() {
return super.getParser();
}
}
Text To HTML
/*
GNU LESSER GENERAL PUBLIC LICENSE
Copyright (C) 2006 The XAMJ Project
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
Contact info: lobochief@users.sourceforge.net
*/
public class Html {
public static String textToHTML(String text) {
if(text == null) {
return null;
}
int length = text.length();
boolean prevSlashR = false;
StringBuffer out = new StringBuffer();
for(int i = 0; i < length; i++) {
char ch = text.charAt(i);
switch(ch) {
case "\r":
if(prevSlashR) {
out.append("<br>");
}
prevSlashR = true;
break;
case "\n":
prevSlashR = false;
out.append("<br>");
break;
case """:
if(prevSlashR) {
out.append("<br>");
prevSlashR = false;
}
out.append(""");
break;
case "<":
if(prevSlashR) {
out.append("<br>");
prevSlashR = false;
}
out.append("<");
break;
case ">":
if(prevSlashR) {
out.append("<br>");
prevSlashR = false;
}
out.append(">");
break;
case "&":
if(prevSlashR) {
out.append("<br>");
prevSlashR = false;
}
out.append("&");
break;
default:
if(prevSlashR) {
out.append("<br>");
prevSlashR = false;
}
out.append(ch);
break;
}
}
return out.toString();
}
}
Unescape HTML
/*
* Static String formatting and query routines.
* Copyright (C) 2001-2005 Stephen Ostermiller
* http://ostermiller.org/contact.pl?regarding=Java+Utilities
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* See COPYING.TXT for details.
*/
import java.util.HashMap;
import java.util.regex.Pattern;
/**
* Utilities for String formatting, manipulation, and queries.
* More information about this class is available from .
*
* @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities
* @since ostermillerutils 1.00.00
*/
public class StringHelper {
private static HashMap<String,Integer> htmlEntities = new HashMap<String,Integer>();
static {
htmlEntities.put("nbsp", new Integer(160));
htmlEntities.put("iexcl", new Integer(161));
htmlEntities.put("cent", new Integer(162));
htmlEntities.put("pound", new Integer(163));
htmlEntities.put("curren", new Integer(164));
htmlEntities.put("yen", new Integer(165));
htmlEntities.put("brvbar", new Integer(166));
htmlEntities.put("sect", new Integer(167));
htmlEntities.put("uml", new Integer(168));
htmlEntities.put("copy", new Integer(169));
htmlEntities.put("ordf", new Integer(170));
htmlEntities.put("laquo", new Integer(171));
htmlEntities.put("not", new Integer(172));
htmlEntities.put("shy", new Integer(173));
htmlEntities.put("reg", new Integer(174));
htmlEntities.put("macr", new Integer(175));
htmlEntities.put("deg", new Integer(176));
htmlEntities.put("plusmn", new Integer(177));
htmlEntities.put("sup2", new Integer(178));
htmlEntities.put("sup3", new Integer(179));
htmlEntities.put("acute", new Integer(180));
htmlEntities.put("micro", new Integer(181));
htmlEntities.put("para", new Integer(182));
htmlEntities.put("middot", new Integer(183));
htmlEntities.put("cedil", new Integer(184));
htmlEntities.put("sup1", new Integer(185));
htmlEntities.put("ordm", new Integer(186));
htmlEntities.put("raquo", new Integer(187));
htmlEntities.put("frac14", new Integer(188));
htmlEntities.put("frac12", new Integer(189));
htmlEntities.put("frac34", new Integer(190));
htmlEntities.put("iquest", new Integer(191));
htmlEntities.put("Agrave", new Integer(192));
htmlEntities.put("Aacute", new Integer(193));
htmlEntities.put("Acirc", new Integer(194));
htmlEntities.put("Atilde", new Integer(195));
htmlEntities.put("Auml", new Integer(196));
htmlEntities.put("Aring", new Integer(197));
htmlEntities.put("AElig", new Integer(198));
htmlEntities.put("Ccedil", new Integer(199));
htmlEntities.put("Egrave", new Integer(200));
htmlEntities.put("Eacute", new Integer(201));
htmlEntities.put("Ecirc", new Integer(202));
htmlEntities.put("Euml", new Integer(203));
htmlEntities.put("Igrave", new Integer(204));
htmlEntities.put("Iacute", new Integer(205));
htmlEntities.put("Icirc", new Integer(206));
htmlEntities.put("Iuml", new Integer(207));
htmlEntities.put("ETH", new Integer(208));
htmlEntities.put("Ntilde", new Integer(209));
htmlEntities.put("Ograve", new Integer(210));
htmlEntities.put("Oacute", new Integer(211));
htmlEntities.put("Ocirc", new Integer(212));
htmlEntities.put("Otilde", new Integer(213));
htmlEntities.put("Ouml", new Integer(214));
htmlEntities.put("times", new Integer(215));
htmlEntities.put("Oslash", new Integer(216));
htmlEntities.put("Ugrave", new Integer(217));
htmlEntities.put("Uacute", new Integer(218));
htmlEntities.put("Ucirc", new Integer(219));
htmlEntities.put("Uuml", new Integer(220));
htmlEntities.put("Yacute", new Integer(221));
htmlEntities.put("THORN", new Integer(222));
htmlEntities.put("szlig", new Integer(223));
htmlEntities.put("agrave", new Integer(224));
htmlEntities.put("aacute", new Integer(225));
htmlEntities.put("acirc", new Integer(226));
htmlEntities.put("atilde", new Integer(227));
htmlEntities.put("auml", new Integer(228));
htmlEntities.put("aring", new Integer(229));
htmlEntities.put("aelig", new Integer(230));
htmlEntities.put("ccedil", new Integer(231));
htmlEntities.put("egrave", new Integer(232));
htmlEntities.put("eacute", new Integer(233));
htmlEntities.put("ecirc", new Integer(234));
htmlEntities.put("euml", new Integer(235));
htmlEntities.put("igrave", new Integer(236));
htmlEntities.put("iacute", new Integer(237));
htmlEntities.put("icirc", new Integer(238));
htmlEntities.put("iuml", new Integer(239));
htmlEntities.put("eth", new Integer(240));
htmlEntities.put("ntilde", new Integer(241));
htmlEntities.put("ograve", new Integer(242));
htmlEntities.put("oacute", new Integer(243));
htmlEntities.put("ocirc", new Integer(244));
htmlEntities.put("otilde", new Integer(245));
htmlEntities.put("ouml", new Integer(246));
htmlEntities.put("divide", new Integer(247));
htmlEntities.put("oslash", new Integer(248));
htmlEntities.put("ugrave", new Integer(249));
htmlEntities.put("uacute", new Integer(250));
htmlEntities.put("ucirc", new Integer(251));
htmlEntities.put("uuml", new Integer(252));
htmlEntities.put("yacute", new Integer(253));
htmlEntities.put("thorn", new Integer(254));
htmlEntities.put("yuml", new Integer(255));
htmlEntities.put("fnof", new Integer(402));
htmlEntities.put("Alpha", new Integer(913));
htmlEntities.put("Beta", new Integer(914));
htmlEntities.put("Gamma", new Integer(915));
htmlEntities.put("Delta", new Integer(916));
htmlEntities.put("Epsilon", new Integer(917));
htmlEntities.put("Zeta", new Integer(918));
htmlEntities.put("Eta", new Integer(919));
htmlEntities.put("Theta", new Integer(920));
htmlEntities.put("Iota", new Integer(921));
htmlEntities.put("Kappa", new Integer(922));
htmlEntities.put("Lambda", new Integer(923));
htmlEntities.put("Mu", new Integer(924));
htmlEntities.put("Nu", new Integer(925));
htmlEntities.put("Xi", new Integer(926));
htmlEntities.put("Omicron", new Integer(927));
htmlEntities.put("Pi", new Integer(928));
htmlEntities.put("Rho", new Integer(929));
htmlEntities.put("Sigma", new Integer(931));
htmlEntities.put("Tau", new Integer(932));
htmlEntities.put("Upsilon", new Integer(933));
htmlEntities.put("Phi", new Integer(934));
htmlEntities.put("Chi", new Integer(935));
htmlEntities.put("Psi", new Integer(936));
htmlEntities.put("Omega", new Integer(937));
htmlEntities.put("alpha", new Integer(945));
htmlEntities.put("beta", new Integer(946));
htmlEntities.put("gamma", new Integer(947));
htmlEntities.put("delta", new Integer(948));
htmlEntities.put("epsilon", new Integer(949));
htmlEntities.put("zeta", new Integer(950));
htmlEntities.put("eta", new Integer(951));
htmlEntities.put("theta", new Integer(952));
htmlEntities.put("iota", new Integer(953));
htmlEntities.put("kappa", new Integer(954));
htmlEntities.put("lambda", new Integer(955));
htmlEntities.put("mu", new Integer(956));
htmlEntities.put("nu", new Integer(957));
htmlEntities.put("xi", new Integer(958));
htmlEntities.put("omicron", new Integer(959));
htmlEntities.put("pi", new Integer(960));
htmlEntities.put("rho", new Integer(961));
htmlEntities.put("sigmaf", new Integer(962));
htmlEntities.put("sigma", new Integer(963));
htmlEntities.put("tau", new Integer(964));
htmlEntities.put("upsilon", new Integer(965));
htmlEntities.put("phi", new Integer(966));
htmlEntities.put("chi", new Integer(967));
htmlEntities.put("psi", new Integer(968));
htmlEntities.put("omega", new Integer(969));
htmlEntities.put("thetasym", new Integer(977));
htmlEntities.put("upsih", new Integer(978));
htmlEntities.put("piv", new Integer(982));
htmlEntities.put("bull", new Integer(8226));
htmlEntities.put("hellip", new Integer(8230));
htmlEntities.put("prime", new Integer(8242));
htmlEntities.put("Prime", new Integer(8243));
htmlEntities.put("oline", new Integer(8254));
htmlEntities.put("frasl", new Integer(8260));
htmlEntities.put("weierp", new Integer(8472));
htmlEntities.put("image", new Integer(8465));
htmlEntities.put("real", new Integer(8476));
htmlEntities.put("trade", new Integer(8482));
htmlEntities.put("alefsym", new Integer(8501));
htmlEntities.put("larr", new Integer(8592));
htmlEntities.put("uarr", new Integer(8593));
htmlEntities.put("rarr", new Integer(8594));
htmlEntities.put("darr", new Integer(8595));
htmlEntities.put("harr", new Integer(8596));
htmlEntities.put("crarr", new Integer(8629));
htmlEntities.put("lArr", new Integer(8656));
htmlEntities.put("uArr", new Integer(8657));
htmlEntities.put("rArr", new Integer(8658));
htmlEntities.put("dArr", new Integer(8659));
htmlEntities.put("hArr", new Integer(8660));
htmlEntities.put("forall", new Integer(8704));
htmlEntities.put("part", new Integer(8706));
htmlEntities.put("exist", new Integer(8707));
htmlEntities.put("empty", new Integer(8709));
htmlEntities.put("nabla", new Integer(8711));
htmlEntities.put("isin", new Integer(8712));
htmlEntities.put("notin", new Integer(8713));
htmlEntities.put("ni", new Integer(8715));
htmlEntities.put("prod", new Integer(8719));
htmlEntities.put("sum", new Integer(8721));
htmlEntities.put("minus", new Integer(8722));
htmlEntities.put("lowast", new Integer(8727));
htmlEntities.put("radic", new Integer(8730));
htmlEntities.put("prop", new Integer(8733));
htmlEntities.put("infin", new Integer(8734));
htmlEntities.put("ang", new Integer(8736));
htmlEntities.put("and", new Integer(8743));
htmlEntities.put("or", new Integer(8744));
htmlEntities.put("cap", new Integer(8745));
htmlEntities.put("cup", new Integer(8746));
htmlEntities.put("int", new Integer(8747));
htmlEntities.put("there4", new Integer(8756));
htmlEntities.put("sim", new Integer(8764));
htmlEntities.put("cong", new Integer(8773));
htmlEntities.put("asymp", new Integer(8776));
htmlEntities.put("ne", new Integer(8800));
htmlEntities.put("equiv", new Integer(8801));
htmlEntities.put("le", new Integer(8804));
htmlEntities.put("ge", new Integer(8805));
htmlEntities.put("sub", new Integer(8834));
htmlEntities.put("sup", new Integer(8835));
htmlEntities.put("nsub", new Integer(8836));
htmlEntities.put("sube", new Integer(8838));
htmlEntities.put("supe", new Integer(8839));
htmlEntities.put("oplus", new Integer(8853));
htmlEntities.put("otimes", new Integer(8855));
htmlEntities.put("perp", new Integer(8869));
htmlEntities.put("sdot", new Integer(8901));
htmlEntities.put("lceil", new Integer(8968));
htmlEntities.put("rceil", new Integer(8969));
htmlEntities.put("lfloor", new Integer(8970));
htmlEntities.put("rfloor", new Integer(8971));
htmlEntities.put("lang", new Integer(9001));
htmlEntities.put("rang", new Integer(9002));
htmlEntities.put("loz", new Integer(9674));
htmlEntities.put("spades", new Integer(9824));
htmlEntities.put("clubs", new Integer(9827));
htmlEntities.put("hearts", new Integer(9829));
htmlEntities.put("diams", new Integer(9830));
htmlEntities.put("quot", new Integer(34));
htmlEntities.put("amp", new Integer(38));
htmlEntities.put("lt", new Integer(60));
htmlEntities.put("gt", new Integer(62));
htmlEntities.put("OElig", new Integer(338));
htmlEntities.put("oelig", new Integer(339));
htmlEntities.put("Scaron", new Integer(352));
htmlEntities.put("scaron", new Integer(353));
htmlEntities.put("Yuml", new Integer(376));
htmlEntities.put("circ", new Integer(710));
htmlEntities.put("tilde", new Integer(732));
htmlEntities.put("ensp", new Integer(8194));
htmlEntities.put("emsp", new Integer(8195));
htmlEntities.put("thinsp", new Integer(8201));
htmlEntities.put("zwnj", new Integer(8204));
htmlEntities.put("zwj", new Integer(8205));
htmlEntities.put("lrm", new Integer(8206));
htmlEntities.put("rlm", new Integer(8207));
htmlEntities.put("ndash", new Integer(8211));
htmlEntities.put("mdash", new Integer(8212));
htmlEntities.put("lsquo", new Integer(8216));
htmlEntities.put("rsquo", new Integer(8217));
htmlEntities.put("sbquo", new Integer(8218));
htmlEntities.put("ldquo", new Integer(8220));
htmlEntities.put("rdquo", new Integer(8221));
htmlEntities.put("bdquo", new Integer(8222));
htmlEntities.put("dagger", new Integer(8224));
htmlEntities.put("Dagger", new Integer(8225));
htmlEntities.put("permil", new Integer(8240));
htmlEntities.put("lsaquo", new Integer(8249));
htmlEntities.put("rsaquo", new Integer(8250));
htmlEntities.put("euro", new Integer(8364));
}
/**
* Turn any HTML escape entities in the string into
* characters and return the resulting string.
*
* @param s String to be unescaped.
* @return unescaped String.
* @throws NullPointerException if s is null.
*
* @since ostermillerutils 1.00.00
*/
public static String unescapeHTML(String s){
StringBuffer result = new StringBuffer(s.length());
int ampInd = s.indexOf("&");
int lastEnd = 0;
while (ampInd >= 0){
int nextAmp = s.indexOf("&", ampInd+1);
int nextSemi = s.indexOf(";", ampInd+1);
if (nextSemi != -1 && (nextAmp == -1 || nextSemi < nextAmp)){
int value = -1;
String escape = s.substring(ampInd+1,nextSemi);
try {
if (escape.startsWith("#")){
value = Integer.parseInt(escape.substring(1), 10);
} else {
if (htmlEntities.containsKey(escape)){
value = ((Integer)(htmlEntities.get(escape))).intValue();
}
}
} catch (NumberFormatException x){
}
result.append(s.substring(lastEnd, ampInd));
lastEnd = nextSemi + 1;
if (value >= 0 && value <= 0xffff){
result.append((char)value);
} else {
result.append("&").append(escape).append(";");
}
}
ampInd = nextAmp;
}
result.append(s.substring(lastEnd));
return result.toString();
}
}
Use javax.swing.text.html.HTMLEditorKit to parse HTML
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URL;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
public class MainClass {
private static void parse(URL url, String encoding) throws IOException {
ParserGetter kit = new ParserGetter();
HTMLEditorKit.Parser parser = kit.getParser();
InputStream in = url.openStream();
InputStreamReader r = new InputStreamReader(in, encoding);
HTMLEditorKit.ParserCallback callback = new Outliner(new OutputStreamWriter(System.out));
parser.parse(r, callback, true);
}
public static void main(String[] args) throws Exception {
ParserGetter kit = new ParserGetter();
HTMLEditorKit.Parser parser = kit.getParser();
String encoding = "ISO-8859-1";
URL url = new URL("http://www.jexp.ru");
InputStream in = url.openStream();
InputStreamReader r = new InputStreamReader(in, encoding);
// parse once just to detect the encoding
HTMLEditorKit.ParserCallback doNothing = new HTMLEditorKit.ParserCallback();
parser.parse(r, doNothing, false);
parse(url, encoding);
}
}
class Outliner extends HTMLEditorKit.ParserCallback {
private Writer out;
private int level = 0;
private boolean inHeader = false;
private static String lineSeparator = System.getProperty("line.separator", "\r\n");
public Outliner(Writer out) {
this.out = out;
}
public void handleStartTag(HTML.Tag tag, MutableAttributeSet attributes, int position) {
int newLevel = 0;
if (tag == HTML.Tag.H1)
newLevel = 1;
else if (tag == HTML.Tag.H2)
newLevel = 2;
else if (tag == HTML.Tag.H3)
newLevel = 3;
else if (tag == HTML.Tag.H4)
newLevel = 4;
else if (tag == HTML.Tag.H5)
newLevel = 5;
else if (tag == HTML.Tag.H6)
newLevel = 6;
else
return;
this.inHeader = true;
try {
if (newLevel > this.level) {
for (int i = 0; i < newLevel - this.level; i++) {
out.write("<ul>" + lineSeparator + "<li>");
}
} else if (newLevel < this.level) {
for (int i = 0; i < this.level - newLevel; i++) {
out.write(lineSeparator + "</ul>" + lineSeparator);
}
out.write(lineSeparator + "<li>");
} else {
out.write(lineSeparator + "<li>");
}
this.level = newLevel;
out.flush();
} catch (IOException ex) {
System.err.println(ex);
}
}
public void handleEndTag(HTML.Tag tag, int position) {
if (tag == HTML.Tag.H1 || tag == HTML.Tag.H2 || tag == HTML.Tag.H3 || tag == HTML.Tag.H4
|| tag == HTML.Tag.H5 || tag == HTML.Tag.H6) {
inHeader = false;
}
// work around bug in the parser that fails to call flush
if (tag == HTML.Tag.HTML)
this.flush();
}
public void handleText(char[] text, int position) {
if (inHeader) {
try {
out.write(text);
out.flush();
} catch (IOException ex) {
System.err.println(ex);
}
}
}
public void flush() {
try {
while (this.level-- > 0) {
out.write(lineSeparator + "</ul>");
}
out.flush();
} catch (IOException e) {
System.err.println(e);
}
}
private static void parse(URL url, String encoding) throws IOException {
ParserGetter kit = new ParserGetter();
HTMLEditorKit.Parser parser = kit.getParser();
InputStream in = url.openStream();
InputStreamReader r = new InputStreamReader(in, encoding);
HTMLEditorKit.ParserCallback callback = new Outliner(new OutputStreamWriter(System.out));
parser.parse(r, callback, true);
}
}
class ParserGetter extends HTMLEditorKit {
public HTMLEditorKit.Parser getParser() {
return super.getParser();
}
}
Utility methods for dealing with HTML
/* Copyright 2005-2006 Tim Fennell
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// sourceforge stripes
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.regex.Pattern;
/**
* Provides simple utility methods for dealing with HTML.
*
* @author Tim Fennell
*/
public class HtmlUtil {
private static final String FIELD_DELIMITER_STRING = "||";
private static final Pattern FIELD_DELIMITER_PATTERN = Pattern.rupile("\\|\\|");
/**
* Replaces special HTML characters from the set {@literal [<, >, ", ", &]} with their HTML
* escape codes. Note that because the escape codes are multi-character that the returned
* String could be longer than the one passed in.
*
* @param fragment a String fragment that might have HTML special characters in it
* @return the fragment with special characters escaped
*/
public static String encode(String fragment) {
// If the input is null, then the output is null
if (fragment == null) return null;
StringBuilder builder = new StringBuilder(fragment.length() + 10); // a little wiggle room
char[] characters = fragment.toCharArray();
// This loop used to also look for and replace single ticks with ' but it
// turns out that it"s not strictly necessary since Stripes uses double-quotes
// around all form fields, and stupid IE6 will render ' verbatim instead
// of as a single quote.
for (int i=0; i<characters.length; ++i) {
switch (characters[i]) {
case "<" : builder.append("<"); break;
case ">" : builder.append(">"); break;
case """ : builder.append("""); break;
case "&" : builder.append("&"); break;
default: builder.append(characters[i]);
}
}
return builder.toString();
}
/**
* One of a pair of methods (the other is splitValues) that is used to combine several
* un-encoded values into a single delimited, encoded value for placement into a
* hidden field.
*
* @param values One or more values which are to be combined
* @return a single HTML-encoded String that contains all the values in such a way that
* they can be converted back into a Collection of Strings with splitValues().
*/
public static String combineValues(Collection<String> values) {
if (values == null || values.size() == 0) {
return "";
}
else {
StringBuilder builder = new StringBuilder(values.size() * 30);
for (String value : values) {
builder.append(value).append(FIELD_DELIMITER_STRING);
}
return encode(builder.toString());
}
}
/**
* Takes in a String produced by combineValues and returns a Collection of values that
* contains the same values as originally supplied to combineValues. Note that the order
* or items in the collection (and indeed the type of Collection used) are not guaranteed
* to be the same.
*
* @param value a String value produced by
* @return a Collection of zero or more Strings
*/
public static Collection<String> splitValues(String value) {
if (value == null || value.length() == 0) {
return Collections.emptyList();
}
else {
String[] splits = FIELD_DELIMITER_PATTERN.split(value);
return Arrays.asList(splits);
}
}
}