Java Tutorial/Network/HTML Parser
Версия от 17:44, 31 мая 2010; (обсуждение)
Содержание
- 1 Escape HTML special characters from a String
- 2 extends HTMLEditorKit.ParserCallback
- 3 Extract links from an HTML page
- 4 Find and display hyperlinks contained within a web page
- 5 Get all hyper links from a web page
- 6 Getting the Links in an HTML Document
- 7 Getting the Text in an HTML Document
- 8 HTML Parser
- 9 HTML parser based on HTMLEditorKit.ParserCallback
- 10 Using javax.swing.text.html.HTMLEditorKit to parse html document
Escape HTML special characters from a String
public class Main {
public static void main(String[] argv){
System.out.println(escapeHTML("><"));
}
public static final String escapeHTML(String s) {
StringBuffer sb = new StringBuffer();
int n = s.length();
for (int i = 0; i < n; i++) {
char c = s.charAt(i);
switch (c) {
case "<":
sb.append("<");
break;
case ">":
sb.append(">");
break;
case "&":
sb.append("&");
break;
case """:
sb.append(""");
break;
case "à":
sb.append("à");
break;
case "�":
sb.append("À");
break;
case "â":
sb.append("â");
break;
case "�":
sb.append("Â");
break;
case "ä":
sb.append("ä");
break;
case "Ä":
sb.append("Ä");
break;
case "å":
sb.append("å");
break;
case "Å":
sb.append("Å");
break;
case "æ":
sb.append("æ");
break;
case "Æ":
sb.append("Æ");
break;
case "ç":
sb.append("ç");
break;
case "Ç":
sb.append("Ç");
break;
case "é":
sb.append("é");
break;
case "�":
sb.append("É");
break;
case "è":
sb.append("è");
break;
case "�":
sb.append("È");
break;
case "ê":
sb.append("ê");
break;
case "�":
sb.append("Ê");
break;
case "ë":
sb.append("ë");
break;
case "Ë":
sb.append("Ë");
break;
case "�":
sb.append("ï");
break;
case "�":
sb.append("Ï");
break;
case "ô":
sb.append("ô");
break;
case "�":
sb.append("Ô");
break;
case "ö":
sb.append("ö");
break;
case "Ö":
sb.append("Ö");
break;
case "ø":
sb.append("ø");
break;
case "Ø":
sb.append("Ø");
break;
case "ß":
sb.append("ß");
break;
case "ù":
sb.append("ù");
break;
case "�":
sb.append("Ù");
break;
case "û":
sb.append("û");
break;
case "�":
sb.append("Û");
break;
case "ü":
sb.append("ü");
break;
case "Ü":
sb.append("Ü");
break;
case "�":
sb.append("®");
break;
case "�":
sb.append("©");
break;
case "�":
sb.append("€");
break;
case " ":
sb.append(" ");
break;
default:
sb.append(c);
break;
}
}
return sb.toString();
}
}
//><
extends HTMLEditorKit.ParserCallback
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
public static void main(String args[]) throws Exception {
URL url = new URL(args[0]);
Reader reader = new InputStreamReader((InputStream) url.getContent());
new ParserDelegator().parse(reader, new TextOnly(), false);
}
}
class TextOnly extends HTMLEditorKit.ParserCallback {
public void handleText(char[] data, int pos) {
System.out.println(data);
}
}
Extract links from an HTML page
import java.io.FileReader;
import java.util.ArrayList;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Attribute;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
public final static void main(String[] args) throws Exception {
final ArrayList<String> list = new ArrayList<String>();
ParserDelegator parserDelegator = new ParserDelegator();
ParserCallback parserCallback = new ParserCallback() {
public void handleText(final char[] data, final int pos) {
}
public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
if (tag == Tag.A) {
String address = (String) attribute.getAttribute(Attribute.HREF);
list.add(address);
}
}
public void handleEndTag(Tag t, final int pos) {
}
public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) {
}
public void handleComment(final char[] data, final int pos) {
}
public void handleError(final java.lang.String errMsg, final int pos) {
}
};
parserDelegator.parse(new FileReader("a.html"), parserCallback, false);
System.out.println(list);
}
}
Find and display hyperlinks contained within a web page
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
public static void main(String[] arguments)throws Exception {
StringBuffer output = new StringBuffer();
FileReader file = new FileReader("a.htm");
BufferedReader buff = new BufferedReader(file);
boolean eof = false;
while (!eof) {
String line = buff.readLine();
if (line == null)
eof = true;
else
output.append(line + "\n");
}
buff.close();
String page = output.toString();
Pattern pattern = Pattern.rupile("<a.+href=\"(.+?)\"");
Matcher matcher = pattern.matcher(page);
while (matcher.find()) {
System.out.println(matcher.group(1));
}
}
}
Get all hyper links from a web page
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
public static void main(String args[]) throws Exception {
URL url = new URL(args[0]);
Reader reader = new InputStreamReader((InputStream) url.getContent());
System.out.println("<HTML><HEAD><TITLE>Links for " + args[0] + "</TITLE>");
System.out.println("<BASE HREF=\"" + args[0] + "\"></HEAD>");
System.out.println("<BODY>");
new ParserDelegator().parse(reader, new LinkPage(), false);
System.out.println("</BODY></HTML>");
}
}
class LinkPage extends HTMLEditorKit.ParserCallback {
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if (t == HTML.Tag.A) {
System.out.println("<BR>");
}
}
}
Getting the Links in an HTML Document
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.text.EditorKit;
import javax.swing.text.SimpleAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
public class Main {
public static void main(String[] argv) throws Exception {
URL url = new URI("http://www.google.ru").toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream());
EditorKit kit = new HTMLEditorKit();
HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument();
kit.read(rd, doc, 0);
HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
while (it.isValid()) {
SimpleAttributeSet s = (SimpleAttributeSet) it.getAttributes();
String link = (String) s.getAttribute(HTML.Attribute.HREF);
if (link != null) {
System.out.println(link);
}
it.next();
}
}
}
Getting the Text in an HTML Document
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
public class Main {
public static void main(String[] argv) throws Exception {
HTMLDocument doc = new HTMLDocument() {
public HTMLEditorKit.ParserCallback getReader(int pos) {
return new HTMLEditorKit.ParserCallback() {
public void handleText(char[] data, int pos) {
System.out.println(data);
}
};
}
};
URL url = new URI("http://www.google.ru").toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream());
EditorKit kit = new HTMLEditorKit();
kit.read(rd, doc, 0);
}
}
HTML Parser
/*******************************************************************************
* Copyright (c) 2004 Actuate Corporation.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Actuate Corporation - initial API and implementation
*******************************************************************************/
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.ArrayList;
public class HTMLParser
{
FileReader reader;
LineNumberReader in;
String token;
ArrayList attribs = new ArrayList( );
int pushC = -1;
private boolean ignoreWhitespace = true;
public static final int EOF = -1;
public static final int TEXT = 1;
public static final int DOCTYPE = 2;
public static final int ELEMENT = 3;
public static final int COMMENT = 4;
public static final int SPECIAL_ELEMENT = 5;
public static final int START_ELEMENT = 0;
public static final int END_ELEMENT = 1;
public static final int SINGLE_ELEMENT = 2;
public HTMLParser( )
{
}
public void open( String fileName ) throws FileNotFoundException
{
reader = new FileReader( fileName );
in = new LineNumberReader( reader );
}
/**
*
*/
public void close( )
{
try
{
in.close( );
reader.close( );
}
catch ( IOException e1 )
{
// Ignore
}
}
public String getTokenText( )
{
return token;
}
public int getElementType( )
{
if ( token.startsWith( "/" ) ) //$NON-NLS-1$
return END_ELEMENT;
if ( token.endsWith( "/" ) ) //$NON-NLS-1$
return SINGLE_ELEMENT;
return START_ELEMENT;
}
public String getElement( )
{
if ( token.startsWith( "/" ) ) //$NON-NLS-1$
return token.substring( 1 );
if ( token.endsWith( "/" ) ) //$NON-NLS-1$
return token.substring( 0, token.length( ) - 1 );
return token;
}
public ArrayList getAttribs( )
{
return attribs;
}
public String getAttrib( String name )
{
for ( int i = 0; i < attribs.size( ); i++ )
{
AttribPair a = (AttribPair) attribs.get( i );
if ( a.attrib.equalsIgnoreCase( name ) )
return a.value;
}
return null;
}
private int getC( )
{
if ( pushC != -1 )
{
int c = pushC;
pushC = -1;
return c;
}
try
{
return in.read( );
}
catch ( IOException e )
{
return EOF;
}
}
private void pushC( int c )
{
pushC = c;
}
public int getToken( )
{
for ( ; ; )
{
int c = getC( );
switch ( c )
{
case -1:
return EOF;
case "<":
return getElement( c );
default:
{
parseText( c );
if ( ! ignoreWhitespace || token.trim( ).length( ) > 0 )
return TEXT;
}
}
}
}
private int parseText( int c )
{
StringBuffer text = new StringBuffer( );
for ( ; ; )
{
if ( c == EOF )
break;
if ( c == "<" )
{
pushC( c );
break;
}
// Convert MS-Word-style quotes.
if ( c == 8220 || c == 8221 )
text.append( """ );
else
text.append( (char) c );
c = getC( );
}
token = text.toString( );
return TEXT;
}
private int skipSpace( int c )
{
while ( c != EOF && Character.isWhitespace( (char)c ) )
{
c = getC( );
}
return c;
}
private int getElement( int c )
{
c = getC( );
// Broken element
if ( c == EOF )
return EOF;
if ( c == "!" )
return getSpecialElement( );
attribs.clear( );
c = skipSpace( c );
if ( c == EOF )
return EOF;
StringBuffer tag = new StringBuffer( );
if ( c == "/" )
{
tag.append( (char) c );
c = skipSpace( getC( ) );
while ( c != EOF && c != ">" && ! Character.isWhitespace( (char)c ) )
{
tag.append( (char) c );
c = getC( );
}
token = tag.toString( );
for ( ; ; )
{
if ( c == ">" || c == -1 )
break;
c = getC( );
}
return ELEMENT;
}
while ( c != EOF && c != ">" && c != "/" && ! Character.isWhitespace( (char)c ) )
{
tag.append( (char) c );
c = getC( );
}
if ( c == EOF )
{
token = tag.toString( );
return ELEMENT;
}
for ( ; ; )
{
c = skipSpace( c );
if ( c == EOF || c == ">" || c == "/" )
break;
c = getAttrib( c );
}
if ( c == "/" )
{
tag.append( (char) c );
for ( ; ; )
{
c = getC( );
if ( c == -1 || c == ">" )
break;
}
}
token = tag.toString( );
return ELEMENT;
}
private int getAttrib( int c )
{
AttribPair a = new AttribPair( );
StringBuffer s = new StringBuffer( );
while ( c != EOF && c != "=" && ! Character.isWhitespace( (char)c ) )
{
s.append( (char) c );
c = getC( );
}
a.attrib = s.toString( );
c = skipSpace( c );
if ( c != "=" )
{
attribs.add( a );
return c;
}
s = new StringBuffer( );
c = skipSpace( getC( ) );
if ( c == "\"" || c == """ )
{
int quote = c;
for ( ; ; )
{
c = getC( );
if ( c == -1 )
break;
if ( c == quote )
{
c = getC( );
break;
}
if ( c == "\\" )
{
c = getC( );
if ( c == EOF )
break;
s.append( "\\" );
s.append( (char) c );
}
else
{
s.append( (char) c );
}
}
}
else
{
for ( ; ; )
{
c = getC( );
if ( c == -1 )
break;
if ( c == ">" || c == "/" || Character.isWhitespace( (char)c ) )
{
c = getC( );
break;
}
s.append( (char) c );
}
}
a.value = s.toString( );
attribs.add( a );
return c;
}
class AttribPair
{
String attrib;
String value;
}
private int getSpecialElement( )
{
StringBuffer text = new StringBuffer( );
text.append( "<!" ); //$NON-NLS-1$
for ( ; ; )
{
int c = getC( );
if ( c == EOF || c == ">" )
break;
text.append( (char) c );
}
text.append( ">" );
token = text.toString( );
if ( token.startsWith( "<!--" ) ) //$NON-NLS-1$
return COMMENT;
return SPECIAL_ELEMENT;
}
static String formatTags[ ] =
{
"i", "b", //$NON-NLS-1$//$NON-NLS-2$
"strong", "em", //$NON-NLS-1$//$NON-NLS-2$
"code", "span", //$NON-NLS-1$ //$NON-NLS-2$
"a" //$NON-NLS-1$
};
public boolean isFormatTag( )
{
return isFormatTag( getElement( ) );
}
public boolean isFormatTag( String tag )
{
for ( int i = 0; i < formatTags.length; i++ )
{
if ( formatTags[ i ].equalsIgnoreCase( tag ) )
return true;
}
return false;
}
public Object getFullElement( )
{
StringBuffer text = new StringBuffer( );
text.append( "<" );
int elementType = getElementType( );
if ( elementType == END_ELEMENT )
text.append( "/" );
text.append( getElement( ) );
for ( int i = 0; i < attribs.size( ); i++ )
{
text.append( " " );
AttribPair a = (AttribPair) attribs.get( i );
text.append( a.attrib );
text.append( "=\"" ); //$NON-NLS-1$
if ( a.value != null )
text.append( a.value );
text.append( "\"" ); //$NON-NLS-1$
}
if ( elementType == SINGLE_ELEMENT )
text.append( "/" );
text.append( ">" );
return text.toString( );
}
public int getLineNo( )
{
return in.getLineNumber( );
}
public void ignoreWhitespace( boolean b )
{
ignoreWhitespace = b;
}
}
HTML parser based on HTMLEditorKit.ParserCallback
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
public static void main(String args[]) throws Exception {
URL url = new URL(args[0]);
Reader reader = new InputStreamReader((InputStream) url.getContent());
new ParserDelegator().parse(reader, new HTMLParse(), false);
}
}
class HTMLParse extends HTMLEditorKit.ParserCallback {
public void handleText(char[] data, int pos) {
System.out.println(data);
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
System.out.println("+" + t.toString());
}
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
System.out.println("*" + t.toString());
}
public void handleEndTag(HTML.Tag t, int pos) {
System.out.println("-" + t.toString());
}
}
Using javax.swing.text.html.HTMLEditorKit to parse html document
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
public static void main(String[] args) throws Exception {
final List<String> list = new ArrayList<String>();
ParserDelegator parserDelegator = new ParserDelegator();
ParserCallback parserCallback = new ParserCallback() {
public void handleText(final char[] data, final int pos) {
list.add(new String(data));
}
public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
}
public void handleEndTag(Tag t, final int pos) {
}
public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) {
}
public void handleComment(final char[] data, final int pos) {
}
public void handleError(final java.lang.String errMsg, final int pos) {
}
};
parserDelegator.parse(new FileReader("a.html"), parserCallback, true);
System.out.println(list);
}
}