Java Tutorial/Network/HTML Parser
Содержание
- 1 Escape HTML special characters from a String
- 2 extends HTMLEditorKit.ParserCallback
- 3 Extract links from an HTML page
- 4 Find and display hyperlinks contained within a web page
- 5 Get all hyper links from a web page
- 6 Getting the Links in an HTML Document
- 7 Getting the Text in an HTML Document
- 8 HTML Parser
- 9 HTML parser based on HTMLEditorKit.ParserCallback
- 10 Using javax.swing.text.html.HTMLEditorKit to parse html document
Escape HTML special characters from a String
<source lang="java">
public class Main {
public static void main(String[] argv){ System.out.println(escapeHTML("><")); } public static final String escapeHTML(String s) { StringBuffer sb = new StringBuffer(); int n = s.length(); for (int i = 0; i < n; i++) { char c = s.charAt(i); switch (c) { case "<": sb.append("<"); break; case ">": sb.append(">"); break; case "&": sb.append("&"); break; case """: sb.append("""); break; case "à": sb.append("à"); break; case "�": sb.append("À"); break; case "â": sb.append("â"); break; case "�": sb.append("Â"); break; case "ä": sb.append("ä"); break; case "Ä": sb.append("Ä"); break; case "å": sb.append("å"); break; case "Å": sb.append("Å"); break; case "æ": sb.append("æ"); break; case "Æ": sb.append("Æ"); break; case "ç": sb.append("ç"); break; case "Ç": sb.append("Ç"); break; case "é": sb.append("é"); break; case "�": sb.append("É"); break; case "è": sb.append("è"); break; case "�": sb.append("È"); break; case "ê": sb.append("ê"); break; case "�": sb.append("Ê"); break; case "ë": sb.append("ë"); break; case "Ë": sb.append("Ë"); break; case "�": sb.append("ï"); break; case "�": sb.append("Ï"); break; case "ô": sb.append("ô"); break; case "�": sb.append("Ô"); break; case "ö": sb.append("ö"); break; case "Ö": sb.append("Ö"); break; case "ø": sb.append("ø"); break; case "Ø": sb.append("Ø"); break; case "ß": sb.append("ß"); break; case "ù": sb.append("ù"); break; case "�": sb.append("Ù"); break; case "û": sb.append("û"); break; case "�": sb.append("Û"); break; case "ü": sb.append("ü"); break; case "Ü": sb.append("Ü"); break; case "�": sb.append("®"); break; case "�": sb.append("©"); break; case "�": sb.append("€"); break; case " ": sb.append(" "); break; default: sb.append(c); break; } } return sb.toString(); }
} //><</source>
extends HTMLEditorKit.ParserCallback
<source lang="java">
import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class Main {
public static void main(String args[]) throws Exception { URL url = new URL(args[0]); Reader reader = new InputStreamReader((InputStream) url.getContent()); new ParserDelegator().parse(reader, new TextOnly(), false); }
} class TextOnly extends HTMLEditorKit.ParserCallback {
public void handleText(char[] data, int pos) { System.out.println(data); }
}</source>
Extract links from an HTML page
<source lang="java">
import java.io.FileReader; import java.util.ArrayList; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML.Attribute; import javax.swing.text.html.HTML.Tag; import javax.swing.text.html.HTMLEditorKit.ParserCallback; import javax.swing.text.html.parser.ParserDelegator; public class Main {
public final static void main(String[] args) throws Exception { final ArrayList<String> list = new ArrayList<String>(); ParserDelegator parserDelegator = new ParserDelegator(); ParserCallback parserCallback = new ParserCallback() { public void handleText(final char[] data, final int pos) { } public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { if (tag == Tag.A) { String address = (String) attribute.getAttribute(Attribute.HREF); list.add(address); } } public void handleEndTag(Tag t, final int pos) { } public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { } public void handleComment(final char[] data, final int pos) { } public void handleError(final java.lang.String errMsg, final int pos) { } }; parserDelegator.parse(new FileReader("a.html"), parserCallback, false); System.out.println(list); }
}</source>
Find and display hyperlinks contained within a web page
<source lang="java">
import java.io.BufferedReader; import java.io.FileReader; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Main {
public static void main(String[] arguments)throws Exception { StringBuffer output = new StringBuffer(); FileReader file = new FileReader("a.htm"); BufferedReader buff = new BufferedReader(file); boolean eof = false; while (!eof) { String line = buff.readLine(); if (line == null) eof = true; else output.append(line + "\n"); } buff.close(); String page = output.toString(); Pattern pattern = Pattern.rupile("<a.+href=\"(.+?)\""); Matcher matcher = pattern.matcher(page); while (matcher.find()) { System.out.println(matcher.group(1)); } }
}</source>
Get all hyper links from a web page
<source lang="java">
import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class Main {
public static void main(String args[]) throws Exception { URL url = new URL(args[0]); Reader reader = new InputStreamReader((InputStream) url.getContent()); System.out.println("<HTML><HEAD><TITLE>Links for " + args[0] + "</TITLE>"); System.out.println("<BASE HREF=\"" + args[0] + "\"></HEAD>"); System.out.println("<BODY>"); new ParserDelegator().parse(reader, new LinkPage(), false); System.out.println("</BODY></HTML>"); }
} class LinkPage extends HTMLEditorKit.ParserCallback {
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { if (t == HTML.Tag.A) { System.out.println("
"); } }
}</source>
Getting the Links in an HTML Document
<source lang="java">
import java.io.InputStreamReader; import java.io.Reader; import java.net.URI; import java.net.URL; import java.net.URLConnection; import javax.swing.text.EditorKit; import javax.swing.text.SimpleAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLDocument; import javax.swing.text.html.HTMLEditorKit; public class Main {
public static void main(String[] argv) throws Exception { URL url = new URI("http://www.google.ru").toURL(); URLConnection conn = url.openConnection(); Reader rd = new InputStreamReader(conn.getInputStream()); EditorKit kit = new HTMLEditorKit(); HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument(); kit.read(rd, doc, 0); HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A); while (it.isValid()) { SimpleAttributeSet s = (SimpleAttributeSet) it.getAttributes(); String link = (String) s.getAttribute(HTML.Attribute.HREF); if (link != null) { System.out.println(link); } it.next(); } }
}</source>
Getting the Text in an HTML Document
<source lang="java">
import java.io.InputStreamReader; import java.io.Reader; import java.net.URI; import java.net.URL; import java.net.URLConnection; import javax.swing.text.EditorKit; import javax.swing.text.html.HTMLDocument; import javax.swing.text.html.HTMLEditorKit; public class Main {
public static void main(String[] argv) throws Exception { HTMLDocument doc = new HTMLDocument() { public HTMLEditorKit.ParserCallback getReader(int pos) { return new HTMLEditorKit.ParserCallback() { public void handleText(char[] data, int pos) { System.out.println(data); } }; } }; URL url = new URI("http://www.google.ru").toURL(); URLConnection conn = url.openConnection(); Reader rd = new InputStreamReader(conn.getInputStream()); EditorKit kit = new HTMLEditorKit(); kit.read(rd, doc, 0); }
}</source>
HTML Parser
<source lang="java">
/*******************************************************************************
* Copyright (c) 2004 Actuate Corporation. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Actuate Corporation - initial API and implementation *******************************************************************************/
import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.LineNumberReader; import java.util.ArrayList; public class HTMLParser {
FileReader reader; LineNumberReader in; String token; ArrayList attribs = new ArrayList( ); int pushC = -1; private boolean ignoreWhitespace = true; public static final int EOF = -1; public static final int TEXT = 1; public static final int DOCTYPE = 2; public static final int ELEMENT = 3; public static final int COMMENT = 4; public static final int SPECIAL_ELEMENT = 5; public static final int START_ELEMENT = 0; public static final int END_ELEMENT = 1; public static final int SINGLE_ELEMENT = 2; public HTMLParser( ) { } public void open( String fileName ) throws FileNotFoundException { reader = new FileReader( fileName ); in = new LineNumberReader( reader ); } /** * */ public void close( ) { try { in.close( ); reader.close( ); } catch ( IOException e1 ) { // Ignore } } public String getTokenText( ) { return token; } public int getElementType( ) { if ( token.startsWith( "/" ) ) //$NON-NLS-1$ return END_ELEMENT; if ( token.endsWith( "/" ) ) //$NON-NLS-1$ return SINGLE_ELEMENT; return START_ELEMENT; } public String getElement( ) { if ( token.startsWith( "/" ) ) //$NON-NLS-1$ return token.substring( 1 ); if ( token.endsWith( "/" ) ) //$NON-NLS-1$ return token.substring( 0, token.length( ) - 1 ); return token; } public ArrayList getAttribs( ) { return attribs; } public String getAttrib( String name ) { for ( int i = 0; i < attribs.size( ); i++ ) { AttribPair a = (AttribPair) attribs.get( i ); if ( a.attrib.equalsIgnoreCase( name ) ) return a.value; } return null; } private int getC( ) { if ( pushC != -1 ) { int c = pushC; pushC = -1; return c; } try { return in.read( ); } catch ( IOException e ) { return EOF; } } private void pushC( int c ) { pushC = c; } public int getToken( ) { for ( ; ; ) { int c = getC( ); switch ( c ) { case -1: return EOF; case "<": return getElement( c ); default: { parseText( c ); if ( ! ignoreWhitespace || token.trim( ).length( ) > 0 ) return TEXT; } } } } private int parseText( int c ) { StringBuffer text = new StringBuffer( ); for ( ; ; ) { if ( c == EOF ) break; if ( c == "<" ) { pushC( c ); break; } // Convert MS-Word-style quotes. if ( c == 8220 || c == 8221 ) text.append( """ ); else text.append( (char) c ); c = getC( ); } token = text.toString( ); return TEXT; } private int skipSpace( int c ) { while ( c != EOF && Character.isWhitespace( (char)c ) ) { c = getC( ); } return c; } private int getElement( int c ) { c = getC( ); // Broken element if ( c == EOF ) return EOF; if ( c == "!" ) return getSpecialElement( ); attribs.clear( ); c = skipSpace( c ); if ( c == EOF ) return EOF; StringBuffer tag = new StringBuffer( ); if ( c == "/" ) { tag.append( (char) c ); c = skipSpace( getC( ) ); while ( c != EOF && c != ">" && ! Character.isWhitespace( (char)c ) ) { tag.append( (char) c ); c = getC( ); } token = tag.toString( ); for ( ; ; ) { if ( c == ">" || c == -1 ) break; c = getC( ); } return ELEMENT; } while ( c != EOF && c != ">" && c != "/" && ! Character.isWhitespace( (char)c ) ) { tag.append( (char) c ); c = getC( ); } if ( c == EOF ) { token = tag.toString( ); return ELEMENT; } for ( ; ; ) { c = skipSpace( c ); if ( c == EOF || c == ">" || c == "/" ) break; c = getAttrib( c ); } if ( c == "/" ) { tag.append( (char) c ); for ( ; ; ) { c = getC( ); if ( c == -1 || c == ">" ) break; } } token = tag.toString( ); return ELEMENT; } private int getAttrib( int c ) { AttribPair a = new AttribPair( ); StringBuffer s = new StringBuffer( ); while ( c != EOF && c != "=" && ! Character.isWhitespace( (char)c ) ) { s.append( (char) c ); c = getC( ); } a.attrib = s.toString( ); c = skipSpace( c ); if ( c != "=" ) { attribs.add( a ); return c; } s = new StringBuffer( ); c = skipSpace( getC( ) ); if ( c == "\"" || c == """ ) { int quote = c; for ( ; ; ) { c = getC( ); if ( c == -1 ) break; if ( c == quote ) { c = getC( ); break; } if ( c == "\\" ) { c = getC( ); if ( c == EOF ) break; s.append( "\\" ); s.append( (char) c ); } else { s.append( (char) c ); } } } else { for ( ; ; ) { c = getC( ); if ( c == -1 ) break; if ( c == ">" || c == "/" || Character.isWhitespace( (char)c ) ) { c = getC( ); break; } s.append( (char) c ); } } a.value = s.toString( ); attribs.add( a ); return c; } class AttribPair { String attrib; String value; } private int getSpecialElement( ) { StringBuffer text = new StringBuffer( ); text.append( "<!" ); //$NON-NLS-1$ for ( ; ; ) { int c = getC( ); if ( c == EOF || c == ">" ) break; text.append( (char) c ); } text.append( ">" ); token = text.toString( ); if ( token.startsWith( "
HTML parser based on HTMLEditorKit.ParserCallback
<source lang="java">
import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class Main {
public static void main(String args[]) throws Exception { URL url = new URL(args[0]); Reader reader = new InputStreamReader((InputStream) url.getContent()); new ParserDelegator().parse(reader, new HTMLParse(), false); }
} class HTMLParse extends HTMLEditorKit.ParserCallback {
public void handleText(char[] data, int pos) { System.out.println(data); } public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { System.out.println("+" + t.toString()); } public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) { System.out.println("*" + t.toString()); } public void handleEndTag(HTML.Tag t, int pos) { System.out.println("-" + t.toString()); }
}</source>
Using javax.swing.text.html.HTMLEditorKit to parse html document
<source lang="java">
import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.List; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML.Tag; import javax.swing.text.html.HTMLEditorKit.ParserCallback; import javax.swing.text.html.parser.ParserDelegator; public class Main {
public static void main(String[] args) throws Exception { final List<String> list = new ArrayList<String>(); ParserDelegator parserDelegator = new ParserDelegator(); ParserCallback parserCallback = new ParserCallback() { public void handleText(final char[] data, final int pos) { list.add(new String(data)); } public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { } public void handleEndTag(Tag t, final int pos) { } public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { } public void handleComment(final char[] data, final int pos) { } public void handleError(final java.lang.String errMsg, final int pos) { } }; parserDelegator.parse(new FileReader("a.html"), parserCallback, true); System.out.println(list); }
}</source>