Java/Network Protocol/HTML Parser
Содержание
- 1 Escape HTML special characters from a String
- 2 extends HTMLEditorKit.ParserCallback
- 3 Extract links from an HTML page
- 4 Find and display hyperlinks contained within a web page
- 5 Get all hyper links from a web page
- 6 Getting the Links in an HTML Document
- 7 Getting the Text in an HTML Document
- 8 HTML parser based on HTMLEditorKit.ParserCallback
- 9 Use regular expression to get web page title
- 10 Using javax.swing.text.html.HTMLEditorKit to parse html document
Escape HTML special characters from a String
<source lang="java">
public class Main {
public static void main(String[] argv){ System.out.println(escapeHTML("><")); } public static final String escapeHTML(String s) { StringBuffer sb = new StringBuffer(); int n = s.length(); for (int i = 0; i < n; i++) { char c = s.charAt(i); switch (c) { case "<": sb.append("<"); break; case ">": sb.append(">"); break; case "&": sb.append("&"); break; case """: sb.append("""); break; case "à": sb.append("à"); break; case "�": sb.append("À"); break; case "â": sb.append("â"); break; case "�": sb.append("Â"); break; case "ä": sb.append("ä"); break; case "Ä": sb.append("Ä"); break; case "å": sb.append("å"); break; case "Å": sb.append("Å"); break; case "æ": sb.append("æ"); break; case "Æ": sb.append("Æ"); break; case "ç": sb.append("ç"); break; case "Ç": sb.append("Ç"); break; case "é": sb.append("é"); break; case "�": sb.append("É"); break; case "è": sb.append("è"); break; case "�": sb.append("È"); break; case "ê": sb.append("ê"); break; case "�": sb.append("Ê"); break; case "ë": sb.append("ë"); break; case "Ë": sb.append("Ë"); break; case "�": sb.append("ï"); break; case "�": sb.append("Ï"); break; case "ô": sb.append("ô"); break; case "�": sb.append("Ô"); break; case "ö": sb.append("ö"); break; case "Ö": sb.append("Ö"); break; case "ø": sb.append("ø"); break; case "Ø": sb.append("Ø"); break; case "ß": sb.append("ß"); break; case "ù": sb.append("ù"); break; case "�": sb.append("Ù"); break; case "û": sb.append("û"); break; case "�": sb.append("Û"); break; case "ü": sb.append("ü"); break; case "Ü": sb.append("Ü"); break; case "�": sb.append("®"); break; case "�": sb.append("©"); break; case "�": sb.append("€"); break; case " ": sb.append(" "); break; default: sb.append(c); break; } } return sb.toString(); }
} //><
</source>
extends HTMLEditorKit.ParserCallback
<source lang="java">
import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class Main {
public static void main(String args[]) throws Exception { URL url = new URL(args[0]); Reader reader = new InputStreamReader((InputStream) url.getContent()); new ParserDelegator().parse(reader, new TextOnly(), false); }
} class TextOnly extends HTMLEditorKit.ParserCallback {
public void handleText(char[] data, int pos) { System.out.println(data); }
}
</source>
Extract links from an HTML page
<source lang="java">
import java.io.FileReader; import java.util.ArrayList; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML.Attribute; import javax.swing.text.html.HTML.Tag; import javax.swing.text.html.HTMLEditorKit.ParserCallback; import javax.swing.text.html.parser.ParserDelegator; public class Main {
public final static void main(String[] args) throws Exception { final ArrayList<String> list = new ArrayList<String>(); ParserDelegator parserDelegator = new ParserDelegator(); ParserCallback parserCallback = new ParserCallback() { public void handleText(final char[] data, final int pos) { } public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { if (tag == Tag.A) { String address = (String) attribute.getAttribute(Attribute.HREF); list.add(address); } } public void handleEndTag(Tag t, final int pos) { } public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { } public void handleComment(final char[] data, final int pos) { } public void handleError(final java.lang.String errMsg, final int pos) { } }; parserDelegator.parse(new FileReader("a.html"), parserCallback, false); System.out.println(list); }
}
</source>
Find and display hyperlinks contained within a web page
<source lang="java">
import java.io.BufferedReader; import java.io.FileReader; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Main {
public static void main(String[] arguments) throws Exception{ String page = loadPage(arguments[0]); Pattern pattern = Pattern.rupile("<a.+href=\"(.+?)\""); Matcher matcher = pattern.matcher(page); while (matcher.find()) { System.out.println(matcher.group(1)); } } static String loadPage(String name) throws Exception { StringBuffer output = new StringBuffer(); FileReader file = new FileReader(name); BufferedReader buff = new BufferedReader(file); boolean eof = false; while (!eof) { String line = buff.readLine(); if (line == null) eof = true; else output.append(line + "\n"); } buff.close(); return output.toString(); }
}
</source>
Get all hyper links from a web page
<source lang="java">
import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class Main {
public static void main(String args[]) throws Exception { URL url = new URL(args[0]); Reader reader = new InputStreamReader((InputStream) url.getContent()); System.out.println("<HTML><HEAD><TITLE>Links for " + args[0] + "</TITLE>"); System.out.println("<BASE HREF=\"" + args[0] + "\"></HEAD>"); System.out.println("<BODY>"); new ParserDelegator().parse(reader, new LinkPage(), false); System.out.println("</BODY></HTML>"); }
} class LinkPage extends HTMLEditorKit.ParserCallback {
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { if (t == HTML.Tag.A) { System.out.println("
"); } }
}
</source>
Getting the Links in an HTML Document
<source lang="java">
import java.io.InputStreamReader; import java.io.Reader; import java.net.URI; import java.net.URL; import java.net.URLConnection; import javax.swing.text.EditorKit; import javax.swing.text.SimpleAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLDocument; import javax.swing.text.html.HTMLEditorKit; public class Main {
public static void main(String[] argv) throws Exception { URL url = new URI("http://www.google.ru").toURL(); URLConnection conn = url.openConnection(); Reader rd = new InputStreamReader(conn.getInputStream()); EditorKit kit = new HTMLEditorKit(); HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument(); kit.read(rd, doc, 0); HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A); while (it.isValid()) { SimpleAttributeSet s = (SimpleAttributeSet) it.getAttributes(); String link = (String) s.getAttribute(HTML.Attribute.HREF); if (link != null) { System.out.println(link); } it.next(); } }
}
</source>
Getting the Text in an HTML Document
<source lang="java">
import java.io.InputStreamReader; import java.io.Reader; import java.net.URI; import java.net.URL; import java.net.URLConnection; import javax.swing.text.EditorKit; import javax.swing.text.html.HTMLDocument; import javax.swing.text.html.HTMLEditorKit; public class Main {
public static void main(String[] argv) throws Exception { HTMLDocument doc = new HTMLDocument() { public HTMLEditorKit.ParserCallback getReader(int pos) { return new HTMLEditorKit.ParserCallback() { public void handleText(char[] data, int pos) { System.out.println(data); } }; } }; URL url = new URI("http://www.google.ru").toURL(); URLConnection conn = url.openConnection(); Reader rd = new InputStreamReader(conn.getInputStream()); EditorKit kit = new HTMLEditorKit(); kit.read(rd, doc, 0); }
}
</source>
HTML parser based on HTMLEditorKit.ParserCallback
<source lang="java">
import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class Main {
public static void main(String args[]) throws Exception { URL url = new URL(args[0]); Reader reader = new InputStreamReader((InputStream) url.getContent()); new ParserDelegator().parse(reader, new HTMLParse(), false); }
} class HTMLParse extends HTMLEditorKit.ParserCallback {
public void handleText(char[] data, int pos) { System.out.println(data); } public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { System.out.println("+" + t.toString()); } public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) { System.out.println("*" + t.toString()); } public void handleEndTag(HTML.Tag t, int pos) { System.out.println("-" + t.toString()); }
}
</source>
Use regular expression to get web page title
<source lang="java">
import java.io.DataInputStream; import java.net.URL; import java.net.URLConnection; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Main {
public static void main(String[] argv) throws Exception { URL url = new URL("http://www.java.ru/"); URLConnection urlConnection = url.openConnection(); DataInputStream dis = new DataInputStream(urlConnection.getInputStream()); String html = "", tmp = ""; while ((tmp = dis.readUTF()) != null) { html += " " + tmp; } dis.close(); html = html.replaceAll("\\s+", " "); Pattern p = Pattern.rupile("<title>(.*?)</title>"); Matcher m = p.matcher(html); while (m.find() == true) { System.out.println(m.group(1)); } }
}
</source>
Using javax.swing.text.html.HTMLEditorKit to parse html document
<source lang="java">
import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.List; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML.Tag; import javax.swing.text.html.HTMLEditorKit.ParserCallback; import javax.swing.text.html.parser.ParserDelegator; public class Main {
public static void main(String[] args) throws Exception { final List<String> list = new ArrayList<String>(); ParserDelegator parserDelegator = new ParserDelegator(); ParserCallback parserCallback = new ParserCallback() { public void handleText(final char[] data, final int pos) { list.add(new String(data)); } public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { } public void handleEndTag(Tag t, final int pos) { } public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { } public void handleComment(final char[] data, final int pos) { } public void handleError(final java.lang.String errMsg, final int pos) { } }; parserDelegator.parse(new FileReader("a.html"), parserCallback, true); System.out.println(list); }
}
</source>