Java/Network Protocol/HTML Parser
Содержание
- 1 Escape HTML special characters from a String
- 2 extends HTMLEditorKit.ParserCallback
- 3 Extract links from an HTML page
- 4 Find and display hyperlinks contained within a web page
- 5 Get all hyper links from a web page
- 6 Getting the Links in an HTML Document
- 7 Getting the Text in an HTML Document
- 8 HTML parser based on HTMLEditorKit.ParserCallback
- 9 Use regular expression to get web page title
- 10 Using javax.swing.text.html.HTMLEditorKit to parse html document
Escape HTML special characters from a String
public class Main {
public static void main(String[] argv){
System.out.println(escapeHTML("><"));
}
public static final String escapeHTML(String s) {
StringBuffer sb = new StringBuffer();
int n = s.length();
for (int i = 0; i < n; i++) {
char c = s.charAt(i);
switch (c) {
case "<":
sb.append("<");
break;
case ">":
sb.append(">");
break;
case "&":
sb.append("&");
break;
case """:
sb.append(""");
break;
case "à":
sb.append("à");
break;
case "�":
sb.append("À");
break;
case "â":
sb.append("â");
break;
case "�":
sb.append("Â");
break;
case "ä":
sb.append("ä");
break;
case "Ä":
sb.append("Ä");
break;
case "å":
sb.append("å");
break;
case "Å":
sb.append("Å");
break;
case "æ":
sb.append("æ");
break;
case "Æ":
sb.append("Æ");
break;
case "ç":
sb.append("ç");
break;
case "Ç":
sb.append("Ç");
break;
case "é":
sb.append("é");
break;
case "�":
sb.append("É");
break;
case "è":
sb.append("è");
break;
case "�":
sb.append("È");
break;
case "ê":
sb.append("ê");
break;
case "�":
sb.append("Ê");
break;
case "ë":
sb.append("ë");
break;
case "Ë":
sb.append("Ë");
break;
case "�":
sb.append("ï");
break;
case "�":
sb.append("Ï");
break;
case "ô":
sb.append("ô");
break;
case "�":
sb.append("Ô");
break;
case "ö":
sb.append("ö");
break;
case "Ö":
sb.append("Ö");
break;
case "ø":
sb.append("ø");
break;
case "Ø":
sb.append("Ø");
break;
case "ß":
sb.append("ß");
break;
case "ù":
sb.append("ù");
break;
case "�":
sb.append("Ù");
break;
case "û":
sb.append("û");
break;
case "�":
sb.append("Û");
break;
case "ü":
sb.append("ü");
break;
case "Ü":
sb.append("Ü");
break;
case "�":
sb.append("®");
break;
case "�":
sb.append("©");
break;
case "�":
sb.append("€");
break;
case " ":
sb.append(" ");
break;
default:
sb.append(c);
break;
}
}
return sb.toString();
}
}
//><
extends HTMLEditorKit.ParserCallback
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
public static void main(String args[]) throws Exception {
URL url = new URL(args[0]);
Reader reader = new InputStreamReader((InputStream) url.getContent());
new ParserDelegator().parse(reader, new TextOnly(), false);
}
}
class TextOnly extends HTMLEditorKit.ParserCallback {
public void handleText(char[] data, int pos) {
System.out.println(data);
}
}
Extract links from an HTML page
import java.io.FileReader;
import java.util.ArrayList;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Attribute;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
public final static void main(String[] args) throws Exception {
final ArrayList<String> list = new ArrayList<String>();
ParserDelegator parserDelegator = new ParserDelegator();
ParserCallback parserCallback = new ParserCallback() {
public void handleText(final char[] data, final int pos) {
}
public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
if (tag == Tag.A) {
String address = (String) attribute.getAttribute(Attribute.HREF);
list.add(address);
}
}
public void handleEndTag(Tag t, final int pos) {
}
public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) {
}
public void handleComment(final char[] data, final int pos) {
}
public void handleError(final java.lang.String errMsg, final int pos) {
}
};
parserDelegator.parse(new FileReader("a.html"), parserCallback, false);
System.out.println(list);
}
}
Find and display hyperlinks contained within a web page
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
public static void main(String[] arguments) throws Exception{
String page = loadPage(arguments[0]);
Pattern pattern = Pattern.rupile("<a.+href=\"(.+?)\"");
Matcher matcher = pattern.matcher(page);
while (matcher.find()) {
System.out.println(matcher.group(1));
}
}
static String loadPage(String name) throws Exception {
StringBuffer output = new StringBuffer();
FileReader file = new FileReader(name);
BufferedReader buff = new BufferedReader(file);
boolean eof = false;
while (!eof) {
String line = buff.readLine();
if (line == null)
eof = true;
else
output.append(line + "\n");
}
buff.close();
return output.toString();
}
}
Get all hyper links from a web page
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
public static void main(String args[]) throws Exception {
URL url = new URL(args[0]);
Reader reader = new InputStreamReader((InputStream) url.getContent());
System.out.println("<HTML><HEAD><TITLE>Links for " + args[0] + "</TITLE>");
System.out.println("<BASE HREF=\"" + args[0] + "\"></HEAD>");
System.out.println("<BODY>");
new ParserDelegator().parse(reader, new LinkPage(), false);
System.out.println("</BODY></HTML>");
}
}
class LinkPage extends HTMLEditorKit.ParserCallback {
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if (t == HTML.Tag.A) {
System.out.println("<BR>");
}
}
}
Getting the Links in an HTML Document
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.text.EditorKit;
import javax.swing.text.SimpleAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
public class Main {
public static void main(String[] argv) throws Exception {
URL url = new URI("http://www.google.ru").toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream());
EditorKit kit = new HTMLEditorKit();
HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument();
kit.read(rd, doc, 0);
HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
while (it.isValid()) {
SimpleAttributeSet s = (SimpleAttributeSet) it.getAttributes();
String link = (String) s.getAttribute(HTML.Attribute.HREF);
if (link != null) {
System.out.println(link);
}
it.next();
}
}
}
Getting the Text in an HTML Document
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
public class Main {
public static void main(String[] argv) throws Exception {
HTMLDocument doc = new HTMLDocument() {
public HTMLEditorKit.ParserCallback getReader(int pos) {
return new HTMLEditorKit.ParserCallback() {
public void handleText(char[] data, int pos) {
System.out.println(data);
}
};
}
};
URL url = new URI("http://www.google.ru").toURL();
URLConnection conn = url.openConnection();
Reader rd = new InputStreamReader(conn.getInputStream());
EditorKit kit = new HTMLEditorKit();
kit.read(rd, doc, 0);
}
}
HTML parser based on HTMLEditorKit.ParserCallback
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
public static void main(String args[]) throws Exception {
URL url = new URL(args[0]);
Reader reader = new InputStreamReader((InputStream) url.getContent());
new ParserDelegator().parse(reader, new HTMLParse(), false);
}
}
class HTMLParse extends HTMLEditorKit.ParserCallback {
public void handleText(char[] data, int pos) {
System.out.println(data);
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
System.out.println("+" + t.toString());
}
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
System.out.println("*" + t.toString());
}
public void handleEndTag(HTML.Tag t, int pos) {
System.out.println("-" + t.toString());
}
}
Use regular expression to get web page title
import java.io.DataInputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
public static void main(String[] argv) throws Exception {
URL url = new URL("http://www.java.ru/");
URLConnection urlConnection = url.openConnection();
DataInputStream dis = new DataInputStream(urlConnection.getInputStream());
String html = "", tmp = "";
while ((tmp = dis.readUTF()) != null) {
html += " " + tmp;
}
dis.close();
html = html.replaceAll("\\s+", " ");
Pattern p = Pattern.rupile("<title>(.*?)</title>");
Matcher m = p.matcher(html);
while (m.find() == true) {
System.out.println(m.group(1));
}
}
}
Using javax.swing.text.html.HTMLEditorKit to parse html document
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
public static void main(String[] args) throws Exception {
final List<String> list = new ArrayList<String>();
ParserDelegator parserDelegator = new ParserDelegator();
ParserCallback parserCallback = new ParserCallback() {
public void handleText(final char[] data, final int pos) {
list.add(new String(data));
}
public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
}
public void handleEndTag(Tag t, final int pos) {
}
public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) {
}
public void handleComment(final char[] data, final int pos) {
}
public void handleError(final java.lang.String errMsg, final int pos) {
}
};
parserDelegator.parse(new FileReader("a.html"), parserCallback, true);
System.out.println(list);
}
}