Java/Network Protocol/HTML Parser

Материал из Java эксперт
Перейти к: навигация, поиск

Escape HTML special characters from a String

  
public class Main {
  public static void main(String[] argv){
    System.out.println(escapeHTML("><"));
  }
  
  public static final String escapeHTML(String s) {
    StringBuffer sb = new StringBuffer();
    int n = s.length();
    for (int i = 0; i < n; i++) {
      char c = s.charAt(i);
      switch (c) {
      case "<":
        sb.append("&lt;");
        break;
      case ">":
        sb.append("&gt;");
        break;
      case "&":
        sb.append("&amp;");
        break;
      case """:
        sb.append("&quot;");
        break;
      case "&agrave;":
        sb.append("&agrave;");
        break;
      case "�":
        sb.append("&Agrave;");
        break;
      case "&acirc;":
        sb.append("&acirc;");
        break;
      case "�":
        sb.append("&Acirc;");
        break;
      case "&auml;":
        sb.append("&auml;");
        break;
      case "&Auml;":
        sb.append("&Auml;");
        break;
      case "&aring;":
        sb.append("&aring;");
        break;
      case "&Aring;":
        sb.append("&Aring;");
        break;
      case "&aelig;":
        sb.append("&aelig;");
        break;
      case "&AElig;":
        sb.append("&AElig;");
        break;
      case "&ccedil;":
        sb.append("&ccedil;");
        break;
      case "&Ccedil;":
        sb.append("&Ccedil;");
        break;
      case "&eacute;":
        sb.append("&eacute;");
        break;
      case "�":
        sb.append("&Eacute;");
        break;
      case "&egrave;":
        sb.append("&egrave;");
        break;
      case "�":
        sb.append("&Egrave;");
        break;
      case "&ecirc;":
        sb.append("&ecirc;");
        break;
      case "�":
        sb.append("&Ecirc;");
        break;
      case "&euml;":
        sb.append("&euml;");
        break;
      case "&Euml;":
        sb.append("&Euml;");
        break;
      case "�":
        sb.append("&iuml;");
        break;
      case "�":
        sb.append("&Iuml;");
        break;
      case "&ocirc;":
        sb.append("&ocirc;");
        break;
      case "�":
        sb.append("&Ocirc;");
        break;
      case "&ouml;":
        sb.append("&ouml;");
        break;
      case "&Ouml;":
        sb.append("&Ouml;");
        break;
      case "&oslash;":
        sb.append("&oslash;");
        break;
      case "&Oslash;":
        sb.append("&Oslash;");
        break;
      case "&szlig;":
        sb.append("&szlig;");
        break;
      case "&ugrave;":
        sb.append("&ugrave;");
        break;
      case "�":
        sb.append("&Ugrave;");
        break;
      case "&ucirc;":
        sb.append("&ucirc;");
        break;
      case "�":
        sb.append("&Ucirc;");
        break;
      case "&uuml;":
        sb.append("&uuml;");
        break;
      case "&Uuml;":
        sb.append("&Uuml;");
        break;
      case "�":
        sb.append("&reg;");
        break;
      case "�":
        sb.append("&copy;");
        break;
      case "�":
        sb.append("&euro;");
        break;
      case " ":
        sb.append("&nbsp;");
        break;
      default:
        sb.append(c);
        break;
      }
    }
    return sb.toString();
  }
}
//&gt;&lt;





extends HTMLEditorKit.ParserCallback

 
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
  public static void main(String args[]) throws Exception {
    URL url = new URL(args[0]);
    Reader reader = new InputStreamReader((InputStream) url.getContent());
    new ParserDelegator().parse(reader, new TextOnly(), false);
  }
}
class TextOnly extends HTMLEditorKit.ParserCallback {
  public void handleText(char[] data, int pos) {
    System.out.println(data);
  }
}





Extract links from an HTML page

  
import java.io.FileReader;
import java.util.ArrayList;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Attribute;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
  public final static void main(String[] args) throws Exception {
    final ArrayList<String> list = new ArrayList<String>();
    ParserDelegator parserDelegator = new ParserDelegator();
    ParserCallback parserCallback = new ParserCallback() {
      public void handleText(final char[] data, final int pos) {
      }
      public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
        if (tag == Tag.A) {
          String address = (String) attribute.getAttribute(Attribute.HREF);
          list.add(address);
        }
      }
      public void handleEndTag(Tag t, final int pos) {
      }
      public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) {
      }
      public void handleComment(final char[] data, final int pos) {
      }
      public void handleError(final java.lang.String errMsg, final int pos) {
      }
    };
    parserDelegator.parse(new FileReader("a.html"), parserCallback, false);
    System.out.println(list);
  }
}





Find and display hyperlinks contained within a web page

import java.io.BufferedReader;
import java.io.FileReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
  public static void main(String[] arguments) throws Exception{
    String page = loadPage(arguments[0]);
    Pattern pattern = Pattern.rupile("<a.+href=\"(.+?)\"");
    Matcher matcher = pattern.matcher(page);
    while (matcher.find()) {
      System.out.println(matcher.group(1));
    }
  }
  static String loadPage(String name) throws Exception {
    StringBuffer output = new StringBuffer();
    FileReader file = new FileReader(name);
    BufferedReader buff = new BufferedReader(file);
    boolean eof = false;
    while (!eof) {
      String line = buff.readLine();
      if (line == null)
        eof = true;
      else
        output.append(line + "\n");
    }
    buff.close();
    return output.toString();
  }
}





Get all hyper links from a web page

 

import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
  public static void main(String args[]) throws Exception {
    URL url = new URL(args[0]);
    Reader reader = new InputStreamReader((InputStream) url.getContent());
    System.out.println("<HTML><HEAD><TITLE>Links for " + args[0] + "</TITLE>");
    System.out.println("<BASE HREF=\"" + args[0] + "\"></HEAD>");
    System.out.println("<BODY>");
    new ParserDelegator().parse(reader, new LinkPage(), false);
    System.out.println("</BODY></HTML>");
  }
}
class LinkPage extends HTMLEditorKit.ParserCallback {
  public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
    if (t == HTML.Tag.A) {
      System.out.println("<BR>");
    }
  }
}





Getting the Links in an HTML Document

  
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.text.EditorKit;
import javax.swing.text.SimpleAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
public class Main {
  public static void main(String[] argv) throws Exception {
    URL url = new URI("http://www.google.ru").toURL();
    URLConnection conn = url.openConnection();
    Reader rd = new InputStreamReader(conn.getInputStream());
    EditorKit kit = new HTMLEditorKit();
    HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument();
    kit.read(rd, doc, 0);
    HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
    while (it.isValid()) {
      SimpleAttributeSet s = (SimpleAttributeSet) it.getAttributes();
      String link = (String) s.getAttribute(HTML.Attribute.HREF);
      if (link != null) {
        System.out.println(link);
      }
      it.next();
    }
  }
}





Getting the Text in an HTML Document

  
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
public class Main {
  public static void main(String[] argv) throws Exception {
    HTMLDocument doc = new HTMLDocument() {
      public HTMLEditorKit.ParserCallback getReader(int pos) {
        return new HTMLEditorKit.ParserCallback() {
          public void handleText(char[] data, int pos) {
            System.out.println(data);
          }
        };
      }
    };
    URL url = new URI("http://www.google.ru").toURL();
    URLConnection conn = url.openConnection();
    Reader rd = new InputStreamReader(conn.getInputStream());
    EditorKit kit = new HTMLEditorKit();
    kit.read(rd, doc, 0);
  }
}





HTML parser based on HTMLEditorKit.ParserCallback

 
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
  public static void main(String args[]) throws Exception {
    URL url = new URL(args[0]);
    Reader reader = new InputStreamReader((InputStream) url.getContent());
    new ParserDelegator().parse(reader, new HTMLParse(), false);
  }
}
class HTMLParse extends HTMLEditorKit.ParserCallback {
  public void handleText(char[] data, int pos) {
    System.out.println(data);
  }
  public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
    System.out.println("+" + t.toString());
  }
  public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
    System.out.println("*" + t.toString());
  }
  public void handleEndTag(HTML.Tag t, int pos) {
    System.out.println("-" + t.toString());
  }
}





Use regular expression to get web page title

import java.io.DataInputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
  public static void main(String[] argv) throws Exception {
    URL url = new URL("http://www.java.ru/");
    URLConnection urlConnection = url.openConnection();
    DataInputStream dis = new DataInputStream(urlConnection.getInputStream());
    String html = "", tmp = "";
    while ((tmp = dis.readUTF()) != null) {
      html += " " + tmp;
    }
    dis.close();
    html = html.replaceAll("\\s+", " ");
    Pattern p = Pattern.rupile("<title>(.*?)</title>");
    Matcher m = p.matcher(html);
    while (m.find() == true) {
      System.out.println(m.group(1));
    }
  }
}





Using javax.swing.text.html.HTMLEditorKit to parse html document

  
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
  public static void main(String[] args) throws Exception {
    final List<String> list = new ArrayList<String>();
    ParserDelegator parserDelegator = new ParserDelegator();
    ParserCallback parserCallback = new ParserCallback() {
      public void handleText(final char[] data, final int pos) {
        list.add(new String(data));
      }
      public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
      }
      public void handleEndTag(Tag t, final int pos) {
      }
      public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) {
      }
      public void handleComment(final char[] data, final int pos) {
      }
      public void handleError(final java.lang.String errMsg, final int pos) {
      }
    };
    parserDelegator.parse(new FileReader("a.html"), parserCallback, true);
    System.out.println(list);
  }
}