Java/Network Protocol/HTML Parser

Материал из Java эксперт
Перейти к: навигация, поиск

Escape HTML special characters from a String

   <source lang="java">
 

public class Main {

 public static void main(String[] argv){
   System.out.println(escapeHTML("><"));
 }
 
 public static final String escapeHTML(String s) {
   StringBuffer sb = new StringBuffer();
   int n = s.length();
   for (int i = 0; i < n; i++) {
     char c = s.charAt(i);
     switch (c) {
     case "<":
       sb.append("<");
       break;
     case ">":
       sb.append(">");
       break;
     case "&":
       sb.append("&");
       break;
     case """:
       sb.append(""");
       break;
     case "à":
       sb.append("à");
       break;
     case "�":
       sb.append("À");
       break;
     case "â":
       sb.append("â");
       break;
     case "�":
       sb.append("Â");
       break;
     case "ä":
       sb.append("ä");
       break;
     case "Ä":
       sb.append("Ä");
       break;
     case "å":
       sb.append("å");
       break;
     case "Å":
       sb.append("Å");
       break;
     case "æ":
       sb.append("æ");
       break;
     case "Æ":
       sb.append("Æ");
       break;
     case "ç":
       sb.append("ç");
       break;
     case "Ç":
       sb.append("Ç");
       break;
     case "é":
       sb.append("é");
       break;
     case "�":
       sb.append("É");
       break;
     case "è":
       sb.append("è");
       break;
     case "�":
       sb.append("È");
       break;
     case "ê":
       sb.append("ê");
       break;
     case "�":
       sb.append("Ê");
       break;
     case "ë":
       sb.append("ë");
       break;
     case "Ë":
       sb.append("Ë");
       break;
     case "�":
       sb.append("ï");
       break;
     case "�":
       sb.append("Ï");
       break;
     case "ô":
       sb.append("ô");
       break;
     case "�":
       sb.append("Ô");
       break;
     case "ö":
       sb.append("ö");
       break;
     case "Ö":
       sb.append("Ö");
       break;
     case "ø":
       sb.append("ø");
       break;
     case "Ø":
       sb.append("Ø");
       break;
     case "ß":
       sb.append("ß");
       break;
     case "ù":
       sb.append("ù");
       break;
     case "�":
       sb.append("Ù");
       break;
     case "û":
       sb.append("û");
       break;
     case "�":
       sb.append("Û");
       break;
     case "ü":
       sb.append("ü");
       break;
     case "Ü":
       sb.append("Ü");
       break;
     case "�":
       sb.append("®");
       break;
     case "�":
       sb.append("©");
       break;
     case "�":
       sb.append("€");
       break;
     case " ":
       sb.append(" ");
       break;
     default:
       sb.append(c);
       break;
     }
   }
   return sb.toString();
 }

} //><


 </source>
   
  
 
  



extends HTMLEditorKit.ParserCallback

   <source lang="java">

import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class Main {

 public static void main(String args[]) throws Exception {
   URL url = new URL(args[0]);
   Reader reader = new InputStreamReader((InputStream) url.getContent());
   new ParserDelegator().parse(reader, new TextOnly(), false);
 }

} class TextOnly extends HTMLEditorKit.ParserCallback {

 public void handleText(char[] data, int pos) {
   System.out.println(data);
 }

}

 </source>
   
  
 
  



Extract links from an HTML page

   <source lang="java">
 

import java.io.FileReader; import java.util.ArrayList; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML.Attribute; import javax.swing.text.html.HTML.Tag; import javax.swing.text.html.HTMLEditorKit.ParserCallback; import javax.swing.text.html.parser.ParserDelegator; public class Main {

 public final static void main(String[] args) throws Exception {
   final ArrayList<String> list = new ArrayList<String>();
   ParserDelegator parserDelegator = new ParserDelegator();
   ParserCallback parserCallback = new ParserCallback() {
     public void handleText(final char[] data, final int pos) {
     }
     public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
       if (tag == Tag.A) {
         String address = (String) attribute.getAttribute(Attribute.HREF);
         list.add(address);
       }
     }
     public void handleEndTag(Tag t, final int pos) {
     }
     public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) {
     }
     public void handleComment(final char[] data, final int pos) {
     }
     public void handleError(final java.lang.String errMsg, final int pos) {
     }
   };
   parserDelegator.parse(new FileReader("a.html"), parserCallback, false);
   System.out.println(list);
 }

}


 </source>
   
  
 
  



Find and display hyperlinks contained within a web page

   <source lang="java">

import java.io.BufferedReader; import java.io.FileReader; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Main {

 public static void main(String[] arguments) throws Exception{
   String page = loadPage(arguments[0]);
   Pattern pattern = Pattern.rupile("<a.+href=\"(.+?)\"");
   Matcher matcher = pattern.matcher(page);
   while (matcher.find()) {
     System.out.println(matcher.group(1));
   }
 }
 static String loadPage(String name) throws Exception {
   StringBuffer output = new StringBuffer();
   FileReader file = new FileReader(name);
   BufferedReader buff = new BufferedReader(file);
   boolean eof = false;
   while (!eof) {
     String line = buff.readLine();
     if (line == null)
       eof = true;
     else
       output.append(line + "\n");
   }
   buff.close();
   return output.toString();
 }

}

</source>
   
  
 
  



Get all hyper links from a web page

   <source lang="java">

import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class Main {

 public static void main(String args[]) throws Exception {
   URL url = new URL(args[0]);
   Reader reader = new InputStreamReader((InputStream) url.getContent());
   System.out.println("<HTML><HEAD><TITLE>Links for " + args[0] + "</TITLE>");
   System.out.println("<BASE HREF=\"" + args[0] + "\"></HEAD>");
   System.out.println("<BODY>");
   new ParserDelegator().parse(reader, new LinkPage(), false);
   System.out.println("</BODY></HTML>");
 }

} class LinkPage extends HTMLEditorKit.ParserCallback {

 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
   if (t == HTML.Tag.A) {
     System.out.println("
"); } }

}

 </source>
   
  
 
  



Getting the Links in an HTML Document

   <source lang="java">
 

import java.io.InputStreamReader; import java.io.Reader; import java.net.URI; import java.net.URL; import java.net.URLConnection; import javax.swing.text.EditorKit; import javax.swing.text.SimpleAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLDocument; import javax.swing.text.html.HTMLEditorKit; public class Main {

 public static void main(String[] argv) throws Exception {
   URL url = new URI("http://www.google.ru").toURL();
   URLConnection conn = url.openConnection();
   Reader rd = new InputStreamReader(conn.getInputStream());
   EditorKit kit = new HTMLEditorKit();
   HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument();
   kit.read(rd, doc, 0);
   HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
   while (it.isValid()) {
     SimpleAttributeSet s = (SimpleAttributeSet) it.getAttributes();
     String link = (String) s.getAttribute(HTML.Attribute.HREF);
     if (link != null) {
       System.out.println(link);
     }
     it.next();
   }
 }

}


 </source>
   
  
 
  



Getting the Text in an HTML Document

   <source lang="java">
 

import java.io.InputStreamReader; import java.io.Reader; import java.net.URI; import java.net.URL; import java.net.URLConnection; import javax.swing.text.EditorKit; import javax.swing.text.html.HTMLDocument; import javax.swing.text.html.HTMLEditorKit; public class Main {

 public static void main(String[] argv) throws Exception {
   HTMLDocument doc = new HTMLDocument() {
     public HTMLEditorKit.ParserCallback getReader(int pos) {
       return new HTMLEditorKit.ParserCallback() {
         public void handleText(char[] data, int pos) {
           System.out.println(data);
         }
       };
     }
   };
   URL url = new URI("http://www.google.ru").toURL();
   URLConnection conn = url.openConnection();
   Reader rd = new InputStreamReader(conn.getInputStream());
   EditorKit kit = new HTMLEditorKit();
   kit.read(rd, doc, 0);
 }

}


 </source>
   
  
 
  



HTML parser based on HTMLEditorKit.ParserCallback

   <source lang="java">

import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class Main {

 public static void main(String args[]) throws Exception {
   URL url = new URL(args[0]);
   Reader reader = new InputStreamReader((InputStream) url.getContent());
   new ParserDelegator().parse(reader, new HTMLParse(), false);
 }

} class HTMLParse extends HTMLEditorKit.ParserCallback {

 public void handleText(char[] data, int pos) {
   System.out.println(data);
 }
 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
   System.out.println("+" + t.toString());
 }
 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
   System.out.println("*" + t.toString());
 }
 public void handleEndTag(HTML.Tag t, int pos) {
   System.out.println("-" + t.toString());
 }

}

 </source>
   
  
 
  



Use regular expression to get web page title

   <source lang="java">

import java.io.DataInputStream; import java.net.URL; import java.net.URLConnection; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Main {

 public static void main(String[] argv) throws Exception {
   URL url = new URL("http://www.java.ru/");
   URLConnection urlConnection = url.openConnection();
   DataInputStream dis = new DataInputStream(urlConnection.getInputStream());
   String html = "", tmp = "";
   while ((tmp = dis.readUTF()) != null) {
     html += " " + tmp;
   }
   dis.close();
   html = html.replaceAll("\\s+", " ");
   Pattern p = Pattern.rupile("<title>(.*?)</title>");
   Matcher m = p.matcher(html);
   while (m.find() == true) {
     System.out.println(m.group(1));
   }
 }

}

</source>
   
  
 
  



Using javax.swing.text.html.HTMLEditorKit to parse html document

   <source lang="java">
 

import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.List; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML.Tag; import javax.swing.text.html.HTMLEditorKit.ParserCallback; import javax.swing.text.html.parser.ParserDelegator; public class Main {

 public static void main(String[] args) throws Exception {
   final List<String> list = new ArrayList<String>();
   ParserDelegator parserDelegator = new ParserDelegator();
   ParserCallback parserCallback = new ParserCallback() {
     public void handleText(final char[] data, final int pos) {
       list.add(new String(data));
     }
     public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
     }
     public void handleEndTag(Tag t, final int pos) {
     }
     public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) {
     }
     public void handleComment(final char[] data, final int pos) {
     }
     public void handleError(final java.lang.String errMsg, final int pos) {
     }
   };
   parserDelegator.parse(new FileReader("a.html"), parserCallback, true);
   System.out.println(list);
 }

}


 </source>