Java Tutorial/Network/HTML Parser

Материал из Java эксперт
Перейти к: навигация, поиск

Escape HTML special characters from a String

   <source lang="java">

public class Main {

 public static void main(String[] argv){
   System.out.println(escapeHTML("><"));
 }
 
 public static final String escapeHTML(String s) {
   StringBuffer sb = new StringBuffer();
   int n = s.length();
   for (int i = 0; i < n; i++) {
     char c = s.charAt(i);
     switch (c) {
     case "<":
       sb.append("<");
       break;
     case ">":
       sb.append(">");
       break;
     case "&":
       sb.append("&");
       break;
     case """:
       sb.append(""");
       break;
     case "à":
       sb.append("à");
       break;
     case "�":
       sb.append("À");
       break;
     case "â":
       sb.append("â");
       break;
     case "�":
       sb.append("Â");
       break;
     case "ä":
       sb.append("ä");
       break;
     case "Ä":
       sb.append("Ä");
       break;
     case "å":
       sb.append("å");
       break;
     case "Å":
       sb.append("Å");
       break;
     case "æ":
       sb.append("æ");
       break;
     case "Æ":
       sb.append("Æ");
       break;
     case "ç":
       sb.append("ç");
       break;
     case "Ç":
       sb.append("Ç");
       break;
     case "é":
       sb.append("é");
       break;
     case "�":
       sb.append("É");
       break;
     case "è":
       sb.append("è");
       break;
     case "�":
       sb.append("È");
       break;
     case "ê":
       sb.append("ê");
       break;
     case "�":
       sb.append("Ê");
       break;
     case "ë":
       sb.append("ë");
       break;
     case "Ë":
       sb.append("Ë");
       break;
     case "�":
       sb.append("ï");
       break;
     case "�":
       sb.append("Ï");
       break;
     case "ô":
       sb.append("ô");
       break;
     case "�":
       sb.append("Ô");
       break;
     case "ö":
       sb.append("ö");
       break;
     case "Ö":
       sb.append("Ö");
       break;
     case "ø":
       sb.append("ø");
       break;
     case "Ø":
       sb.append("Ø");
       break;
     case "ß":
       sb.append("ß");
       break;
     case "ù":
       sb.append("ù");
       break;
     case "�":
       sb.append("Ù");
       break;
     case "û":
       sb.append("û");
       break;
     case "�":
       sb.append("Û");
       break;
     case "ü":
       sb.append("ü");
       break;
     case "Ü":
       sb.append("Ü");
       break;
     case "�":
       sb.append("®");
       break;
     case "�":
       sb.append("©");
       break;
     case "�":
       sb.append("€");
       break;
     case " ":
       sb.append(" ");
       break;
     default:
       sb.append(c);
       break;
     }
   }
   return sb.toString();
 }

} //><</source>





extends HTMLEditorKit.ParserCallback

   <source lang="java">

import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class Main {

 public static void main(String args[]) throws Exception {
   URL url = new URL(args[0]);
   Reader reader = new InputStreamReader((InputStream) url.getContent());
   new ParserDelegator().parse(reader, new TextOnly(), false);
 }

} class TextOnly extends HTMLEditorKit.ParserCallback {

 public void handleText(char[] data, int pos) {
   System.out.println(data);
 }

}</source>





Extract links from an HTML page

   <source lang="java">

import java.io.FileReader; import java.util.ArrayList; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML.Attribute; import javax.swing.text.html.HTML.Tag; import javax.swing.text.html.HTMLEditorKit.ParserCallback; import javax.swing.text.html.parser.ParserDelegator; public class Main {

 public final static void main(String[] args) throws Exception {
   final ArrayList<String> list = new ArrayList<String>();
   ParserDelegator parserDelegator = new ParserDelegator();
   ParserCallback parserCallback = new ParserCallback() {
     public void handleText(final char[] data, final int pos) {
     }
     public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
       if (tag == Tag.A) {
         String address = (String) attribute.getAttribute(Attribute.HREF);
         list.add(address);
       }
     }
     public void handleEndTag(Tag t, final int pos) {
     }
     public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) {
     }
     public void handleComment(final char[] data, final int pos) {
     }
     public void handleError(final java.lang.String errMsg, final int pos) {
     }
   };
   parserDelegator.parse(new FileReader("a.html"), parserCallback, false);
   System.out.println(list);
 }

}</source>





Find and display hyperlinks contained within a web page

   <source lang="java">

import java.io.BufferedReader; import java.io.FileReader; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Main {

 public static void main(String[] arguments)throws Exception {
   StringBuffer output = new StringBuffer();
   FileReader file = new FileReader("a.htm");
   BufferedReader buff = new BufferedReader(file);
   boolean eof = false;
   while (!eof) {
     String line = buff.readLine();
     if (line == null)
       eof = true;
     else
       output.append(line + "\n");
   }
   buff.close();
   String page = output.toString();
   Pattern pattern = Pattern.rupile("<a.+href=\"(.+?)\"");
   Matcher matcher = pattern.matcher(page);
   while (matcher.find()) {
     System.out.println(matcher.group(1));
   }
 }

}</source>





Get all hyper links from a web page

   <source lang="java">

import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class Main {

 public static void main(String args[]) throws Exception {
   URL url = new URL(args[0]);
   Reader reader = new InputStreamReader((InputStream) url.getContent());
   System.out.println("<HTML><HEAD><TITLE>Links for " + args[0] + "</TITLE>");
   System.out.println("<BASE HREF=\"" + args[0] + "\"></HEAD>");
   System.out.println("<BODY>");
   new ParserDelegator().parse(reader, new LinkPage(), false);
   System.out.println("</BODY></HTML>");
 }

} class LinkPage extends HTMLEditorKit.ParserCallback {

 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
   if (t == HTML.Tag.A) {
     System.out.println("
"); } }

}</source>





Getting the Links in an HTML Document

   <source lang="java">

import java.io.InputStreamReader; import java.io.Reader; import java.net.URI; import java.net.URL; import java.net.URLConnection; import javax.swing.text.EditorKit; import javax.swing.text.SimpleAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLDocument; import javax.swing.text.html.HTMLEditorKit; public class Main {

 public static void main(String[] argv) throws Exception {
   URL url = new URI("http://www.google.ru").toURL();
   URLConnection conn = url.openConnection();
   Reader rd = new InputStreamReader(conn.getInputStream());
   EditorKit kit = new HTMLEditorKit();
   HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument();
   kit.read(rd, doc, 0);
   HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
   while (it.isValid()) {
     SimpleAttributeSet s = (SimpleAttributeSet) it.getAttributes();
     String link = (String) s.getAttribute(HTML.Attribute.HREF);
     if (link != null) {
       System.out.println(link);
     }
     it.next();
   }
 }

}</source>





Getting the Text in an HTML Document

   <source lang="java">

import java.io.InputStreamReader; import java.io.Reader; import java.net.URI; import java.net.URL; import java.net.URLConnection; import javax.swing.text.EditorKit; import javax.swing.text.html.HTMLDocument; import javax.swing.text.html.HTMLEditorKit; public class Main {

 public static void main(String[] argv) throws Exception {
   HTMLDocument doc = new HTMLDocument() {
     public HTMLEditorKit.ParserCallback getReader(int pos) {
       return new HTMLEditorKit.ParserCallback() {
         public void handleText(char[] data, int pos) {
           System.out.println(data);
         }
       };
     }
   };
   URL url = new URI("http://www.google.ru").toURL();
   URLConnection conn = url.openConnection();
   Reader rd = new InputStreamReader(conn.getInputStream());
   EditorKit kit = new HTMLEditorKit();
   kit.read(rd, doc, 0);
 }

}</source>





HTML Parser

   <source lang="java">

/*******************************************************************************

* Copyright (c) 2004 Actuate Corporation.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
*  Actuate Corporation  - initial API and implementation
*******************************************************************************/

import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.LineNumberReader; import java.util.ArrayList; public class HTMLParser {

 FileReader reader;
 LineNumberReader in;
 String token;
 ArrayList attribs = new ArrayList( );
 int pushC = -1;
 private boolean ignoreWhitespace = true;
 
 public static final int EOF = -1;
 public static final int TEXT = 1;
 public static final int DOCTYPE = 2;
 public static final int ELEMENT = 3;
 public static final int COMMENT = 4;
 public static final int SPECIAL_ELEMENT = 5;
 
 public static final int START_ELEMENT = 0;
 public static final int END_ELEMENT = 1;
 public static final int SINGLE_ELEMENT = 2;
 
 public HTMLParser( )
 {
 }
 
 public void open( String fileName ) throws FileNotFoundException
 {
   reader = new FileReader( fileName );
   in = new LineNumberReader( reader );
 }
 
 /**
  * 
  */
 public void close( )
 {
   try
   {
     in.close( );
     reader.close( );
   }
   catch ( IOException e1 )
   {
     // Ignore
   }
 }
 public String getTokenText( )
 {
   return token;
 }
 
 public int getElementType( )
 {
   if ( token.startsWith( "/" ) ) //$NON-NLS-1$
     return END_ELEMENT;
   if ( token.endsWith( "/" ) ) //$NON-NLS-1$
     return SINGLE_ELEMENT;
   return START_ELEMENT;
 }
 
 public String getElement( )
 {
   if ( token.startsWith( "/" ) ) //$NON-NLS-1$
     return token.substring( 1 );
   if ( token.endsWith( "/" ) ) //$NON-NLS-1$
     return token.substring( 0, token.length( ) - 1 );
   return token;
   
 }
 
 public ArrayList getAttribs( )
 {
   return attribs;
 }
 
 public String getAttrib( String name )
 {
   for ( int i = 0;  i < attribs.size( );  i++ )
   {
     AttribPair a = (AttribPair) attribs.get( i );
     if ( a.attrib.equalsIgnoreCase( name ) )
       return a.value;
   }
   return null;
 }
 
 private int getC( )
 {
   if ( pushC != -1 )
   {
     int c = pushC;
     pushC = -1;
     return c;
   }
   try
   {
     return in.read( );
   }
   catch ( IOException e )
   {
     return EOF;
   }
 }
 
 private void pushC( int c )
 {
   pushC = c;
 }
 
 public int getToken( )
 {
   for ( ; ; )
   {
     int c = getC( );
     switch ( c )
     {
       case -1:
         return EOF;
       case "<":
         return getElement( c );
       default:
       {
         parseText( c );
         if ( ! ignoreWhitespace  ||  token.trim( ).length( ) > 0 )
           return TEXT;
       }
     }
   }
 }
 private int parseText( int c )
 {
   StringBuffer text = new StringBuffer( );
   for ( ; ; )
   {
     if ( c == EOF )
       break;
     if ( c == "<" )
     {
       pushC( c );
       break;
     }
     
     // Convert MS-Word-style quotes.
     
     if ( c == 8220  ||  c == 8221 )
       text.append( """ );
     else
       text.append( (char) c );
     c = getC( );
   }
   token = text.toString( );
   return TEXT;
 }
 private int skipSpace( int c )
 {
   while ( c != EOF  &&  Character.isWhitespace( (char)c ) )
   {
     c = getC( );
   }
   return c;
 }
 
 private int getElement( int c )
 {
   c = getC( );
   
   // Broken element
   
   if ( c == EOF )
     return EOF;
   
   if ( c == "!" )
     return getSpecialElement( );
   
   attribs.clear( );
   c = skipSpace( c );
   if ( c == EOF )
     return EOF;
   
   StringBuffer tag = new StringBuffer( );
   if ( c == "/" )
   {
     tag.append( (char) c );
     c = skipSpace( getC( ) );
     while ( c != EOF  &&  c != ">"  && ! Character.isWhitespace( (char)c ) )
     {
       tag.append( (char) c );
       c = getC( );
     }
     token = tag.toString( );
     for ( ; ; )
     {
       if ( c == ">"  ||  c == -1 )
         break;
       c = getC( );
     }
     return ELEMENT;     
   }
   
   while ( c != EOF  &&  c != ">"  &&  c != "/"  && ! Character.isWhitespace( (char)c ) )
   {
     tag.append( (char) c );
     c = getC( );
   }
   if ( c == EOF )
   {
     token = tag.toString( );
     return ELEMENT;
   }
   
   for ( ; ; )
   {
     c = skipSpace( c );
     if ( c == EOF  ||  c == ">" || c == "/" )
       break;
     c = getAttrib( c );
   }
   if ( c == "/" )
   {
     tag.append( (char) c );
     for ( ; ; )
     {
       c = getC( );
       if ( c == -1  ||  c == ">" )
         break;
     }
   }
   token = tag.toString( );
   return ELEMENT;
 }
 
 private int getAttrib( int c )
 {
   AttribPair a = new AttribPair( );
   StringBuffer s = new StringBuffer( );
   while ( c != EOF  &&  c != "="  &&  ! Character.isWhitespace( (char)c ) )
   {
     s.append( (char) c );
     c = getC( );
   }
   a.attrib = s.toString( );
   c = skipSpace( c );
   if ( c != "=" )
   {
     attribs.add( a );
     return c;
   }
   s = new StringBuffer( );
   c = skipSpace( getC( ) );
   if ( c == "\"" || c == """ )
   {
     int quote = c;
     for ( ; ; )
     {
       c = getC( );
       if ( c == -1 )
         break;
       if ( c == quote )
       {
         c = getC( );
         break;
       }
       if ( c == "\\" )
       {
         c = getC( );
         if ( c == EOF )
           break;
         s.append( "\\" );
         s.append( (char) c );
       }
       else
       {
         s.append( (char) c );
       }
     }
   }
   else
   {
     for ( ; ; )
     {
       c = getC( );
       if ( c == -1 )
         break;
       if ( c == ">"  ||  c == "/"  ||  Character.isWhitespace( (char)c ) )
       {
         c = getC( );
         break;
       }
       s.append( (char) c );
     }
   }
   a.value = s.toString( );
   attribs.add( a );
   return c;
 }
 
 class AttribPair
 {
   String attrib;
   String value;
 }
 
 private int getSpecialElement(  )
 {
   StringBuffer text = new StringBuffer( );
   text.append( "<!" ); //$NON-NLS-1$
   for ( ; ; )
   {
     int c = getC( );
     if ( c == EOF || c == ">" )
       break;
     text.append( (char) c );
   }
   text.append( ">" );
   token = text.toString( );
   if ( token.startsWith( "
  



HTML parser based on HTMLEditorKit.ParserCallback

   <source lang="java">

import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class Main {

 public static void main(String args[]) throws Exception {
   URL url = new URL(args[0]);
   Reader reader = new InputStreamReader((InputStream) url.getContent());
   new ParserDelegator().parse(reader, new HTMLParse(), false);
 }

} class HTMLParse extends HTMLEditorKit.ParserCallback {

 public void handleText(char[] data, int pos) {
   System.out.println(data);
 }
 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
   System.out.println("+" + t.toString());
 }
 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
   System.out.println("*" + t.toString());
 }
 public void handleEndTag(HTML.Tag t, int pos) {
   System.out.println("-" + t.toString());
 }

}</source>





Using javax.swing.text.html.HTMLEditorKit to parse html document

   <source lang="java">

import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.List; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML.Tag; import javax.swing.text.html.HTMLEditorKit.ParserCallback; import javax.swing.text.html.parser.ParserDelegator; public class Main {

 public static void main(String[] args) throws Exception {
   final List<String> list = new ArrayList<String>();
   ParserDelegator parserDelegator = new ParserDelegator();
   ParserCallback parserCallback = new ParserCallback() {
     public void handleText(final char[] data, final int pos) {
       list.add(new String(data));
     }
     public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
     }
     public void handleEndTag(Tag t, final int pos) {
     }
     public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) {
     }
     public void handleComment(final char[] data, final int pos) {
     }
     public void handleError(final java.lang.String errMsg, final int pos) {
     }
   };
   parserDelegator.parse(new FileReader("a.html"), parserCallback, true);
   System.out.println(list);
 }

}</source>