Java Tutorial/Network/HTML Parser

Материал из Java эксперт
Версия от 05:03, 1 июня 2010; Admin (обсуждение | вклад) (1 версия)
(разн.) ← Предыдущая | Текущая версия (разн.) | Следующая → (разн.)
Перейти к: навигация, поиск

Escape HTML special characters from a String

public class Main {
  public static void main(String[] argv){
    System.out.println(escapeHTML("><"));
  }
  
  public static final String escapeHTML(String s) {
    StringBuffer sb = new StringBuffer();
    int n = s.length();
    for (int i = 0; i < n; i++) {
      char c = s.charAt(i);
      switch (c) {
      case "<":
        sb.append("&lt;");
        break;
      case ">":
        sb.append("&gt;");
        break;
      case "&":
        sb.append("&amp;");
        break;
      case """:
        sb.append("&quot;");
        break;
      case "&agrave;":
        sb.append("&agrave;");
        break;
      case "�":
        sb.append("&Agrave;");
        break;
      case "&acirc;":
        sb.append("&acirc;");
        break;
      case "�":
        sb.append("&Acirc;");
        break;
      case "&auml;":
        sb.append("&auml;");
        break;
      case "&Auml;":
        sb.append("&Auml;");
        break;
      case "&aring;":
        sb.append("&aring;");
        break;
      case "&Aring;":
        sb.append("&Aring;");
        break;
      case "&aelig;":
        sb.append("&aelig;");
        break;
      case "&AElig;":
        sb.append("&AElig;");
        break;
      case "&ccedil;":
        sb.append("&ccedil;");
        break;
      case "&Ccedil;":
        sb.append("&Ccedil;");
        break;
      case "&eacute;":
        sb.append("&eacute;");
        break;
      case "�":
        sb.append("&Eacute;");
        break;
      case "&egrave;":
        sb.append("&egrave;");
        break;
      case "�":
        sb.append("&Egrave;");
        break;
      case "&ecirc;":
        sb.append("&ecirc;");
        break;
      case "�":
        sb.append("&Ecirc;");
        break;
      case "&euml;":
        sb.append("&euml;");
        break;
      case "&Euml;":
        sb.append("&Euml;");
        break;
      case "�":
        sb.append("&iuml;");
        break;
      case "�":
        sb.append("&Iuml;");
        break;
      case "&ocirc;":
        sb.append("&ocirc;");
        break;
      case "�":
        sb.append("&Ocirc;");
        break;
      case "&ouml;":
        sb.append("&ouml;");
        break;
      case "&Ouml;":
        sb.append("&Ouml;");
        break;
      case "&oslash;":
        sb.append("&oslash;");
        break;
      case "&Oslash;":
        sb.append("&Oslash;");
        break;
      case "&szlig;":
        sb.append("&szlig;");
        break;
      case "&ugrave;":
        sb.append("&ugrave;");
        break;
      case "�":
        sb.append("&Ugrave;");
        break;
      case "&ucirc;":
        sb.append("&ucirc;");
        break;
      case "�":
        sb.append("&Ucirc;");
        break;
      case "&uuml;":
        sb.append("&uuml;");
        break;
      case "&Uuml;":
        sb.append("&Uuml;");
        break;
      case "�":
        sb.append("&reg;");
        break;
      case "�":
        sb.append("&copy;");
        break;
      case "�":
        sb.append("&euro;");
        break;
      case " ":
        sb.append("&nbsp;");
        break;
      default:
        sb.append(c);
        break;
      }
    }
    return sb.toString();
  }
}
//&gt;&lt;





extends HTMLEditorKit.ParserCallback

import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
  public static void main(String args[]) throws Exception {
    URL url = new URL(args[0]);
    Reader reader = new InputStreamReader((InputStream) url.getContent());
    new ParserDelegator().parse(reader, new TextOnly(), false);
  }
}
class TextOnly extends HTMLEditorKit.ParserCallback {
  public void handleText(char[] data, int pos) {
    System.out.println(data);
  }
}





Extract links from an HTML page

import java.io.FileReader;
import java.util.ArrayList;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Attribute;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
  public final static void main(String[] args) throws Exception {
    final ArrayList<String> list = new ArrayList<String>();
    ParserDelegator parserDelegator = new ParserDelegator();
    ParserCallback parserCallback = new ParserCallback() {
      public void handleText(final char[] data, final int pos) {
      }
      public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
        if (tag == Tag.A) {
          String address = (String) attribute.getAttribute(Attribute.HREF);
          list.add(address);
        }
      }
      public void handleEndTag(Tag t, final int pos) {
      }
      public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) {
      }
      public void handleComment(final char[] data, final int pos) {
      }
      public void handleError(final java.lang.String errMsg, final int pos) {
      }
    };
    parserDelegator.parse(new FileReader("a.html"), parserCallback, false);
    System.out.println(list);
  }
}





Find and display hyperlinks contained within a web page

import java.io.BufferedReader;
import java.io.FileReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
  public static void main(String[] arguments)throws Exception {
    StringBuffer output = new StringBuffer();
    FileReader file = new FileReader("a.htm");
    BufferedReader buff = new BufferedReader(file);
    boolean eof = false;
    while (!eof) {
      String line = buff.readLine();
      if (line == null)
        eof = true;
      else
        output.append(line + "\n");
    }
    buff.close();
    String page = output.toString();
    Pattern pattern = Pattern.rupile("<a.+href=\"(.+?)\"");
    Matcher matcher = pattern.matcher(page);
    while (matcher.find()) {
      System.out.println(matcher.group(1));
    }
  }
}





Get all hyper links from a web page

import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
  public static void main(String args[]) throws Exception {
    URL url = new URL(args[0]);
    Reader reader = new InputStreamReader((InputStream) url.getContent());
    System.out.println("<HTML><HEAD><TITLE>Links for " + args[0] + "</TITLE>");
    System.out.println("<BASE HREF=\"" + args[0] + "\"></HEAD>");
    System.out.println("<BODY>");
    new ParserDelegator().parse(reader, new LinkPage(), false);
    System.out.println("</BODY></HTML>");
  }
}
class LinkPage extends HTMLEditorKit.ParserCallback {
  public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
    if (t == HTML.Tag.A) {
      System.out.println("<BR>");
    }
  }
}





Getting the Links in an HTML Document

import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.text.EditorKit;
import javax.swing.text.SimpleAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
public class Main {
  public static void main(String[] argv) throws Exception {
    URL url = new URI("http://www.google.ru").toURL();
    URLConnection conn = url.openConnection();
    Reader rd = new InputStreamReader(conn.getInputStream());
    EditorKit kit = new HTMLEditorKit();
    HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument();
    kit.read(rd, doc, 0);
    HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
    while (it.isValid()) {
      SimpleAttributeSet s = (SimpleAttributeSet) it.getAttributes();
      String link = (String) s.getAttribute(HTML.Attribute.HREF);
      if (link != null) {
        System.out.println(link);
      }
      it.next();
    }
  }
}





Getting the Text in an HTML Document

import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
public class Main {
  public static void main(String[] argv) throws Exception {
    HTMLDocument doc = new HTMLDocument() {
      public HTMLEditorKit.ParserCallback getReader(int pos) {
        return new HTMLEditorKit.ParserCallback() {
          public void handleText(char[] data, int pos) {
            System.out.println(data);
          }
        };
      }
    };
    URL url = new URI("http://www.google.ru").toURL();
    URLConnection conn = url.openConnection();
    Reader rd = new InputStreamReader(conn.getInputStream());
    EditorKit kit = new HTMLEditorKit();
    kit.read(rd, doc, 0);
  }
}





HTML Parser

/*******************************************************************************
 * Copyright (c) 2004 Actuate Corporation.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *  Actuate Corporation  - initial API and implementation
 *******************************************************************************/

import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.ArrayList;
public class HTMLParser
{
  FileReader reader;
  LineNumberReader in;
  String token;
  ArrayList attribs = new ArrayList( );
  int pushC = -1;
  private boolean ignoreWhitespace = true;
  
  public static final int EOF = -1;
  public static final int TEXT = 1;
  public static final int DOCTYPE = 2;
  public static final int ELEMENT = 3;
  public static final int COMMENT = 4;
  public static final int SPECIAL_ELEMENT = 5;
  
  public static final int START_ELEMENT = 0;
  public static final int END_ELEMENT = 1;
  public static final int SINGLE_ELEMENT = 2;
  
  public HTMLParser( )
  {
  }
  
  public void open( String fileName ) throws FileNotFoundException
  {
    reader = new FileReader( fileName );
    in = new LineNumberReader( reader );
  }
  
  /**
   * 
   */
  public void close( )
  {
    try
    {
      in.close( );
      reader.close( );
    }
    catch ( IOException e1 )
    {
      // Ignore
    }
  }
  public String getTokenText( )
  {
    return token;
  }
  
  public int getElementType( )
  {
    if ( token.startsWith( "/" ) ) //$NON-NLS-1$
      return END_ELEMENT;
    if ( token.endsWith( "/" ) ) //$NON-NLS-1$
      return SINGLE_ELEMENT;
    return START_ELEMENT;
  }
  
  public String getElement( )
  {
    if ( token.startsWith( "/" ) ) //$NON-NLS-1$
      return token.substring( 1 );
    if ( token.endsWith( "/" ) ) //$NON-NLS-1$
      return token.substring( 0, token.length( ) - 1 );
    return token;
    
  }
  
  public ArrayList getAttribs( )
  {
    return attribs;
  }
  
  public String getAttrib( String name )
  {
    for ( int i = 0;  i < attribs.size( );  i++ )
    {
      AttribPair a = (AttribPair) attribs.get( i );
      if ( a.attrib.equalsIgnoreCase( name ) )
        return a.value;
    }
    return null;
  }
  
  private int getC( )
  {
    if ( pushC != -1 )
    {
      int c = pushC;
      pushC = -1;
      return c;
    }
    try
    {
      return in.read( );
    }
    catch ( IOException e )
    {
      return EOF;
    }
  }
  
  private void pushC( int c )
  {
    pushC = c;
  }
  
  public int getToken( )
  {
    for ( ; ; )
    {
      int c = getC( );
      switch ( c )
      {
        case -1:
          return EOF;
        case "<":
          return getElement( c );
        default:
        {
          parseText( c );
          if ( ! ignoreWhitespace  ||  token.trim( ).length( ) > 0 )
            return TEXT;
        }
      }
    }
  }
  private int parseText( int c )
  {
    StringBuffer text = new StringBuffer( );
    for ( ; ; )
    {
      if ( c == EOF )
        break;
      if ( c == "<" )
      {
        pushC( c );
        break;
      }
      
      // Convert MS-Word-style quotes.
      
      if ( c == 8220  ||  c == 8221 )
        text.append( "&quot;" );
      else
        text.append( (char) c );
      c = getC( );
    }
    token = text.toString( );
    return TEXT;
  }
  private int skipSpace( int c )
  {
    while ( c != EOF  &&  Character.isWhitespace( (char)c ) )
    {
      c = getC( );
    }
    return c;
  }
  
  private int getElement( int c )
  {
    c = getC( );
    
    // Broken element
    
    if ( c == EOF )
      return EOF;
    
    if ( c == "!" )
      return getSpecialElement( );
    
    attribs.clear( );
    c = skipSpace( c );
    if ( c == EOF )
      return EOF;
    
    StringBuffer tag = new StringBuffer( );
    if ( c == "/" )
    {
      tag.append( (char) c );
      c = skipSpace( getC( ) );
      while ( c != EOF  &&  c != ">"  && ! Character.isWhitespace( (char)c ) )
      {
        tag.append( (char) c );
        c = getC( );
      }
      token = tag.toString( );
      for ( ; ; )
      {
        if ( c == ">"  ||  c == -1 )
          break;
        c = getC( );
      }
      return ELEMENT;     
    }
    
    while ( c != EOF  &&  c != ">"  &&  c != "/"  && ! Character.isWhitespace( (char)c ) )
    {
      tag.append( (char) c );
      c = getC( );
    }
    if ( c == EOF )
    {
      token = tag.toString( );
      return ELEMENT;
    }
    
    for ( ; ; )
    {
      c = skipSpace( c );
      if ( c == EOF  ||  c == ">" || c == "/" )
        break;
      c = getAttrib( c );
    }
    if ( c == "/" )
    {
      tag.append( (char) c );
      for ( ; ; )
      {
        c = getC( );
        if ( c == -1  ||  c == ">" )
          break;
      }
    }
    token = tag.toString( );
    return ELEMENT;
  }
  
  private int getAttrib( int c )
  {
    AttribPair a = new AttribPair( );
    StringBuffer s = new StringBuffer( );
    while ( c != EOF  &&  c != "="  &&  ! Character.isWhitespace( (char)c ) )
    {
      s.append( (char) c );
      c = getC( );
    }
    a.attrib = s.toString( );
    c = skipSpace( c );
    if ( c != "=" )
    {
      attribs.add( a );
      return c;
    }
    s = new StringBuffer( );
    c = skipSpace( getC( ) );
    if ( c == "\"" || c == """ )
    {
      int quote = c;
      for ( ; ; )
      {
        c = getC( );
        if ( c == -1 )
          break;
        if ( c == quote )
        {
          c = getC( );
          break;
        }
        if ( c == "\\" )
        {
          c = getC( );
          if ( c == EOF )
            break;
          s.append( "\\" );
          s.append( (char) c );
        }
        else
        {
          s.append( (char) c );
        }
      }
    }
    else
    {
      for ( ; ; )
      {
        c = getC( );
        if ( c == -1 )
          break;
        if ( c == ">"  ||  c == "/"  ||  Character.isWhitespace( (char)c ) )
        {
          c = getC( );
          break;
        }
        s.append( (char) c );
      }
    }
    a.value = s.toString( );
    attribs.add( a );
    return c;
  }
  
  class AttribPair
  {
    String attrib;
    String value;
  }
  
  private int getSpecialElement(  )
  {
    StringBuffer text = new StringBuffer( );
    text.append( "<!" ); //$NON-NLS-1$
    for ( ; ; )
    {
      int c = getC( );
      if ( c == EOF || c == ">" )
        break;
      text.append( (char) c );
    }
    text.append( ">" );
    token = text.toString( );
    if ( token.startsWith( "<!--" ) ) //$NON-NLS-1$
      return COMMENT;
    return SPECIAL_ELEMENT;
  }
  static String formatTags[ ] =
  {
      "i", "b",  //$NON-NLS-1$//$NON-NLS-2$
      "strong", "em",  //$NON-NLS-1$//$NON-NLS-2$
      "code", "span", //$NON-NLS-1$ //$NON-NLS-2$
      "a" //$NON-NLS-1$
  };
  
  public boolean isFormatTag( )
  {
    return isFormatTag( getElement( ) );
  }
  
  public boolean isFormatTag( String tag )
  {
    for ( int i = 0;  i < formatTags.length;  i++ )
    {
      if ( formatTags[ i ].equalsIgnoreCase( tag ) )
        return true;
    }
    return false;
  }
  public Object getFullElement( )
  {
    StringBuffer text = new StringBuffer( );
    text.append( "<" );
    int elementType = getElementType( );
    if ( elementType == END_ELEMENT )
      text.append( "/" );
    text.append( getElement( ) );
    
    for ( int i = 0;  i < attribs.size( );  i++ )
    {
      text.append( " " );
      AttribPair a = (AttribPair) attribs.get( i );
      text.append( a.attrib );
      text.append( "=\"" ); //$NON-NLS-1$
      if ( a.value != null )
        text.append( a.value );
      text.append( "\"" ); //$NON-NLS-1$
    }
    if ( elementType == SINGLE_ELEMENT )
      text.append( "/" );
    text.append( ">" );
    return text.toString( );
  }
  public int getLineNo( )
  {
    return in.getLineNumber( );
  }
  public void ignoreWhitespace( boolean b )
  {
    ignoreWhitespace = b;
  }
}





HTML parser based on HTMLEditorKit.ParserCallback

import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
  public static void main(String args[]) throws Exception {
    URL url = new URL(args[0]);
    Reader reader = new InputStreamReader((InputStream) url.getContent());
    new ParserDelegator().parse(reader, new HTMLParse(), false);
  }
}
class HTMLParse extends HTMLEditorKit.ParserCallback {
  public void handleText(char[] data, int pos) {
    System.out.println(data);
  }
  public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
    System.out.println("+" + t.toString());
  }
  public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
    System.out.println("*" + t.toString());
  }
  public void handleEndTag(HTML.Tag t, int pos) {
    System.out.println("-" + t.toString());
  }
}





Using javax.swing.text.html.HTMLEditorKit to parse html document

import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class Main {
  public static void main(String[] args) throws Exception {
    final List<String> list = new ArrayList<String>();
    ParserDelegator parserDelegator = new ParserDelegator();
    ParserCallback parserCallback = new ParserCallback() {
      public void handleText(final char[] data, final int pos) {
        list.add(new String(data));
      }
      public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
      }
      public void handleEndTag(Tag t, final int pos) {
      }
      public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) {
      }
      public void handleComment(final char[] data, final int pos) {
      }
      public void handleError(final java.lang.String errMsg, final int pos) {
      }
    };
    parserDelegator.parse(new FileReader("a.html"), parserCallback, true);
    System.out.println(list);
  }
}