Java/Network Protocol/Crawler

Материал из Java эксперт
Версия от 18:01, 31 мая 2010; (обсуждение)
(разн.) ← Предыдущая | Текущая версия (разн.) | Следующая → (разн.)
Перейти к: навигация, поиск

Search Crawler

 
// The SearchCrawler class is shown here and is examined in detail in the
// following sections. Notice that it extends JFrame:
/*
 * Chapter 6 - Crawling the Web with Java 
 * The Art of Java 
 * by Herbert Schildt and James Holmes 
 * McGraw-Hill/Osborne 2003
 *  
 */
import java.awt.BorderLayout;
import java.awt.Cursor;
import java.awt.Font;
import java.awt.GridBagConstraints;
import java.awt.GridBagLayout;
import java.awt.Insets;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.KeyEvent;
import java.awt.event.WindowAdapter;
import java.awt.event.WindowEvent;
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.BorderFactory;
import javax.swing.JButton;
import javax.swing.JCheckBox;
import javax.swing.JComboBox;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JMenu;
import javax.swing.JMenuBar;
import javax.swing.JMenuItem;
import javax.swing.JOptionPane;
import javax.swing.JPanel;
import javax.swing.JProgressBar;
import javax.swing.JScrollPane;
import javax.swing.JSeparator;
import javax.swing.JTable;
import javax.swing.JTextField;
import javax.swing.table.DefaultTableModel;
// The Search Web Crawler
public class SearchCrawler extends JFrame {
  // Max URLs drop-down values.
  private static final String[] MAX_URLS = { "50", "100", "500", "1000" };
  // Cache of robot disallow lists.
  private HashMap disallowListCache = new HashMap();
  // Search GUI controls.
  private JTextField startTextField;
  private JComboBox maxComboBox;
  private JCheckBox limitCheckBox;
  private JTextField logTextField;
  private JTextField searchTextField;
  private JCheckBox caseCheckBox;
  private JButton searchButton;
  // Search stats GUI controls.
  private JLabel crawlingLabel2;
  private JLabel crawledLabel2;
  private JLabel toCrawlLabel2;
  private JProgressBar progressBar;
  private JLabel matchesLabel2;
  // Table listing search matches.
  private JTable table;
  // Flag for whether or not crawling is underway.
  private boolean crawling;
  // Matches log file print writer.
  private PrintWriter logFileWriter;
  // Constructor for Search Web Crawler.
  public SearchCrawler() {
    // Set application title.
    setTitle("Search Crawler");
    // Set window size.
    setSize(600, 600);
    // Handle window closing events.
    addWindowListener(new WindowAdapter() {
      public void windowClosing(WindowEvent e) {
        actionExit();
      }
    });
    // Set up File menu.
    JMenuBar menuBar = new JMenuBar();
    JMenu fileMenu = new JMenu("File");
    fileMenu.setMnemonic(KeyEvent.VK_F);
    JMenuItem fileExitMenuItem = new JMenuItem("Exit", KeyEvent.VK_X);
    fileExitMenuItem.addActionListener(new ActionListener() {
      public void actionPerformed(ActionEvent e) {
        actionExit();
      }
    });
    fileMenu.add(fileExitMenuItem);
    menuBar.add(fileMenu);
    setJMenuBar(menuBar);
    // Set up search panel.
    JPanel searchPanel = new JPanel();
    GridBagConstraints constraints;
    GridBagLayout layout = new GridBagLayout();
    searchPanel.setLayout(layout);
    JLabel startLabel = new JLabel("Start URL:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(startLabel, constraints);
    searchPanel.add(startLabel);
    startTextField = new JTextField();
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 0, 5);
    layout.setConstraints(startTextField, constraints);
    searchPanel.add(startTextField);
    JLabel maxLabel = new JLabel("Max URLs to Crawl:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(maxLabel, constraints);
    searchPanel.add(maxLabel);
    maxComboBox = new JComboBox(MAX_URLS);
    maxComboBox.setEditable(true);
    constraints = new GridBagConstraints();
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(maxComboBox, constraints);
    searchPanel.add(maxComboBox);
    limitCheckBox = new JCheckBox("Limit crawling to Start URL site");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.WEST;
    constraints.insets = new Insets(0, 10, 0, 0);
    layout.setConstraints(limitCheckBox, constraints);
    searchPanel.add(limitCheckBox);
    JLabel blankLabel = new JLabel();
    constraints = new GridBagConstraints();
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    layout.setConstraints(blankLabel, constraints);
    searchPanel.add(blankLabel);
    JLabel logLabel = new JLabel("Matches Log File:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(logLabel, constraints);
    searchPanel.add(logLabel);
    String file = System.getProperty("user.dir")
        + System.getProperty("file.separator") + "crawler.log";
    logTextField = new JTextField(file);
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 0, 5);
    layout.setConstraints(logTextField, constraints);
    searchPanel.add(logTextField);
    JLabel searchLabel = new JLabel("Search String:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(searchLabel, constraints);
    searchPanel.add(searchLabel);
    searchTextField = new JTextField();
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.insets = new Insets(5, 5, 0, 0);
    constraints.gridwidth = 2;
    constraints.weightx = 1.0d;
    layout.setConstraints(searchTextField, constraints);
    searchPanel.add(searchTextField);
    caseCheckBox = new JCheckBox("Case Sensitive");
    constraints = new GridBagConstraints();
    constraints.insets = new Insets(5, 5, 0, 5);
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    layout.setConstraints(caseCheckBox, constraints);
    searchPanel.add(caseCheckBox);
    searchButton = new JButton("Search");
    searchButton.addActionListener(new ActionListener() {
      public void actionPerformed(ActionEvent e) {
        actionSearch();
      }
    });
    constraints = new GridBagConstraints();
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 5, 5);
    layout.setConstraints(searchButton, constraints);
    searchPanel.add(searchButton);
    JSeparator separator = new JSeparator();
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 5, 5);
    layout.setConstraints(separator, constraints);
    searchPanel.add(separator);
    JLabel crawlingLabel1 = new JLabel("Crawling:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(crawlingLabel1, constraints);
    searchPanel.add(crawlingLabel1);
    crawlingLabel2 = new JLabel();
    crawlingLabel2.setFont(crawlingLabel2.getFont().deriveFont(Font.PLAIN));
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 0, 5);
    layout.setConstraints(crawlingLabel2, constraints);
    searchPanel.add(crawlingLabel2);
    JLabel crawledLabel1 = new JLabel("Crawled URLs:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(crawledLabel1, constraints);
    searchPanel.add(crawledLabel1);
    crawledLabel2 = new JLabel();
    crawledLabel2.setFont(crawledLabel2.getFont().deriveFont(Font.PLAIN));
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 0, 5);
    layout.setConstraints(crawledLabel2, constraints);
    searchPanel.add(crawledLabel2);
    JLabel toCrawlLabel1 = new JLabel("URLs to Crawl:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(toCrawlLabel1, constraints);
    searchPanel.add(toCrawlLabel1);
    toCrawlLabel2 = new JLabel();
    toCrawlLabel2.setFont(toCrawlLabel2.getFont().deriveFont(Font.PLAIN));
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 0, 5);
    layout.setConstraints(toCrawlLabel2, constraints);
    searchPanel.add(toCrawlLabel2);
    JLabel progressLabel = new JLabel("Crawling Progress:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(progressLabel, constraints);
    searchPanel.add(progressLabel);
    progressBar = new JProgressBar();
    progressBar.setMinimum(0);
    progressBar.setStringPainted(true);
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 0, 5);
    layout.setConstraints(progressBar, constraints);
    searchPanel.add(progressBar);
    JLabel matchesLabel1 = new JLabel("Search Matches:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 10, 0);
    layout.setConstraints(matchesLabel1, constraints);
    searchPanel.add(matchesLabel1);
    matchesLabel2 = new JLabel();
    matchesLabel2.setFont(matchesLabel2.getFont().deriveFont(Font.PLAIN));
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 10, 5);
    layout.setConstraints(matchesLabel2, constraints);
    searchPanel.add(matchesLabel2);
    // Set up matches table.
    table = new JTable(new DefaultTableModel(new Object[][] {},
        new String[] { "URL" }) {
      public boolean isCellEditable(int row, int column) {
        return false;
      }
    });
    // Set up Matches panel.
    JPanel matchesPanel = new JPanel();
    matchesPanel.setBorder(BorderFactory.createTitledBorder("Matches"));
    matchesPanel.setLayout(new BorderLayout());
    matchesPanel.add(new JScrollPane(table), BorderLayout.CENTER);
    // Add panels to display.
    getContentPane().setLayout(new BorderLayout());
    getContentPane().add(searchPanel, BorderLayout.NORTH);
    getContentPane().add(matchesPanel, BorderLayout.CENTER);
  }
  // Exit this program.
  private void actionExit() {
    System.exit(0);
  }
  // Handle Search/Stop button being clicked.
  private void actionSearch() {
    // If stop button clicked, turn crawling flag off.
    if (crawling) {
      crawling = false;
      return;
    }
    ArrayList errorList = new ArrayList();
    // Validate that start URL has been entered.
    String startUrl = startTextField.getText().trim();
    if (startUrl.length() < 1) {
      errorList.add("Missing Start URL.");
    }
    // Verify start URL.
    else if (verifyUrl(startUrl) == null) {
      errorList.add("Invalid Start URL.");
    }
    // Validate that Max URLs is either empty or is a number.
    int maxUrls = 0;
    String max = ((String) maxComboBox.getSelectedItem()).trim();
    if (max.length() > 0) {
      try {
        maxUrls = Integer.parseInt(max);
      } catch (NumberFormatException e) {
      }
      if (maxUrls < 1) {
        errorList.add("Invalid Max URLs value.");
      }
    }
    // Validate that matches log file has been entered.
    String logFile = logTextField.getText().trim();
    if (logFile.length() < 1) {
      errorList.add("Missing Matches Log File.");
    }
    // Validate that search string has been entered.
    String searchString = searchTextField.getText().trim();
    if (searchString.length() < 1) {
      errorList.add("Missing Search String.");
    }
    // Show errors, if any, and return.
    if (errorList.size() > 0) {
      StringBuffer message = new StringBuffer();
      // Concatenate errors into single message.
      for (int i = 0; i < errorList.size(); i++) {
        message.append(errorList.get(i));
        if (i + 1 < errorList.size()) {
          message.append("\n");
        }
      }
      showError(message.toString());
      return;
    }
    // Remove "www" from start URL if present.
    startUrl = removeWwwFromUrl(startUrl);
    // Start the Search Crawler.
    search(logFile, startUrl, maxUrls, searchString);
  }
  private void search(final String logFile, final String startUrl,
      final int maxUrls, final String searchString) {
    // Start the search in a new thread.
    Thread thread = new Thread(new Runnable() {
      public void run() {
        // Show hour glass cursor while crawling is under way.
        setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));
        // Disable search controls.
        startTextField.setEnabled(false);
        maxComboBox.setEnabled(false);
        limitCheckBox.setEnabled(false);
        logTextField.setEnabled(false);
        searchTextField.setEnabled(false);
        caseCheckBox.setEnabled(false);
        // Switch Search button to "Stop."
        searchButton.setText("Stop");
        // Reset stats.
        table.setModel(new DefaultTableModel(new Object[][] {},
            new String[] { "URL" }) {
          public boolean isCellEditable(int row, int column) {
            return false;
          }
        });
        updateStats(startUrl, 0, 0, maxUrls);
        // Open matches log file.
        try {
          logFileWriter = new PrintWriter(new FileWriter(logFile));
        } catch (Exception e) {
          showError("Unable to open matches log file.");
          return;
        }
        // Turn crawling flag on.
        crawling = true;
        // Perform the actual crawling.
        crawl(startUrl, maxUrls, limitCheckBox.isSelected(),
            searchString, caseCheckBox.isSelected());
        // Turn crawling flag off.
        crawling = false;
        // Close matches log file.
        try {
          logFileWriter.close();
        } catch (Exception e) {
          showError("Unable to close matches log file.");
        }
        // Mark search as done.
        crawlingLabel2.setText("Done");
        // Enable search controls.
        startTextField.setEnabled(true);
        maxComboBox.setEnabled(true);
        limitCheckBox.setEnabled(true);
        logTextField.setEnabled(true);
        searchTextField.setEnabled(true);
        caseCheckBox.setEnabled(true);
        // Switch search button back to "Search."
        searchButton.setText("Search");
        // Return to default cursor.
        setCursor(Cursor.getDefaultCursor());
        // Show message if search string not found.
        if (table.getRowCount() == 0) {
          JOptionPane
              .showMessageDialog(
                  SearchCrawler.this,
                  "Your Search String was not found. Please try another.",
                  "Search String Not Found",
                  JOptionPane.WARNING_MESSAGE);
        }
      }
    });
    thread.start();
  }
  // Show dialog box with error message.
  private void showError(String message) {
    JOptionPane.showMessageDialog(this, message, "Error",
        JOptionPane.ERROR_MESSAGE);
  }
  // Update crawling stats.
  private void updateStats(String crawling, int crawled, int toCrawl,
      int maxUrls) {
    crawlingLabel2.setText(crawling);
    crawledLabel2.setText("" + crawled);
    toCrawlLabel2.setText("" + toCrawl);
    // Update progress bar.
    if (maxUrls == -1) {
      progressBar.setMaximum(crawled + toCrawl);
    } else {
      progressBar.setMaximum(maxUrls);
    }
    progressBar.setValue(crawled);
    matchesLabel2.setText("" + table.getRowCount());
  }
  // Add match to matches table and log file.
  private void addMatch(String url) {
    // Add URL to matches table.
    DefaultTableModel model = (DefaultTableModel) table.getModel();
    model.addRow(new Object[] { url });
    // Add URL to matches log file.
    try {
      logFileWriter.println(url);
    } catch (Exception e) {
      showError("Unable to log match.");
    }
  }
  // Verify URL format.
  private URL verifyUrl(String url) {
    // Only allow HTTP URLs.
    if (!url.toLowerCase().startsWith("http://"))
      return null;
    // Verify format of URL.
    URL verifiedUrl = null;
    try {
      verifiedUrl = new URL(url);
    } catch (Exception e) {
      return null;
    }
    return verifiedUrl;
  }
  // Check if robot is allowed to access the given URL.
  private boolean isRobotAllowed(URL urlToCheck) {
    String host = urlToCheck.getHost().toLowerCase();
    // Retrieve host"s disallow list from cache.
    ArrayList disallowList = (ArrayList) disallowListCache.get(host);
    // If list is not in the cache, download and cache it.
    if (disallowList == null) {
      disallowList = new ArrayList();
      try {
        URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
        // Open connection to robot file URL for reading.
        BufferedReader reader = new BufferedReader(
            new InputStreamReader(robotsFileUrl.openStream()));
        // Read robot file, creating list of disallowed paths.
        String line;
        while ((line = reader.readLine()) != null) {
          if (line.indexOf("Disallow:") == 0) {
            String disallowPath = line.substring("Disallow:"
                .length());
            // Check disallow path for comments and remove if
            // present.
            int commentIndex = disallowPath.indexOf("#");
            if (commentIndex != -1) {
              disallowPath = disallowPath.substring(0,
                  commentIndex);
            }
            // Remove leading or trailing spaces from disallow path.
            disallowPath = disallowPath.trim();
            // Add disallow path to list.
            disallowList.add(disallowPath);
          }
        }
        // Add new disallow list to cache.
        disallowListCache.put(host, disallowList);
      } catch (Exception e) {
        /*
         * Assume robot is allowed since an exception is thrown if the
         * robot file doesn"t exist.
         */
        return true;
      }
    }
    /*
     * Loop through disallow list to see if crawling is allowed for the
     * given URL.
     */
    String file = urlToCheck.getFile();
    for (int i = 0; i < disallowList.size(); i++) {
      String disallow = (String) disallowList.get(i);
      if (file.startsWith(disallow)) {
        return false;
      }
    }
    return true;
  }
  // Download page at given URL.
  private String downloadPage(URL pageUrl) {
    try {
      // Open connection to URL for reading.
      BufferedReader reader = new BufferedReader(new InputStreamReader(
          pageUrl.openStream()));
      // Read page into buffer.
      String line;
      StringBuffer pageBuffer = new StringBuffer();
      while ((line = reader.readLine()) != null) {
        pageBuffer.append(line);
      }
      return pageBuffer.toString();
    } catch (Exception e) {
    }
    return null;
  }
  // Remove leading "www" from a URL"s host if present.
  private String removeWwwFromUrl(String url) {
    int index = url.indexOf("://www.");
    if (index != -1) {
      return url.substring(0, index + 3) + url.substring(index + 7);
    }
    return (url);
  }
  // Parse through page contents and retrieve links.
  private ArrayList retrieveLinks(URL pageUrl, String pageContents,
      HashSet crawledList, boolean limitHost) {
    // Compile link matching pattern.
    Pattern p = Pattern.rupile("<a\\s+href\\s*=\\s*\"?(.*?)[\" |>]",
        Pattern.CASE_INSENSITIVE);
    Matcher m = p.matcher(pageContents);
    // Create list of link matches.
    ArrayList linkList = new ArrayList();
    while (m.find()) {
      String link = m.group(1).trim();
      // Skip empty links.
      if (link.length() < 1) {
        continue;
      }
      // Skip links that are just page anchors.
      if (link.charAt(0) == "#") {
        continue;
      }
      // Skip mailto links.
      if (link.indexOf("mailto:") != -1) {
        continue;
      }
      // Skip JavaScript links.
      if (link.toLowerCase().indexOf("javascript") != -1) {
        continue;
      }
      // Prefix absolute and relative URLs if necessary.
      if (link.indexOf("://") == -1) {
        // Handle absolute URLs.
        if (link.charAt(0) == "/") {
          link = "http://" + pageUrl.getHost() + link;
          // Handle relative URLs.
        } else {
          String file = pageUrl.getFile();
          if (file.indexOf("/") == -1) {
            link = "http://" + pageUrl.getHost() + "/" + link;
          } else {
            String path = file.substring(0,
                file.lastIndexOf("/") + 1);
            link = "http://" + pageUrl.getHost() + path + link;
          }
        }
      }
      // Remove anchors from link.
      int index = link.indexOf("#");
      if (index != -1) {
        link = link.substring(0, index);
      }
      // Remove leading "www" from URL"s host if present.
      link = removeWwwFromUrl(link);
      // Verify link and skip if invalid.
      URL verifiedLink = verifyUrl(link);
      if (verifiedLink == null) {
        continue;
      }
      /*
       * If specified, limit links to those having the same host as the
       * start URL.
       */
      if (limitHost
          && !pageUrl.getHost().toLowerCase().equals(
              verifiedLink.getHost().toLowerCase())) {
        continue;
      }
      // Skip link if it has already been crawled.
      if (crawledList.contains(link)) {
        continue;
      }
      // Add link to list.
      linkList.add(link);
    }
    return (linkList);
  }
  /*
   * Determine whether or not search string is matched in the given page
   * contents.
   */
  private boolean searchStringMatches(String pageContents,
      String searchString, boolean caseSensitive) {
    String searchContents = pageContents;
    /*
     * If case-sensitive search, lowercase page contents for comparison.
     */
    if (!caseSensitive) {
      searchContents = pageContents.toLowerCase();
    }
    // Split search string into individual terms.
    Pattern p = Pattern.rupile("[\\s]+");
    String[] terms = p.split(searchString);
    // Check to see if each term matches.
    for (int i = 0; i < terms.length; i++) {
      if (caseSensitive) {
        if (searchContents.indexOf(terms[i]) == -1) {
          return false;
        }
      } else {
        if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
          return false;
        }
      }
    }
    return true;
  }
  // Perform the actual crawling, searching for the search string.
  public void crawl(String startUrl, int maxUrls, boolean limitHost,
      String searchString, boolean caseSensitive) {
    // Set up crawl lists.
    HashSet crawledList = new HashSet();
    LinkedHashSet toCrawlList = new LinkedHashSet();
    // Add start URL to the to crawl list.
    toCrawlList.add(startUrl);
    /*
     * Perform actual crawling by looping through the To Crawl list.
     */
    while (crawling && toCrawlList.size() > 0) {
      /*
       * Check to see if the max URL count has been reached, if it was
       * specified.
       */
      if (maxUrls != -1) {
        if (crawledList.size() == maxUrls) {
          break;
        }
      }
      // Get URL at bottom of the list.
      String url = (String) toCrawlList.iterator().next();
      // Remove URL from the To Crawl list.
      toCrawlList.remove(url);
      // Convert string url to URL object.
      URL verifiedUrl = verifyUrl(url);
      // Skip URL if robots are not allowed to access it.
      if (!isRobotAllowed(verifiedUrl)) {
        continue;
      }
      // Update crawling stats.
      updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);
      // Add page to the crawled list.
      crawledList.add(url);
      // Download the page at the given URL.
      String pageContents = downloadPage(verifiedUrl);
      /*
       * If the page was downloaded successfully, retrieve all its links
       * and then see if it contains the search string.
       */
      if (pageContents != null && pageContents.length() > 0) {
        // Retrieve list of valid links from page.
        ArrayList links = retrieveLinks(verifiedUrl, pageContents,
            crawledList, limitHost);
        // Add links to the To Crawl list.
        toCrawlList.addAll(links);
        /*
         * Check if search string is present in page, and if so, record
         * a match.
         */
        if (searchStringMatches(pageContents, searchString,
            caseSensitive)) {
          addMatch(url);
        }
      }
      // Update crawling stats.
      updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);
    }
  }
  // Run the Search Crawler.
  public static void main(String[] args) {
    SearchCrawler crawler = new SearchCrawler();
    crawler.show();
  }
}
/**
A quantifier determines how many times an expression is matched. The quantifiers are shown here:
+   Match one or more.
*   Match zero or more.
?   Match zero or one.
*/
/*
Character Sequence Explanation
<a Look for the characters "<a".
\\s+ Look for one or more space characters.
href Look for the characters "href".
\\s* Look for zero or more space characters.
=    Look for the character "--".
\\s* Look for zero or more space characters.
\"?  Look for zero or one quote character.
(.*?)Look for zero or more of any character until the next part of the pattern is matched, and place the results in a group.
[\">]Look for quote character or greater than (">") character.
*/





Web crawler

 
Revised from: http://java.sun.ru/developer/technicalArticles/ThirdParty/WebCrawler/
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.Vector;
public class WebCrawler implements Runnable {
  public static final String SEARCH = "Search";
  public static final String STOP = "Stop";
  public static final String DISALLOW = "Disallow:";
  public static final int SEARCH_LIMIT = 50;
  Vector vectorToSearch = new Vector();
  Vector vectorSearched = new Vector();
  Vector vectorMatches = new Vector();
  Thread searchThread;
  public WebCrawler() {
    // ("text/html");
    // ("audio/basic");
    // ("audio/au");
    // ("audio/aiff");
    // ("audio/wav");
    // ("video/mpeg");
    // ("video/x-avi");
    URLConnection.setDefaultAllowUserInteraction(false);
    searchThread = new Thread(this);
    searchThread.start();
  }
  public void run() {
    String strURL = "http://www.google.ru";
    String strTargetType = "text/html";
    int numberSearched = 0;
    int numberFound = 0;
    if (strURL.length() == 0) {
      System.out.println("ERROR: must enter a starting URL");
      return;
    }
    vectorToSearch = new Vector();
    vectorSearched = new Vector();
    vectorMatches = new Vector();
    vectorToSearch.addElement(strURL);
    while ((vectorToSearch.size() > 0)
        && (Thread.currentThread() == searchThread)) {
      strURL = (String) vectorToSearch.elementAt(0);
      System.out.println("searching " + strURL);
      URL url = null;
      try {
        url = new URL(strURL);
      } catch (MalformedURLException e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
      }
      vectorToSearch.removeElementAt(0);
      vectorSearched.addElement(strURL);
      try {
        URLConnection urlConnection = url.openConnection();
        urlConnection.setAllowUserInteraction(false);
        InputStream urlStream = url.openStream();
        String type = urlConnection.guessContentTypeFromStream(urlStream);
        if (type == null)
          break;
        if (type.rupareTo("text/html") != 0)
          break;
        byte b[] = new byte[5000];
        int numRead = urlStream.read(b);
        String content = new String(b, 0, numRead);
        while (numRead != -1) {
          if (Thread.currentThread() != searchThread)
            break;
          numRead = urlStream.read(b);
          if (numRead != -1) {
            String newContent = new String(b, 0, numRead);
            content += newContent;
          }
        }
        urlStream.close();
        if (Thread.currentThread() != searchThread)
          break;
        String lowerCaseContent = content.toLowerCase();
        int index = 0;
        while ((index = lowerCaseContent.indexOf("<a", index)) != -1) {
          if ((index = lowerCaseContent.indexOf("href", index)) == -1)
            break;
          if ((index = lowerCaseContent.indexOf("=", index)) == -1)
            break;
          if (Thread.currentThread() != searchThread)
            break;
          index++;
          String remaining = content.substring(index);
          StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\">#");
          String strLink = st.nextToken();
          URL urlLink;
          try {
            urlLink = new URL(url, strLink);
            strLink = urlLink.toString();
          } catch (MalformedURLException e) {
            System.out.println("ERROR: bad URL " + strLink);
            continue;
          }
          if (urlLink.getProtocol().rupareTo("http") != 0)
            break;
          if (Thread.currentThread() != searchThread)
            break;
          try {
            URLConnection urlLinkConnection = urlLink.openConnection();
            urlLinkConnection.setAllowUserInteraction(false);
            InputStream linkStream = urlLink.openStream();
            String strType = urlLinkConnection
                .guessContentTypeFromStream(linkStream);
            linkStream.close();
            if (strType == null)
              break;
            if (strType.rupareTo("text/html") == 0) {
              if ((!vectorSearched.contains(strLink))
                  && (!vectorToSearch.contains(strLink))) {
                vectorToSearch.addElement(strLink);
              }
            }
            if (strType.rupareTo(strTargetType) == 0) {
              if (vectorMatches.contains(strLink) == false) {
                System.out.println(strLink);
                vectorMatches.addElement(strLink);
                numberFound++;
                if (numberFound >= SEARCH_LIMIT)
                  break;
              }
            }
          } catch (IOException e) {
            System.out.println("ERROR: couldn"t open URL " + strLink);
            continue;
          }
        }
      } catch (IOException e) {
        System.out.println("ERROR: couldn"t open URL " + strURL);
        break;
      }
      numberSearched++;
      if (numberSearched >= SEARCH_LIMIT)
        break;
    }
    if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
      System.out.println("reached search limit of " + SEARCH_LIMIT);
    else
      System.out.println("done");
    searchThread = null;
  }
  public static void main(String argv[]) {
    WebCrawler applet = new WebCrawler();
    /*
     * Behind a firewall set your proxy and port here!
     */
    Properties props = new Properties(System.getProperties());
    props.put("http.proxySet", "true");
    props.put("http.proxyHost", "webcache-cup");
    props.put("http.proxyPort", "8080");
    Properties newprops = new Properties(props);
    System.setProperties(newprops);
  }
}





Web Crawler from Sun Microsystems

 
/* Copyright 2004 Sun Microsystems, Inc.  All rights reserved.  You may not modify, use, reproduce, or distribute this software except in compliance with the terms of the License at:*/ 
//import java.applet.Applet;
import java.awt.BorderLayout;
import java.awt.Button;
import java.awt.Choice;
import java.awt.FlowLayout;
import java.awt.Frame;
import java.awt.Graphics;
import java.awt.Label;
import java.awt.List;
import java.awt.Panel;
import java.awt.TextField;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.Vector;
public class WebCrawler extends Applet implements ActionListener, Runnable {
    public static final String SEARCH = "Search";
    public static final String STOP = "Stop";
    public static final String DISALLOW = "Disallow:";
    public static final int    SEARCH_LIMIT = 50;
    Panel   panelMain;
    List    listMatches;
    Label   labelStatus;
    // URLs to be searched
    Vector vectorToSearch;
    // URLs already searched
    Vector vectorSearched;
    // URLs which match
    Vector vectorMatches;
    Thread searchThread;
    TextField textURL;
    Choice    choiceType;
    public void init() {
  // set up the main UI panel
  panelMain = new Panel();
  panelMain.setLayout(new BorderLayout(5, 5));
  // text entry components
  Panel panelEntry = new Panel();
  panelEntry.setLayout(new BorderLayout(5, 5));
  Panel panelURL = new Panel();
  panelURL.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
  Label labelURL = new Label("Starting URL: ", Label.RIGHT);
  panelURL.add(labelURL);
  textURL = new TextField("", 40);
  panelURL.add(textURL);
  panelEntry.add("North", panelURL);
  Panel panelType = new Panel();
  panelType.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
  Label labelType = new Label("Content type: ", Label.RIGHT);
  panelType.add(labelType);
  choiceType = new Choice();
  choiceType.addItem("text/html");
  choiceType.addItem("audio/basic");
  choiceType.addItem("audio/au");
  choiceType.addItem("audio/aiff");
  choiceType.addItem("audio/wav");
  choiceType.addItem("video/mpeg");
  choiceType.addItem("video/x-avi");
  panelType.add(choiceType);
  panelEntry.add("South", panelType);
  panelMain.add("North", panelEntry);
  // list of result URLs
  Panel panelListButtons = new Panel();
  panelListButtons.setLayout(new BorderLayout(5, 5));
  Panel panelList = new Panel();
  panelList.setLayout(new BorderLayout(5, 5));
  Label labelResults = new Label("Search results");
  panelList.add("North", labelResults);
  Panel panelListCurrent = new Panel();
  panelListCurrent.setLayout(new BorderLayout(5, 5));
  listMatches = new List(10);
  panelListCurrent.add("North", listMatches);
  labelStatus = new Label("");
  panelListCurrent.add("South", labelStatus);
  panelList.add("South", panelListCurrent);
  panelListButtons.add("North", panelList);
  // control buttons
  Panel panelButtons = new Panel();
  Button buttonSearch = new Button(SEARCH);
  buttonSearch.addActionListener(this);
  panelButtons.add(buttonSearch);
  Button buttonStop = new Button(STOP);
  buttonStop.addActionListener(this);
  panelButtons.add(buttonStop);
  panelListButtons.add("South", panelButtons);
  panelMain.add("South", panelListButtons);
  add(panelMain);
  setVisible(true);
  repaint(); 
  // initialize search data structures
  vectorToSearch = new Vector();
  vectorSearched = new Vector();
  vectorMatches = new Vector();
  // set default for URL access
  URLConnection.setDefaultAllowUserInteraction(false);
    }
    public void start() {
    }
    public void stop() {
  if (searchThread != null) {
      setStatus("stopping...");
      searchThread = null;
  }
    }
    public void destroy() {
    }
    boolean robotSafe(URL url) {
  String strHost = url.getHost();
  // form URL of the robots.txt file
  String strRobot = "http://" + strHost + "/robots.txt";
  URL urlRobot;
  try { 
      urlRobot = new URL(strRobot);
  } catch (MalformedURLException e) {
      // something weird is happening, so don"t trust it
      return false;
  }
  String strCommands;
  try {
      InputStream urlRobotStream = urlRobot.openStream();
      // read in entire file
      byte b[] = new byte[1000];
      int numRead = urlRobotStream.read(b);
      strCommands = new String(b, 0, numRead);
      while (numRead != -1) {
    if (Thread.currentThread() != searchThread)
        break;
    numRead = urlRobotStream.read(b);
    if (numRead != -1) {
        String newCommands = new String(b, 0, numRead);
        strCommands += newCommands;
    }
      }
      urlRobotStream.close();
  } catch (IOException e) {
      // if there is no robots.txt file, it is OK to search
      return true;
  }
  // assume that this robots.txt refers to us and 
  // search for "Disallow:" commands.
  String strURL = url.getFile();
  int index = 0;
  while ((index = strCommands.indexOf(DISALLOW, index)) != -1) {
      index += DISALLOW.length();
      String strPath = strCommands.substring(index);
      StringTokenizer st = new StringTokenizer(strPath);
      if (!st.hasMoreTokens())
    break;
      
      String strBadPath = st.nextToken();
      // if the URL starts with a disallowed path, it is not safe
      if (strURL.indexOf(strBadPath) == 0)
    return false;
  }
  return true;
    }
    public void paint(Graphics g) {
        //Draw a Rectangle around the applet"s display area.
        g.drawRect(0, 0, getSize().width - 1, getSize().height - 1);
  panelMain.paint(g);
  panelMain.paintComponents(g);
  // update(g);
  // panelMain.update(g);
    }
    public void run() {
  String strURL = textURL.getText();
  String strTargetType = choiceType.getSelectedItem();
  int numberSearched = 0;
  int numberFound = 0;
  if (strURL.length() == 0) {
      setStatus("ERROR: must enter a starting URL");
      return;
  }
  // initialize search data structures
  vectorToSearch.removeAllElements();
  vectorSearched.removeAllElements();
  vectorMatches.removeAllElements();
  listMatches.removeAll();
  vectorToSearch.addElement(strURL);
  while ((vectorToSearch.size() > 0) 
    && (Thread.currentThread() == searchThread)) {
      // get the first element from the to be searched list
      strURL = (String) vectorToSearch.elementAt(0);
      setStatus("searching " + strURL);
      URL url;
      try { 
    url = new URL(strURL);
      } catch (MalformedURLException e) {
    setStatus("ERROR: invalid URL " + strURL);
    break;
      }
      // mark the URL as searched (we want this one way or the other)
      vectorToSearch.removeElementAt(0);
      vectorSearched.addElement(strURL);
      // can only search http: protocol URLs
      if (url.getProtocol().rupareTo("http") != 0) 
    break;
      // test to make sure it is before searching
      if (!robotSafe(url))
    break;
      try {
    // try opening the URL
    URLConnection urlConnection = url.openConnection();
    urlConnection.setAllowUserInteraction(false);
    InputStream urlStream = url.openStream();
    String type 
      = urlConnection.guessContentTypeFromStream(urlStream);
    if (type == null)
        break;
    if (type.rupareTo("text/html") != 0) 
        break;
    // search the input stream for links
    // first, read in the entire URL
    byte b[] = new byte[1000];
    int numRead = urlStream.read(b);
    String content = new String(b, 0, numRead);
    while (numRead != -1) {
        if (Thread.currentThread() != searchThread)
      break;
        numRead = urlStream.read(b);
        if (numRead != -1) {
      String newContent = new String(b, 0, numRead);
      content += newContent;
        }
    }
    urlStream.close();
    if (Thread.currentThread() != searchThread)
        break;
    String lowerCaseContent = content.toLowerCase();
    int index = 0;
    while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
    {
        if ((index = lowerCaseContent.indexOf("href", index)) == -1) 
      break;
        if ((index = lowerCaseContent.indexOf("=", index)) == -1) 
      break;
        
        if (Thread.currentThread() != searchThread)
      break;
        index++;
        String remaining = content.substring(index);
        StringTokenizer st 
          = new StringTokenizer(remaining, "\t\n\r\">#");
        String strLink = st.nextToken();
        URL urlLink;
        try {
      urlLink = new URL(url, strLink);
      strLink = urlLink.toString();
        } catch (MalformedURLException e) {
      setStatus("ERROR: bad URL " + strLink);
      continue;
        }
        // only look at http links
        if (urlLink.getProtocol().rupareTo("http") != 0)
      break;
        if (Thread.currentThread() != searchThread)
      break;
        try {
      // try opening the URL
      URLConnection urlLinkConnection 
        = urlLink.openConnection();
      urlLinkConnection.setAllowUserInteraction(false);
      InputStream linkStream = urlLink.openStream();
      String strType 
        = urlLinkConnection.guessContentTypeFromStream(linkStream);
      linkStream.close();
      // if another page, add to the end of search list
      if (strType == null)
          break;
      if (strType.rupareTo("text/html") == 0) {
          // check to see if this URL has already been 
          // searched or is going to be searched
          if ((!vectorSearched.contains(strLink)) 
            && (!vectorToSearch.contains(strLink))) {
        // test to make sure it is robot-safe!
        if (robotSafe(urlLink))
            vectorToSearch.addElement(strLink);
          }
      }
      // if the proper type, add it to the results list
      // unless we have already seen it
      if (strType.rupareTo(strTargetType) == 0) {
          if (vectorMatches.contains(strLink) == false) {
        listMatches.add(strLink);
        vectorMatches.addElement(strLink);
        numberFound++;
        if (numberFound >= SEARCH_LIMIT)
            break;
          }
      }
        } catch (IOException e) {
      setStatus("ERROR: couldn"t open URL " + strLink);
      continue;
        }
    }
      } catch (IOException e) {
    setStatus("ERROR: couldn"t open URL " + strURL);
    break;
      }
      numberSearched++;
      if (numberSearched >= SEARCH_LIMIT)
    break;
  }
  if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
      setStatus("reached search limit of " + SEARCH_LIMIT);
  else
      setStatus("done");
  searchThread = null;
  // searchThread.stop();
    }
    void setStatus(String status) {
  labelStatus.setText(status);
    }
    public void actionPerformed(ActionEvent event) {
  String command = event.getActionCommand();
  if (command.rupareTo(SEARCH) == 0) {
      setStatus("searching...");
      // launch a thread to do the search
      if (searchThread == null) {
    searchThread = new Thread(this);
      }
      searchThread.start();
  }
  else if (command.rupareTo(STOP) == 0) {
      stop();
  }
    }
        public static void main (String argv[])
        {
                Frame f = new Frame("WebFrame");
                WebCrawler applet = new WebCrawler();
    f.add("Center", applet);
/*    Behind a firewall set your proxy and port here!
*/
                Properties props= new Properties(System.getProperties());
                props.put("http.proxySet", "true");
          props.put("http.proxyHost", "webcache-cup");
          props.put("http.proxyPort", "8080");
                Properties newprops = new Properties(props);
                System.setProperties(newprops);
/**/
    
                applet.init();
                applet.start();
                f.pack();
                f.show();
        }
}