Java/Network Protocol/Crawler

Search Crawler

   <source lang="java">

// The SearchCrawler class is shown here and is examined in detail in the // following sections. Notice that it extends JFrame: /*

* Chapter 6 - Crawling the Web with Java 
* The Art of Java 
* by Herbert Schildt and James Holmes 
* McGraw-Hill/Osborne 2003
*  
*/

import java.awt.BorderLayout; import java.awt.Cursor; import java.awt.Font; import java.awt.GridBagConstraints; import java.awt.GridBagLayout; import java.awt.Insets; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.awt.event.KeyEvent; import java.awt.event.WindowAdapter; import java.awt.event.WindowEvent; import java.io.BufferedReader; import java.io.FileWriter; import java.io.InputStreamReader; import java.io.PrintWriter; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.swing.BorderFactory; import javax.swing.JButton; import javax.swing.JCheckBox; import javax.swing.JComboBox; import javax.swing.JFrame; import javax.swing.JLabel; import javax.swing.JMenu; import javax.swing.JMenuBar; import javax.swing.JMenuItem; import javax.swing.JOptionPane; import javax.swing.JPanel; import javax.swing.JProgressBar; import javax.swing.JScrollPane; import javax.swing.JSeparator; import javax.swing.JTable; import javax.swing.JTextField; import javax.swing.table.DefaultTableModel; // The Search Web Crawler public class SearchCrawler extends JFrame {

 // Max URLs drop-down values.
 private static final String[] MAX_URLS = { "50", "100", "500", "1000" };
 // Cache of robot disallow lists.
 private HashMap disallowListCache = new HashMap();
 // Search GUI controls.
 private JTextField startTextField;
 private JComboBox maxComboBox;
 private JCheckBox limitCheckBox;
 private JTextField logTextField;
 private JTextField searchTextField;
 private JCheckBox caseCheckBox;
 private JButton searchButton;
 // Search stats GUI controls.
 private JLabel crawlingLabel2;
 private JLabel crawledLabel2;
 private JLabel toCrawlLabel2;
 private JProgressBar progressBar;
 private JLabel matchesLabel2;
 // Table listing search matches.
 private JTable table;
 // Flag for whether or not crawling is underway.
 private boolean crawling;
 // Matches log file print writer.
 private PrintWriter logFileWriter;
 // Constructor for Search Web Crawler.
 public SearchCrawler() {
   // Set application title.
   setTitle("Search Crawler");
   // Set window size.
   setSize(600, 600);
   // Handle window closing events.
   addWindowListener(new WindowAdapter() {
     public void windowClosing(WindowEvent e) {
       actionExit();
     }
   });
   // Set up File menu.
   JMenuBar menuBar = new JMenuBar();
   JMenu fileMenu = new JMenu("File");
   fileMenu.setMnemonic(KeyEvent.VK_F);
   JMenuItem fileExitMenuItem = new JMenuItem("Exit", KeyEvent.VK_X);
   fileExitMenuItem.addActionListener(new ActionListener() {
     public void actionPerformed(ActionEvent e) {
       actionExit();
     }
   });
   fileMenu.add(fileExitMenuItem);
   menuBar.add(fileMenu);
   setJMenuBar(menuBar);
   // Set up search panel.
   JPanel searchPanel = new JPanel();
   GridBagConstraints constraints;
   GridBagLayout layout = new GridBagLayout();
   searchPanel.setLayout(layout);
   JLabel startLabel = new JLabel("Start URL:");
   constraints = new GridBagConstraints();
   constraints.anchor = GridBagConstraints.EAST;
   constraints.insets = new Insets(5, 5, 0, 0);
   layout.setConstraints(startLabel, constraints);
   searchPanel.add(startLabel);
   startTextField = new JTextField();
   constraints = new GridBagConstraints();
   constraints.fill = GridBagConstraints.HORIZONTAL;
   constraints.gridwidth = GridBagConstraints.REMAINDER;
   constraints.insets = new Insets(5, 5, 0, 5);
   layout.setConstraints(startTextField, constraints);
   searchPanel.add(startTextField);
   JLabel maxLabel = new JLabel("Max URLs to Crawl:");
   constraints = new GridBagConstraints();
   constraints.anchor = GridBagConstraints.EAST;
   constraints.insets = new Insets(5, 5, 0, 0);
   layout.setConstraints(maxLabel, constraints);
   searchPanel.add(maxLabel);
   maxComboBox = new JComboBox(MAX_URLS);
   maxComboBox.setEditable(true);
   constraints = new GridBagConstraints();
   constraints.insets = new Insets(5, 5, 0, 0);
   layout.setConstraints(maxComboBox, constraints);
   searchPanel.add(maxComboBox);
   limitCheckBox = new JCheckBox("Limit crawling to Start URL site");
   constraints = new GridBagConstraints();
   constraints.anchor = GridBagConstraints.WEST;
   constraints.insets = new Insets(0, 10, 0, 0);
   layout.setConstraints(limitCheckBox, constraints);
   searchPanel.add(limitCheckBox);
   JLabel blankLabel = new JLabel();
   constraints = new GridBagConstraints();
   constraints.gridwidth = GridBagConstraints.REMAINDER;
   layout.setConstraints(blankLabel, constraints);
   searchPanel.add(blankLabel);
   JLabel logLabel = new JLabel("Matches Log File:");
   constraints = new GridBagConstraints();
   constraints.anchor = GridBagConstraints.EAST;
   constraints.insets = new Insets(5, 5, 0, 0);
   layout.setConstraints(logLabel, constraints);
   searchPanel.add(logLabel);
   String file = System.getProperty("user.dir")
       + System.getProperty("file.separator") + "crawler.log";
   logTextField = new JTextField(file);
   constraints = new GridBagConstraints();
   constraints.fill = GridBagConstraints.HORIZONTAL;
   constraints.gridwidth = GridBagConstraints.REMAINDER;
   constraints.insets = new Insets(5, 5, 0, 5);
   layout.setConstraints(logTextField, constraints);
   searchPanel.add(logTextField);
   JLabel searchLabel = new JLabel("Search String:");
   constraints = new GridBagConstraints();
   constraints.anchor = GridBagConstraints.EAST;
   constraints.insets = new Insets(5, 5, 0, 0);
   layout.setConstraints(searchLabel, constraints);
   searchPanel.add(searchLabel);
   searchTextField = new JTextField();
   constraints = new GridBagConstraints();
   constraints.fill = GridBagConstraints.HORIZONTAL;
   constraints.insets = new Insets(5, 5, 0, 0);
   constraints.gridwidth = 2;
   constraints.weightx = 1.0d;
   layout.setConstraints(searchTextField, constraints);
   searchPanel.add(searchTextField);
   caseCheckBox = new JCheckBox("Case Sensitive");
   constraints = new GridBagConstraints();
   constraints.insets = new Insets(5, 5, 0, 5);
   constraints.gridwidth = GridBagConstraints.REMAINDER;
   layout.setConstraints(caseCheckBox, constraints);
   searchPanel.add(caseCheckBox);
   searchButton = new JButton("Search");
   searchButton.addActionListener(new ActionListener() {
     public void actionPerformed(ActionEvent e) {
       actionSearch();
     }
   });
   constraints = new GridBagConstraints();
   constraints.gridwidth = GridBagConstraints.REMAINDER;
   constraints.insets = new Insets(5, 5, 5, 5);
   layout.setConstraints(searchButton, constraints);
   searchPanel.add(searchButton);
   JSeparator separator = new JSeparator();
   constraints = new GridBagConstraints();
   constraints.fill = GridBagConstraints.HORIZONTAL;
   constraints.gridwidth = GridBagConstraints.REMAINDER;
   constraints.insets = new Insets(5, 5, 5, 5);
   layout.setConstraints(separator, constraints);
   searchPanel.add(separator);
   JLabel crawlingLabel1 = new JLabel("Crawling:");
   constraints = new GridBagConstraints();
   constraints.anchor = GridBagConstraints.EAST;
   constraints.insets = new Insets(5, 5, 0, 0);
   layout.setConstraints(crawlingLabel1, constraints);
   searchPanel.add(crawlingLabel1);
   crawlingLabel2 = new JLabel();
   crawlingLabel2.setFont(crawlingLabel2.getFont().deriveFont(Font.PLAIN));
   constraints = new GridBagConstraints();
   constraints.fill = GridBagConstraints.HORIZONTAL;
   constraints.gridwidth = GridBagConstraints.REMAINDER;
   constraints.insets = new Insets(5, 5, 0, 5);
   layout.setConstraints(crawlingLabel2, constraints);
   searchPanel.add(crawlingLabel2);
   JLabel crawledLabel1 = new JLabel("Crawled URLs:");
   constraints = new GridBagConstraints();
   constraints.anchor = GridBagConstraints.EAST;
   constraints.insets = new Insets(5, 5, 0, 0);
   layout.setConstraints(crawledLabel1, constraints);
   searchPanel.add(crawledLabel1);
   crawledLabel2 = new JLabel();
   crawledLabel2.setFont(crawledLabel2.getFont().deriveFont(Font.PLAIN));
   constraints = new GridBagConstraints();
   constraints.fill = GridBagConstraints.HORIZONTAL;
   constraints.gridwidth = GridBagConstraints.REMAINDER;
   constraints.insets = new Insets(5, 5, 0, 5);
   layout.setConstraints(crawledLabel2, constraints);
   searchPanel.add(crawledLabel2);
   JLabel toCrawlLabel1 = new JLabel("URLs to Crawl:");
   constraints = new GridBagConstraints();
   constraints.anchor = GridBagConstraints.EAST;
   constraints.insets = new Insets(5, 5, 0, 0);
   layout.setConstraints(toCrawlLabel1, constraints);
   searchPanel.add(toCrawlLabel1);
   toCrawlLabel2 = new JLabel();
   toCrawlLabel2.setFont(toCrawlLabel2.getFont().deriveFont(Font.PLAIN));
   constraints = new GridBagConstraints();
   constraints.fill = GridBagConstraints.HORIZONTAL;
   constraints.gridwidth = GridBagConstraints.REMAINDER;
   constraints.insets = new Insets(5, 5, 0, 5);
   layout.setConstraints(toCrawlLabel2, constraints);
   searchPanel.add(toCrawlLabel2);
   JLabel progressLabel = new JLabel("Crawling Progress:");
   constraints = new GridBagConstraints();
   constraints.anchor = GridBagConstraints.EAST;
   constraints.insets = new Insets(5, 5, 0, 0);
   layout.setConstraints(progressLabel, constraints);
   searchPanel.add(progressLabel);
   progressBar = new JProgressBar();
   progressBar.setMinimum(0);
   progressBar.setStringPainted(true);
   constraints = new GridBagConstraints();
   constraints.fill = GridBagConstraints.HORIZONTAL;
   constraints.gridwidth = GridBagConstraints.REMAINDER;
   constraints.insets = new Insets(5, 5, 0, 5);
   layout.setConstraints(progressBar, constraints);
   searchPanel.add(progressBar);
   JLabel matchesLabel1 = new JLabel("Search Matches:");
   constraints = new GridBagConstraints();
   constraints.anchor = GridBagConstraints.EAST;
   constraints.insets = new Insets(5, 5, 10, 0);
   layout.setConstraints(matchesLabel1, constraints);
   searchPanel.add(matchesLabel1);
   matchesLabel2 = new JLabel();
   matchesLabel2.setFont(matchesLabel2.getFont().deriveFont(Font.PLAIN));
   constraints = new GridBagConstraints();
   constraints.fill = GridBagConstraints.HORIZONTAL;
   constraints.gridwidth = GridBagConstraints.REMAINDER;
   constraints.insets = new Insets(5, 5, 10, 5);
   layout.setConstraints(matchesLabel2, constraints);
   searchPanel.add(matchesLabel2);
   // Set up matches table.
   table = new JTable(new DefaultTableModel(new Object[][] {},
       new String[] { "URL" }) {
     public boolean isCellEditable(int row, int column) {
       return false;
     }
   });
   // Set up Matches panel.
   JPanel matchesPanel = new JPanel();
   matchesPanel.setBorder(BorderFactory.createTitledBorder("Matches"));
   matchesPanel.setLayout(new BorderLayout());
   matchesPanel.add(new JScrollPane(table), BorderLayout.CENTER);
   // Add panels to display.
   getContentPane().setLayout(new BorderLayout());
   getContentPane().add(searchPanel, BorderLayout.NORTH);
   getContentPane().add(matchesPanel, BorderLayout.CENTER);
 }
 // Exit this program.
 private void actionExit() {
   System.exit(0);
 }
 // Handle Search/Stop button being clicked.
 private void actionSearch() {
   // If stop button clicked, turn crawling flag off.
   if (crawling) {
     crawling = false;
     return;
   }
   ArrayList errorList = new ArrayList();
   // Validate that start URL has been entered.
   String startUrl = startTextField.getText().trim();
   if (startUrl.length() < 1) {
     errorList.add("Missing Start URL.");
   }
   // Verify start URL.
   else if (verifyUrl(startUrl) == null) {
     errorList.add("Invalid Start URL.");
   }
   // Validate that Max URLs is either empty or is a number.
   int maxUrls = 0;
   String max = ((String) maxComboBox.getSelectedItem()).trim();
   if (max.length() > 0) {
     try {
       maxUrls = Integer.parseInt(max);
     } catch (NumberFormatException e) {
     }
     if (maxUrls < 1) {
       errorList.add("Invalid Max URLs value.");
     }
   }
   // Validate that matches log file has been entered.
   String logFile = logTextField.getText().trim();
   if (logFile.length() < 1) {
     errorList.add("Missing Matches Log File.");
   }
   // Validate that search string has been entered.
   String searchString = searchTextField.getText().trim();
   if (searchString.length() < 1) {
     errorList.add("Missing Search String.");
   }
   // Show errors, if any, and return.
   if (errorList.size() > 0) {
     StringBuffer message = new StringBuffer();
     // Concatenate errors into single message.
     for (int i = 0; i < errorList.size(); i++) {
       message.append(errorList.get(i));
       if (i + 1 < errorList.size()) {
         message.append("\n");
       }
     }
     showError(message.toString());
     return;
   }
   // Remove "www" from start URL if present.
   startUrl = removeWwwFromUrl(startUrl);
   // Start the Search Crawler.
   search(logFile, startUrl, maxUrls, searchString);
 }
 private void search(final String logFile, final String startUrl,
     final int maxUrls, final String searchString) {
   // Start the search in a new thread.
   Thread thread = new Thread(new Runnable() {
     public void run() {
       // Show hour glass cursor while crawling is under way.
       setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));
       // Disable search controls.
       startTextField.setEnabled(false);
       maxComboBox.setEnabled(false);
       limitCheckBox.setEnabled(false);
       logTextField.setEnabled(false);
       searchTextField.setEnabled(false);
       caseCheckBox.setEnabled(false);
       // Switch Search button to "Stop."
       searchButton.setText("Stop");
       // Reset stats.
       table.setModel(new DefaultTableModel(new Object[][] {},
           new String[] { "URL" }) {
         public boolean isCellEditable(int row, int column) {
           return false;
         }
       });
       updateStats(startUrl, 0, 0, maxUrls);
       // Open matches log file.
       try {
         logFileWriter = new PrintWriter(new FileWriter(logFile));
       } catch (Exception e) {
         showError("Unable to open matches log file.");
         return;
       }
       // Turn crawling flag on.
       crawling = true;
       // Perform the actual crawling.
       crawl(startUrl, maxUrls, limitCheckBox.isSelected(),
           searchString, caseCheckBox.isSelected());
       // Turn crawling flag off.
       crawling = false;
       // Close matches log file.
       try {
         logFileWriter.close();
       } catch (Exception e) {
         showError("Unable to close matches log file.");
       }
       // Mark search as done.
       crawlingLabel2.setText("Done");
       // Enable search controls.
       startTextField.setEnabled(true);
       maxComboBox.setEnabled(true);
       limitCheckBox.setEnabled(true);
       logTextField.setEnabled(true);
       searchTextField.setEnabled(true);
       caseCheckBox.setEnabled(true);
       // Switch search button back to "Search."
       searchButton.setText("Search");
       // Return to default cursor.
       setCursor(Cursor.getDefaultCursor());
       // Show message if search string not found.
       if (table.getRowCount() == 0) {
         JOptionPane
             .showMessageDialog(
                 SearchCrawler.this,
                 "Your Search String was not found. Please try another.",
                 "Search String Not Found",
                 JOptionPane.WARNING_MESSAGE);
       }
     }
   });
   thread.start();
 }
 // Show dialog box with error message.
 private void showError(String message) {
   JOptionPane.showMessageDialog(this, message, "Error",
       JOptionPane.ERROR_MESSAGE);
 }
 // Update crawling stats.
 private void updateStats(String crawling, int crawled, int toCrawl,
     int maxUrls) {
   crawlingLabel2.setText(crawling);
   crawledLabel2.setText("" + crawled);
   toCrawlLabel2.setText("" + toCrawl);
   // Update progress bar.
   if (maxUrls == -1) {
     progressBar.setMaximum(crawled + toCrawl);
   } else {
     progressBar.setMaximum(maxUrls);
   }
   progressBar.setValue(crawled);
   matchesLabel2.setText("" + table.getRowCount());
 }
 // Add match to matches table and log file.
 private void addMatch(String url) {
   // Add URL to matches table.
   DefaultTableModel model = (DefaultTableModel) table.getModel();
   model.addRow(new Object[] { url });
   // Add URL to matches log file.
   try {
     logFileWriter.println(url);
   } catch (Exception e) {
     showError("Unable to log match.");
   }
 }
 // Verify URL format.
 private URL verifyUrl(String url) {
   // Only allow HTTP URLs.
   if (!url.toLowerCase().startsWith("http://"))
     return null;
   // Verify format of URL.
   URL verifiedUrl = null;
   try {
     verifiedUrl = new URL(url);
   } catch (Exception e) {
     return null;
   }
   return verifiedUrl;
 }
 // Check if robot is allowed to access the given URL.
 private boolean isRobotAllowed(URL urlToCheck) {
   String host = urlToCheck.getHost().toLowerCase();
   // Retrieve host"s disallow list from cache.
   ArrayList disallowList = (ArrayList) disallowListCache.get(host);
   // If list is not in the cache, download and cache it.
   if (disallowList == null) {
     disallowList = new ArrayList();
     try {
       URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
       // Open connection to robot file URL for reading.
       BufferedReader reader = new BufferedReader(
           new InputStreamReader(robotsFileUrl.openStream()));
       // Read robot file, creating list of disallowed paths.
       String line;
       while ((line = reader.readLine()) != null) {
         if (line.indexOf("Disallow:") == 0) {
           String disallowPath = line.substring("Disallow:"
               .length());
           // Check disallow path for comments and remove if
           // present.
           int commentIndex = disallowPath.indexOf("#");
           if (commentIndex != -1) {
             disallowPath = disallowPath.substring(0,
                 commentIndex);
           }
           // Remove leading or trailing spaces from disallow path.
           disallowPath = disallowPath.trim();
           // Add disallow path to list.
           disallowList.add(disallowPath);
         }
       }
       // Add new disallow list to cache.
       disallowListCache.put(host, disallowList);
     } catch (Exception e) {
       /*
        * Assume robot is allowed since an exception is thrown if the
        * robot file doesn"t exist.
        */
       return true;
     }
   }
   /*
    * Loop through disallow list to see if crawling is allowed for the
    * given URL.
    */
   String file = urlToCheck.getFile();
   for (int i = 0; i < disallowList.size(); i++) {
     String disallow = (String) disallowList.get(i);
     if (file.startsWith(disallow)) {
       return false;
     }
   }
   return true;
 }
 // Download page at given URL.
 private String downloadPage(URL pageUrl) {
   try {
     // Open connection to URL for reading.
     BufferedReader reader = new BufferedReader(new InputStreamReader(
         pageUrl.openStream()));
     // Read page into buffer.
     String line;
     StringBuffer pageBuffer = new StringBuffer();
     while ((line = reader.readLine()) != null) {
       pageBuffer.append(line);
     }
     return pageBuffer.toString();
   } catch (Exception e) {
   }
   return null;
 }
 // Remove leading "www" from a URL"s host if present.
 private String removeWwwFromUrl(String url) {
   int index = url.indexOf("://www.");
   if (index != -1) {
     return url.substring(0, index + 3) + url.substring(index + 7);
   }
   return (url);
 }
 // Parse through page contents and retrieve links.
 private ArrayList retrieveLinks(URL pageUrl, String pageContents,
     HashSet crawledList, boolean limitHost) {
   // Compile link matching pattern.
   Pattern p = Pattern.rupile("<a\\s+href\\s*=\\s*\"?(.*?)[\" |>]",
       Pattern.CASE_INSENSITIVE);
   Matcher m = p.matcher(pageContents);
   // Create list of link matches.
   ArrayList linkList = new ArrayList();
   while (m.find()) {
     String link = m.group(1).trim();
     // Skip empty links.
     if (link.length() < 1) {
       continue;
     }
     // Skip links that are just page anchors.
     if (link.charAt(0) == "#") {
       continue;
     }
     // Skip mailto links.
     if (link.indexOf("mailto:") != -1) {
       continue;
     }
     // Skip JavaScript links.
     if (link.toLowerCase().indexOf("javascript") != -1) {
       continue;
     }
     // Prefix absolute and relative URLs if necessary.
     if (link.indexOf("://") == -1) {
       // Handle absolute URLs.
       if (link.charAt(0) == "/") {
         link = "http://" + pageUrl.getHost() + link;
         // Handle relative URLs.
       } else {
         String file = pageUrl.getFile();
         if (file.indexOf("/") == -1) {
           link = "http://" + pageUrl.getHost() + "/" + link;
         } else {
           String path = file.substring(0,
               file.lastIndexOf("/") + 1);
           link = "http://" + pageUrl.getHost() + path + link;
         }
       }
     }
     // Remove anchors from link.
     int index = link.indexOf("#");
     if (index != -1) {
       link = link.substring(0, index);
     }
     // Remove leading "www" from URL"s host if present.
     link = removeWwwFromUrl(link);
     // Verify link and skip if invalid.
     URL verifiedLink = verifyUrl(link);
     if (verifiedLink == null) {
       continue;
     }
     /*
      * If specified, limit links to those having the same host as the
      * start URL.
      */
     if (limitHost
         && !pageUrl.getHost().toLowerCase().equals(
             verifiedLink.getHost().toLowerCase())) {
       continue;
     }
     // Skip link if it has already been crawled.
     if (crawledList.contains(link)) {
       continue;
     }
     // Add link to list.
     linkList.add(link);
   }
   return (linkList);
 }
 /*
  * Determine whether or not search string is matched in the given page
  * contents.
  */
 private boolean searchStringMatches(String pageContents,
     String searchString, boolean caseSensitive) {
   String searchContents = pageContents;
   /*
    * If case-sensitive search, lowercase page contents for comparison.
    */
   if (!caseSensitive) {
     searchContents = pageContents.toLowerCase();
   }
   // Split search string into individual terms.
   Pattern p = Pattern.rupile("[\\s]+");
   String[] terms = p.split(searchString);
   // Check to see if each term matches.
   for (int i = 0; i < terms.length; i++) {
     if (caseSensitive) {
       if (searchContents.indexOf(terms[i]) == -1) {
         return false;
       }
     } else {
       if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
         return false;
       }
     }
   }
   return true;
 }
 // Perform the actual crawling, searching for the search string.
 public void crawl(String startUrl, int maxUrls, boolean limitHost,
     String searchString, boolean caseSensitive) {
   // Set up crawl lists.
   HashSet crawledList = new HashSet();
   LinkedHashSet toCrawlList = new LinkedHashSet();
   // Add start URL to the to crawl list.
   toCrawlList.add(startUrl);
   /*
    * Perform actual crawling by looping through the To Crawl list.
    */
   while (crawling && toCrawlList.size() > 0) {
     /*
      * Check to see if the max URL count has been reached, if it was
      * specified.
      */
     if (maxUrls != -1) {
       if (crawledList.size() == maxUrls) {
         break;
       }
     }
     // Get URL at bottom of the list.
     String url = (String) toCrawlList.iterator().next();
     // Remove URL from the To Crawl list.
     toCrawlList.remove(url);
     // Convert string url to URL object.
     URL verifiedUrl = verifyUrl(url);
     // Skip URL if robots are not allowed to access it.
     if (!isRobotAllowed(verifiedUrl)) {
       continue;
     }
     // Update crawling stats.
     updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);
     // Add page to the crawled list.
     crawledList.add(url);
     // Download the page at the given URL.
     String pageContents = downloadPage(verifiedUrl);
     /*
      * If the page was downloaded successfully, retrieve all its links
      * and then see if it contains the search string.
      */
     if (pageContents != null && pageContents.length() > 0) {
       // Retrieve list of valid links from page.
       ArrayList links = retrieveLinks(verifiedUrl, pageContents,
           crawledList, limitHost);
       // Add links to the To Crawl list.
       toCrawlList.addAll(links);
       /*
        * Check if search string is present in page, and if so, record
        * a match.
        */
       if (searchStringMatches(pageContents, searchString,
           caseSensitive)) {
         addMatch(url);
       }
     }
     // Update crawling stats.
     updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);
   }
 }
 // Run the Search Crawler.
 public static void main(String[] args) {
   SearchCrawler crawler = new SearchCrawler();
   crawler.show();
 }

} /** A quantifier determines how many times an expression is matched. The quantifiers are shown here: + Match one or more.

Match zero or more.

? Match zero or one.

/

/* Character Sequence Explanation <a Look for the characters "<a". \\s+ Look for one or more space characters. href Look for the characters "href". \\s* Look for zero or more space characters. = Look for the character "--". \\s* Look for zero or more space characters. \"? Look for zero or one quote character. (.*?)Look for zero or more of any character until the next part of the pattern is matched, and place the results in a group. [\">]Look for quote character or greater than (">") character.

/

 </source>

Web crawler

   <source lang="java">

Revised from: http://java.sun.ru/developer/technicalArticles/ThirdParty/WebCrawler/ import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.Properties; import java.util.StringTokenizer; import java.util.Vector; public class WebCrawler implements Runnable {

 public static final String SEARCH = "Search";
 public static final String STOP = "Stop";
 public static final String DISALLOW = "Disallow:";
 public static final int SEARCH_LIMIT = 50;
 Vector vectorToSearch = new Vector();
 Vector vectorSearched = new Vector();
 Vector vectorMatches = new Vector();
 Thread searchThread;
 public WebCrawler() {
   // ("text/html");
   // ("audio/basic");
   // ("audio/au");
   // ("audio/aiff");
   // ("audio/wav");
   // ("video/mpeg");
   // ("video/x-avi");
   URLConnection.setDefaultAllowUserInteraction(false);
   searchThread = new Thread(this);
   searchThread.start();
 }
 public void run() {
   String strURL = "http://www.google.ru";
   String strTargetType = "text/html";
   int numberSearched = 0;
   int numberFound = 0;
   if (strURL.length() == 0) {
     System.out.println("ERROR: must enter a starting URL");
     return;
   }
   vectorToSearch = new Vector();
   vectorSearched = new Vector();
   vectorMatches = new Vector();
   vectorToSearch.addElement(strURL);
   while ((vectorToSearch.size() > 0)
       && (Thread.currentThread() == searchThread)) {
     strURL = (String) vectorToSearch.elementAt(0);
     System.out.println("searching " + strURL);
     URL url = null;
     try {
       url = new URL(strURL);
     } catch (MalformedURLException e1) {
       // TODO Auto-generated catch block
       e1.printStackTrace();
     }
     vectorToSearch.removeElementAt(0);
     vectorSearched.addElement(strURL);
     try {
       URLConnection urlConnection = url.openConnection();
       urlConnection.setAllowUserInteraction(false);
       InputStream urlStream = url.openStream();
       String type = urlConnection.guessContentTypeFromStream(urlStream);
       if (type == null)
         break;
       if (type.rupareTo("text/html") != 0)
         break;
       byte b[] = new byte[5000];
       int numRead = urlStream.read(b);
       String content = new String(b, 0, numRead);
       while (numRead != -1) {
         if (Thread.currentThread() != searchThread)
           break;
         numRead = urlStream.read(b);
         if (numRead != -1) {
           String newContent = new String(b, 0, numRead);
           content += newContent;
         }
       }
       urlStream.close();
       if (Thread.currentThread() != searchThread)
         break;
       String lowerCaseContent = content.toLowerCase();
       int index = 0;
       while ((index = lowerCaseContent.indexOf("<a", index)) != -1) {
         if ((index = lowerCaseContent.indexOf("href", index)) == -1)
           break;
         if ((index = lowerCaseContent.indexOf("=", index)) == -1)
           break;
         if (Thread.currentThread() != searchThread)
           break;
         index++;
         String remaining = content.substring(index);
         StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\">#");
         String strLink = st.nextToken();
         URL urlLink;
         try {
           urlLink = new URL(url, strLink);
           strLink = urlLink.toString();
         } catch (MalformedURLException e) {
           System.out.println("ERROR: bad URL " + strLink);
           continue;
         }
         if (urlLink.getProtocol().rupareTo("http") != 0)
           break;
         if (Thread.currentThread() != searchThread)
           break;
         try {
           URLConnection urlLinkConnection = urlLink.openConnection();
           urlLinkConnection.setAllowUserInteraction(false);
           InputStream linkStream = urlLink.openStream();
           String strType = urlLinkConnection
               .guessContentTypeFromStream(linkStream);
           linkStream.close();
           if (strType == null)
             break;
           if (strType.rupareTo("text/html") == 0) {
             if ((!vectorSearched.contains(strLink))
                 && (!vectorToSearch.contains(strLink))) {
               vectorToSearch.addElement(strLink);
             }
           }
           if (strType.rupareTo(strTargetType) == 0) {
             if (vectorMatches.contains(strLink) == false) {
               System.out.println(strLink);
               vectorMatches.addElement(strLink);
               numberFound++;
               if (numberFound >= SEARCH_LIMIT)
                 break;
             }
           }
         } catch (IOException e) {
           System.out.println("ERROR: couldn"t open URL " + strLink);
           continue;
         }
       }
     } catch (IOException e) {
       System.out.println("ERROR: couldn"t open URL " + strURL);
       break;
     }
     numberSearched++;
     if (numberSearched >= SEARCH_LIMIT)
       break;
   }
   if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
     System.out.println("reached search limit of " + SEARCH_LIMIT);
   else
     System.out.println("done");
   searchThread = null;
 }
 public static void main(String argv[]) {
   WebCrawler applet = new WebCrawler();
   /*
    * Behind a firewall set your proxy and port here!
    */
   Properties props = new Properties(System.getProperties());
   props.put("http.proxySet", "true");
   props.put("http.proxyHost", "webcache-cup");
   props.put("http.proxyPort", "8080");
   Properties newprops = new Properties(props);
   System.setProperties(newprops);
 }

}

 </source>

Web Crawler from Sun Microsystems

   <source lang="java">

/* Copyright 2004 Sun Microsystems, Inc. All rights reserved. You may not modify, use, reproduce, or distribute this software except in compliance with the terms of the License at:*/ //import java.applet.Applet; import java.awt.BorderLayout; import java.awt.Button; import java.awt.Choice; import java.awt.FlowLayout; import java.awt.Frame; import java.awt.Graphics; import java.awt.Label; import java.awt.List; import java.awt.Panel; import java.awt.TextField; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.Properties; import java.util.StringTokenizer; import java.util.Vector; public class WebCrawler extends Applet implements ActionListener, Runnable {

   public static final String SEARCH = "Search";
   public static final String STOP = "Stop";
   public static final String DISALLOW = "Disallow:";
   public static final int    SEARCH_LIMIT = 50;
   Panel   panelMain;
   List    listMatches;
   Label   labelStatus;
   // URLs to be searched
   Vector vectorToSearch;
   // URLs already searched
   Vector vectorSearched;
   // URLs which match
   Vector vectorMatches;
   Thread searchThread;
   TextField textURL;
   Choice    choiceType;
   public void init() {
 // set up the main UI panel
 panelMain = new Panel();
 panelMain.setLayout(new BorderLayout(5, 5));
 // text entry components
 Panel panelEntry = new Panel();
 panelEntry.setLayout(new BorderLayout(5, 5));
 Panel panelURL = new Panel();
 panelURL.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
 Label labelURL = new Label("Starting URL: ", Label.RIGHT);
 panelURL.add(labelURL);
 textURL = new TextField("", 40);
 panelURL.add(textURL);
 panelEntry.add("North", panelURL);
 Panel panelType = new Panel();
 panelType.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
 Label labelType = new Label("Content type: ", Label.RIGHT);
 panelType.add(labelType);
 choiceType = new Choice();
 choiceType.addItem("text/html");
 choiceType.addItem("audio/basic");
 choiceType.addItem("audio/au");
 choiceType.addItem("audio/aiff");
 choiceType.addItem("audio/wav");
 choiceType.addItem("video/mpeg");
 choiceType.addItem("video/x-avi");
 panelType.add(choiceType);
 panelEntry.add("South", panelType);
 panelMain.add("North", panelEntry);
 // list of result URLs
 Panel panelListButtons = new Panel();
 panelListButtons.setLayout(new BorderLayout(5, 5));
 Panel panelList = new Panel();
 panelList.setLayout(new BorderLayout(5, 5));
 Label labelResults = new Label("Search results");
 panelList.add("North", labelResults);
 Panel panelListCurrent = new Panel();
 panelListCurrent.setLayout(new BorderLayout(5, 5));
 listMatches = new List(10);
 panelListCurrent.add("North", listMatches);
 labelStatus = new Label("");
 panelListCurrent.add("South", labelStatus);
 panelList.add("South", panelListCurrent);
 panelListButtons.add("North", panelList);
 // control buttons
 Panel panelButtons = new Panel();
 Button buttonSearch = new Button(SEARCH);
 buttonSearch.addActionListener(this);
 panelButtons.add(buttonSearch);
 Button buttonStop = new Button(STOP);
 buttonStop.addActionListener(this);
 panelButtons.add(buttonStop);
 panelListButtons.add("South", panelButtons);
 panelMain.add("South", panelListButtons);
 add(panelMain);
 setVisible(true);
 repaint(); 
 // initialize search data structures
 vectorToSearch = new Vector();
 vectorSearched = new Vector();
 vectorMatches = new Vector();
 // set default for URL access
 URLConnection.setDefaultAllowUserInteraction(false);
   }
   public void start() {
   }
   public void stop() {
 if (searchThread != null) {
     setStatus("stopping...");
     searchThread = null;
 }
   }
   public void destroy() {
   }
   boolean robotSafe(URL url) {
 String strHost = url.getHost();
 // form URL of the robots.txt file
 String strRobot = "http://" + strHost + "/robots.txt";
 URL urlRobot;
 try { 
     urlRobot = new URL(strRobot);
 } catch (MalformedURLException e) {
     // something weird is happening, so don"t trust it
     return false;
 }
 String strCommands;
 try {
     InputStream urlRobotStream = urlRobot.openStream();
     // read in entire file
     byte b[] = new byte[1000];
     int numRead = urlRobotStream.read(b);
     strCommands = new String(b, 0, numRead);
     while (numRead != -1) {
   if (Thread.currentThread() != searchThread)
       break;
   numRead = urlRobotStream.read(b);
   if (numRead != -1) {
       String newCommands = new String(b, 0, numRead);
       strCommands += newCommands;
   }
     }
     urlRobotStream.close();
 } catch (IOException e) {
     // if there is no robots.txt file, it is OK to search
     return true;
 }
 // assume that this robots.txt refers to us and 
 // search for "Disallow:" commands.
 String strURL = url.getFile();
 int index = 0;
 while ((index = strCommands.indexOf(DISALLOW, index)) != -1) {
     index += DISALLOW.length();
     String strPath = strCommands.substring(index);
     StringTokenizer st = new StringTokenizer(strPath);
     if (!st.hasMoreTokens())
   break;
     
     String strBadPath = st.nextToken();
     // if the URL starts with a disallowed path, it is not safe
     if (strURL.indexOf(strBadPath) == 0)
   return false;
 }
 return true;
   }
   public void paint(Graphics g) {
       //Draw a Rectangle around the applet"s display area.
       g.drawRect(0, 0, getSize().width - 1, getSize().height - 1);
 panelMain.paint(g);
 panelMain.paintComponents(g);
 // update(g);
 // panelMain.update(g);
   }
   public void run() {
 String strURL = textURL.getText();
 String strTargetType = choiceType.getSelectedItem();
 int numberSearched = 0;
 int numberFound = 0;
 if (strURL.length() == 0) {
     setStatus("ERROR: must enter a starting URL");
     return;
 }
 // initialize search data structures
 vectorToSearch.removeAllElements();
 vectorSearched.removeAllElements();
 vectorMatches.removeAllElements();
 listMatches.removeAll();
 vectorToSearch.addElement(strURL);
 while ((vectorToSearch.size() > 0) 
   && (Thread.currentThread() == searchThread)) {
     // get the first element from the to be searched list
     strURL = (String) vectorToSearch.elementAt(0);
     setStatus("searching " + strURL);
     URL url;
     try { 
   url = new URL(strURL);
     } catch (MalformedURLException e) {
   setStatus("ERROR: invalid URL " + strURL);
   break;
     }
     // mark the URL as searched (we want this one way or the other)
     vectorToSearch.removeElementAt(0);
     vectorSearched.addElement(strURL);
     // can only search http: protocol URLs
     if (url.getProtocol().rupareTo("http") != 0) 
   break;
     // test to make sure it is before searching
     if (!robotSafe(url))
   break;
     try {
   // try opening the URL
   URLConnection urlConnection = url.openConnection();
   urlConnection.setAllowUserInteraction(false);
   InputStream urlStream = url.openStream();
   String type 
     = urlConnection.guessContentTypeFromStream(urlStream);
   if (type == null)
       break;
   if (type.rupareTo("text/html") != 0) 
       break;
   // search the input stream for links
   // first, read in the entire URL
   byte b[] = new byte[1000];
   int numRead = urlStream.read(b);
   String content = new String(b, 0, numRead);
   while (numRead != -1) {
       if (Thread.currentThread() != searchThread)
     break;
       numRead = urlStream.read(b);
       if (numRead != -1) {
     String newContent = new String(b, 0, numRead);
     content += newContent;
       }
   }
   urlStream.close();
   if (Thread.currentThread() != searchThread)
       break;
   String lowerCaseContent = content.toLowerCase();
   int index = 0;
   while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
   {
       if ((index = lowerCaseContent.indexOf("href", index)) == -1) 
     break;
       if ((index = lowerCaseContent.indexOf("=", index)) == -1) 
     break;
       
       if (Thread.currentThread() != searchThread)
     break;
       index++;
       String remaining = content.substring(index);
       StringTokenizer st 
         = new StringTokenizer(remaining, "\t\n\r\">#");
       String strLink = st.nextToken();
       URL urlLink;
       try {
     urlLink = new URL(url, strLink);
     strLink = urlLink.toString();
       } catch (MalformedURLException e) {
     setStatus("ERROR: bad URL " + strLink);
     continue;
       }
       // only look at http links
       if (urlLink.getProtocol().rupareTo("http") != 0)
     break;
       if (Thread.currentThread() != searchThread)
     break;
       try {
     // try opening the URL
     URLConnection urlLinkConnection 
       = urlLink.openConnection();
     urlLinkConnection.setAllowUserInteraction(false);
     InputStream linkStream = urlLink.openStream();
     String strType 
       = urlLinkConnection.guessContentTypeFromStream(linkStream);
     linkStream.close();
     // if another page, add to the end of search list
     if (strType == null)
         break;
     if (strType.rupareTo("text/html") == 0) {
         // check to see if this URL has already been 
         // searched or is going to be searched
         if ((!vectorSearched.contains(strLink)) 
           && (!vectorToSearch.contains(strLink))) {
       // test to make sure it is robot-safe!
       if (robotSafe(urlLink))
           vectorToSearch.addElement(strLink);
         }
     }
     // if the proper type, add it to the results list
     // unless we have already seen it
     if (strType.rupareTo(strTargetType) == 0) {
         if (vectorMatches.contains(strLink) == false) {
       listMatches.add(strLink);
       vectorMatches.addElement(strLink);
       numberFound++;
       if (numberFound >= SEARCH_LIMIT)
           break;
         }
     }
       } catch (IOException e) {
     setStatus("ERROR: couldn"t open URL " + strLink);
     continue;
       }
   }
     } catch (IOException e) {
   setStatus("ERROR: couldn"t open URL " + strURL);
   break;
     }
     numberSearched++;
     if (numberSearched >= SEARCH_LIMIT)
   break;
 }
 if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
     setStatus("reached search limit of " + SEARCH_LIMIT);
 else
     setStatus("done");
 searchThread = null;
 // searchThread.stop();
   }
   void setStatus(String status) {
 labelStatus.setText(status);
   }
   public void actionPerformed(ActionEvent event) {
 String command = event.getActionCommand();
 if (command.rupareTo(SEARCH) == 0) {
     setStatus("searching...");
     // launch a thread to do the search
     if (searchThread == null) {
   searchThread = new Thread(this);
     }
     searchThread.start();
 }
 else if (command.rupareTo(STOP) == 0) {
     stop();
 }
   }
       public static void main (String argv[])
       {
               Frame f = new Frame("WebFrame");
               WebCrawler applet = new WebCrawler();
   f.add("Center", applet);

/* Behind a firewall set your proxy and port here!

/

               Properties props= new Properties(System.getProperties());
               props.put("http.proxySet", "true");
         props.put("http.proxyHost", "webcache-cup");
         props.put("http.proxyPort", "8080");
               Properties newprops = new Properties(props);
               System.setProperties(newprops);

/**/

               applet.init();
               applet.start();
               f.pack();
               f.show();
       }

}

 </source>

Java/Network Protocol/Crawler

Search Crawler

Web crawler

Web Crawler from Sun Microsystems

Навигация

Персональные инструменты

Пространства имён

Варианты

Просмотры

Ещё

Поиск

Разделы

Навигация

Инструменты