Java/Network Protocol/Crawler
Search Crawler
<source lang="java">
// The SearchCrawler class is shown here and is examined in detail in the // following sections. Notice that it extends JFrame: /*
* Chapter 6 - Crawling the Web with Java * The Art of Java * by Herbert Schildt and James Holmes * McGraw-Hill/Osborne 2003 * */
import java.awt.BorderLayout; import java.awt.Cursor; import java.awt.Font; import java.awt.GridBagConstraints; import java.awt.GridBagLayout; import java.awt.Insets; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.awt.event.KeyEvent; import java.awt.event.WindowAdapter; import java.awt.event.WindowEvent; import java.io.BufferedReader; import java.io.FileWriter; import java.io.InputStreamReader; import java.io.PrintWriter; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.swing.BorderFactory; import javax.swing.JButton; import javax.swing.JCheckBox; import javax.swing.JComboBox; import javax.swing.JFrame; import javax.swing.JLabel; import javax.swing.JMenu; import javax.swing.JMenuBar; import javax.swing.JMenuItem; import javax.swing.JOptionPane; import javax.swing.JPanel; import javax.swing.JProgressBar; import javax.swing.JScrollPane; import javax.swing.JSeparator; import javax.swing.JTable; import javax.swing.JTextField; import javax.swing.table.DefaultTableModel; // The Search Web Crawler public class SearchCrawler extends JFrame {
// Max URLs drop-down values. private static final String[] MAX_URLS = { "50", "100", "500", "1000" }; // Cache of robot disallow lists. private HashMap disallowListCache = new HashMap(); // Search GUI controls. private JTextField startTextField; private JComboBox maxComboBox; private JCheckBox limitCheckBox; private JTextField logTextField; private JTextField searchTextField; private JCheckBox caseCheckBox; private JButton searchButton; // Search stats GUI controls. private JLabel crawlingLabel2; private JLabel crawledLabel2; private JLabel toCrawlLabel2; private JProgressBar progressBar; private JLabel matchesLabel2; // Table listing search matches. private JTable table; // Flag for whether or not crawling is underway. private boolean crawling; // Matches log file print writer. private PrintWriter logFileWriter; // Constructor for Search Web Crawler. public SearchCrawler() { // Set application title. setTitle("Search Crawler"); // Set window size. setSize(600, 600); // Handle window closing events. addWindowListener(new WindowAdapter() { public void windowClosing(WindowEvent e) { actionExit(); } }); // Set up File menu. JMenuBar menuBar = new JMenuBar(); JMenu fileMenu = new JMenu("File"); fileMenu.setMnemonic(KeyEvent.VK_F); JMenuItem fileExitMenuItem = new JMenuItem("Exit", KeyEvent.VK_X); fileExitMenuItem.addActionListener(new ActionListener() { public void actionPerformed(ActionEvent e) { actionExit(); } }); fileMenu.add(fileExitMenuItem); menuBar.add(fileMenu); setJMenuBar(menuBar); // Set up search panel. JPanel searchPanel = new JPanel(); GridBagConstraints constraints; GridBagLayout layout = new GridBagLayout(); searchPanel.setLayout(layout); JLabel startLabel = new JLabel("Start URL:"); constraints = new GridBagConstraints(); constraints.anchor = GridBagConstraints.EAST; constraints.insets = new Insets(5, 5, 0, 0); layout.setConstraints(startLabel, constraints); searchPanel.add(startLabel); startTextField = new JTextField(); constraints = new GridBagConstraints(); constraints.fill = GridBagConstraints.HORIZONTAL; constraints.gridwidth = GridBagConstraints.REMAINDER; constraints.insets = new Insets(5, 5, 0, 5); layout.setConstraints(startTextField, constraints); searchPanel.add(startTextField); JLabel maxLabel = new JLabel("Max URLs to Crawl:"); constraints = new GridBagConstraints(); constraints.anchor = GridBagConstraints.EAST; constraints.insets = new Insets(5, 5, 0, 0); layout.setConstraints(maxLabel, constraints); searchPanel.add(maxLabel); maxComboBox = new JComboBox(MAX_URLS); maxComboBox.setEditable(true); constraints = new GridBagConstraints(); constraints.insets = new Insets(5, 5, 0, 0); layout.setConstraints(maxComboBox, constraints); searchPanel.add(maxComboBox); limitCheckBox = new JCheckBox("Limit crawling to Start URL site"); constraints = new GridBagConstraints(); constraints.anchor = GridBagConstraints.WEST; constraints.insets = new Insets(0, 10, 0, 0); layout.setConstraints(limitCheckBox, constraints); searchPanel.add(limitCheckBox); JLabel blankLabel = new JLabel(); constraints = new GridBagConstraints(); constraints.gridwidth = GridBagConstraints.REMAINDER; layout.setConstraints(blankLabel, constraints); searchPanel.add(blankLabel); JLabel logLabel = new JLabel("Matches Log File:"); constraints = new GridBagConstraints(); constraints.anchor = GridBagConstraints.EAST; constraints.insets = new Insets(5, 5, 0, 0); layout.setConstraints(logLabel, constraints); searchPanel.add(logLabel); String file = System.getProperty("user.dir") + System.getProperty("file.separator") + "crawler.log"; logTextField = new JTextField(file); constraints = new GridBagConstraints(); constraints.fill = GridBagConstraints.HORIZONTAL; constraints.gridwidth = GridBagConstraints.REMAINDER; constraints.insets = new Insets(5, 5, 0, 5); layout.setConstraints(logTextField, constraints); searchPanel.add(logTextField); JLabel searchLabel = new JLabel("Search String:"); constraints = new GridBagConstraints(); constraints.anchor = GridBagConstraints.EAST; constraints.insets = new Insets(5, 5, 0, 0); layout.setConstraints(searchLabel, constraints); searchPanel.add(searchLabel); searchTextField = new JTextField(); constraints = new GridBagConstraints(); constraints.fill = GridBagConstraints.HORIZONTAL; constraints.insets = new Insets(5, 5, 0, 0); constraints.gridwidth = 2; constraints.weightx = 1.0d; layout.setConstraints(searchTextField, constraints); searchPanel.add(searchTextField); caseCheckBox = new JCheckBox("Case Sensitive"); constraints = new GridBagConstraints(); constraints.insets = new Insets(5, 5, 0, 5); constraints.gridwidth = GridBagConstraints.REMAINDER; layout.setConstraints(caseCheckBox, constraints); searchPanel.add(caseCheckBox); searchButton = new JButton("Search"); searchButton.addActionListener(new ActionListener() { public void actionPerformed(ActionEvent e) { actionSearch(); } }); constraints = new GridBagConstraints(); constraints.gridwidth = GridBagConstraints.REMAINDER; constraints.insets = new Insets(5, 5, 5, 5); layout.setConstraints(searchButton, constraints); searchPanel.add(searchButton); JSeparator separator = new JSeparator(); constraints = new GridBagConstraints(); constraints.fill = GridBagConstraints.HORIZONTAL; constraints.gridwidth = GridBagConstraints.REMAINDER; constraints.insets = new Insets(5, 5, 5, 5); layout.setConstraints(separator, constraints); searchPanel.add(separator); JLabel crawlingLabel1 = new JLabel("Crawling:"); constraints = new GridBagConstraints(); constraints.anchor = GridBagConstraints.EAST; constraints.insets = new Insets(5, 5, 0, 0); layout.setConstraints(crawlingLabel1, constraints); searchPanel.add(crawlingLabel1); crawlingLabel2 = new JLabel(); crawlingLabel2.setFont(crawlingLabel2.getFont().deriveFont(Font.PLAIN)); constraints = new GridBagConstraints(); constraints.fill = GridBagConstraints.HORIZONTAL; constraints.gridwidth = GridBagConstraints.REMAINDER; constraints.insets = new Insets(5, 5, 0, 5); layout.setConstraints(crawlingLabel2, constraints); searchPanel.add(crawlingLabel2); JLabel crawledLabel1 = new JLabel("Crawled URLs:"); constraints = new GridBagConstraints(); constraints.anchor = GridBagConstraints.EAST; constraints.insets = new Insets(5, 5, 0, 0); layout.setConstraints(crawledLabel1, constraints); searchPanel.add(crawledLabel1); crawledLabel2 = new JLabel(); crawledLabel2.setFont(crawledLabel2.getFont().deriveFont(Font.PLAIN)); constraints = new GridBagConstraints(); constraints.fill = GridBagConstraints.HORIZONTAL; constraints.gridwidth = GridBagConstraints.REMAINDER; constraints.insets = new Insets(5, 5, 0, 5); layout.setConstraints(crawledLabel2, constraints); searchPanel.add(crawledLabel2); JLabel toCrawlLabel1 = new JLabel("URLs to Crawl:"); constraints = new GridBagConstraints(); constraints.anchor = GridBagConstraints.EAST; constraints.insets = new Insets(5, 5, 0, 0); layout.setConstraints(toCrawlLabel1, constraints); searchPanel.add(toCrawlLabel1); toCrawlLabel2 = new JLabel(); toCrawlLabel2.setFont(toCrawlLabel2.getFont().deriveFont(Font.PLAIN)); constraints = new GridBagConstraints(); constraints.fill = GridBagConstraints.HORIZONTAL; constraints.gridwidth = GridBagConstraints.REMAINDER; constraints.insets = new Insets(5, 5, 0, 5); layout.setConstraints(toCrawlLabel2, constraints); searchPanel.add(toCrawlLabel2); JLabel progressLabel = new JLabel("Crawling Progress:"); constraints = new GridBagConstraints(); constraints.anchor = GridBagConstraints.EAST; constraints.insets = new Insets(5, 5, 0, 0); layout.setConstraints(progressLabel, constraints); searchPanel.add(progressLabel); progressBar = new JProgressBar(); progressBar.setMinimum(0); progressBar.setStringPainted(true); constraints = new GridBagConstraints(); constraints.fill = GridBagConstraints.HORIZONTAL; constraints.gridwidth = GridBagConstraints.REMAINDER; constraints.insets = new Insets(5, 5, 0, 5); layout.setConstraints(progressBar, constraints); searchPanel.add(progressBar); JLabel matchesLabel1 = new JLabel("Search Matches:"); constraints = new GridBagConstraints(); constraints.anchor = GridBagConstraints.EAST; constraints.insets = new Insets(5, 5, 10, 0); layout.setConstraints(matchesLabel1, constraints); searchPanel.add(matchesLabel1); matchesLabel2 = new JLabel(); matchesLabel2.setFont(matchesLabel2.getFont().deriveFont(Font.PLAIN)); constraints = new GridBagConstraints(); constraints.fill = GridBagConstraints.HORIZONTAL; constraints.gridwidth = GridBagConstraints.REMAINDER; constraints.insets = new Insets(5, 5, 10, 5); layout.setConstraints(matchesLabel2, constraints); searchPanel.add(matchesLabel2); // Set up matches table. table = new JTable(new DefaultTableModel(new Object[][] {}, new String[] { "URL" }) { public boolean isCellEditable(int row, int column) { return false; } }); // Set up Matches panel. JPanel matchesPanel = new JPanel(); matchesPanel.setBorder(BorderFactory.createTitledBorder("Matches")); matchesPanel.setLayout(new BorderLayout()); matchesPanel.add(new JScrollPane(table), BorderLayout.CENTER); // Add panels to display. getContentPane().setLayout(new BorderLayout()); getContentPane().add(searchPanel, BorderLayout.NORTH); getContentPane().add(matchesPanel, BorderLayout.CENTER); } // Exit this program. private void actionExit() { System.exit(0); } // Handle Search/Stop button being clicked. private void actionSearch() { // If stop button clicked, turn crawling flag off. if (crawling) { crawling = false; return; } ArrayList errorList = new ArrayList(); // Validate that start URL has been entered. String startUrl = startTextField.getText().trim(); if (startUrl.length() < 1) { errorList.add("Missing Start URL."); } // Verify start URL. else if (verifyUrl(startUrl) == null) { errorList.add("Invalid Start URL."); } // Validate that Max URLs is either empty or is a number. int maxUrls = 0; String max = ((String) maxComboBox.getSelectedItem()).trim(); if (max.length() > 0) { try { maxUrls = Integer.parseInt(max); } catch (NumberFormatException e) { } if (maxUrls < 1) { errorList.add("Invalid Max URLs value."); } } // Validate that matches log file has been entered. String logFile = logTextField.getText().trim(); if (logFile.length() < 1) { errorList.add("Missing Matches Log File."); } // Validate that search string has been entered. String searchString = searchTextField.getText().trim(); if (searchString.length() < 1) { errorList.add("Missing Search String."); } // Show errors, if any, and return. if (errorList.size() > 0) { StringBuffer message = new StringBuffer(); // Concatenate errors into single message. for (int i = 0; i < errorList.size(); i++) { message.append(errorList.get(i)); if (i + 1 < errorList.size()) { message.append("\n"); } } showError(message.toString()); return; } // Remove "www" from start URL if present. startUrl = removeWwwFromUrl(startUrl); // Start the Search Crawler. search(logFile, startUrl, maxUrls, searchString); } private void search(final String logFile, final String startUrl, final int maxUrls, final String searchString) { // Start the search in a new thread. Thread thread = new Thread(new Runnable() { public void run() { // Show hour glass cursor while crawling is under way. setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR)); // Disable search controls. startTextField.setEnabled(false); maxComboBox.setEnabled(false); limitCheckBox.setEnabled(false); logTextField.setEnabled(false); searchTextField.setEnabled(false); caseCheckBox.setEnabled(false); // Switch Search button to "Stop." searchButton.setText("Stop"); // Reset stats. table.setModel(new DefaultTableModel(new Object[][] {}, new String[] { "URL" }) { public boolean isCellEditable(int row, int column) { return false; } }); updateStats(startUrl, 0, 0, maxUrls); // Open matches log file. try { logFileWriter = new PrintWriter(new FileWriter(logFile)); } catch (Exception e) { showError("Unable to open matches log file."); return; } // Turn crawling flag on. crawling = true; // Perform the actual crawling. crawl(startUrl, maxUrls, limitCheckBox.isSelected(), searchString, caseCheckBox.isSelected()); // Turn crawling flag off. crawling = false; // Close matches log file. try { logFileWriter.close(); } catch (Exception e) { showError("Unable to close matches log file."); } // Mark search as done. crawlingLabel2.setText("Done"); // Enable search controls. startTextField.setEnabled(true); maxComboBox.setEnabled(true); limitCheckBox.setEnabled(true); logTextField.setEnabled(true); searchTextField.setEnabled(true); caseCheckBox.setEnabled(true); // Switch search button back to "Search." searchButton.setText("Search"); // Return to default cursor. setCursor(Cursor.getDefaultCursor()); // Show message if search string not found. if (table.getRowCount() == 0) { JOptionPane .showMessageDialog( SearchCrawler.this, "Your Search String was not found. Please try another.", "Search String Not Found", JOptionPane.WARNING_MESSAGE); } } }); thread.start(); } // Show dialog box with error message. private void showError(String message) { JOptionPane.showMessageDialog(this, message, "Error", JOptionPane.ERROR_MESSAGE); } // Update crawling stats. private void updateStats(String crawling, int crawled, int toCrawl, int maxUrls) { crawlingLabel2.setText(crawling); crawledLabel2.setText("" + crawled); toCrawlLabel2.setText("" + toCrawl); // Update progress bar. if (maxUrls == -1) { progressBar.setMaximum(crawled + toCrawl); } else { progressBar.setMaximum(maxUrls); } progressBar.setValue(crawled); matchesLabel2.setText("" + table.getRowCount()); } // Add match to matches table and log file. private void addMatch(String url) { // Add URL to matches table. DefaultTableModel model = (DefaultTableModel) table.getModel(); model.addRow(new Object[] { url }); // Add URL to matches log file. try { logFileWriter.println(url); } catch (Exception e) { showError("Unable to log match."); } } // Verify URL format. private URL verifyUrl(String url) { // Only allow HTTP URLs. if (!url.toLowerCase().startsWith("http://")) return null; // Verify format of URL. URL verifiedUrl = null; try { verifiedUrl = new URL(url); } catch (Exception e) { return null; } return verifiedUrl; } // Check if robot is allowed to access the given URL. private boolean isRobotAllowed(URL urlToCheck) { String host = urlToCheck.getHost().toLowerCase(); // Retrieve host"s disallow list from cache. ArrayList disallowList = (ArrayList) disallowListCache.get(host); // If list is not in the cache, download and cache it. if (disallowList == null) { disallowList = new ArrayList(); try { URL robotsFileUrl = new URL("http://" + host + "/robots.txt"); // Open connection to robot file URL for reading. BufferedReader reader = new BufferedReader( new InputStreamReader(robotsFileUrl.openStream())); // Read robot file, creating list of disallowed paths. String line; while ((line = reader.readLine()) != null) { if (line.indexOf("Disallow:") == 0) { String disallowPath = line.substring("Disallow:" .length()); // Check disallow path for comments and remove if // present. int commentIndex = disallowPath.indexOf("#"); if (commentIndex != -1) { disallowPath = disallowPath.substring(0, commentIndex); } // Remove leading or trailing spaces from disallow path. disallowPath = disallowPath.trim(); // Add disallow path to list. disallowList.add(disallowPath); } } // Add new disallow list to cache. disallowListCache.put(host, disallowList); } catch (Exception e) { /* * Assume robot is allowed since an exception is thrown if the * robot file doesn"t exist. */ return true; } } /* * Loop through disallow list to see if crawling is allowed for the * given URL. */ String file = urlToCheck.getFile(); for (int i = 0; i < disallowList.size(); i++) { String disallow = (String) disallowList.get(i); if (file.startsWith(disallow)) { return false; } } return true; } // Download page at given URL. private String downloadPage(URL pageUrl) { try { // Open connection to URL for reading. BufferedReader reader = new BufferedReader(new InputStreamReader( pageUrl.openStream())); // Read page into buffer. String line; StringBuffer pageBuffer = new StringBuffer(); while ((line = reader.readLine()) != null) { pageBuffer.append(line); } return pageBuffer.toString(); } catch (Exception e) { } return null; } // Remove leading "www" from a URL"s host if present. private String removeWwwFromUrl(String url) { int index = url.indexOf("://www."); if (index != -1) { return url.substring(0, index + 3) + url.substring(index + 7); } return (url); } // Parse through page contents and retrieve links. private ArrayList retrieveLinks(URL pageUrl, String pageContents, HashSet crawledList, boolean limitHost) { // Compile link matching pattern. Pattern p = Pattern.rupile("<a\\s+href\\s*=\\s*\"?(.*?)[\" |>]", Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(pageContents); // Create list of link matches. ArrayList linkList = new ArrayList(); while (m.find()) { String link = m.group(1).trim(); // Skip empty links. if (link.length() < 1) { continue; } // Skip links that are just page anchors. if (link.charAt(0) == "#") { continue; } // Skip mailto links. if (link.indexOf("mailto:") != -1) { continue; } // Skip JavaScript links. if (link.toLowerCase().indexOf("javascript") != -1) { continue; } // Prefix absolute and relative URLs if necessary. if (link.indexOf("://") == -1) { // Handle absolute URLs. if (link.charAt(0) == "/") { link = "http://" + pageUrl.getHost() + link; // Handle relative URLs. } else { String file = pageUrl.getFile(); if (file.indexOf("/") == -1) { link = "http://" + pageUrl.getHost() + "/" + link; } else { String path = file.substring(0, file.lastIndexOf("/") + 1); link = "http://" + pageUrl.getHost() + path + link; } } } // Remove anchors from link. int index = link.indexOf("#"); if (index != -1) { link = link.substring(0, index); } // Remove leading "www" from URL"s host if present. link = removeWwwFromUrl(link); // Verify link and skip if invalid. URL verifiedLink = verifyUrl(link); if (verifiedLink == null) { continue; } /* * If specified, limit links to those having the same host as the * start URL. */ if (limitHost && !pageUrl.getHost().toLowerCase().equals( verifiedLink.getHost().toLowerCase())) { continue; } // Skip link if it has already been crawled. if (crawledList.contains(link)) { continue; } // Add link to list. linkList.add(link); } return (linkList); } /* * Determine whether or not search string is matched in the given page * contents. */ private boolean searchStringMatches(String pageContents, String searchString, boolean caseSensitive) { String searchContents = pageContents; /* * If case-sensitive search, lowercase page contents for comparison. */ if (!caseSensitive) { searchContents = pageContents.toLowerCase(); } // Split search string into individual terms. Pattern p = Pattern.rupile("[\\s]+"); String[] terms = p.split(searchString); // Check to see if each term matches. for (int i = 0; i < terms.length; i++) { if (caseSensitive) { if (searchContents.indexOf(terms[i]) == -1) { return false; } } else { if (searchContents.indexOf(terms[i].toLowerCase()) == -1) { return false; } } } return true; } // Perform the actual crawling, searching for the search string. public void crawl(String startUrl, int maxUrls, boolean limitHost, String searchString, boolean caseSensitive) { // Set up crawl lists. HashSet crawledList = new HashSet(); LinkedHashSet toCrawlList = new LinkedHashSet(); // Add start URL to the to crawl list. toCrawlList.add(startUrl); /* * Perform actual crawling by looping through the To Crawl list. */ while (crawling && toCrawlList.size() > 0) { /* * Check to see if the max URL count has been reached, if it was * specified. */ if (maxUrls != -1) { if (crawledList.size() == maxUrls) { break; } } // Get URL at bottom of the list. String url = (String) toCrawlList.iterator().next(); // Remove URL from the To Crawl list. toCrawlList.remove(url); // Convert string url to URL object. URL verifiedUrl = verifyUrl(url); // Skip URL if robots are not allowed to access it. if (!isRobotAllowed(verifiedUrl)) { continue; } // Update crawling stats. updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls); // Add page to the crawled list. crawledList.add(url); // Download the page at the given URL. String pageContents = downloadPage(verifiedUrl); /* * If the page was downloaded successfully, retrieve all its links * and then see if it contains the search string. */ if (pageContents != null && pageContents.length() > 0) { // Retrieve list of valid links from page. ArrayList links = retrieveLinks(verifiedUrl, pageContents, crawledList, limitHost); // Add links to the To Crawl list. toCrawlList.addAll(links); /* * Check if search string is present in page, and if so, record * a match. */ if (searchStringMatches(pageContents, searchString, caseSensitive)) { addMatch(url); } } // Update crawling stats. updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls); } } // Run the Search Crawler. public static void main(String[] args) { SearchCrawler crawler = new SearchCrawler(); crawler.show(); }
} /** A quantifier determines how many times an expression is matched. The quantifiers are shown here: + Match one or more.
- Match zero or more.
? Match zero or one.
- /
/* Character Sequence Explanation <a Look for the characters "<a". \\s+ Look for one or more space characters. href Look for the characters "href". \\s* Look for zero or more space characters. = Look for the character "--". \\s* Look for zero or more space characters. \"? Look for zero or one quote character. (.*?)Look for zero or more of any character until the next part of the pattern is matched, and place the results in a group. [\">]Look for quote character or greater than (">") character.
- /
</source>
Web crawler
<source lang="java">
Revised from: http://java.sun.ru/developer/technicalArticles/ThirdParty/WebCrawler/ import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.Properties; import java.util.StringTokenizer; import java.util.Vector; public class WebCrawler implements Runnable {
public static final String SEARCH = "Search"; public static final String STOP = "Stop"; public static final String DISALLOW = "Disallow:"; public static final int SEARCH_LIMIT = 50; Vector vectorToSearch = new Vector(); Vector vectorSearched = new Vector(); Vector vectorMatches = new Vector(); Thread searchThread; public WebCrawler() { // ("text/html"); // ("audio/basic"); // ("audio/au"); // ("audio/aiff"); // ("audio/wav"); // ("video/mpeg"); // ("video/x-avi"); URLConnection.setDefaultAllowUserInteraction(false); searchThread = new Thread(this); searchThread.start(); } public void run() { String strURL = "http://www.google.ru"; String strTargetType = "text/html"; int numberSearched = 0; int numberFound = 0; if (strURL.length() == 0) { System.out.println("ERROR: must enter a starting URL"); return; } vectorToSearch = new Vector(); vectorSearched = new Vector(); vectorMatches = new Vector(); vectorToSearch.addElement(strURL); while ((vectorToSearch.size() > 0) && (Thread.currentThread() == searchThread)) { strURL = (String) vectorToSearch.elementAt(0); System.out.println("searching " + strURL); URL url = null; try { url = new URL(strURL); } catch (MalformedURLException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } vectorToSearch.removeElementAt(0); vectorSearched.addElement(strURL); try { URLConnection urlConnection = url.openConnection(); urlConnection.setAllowUserInteraction(false); InputStream urlStream = url.openStream(); String type = urlConnection.guessContentTypeFromStream(urlStream); if (type == null) break; if (type.rupareTo("text/html") != 0) break; byte b[] = new byte[5000]; int numRead = urlStream.read(b); String content = new String(b, 0, numRead); while (numRead != -1) { if (Thread.currentThread() != searchThread) break; numRead = urlStream.read(b); if (numRead != -1) { String newContent = new String(b, 0, numRead); content += newContent; } } urlStream.close(); if (Thread.currentThread() != searchThread) break; String lowerCaseContent = content.toLowerCase(); int index = 0; while ((index = lowerCaseContent.indexOf("<a", index)) != -1) { if ((index = lowerCaseContent.indexOf("href", index)) == -1) break; if ((index = lowerCaseContent.indexOf("=", index)) == -1) break; if (Thread.currentThread() != searchThread) break; index++; String remaining = content.substring(index); StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\">#"); String strLink = st.nextToken(); URL urlLink; try { urlLink = new URL(url, strLink); strLink = urlLink.toString(); } catch (MalformedURLException e) { System.out.println("ERROR: bad URL " + strLink); continue; } if (urlLink.getProtocol().rupareTo("http") != 0) break; if (Thread.currentThread() != searchThread) break; try { URLConnection urlLinkConnection = urlLink.openConnection(); urlLinkConnection.setAllowUserInteraction(false); InputStream linkStream = urlLink.openStream(); String strType = urlLinkConnection .guessContentTypeFromStream(linkStream); linkStream.close(); if (strType == null) break; if (strType.rupareTo("text/html") == 0) { if ((!vectorSearched.contains(strLink)) && (!vectorToSearch.contains(strLink))) { vectorToSearch.addElement(strLink); } } if (strType.rupareTo(strTargetType) == 0) { if (vectorMatches.contains(strLink) == false) { System.out.println(strLink); vectorMatches.addElement(strLink); numberFound++; if (numberFound >= SEARCH_LIMIT) break; } } } catch (IOException e) { System.out.println("ERROR: couldn"t open URL " + strLink); continue; } } } catch (IOException e) { System.out.println("ERROR: couldn"t open URL " + strURL); break; } numberSearched++; if (numberSearched >= SEARCH_LIMIT) break; } if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT) System.out.println("reached search limit of " + SEARCH_LIMIT); else System.out.println("done"); searchThread = null; } public static void main(String argv[]) { WebCrawler applet = new WebCrawler(); /* * Behind a firewall set your proxy and port here! */ Properties props = new Properties(System.getProperties()); props.put("http.proxySet", "true"); props.put("http.proxyHost", "webcache-cup"); props.put("http.proxyPort", "8080"); Properties newprops = new Properties(props); System.setProperties(newprops); }
}
</source>
Web Crawler from Sun Microsystems
<source lang="java">
/* Copyright 2004 Sun Microsystems, Inc. All rights reserved. You may not modify, use, reproduce, or distribute this software except in compliance with the terms of the License at:*/ //import java.applet.Applet; import java.awt.BorderLayout; import java.awt.Button; import java.awt.Choice; import java.awt.FlowLayout; import java.awt.Frame; import java.awt.Graphics; import java.awt.Label; import java.awt.List; import java.awt.Panel; import java.awt.TextField; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.Properties; import java.util.StringTokenizer; import java.util.Vector; public class WebCrawler extends Applet implements ActionListener, Runnable {
public static final String SEARCH = "Search"; public static final String STOP = "Stop"; public static final String DISALLOW = "Disallow:"; public static final int SEARCH_LIMIT = 50; Panel panelMain; List listMatches; Label labelStatus; // URLs to be searched Vector vectorToSearch; // URLs already searched Vector vectorSearched; // URLs which match Vector vectorMatches; Thread searchThread; TextField textURL; Choice choiceType; public void init() { // set up the main UI panel panelMain = new Panel(); panelMain.setLayout(new BorderLayout(5, 5)); // text entry components Panel panelEntry = new Panel(); panelEntry.setLayout(new BorderLayout(5, 5)); Panel panelURL = new Panel(); panelURL.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5)); Label labelURL = new Label("Starting URL: ", Label.RIGHT); panelURL.add(labelURL); textURL = new TextField("", 40); panelURL.add(textURL); panelEntry.add("North", panelURL); Panel panelType = new Panel(); panelType.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5)); Label labelType = new Label("Content type: ", Label.RIGHT); panelType.add(labelType); choiceType = new Choice(); choiceType.addItem("text/html"); choiceType.addItem("audio/basic"); choiceType.addItem("audio/au"); choiceType.addItem("audio/aiff"); choiceType.addItem("audio/wav"); choiceType.addItem("video/mpeg"); choiceType.addItem("video/x-avi"); panelType.add(choiceType); panelEntry.add("South", panelType); panelMain.add("North", panelEntry); // list of result URLs Panel panelListButtons = new Panel(); panelListButtons.setLayout(new BorderLayout(5, 5)); Panel panelList = new Panel(); panelList.setLayout(new BorderLayout(5, 5)); Label labelResults = new Label("Search results"); panelList.add("North", labelResults); Panel panelListCurrent = new Panel(); panelListCurrent.setLayout(new BorderLayout(5, 5)); listMatches = new List(10); panelListCurrent.add("North", listMatches); labelStatus = new Label(""); panelListCurrent.add("South", labelStatus); panelList.add("South", panelListCurrent); panelListButtons.add("North", panelList); // control buttons Panel panelButtons = new Panel(); Button buttonSearch = new Button(SEARCH); buttonSearch.addActionListener(this); panelButtons.add(buttonSearch); Button buttonStop = new Button(STOP); buttonStop.addActionListener(this); panelButtons.add(buttonStop); panelListButtons.add("South", panelButtons); panelMain.add("South", panelListButtons); add(panelMain); setVisible(true); repaint(); // initialize search data structures vectorToSearch = new Vector(); vectorSearched = new Vector(); vectorMatches = new Vector(); // set default for URL access URLConnection.setDefaultAllowUserInteraction(false); } public void start() { } public void stop() { if (searchThread != null) { setStatus("stopping..."); searchThread = null; } } public void destroy() { } boolean robotSafe(URL url) { String strHost = url.getHost(); // form URL of the robots.txt file String strRobot = "http://" + strHost + "/robots.txt"; URL urlRobot; try { urlRobot = new URL(strRobot); } catch (MalformedURLException e) { // something weird is happening, so don"t trust it return false; } String strCommands; try { InputStream urlRobotStream = urlRobot.openStream(); // read in entire file byte b[] = new byte[1000]; int numRead = urlRobotStream.read(b); strCommands = new String(b, 0, numRead); while (numRead != -1) { if (Thread.currentThread() != searchThread) break; numRead = urlRobotStream.read(b); if (numRead != -1) { String newCommands = new String(b, 0, numRead); strCommands += newCommands; } } urlRobotStream.close(); } catch (IOException e) { // if there is no robots.txt file, it is OK to search return true; } // assume that this robots.txt refers to us and // search for "Disallow:" commands. String strURL = url.getFile(); int index = 0; while ((index = strCommands.indexOf(DISALLOW, index)) != -1) { index += DISALLOW.length(); String strPath = strCommands.substring(index); StringTokenizer st = new StringTokenizer(strPath); if (!st.hasMoreTokens()) break; String strBadPath = st.nextToken(); // if the URL starts with a disallowed path, it is not safe if (strURL.indexOf(strBadPath) == 0) return false; } return true; } public void paint(Graphics g) { //Draw a Rectangle around the applet"s display area. g.drawRect(0, 0, getSize().width - 1, getSize().height - 1); panelMain.paint(g); panelMain.paintComponents(g); // update(g); // panelMain.update(g); } public void run() { String strURL = textURL.getText(); String strTargetType = choiceType.getSelectedItem(); int numberSearched = 0; int numberFound = 0; if (strURL.length() == 0) { setStatus("ERROR: must enter a starting URL"); return; } // initialize search data structures vectorToSearch.removeAllElements(); vectorSearched.removeAllElements(); vectorMatches.removeAllElements(); listMatches.removeAll(); vectorToSearch.addElement(strURL); while ((vectorToSearch.size() > 0) && (Thread.currentThread() == searchThread)) { // get the first element from the to be searched list strURL = (String) vectorToSearch.elementAt(0); setStatus("searching " + strURL); URL url; try { url = new URL(strURL); } catch (MalformedURLException e) { setStatus("ERROR: invalid URL " + strURL); break; } // mark the URL as searched (we want this one way or the other) vectorToSearch.removeElementAt(0); vectorSearched.addElement(strURL); // can only search http: protocol URLs if (url.getProtocol().rupareTo("http") != 0) break; // test to make sure it is before searching if (!robotSafe(url)) break; try { // try opening the URL URLConnection urlConnection = url.openConnection(); urlConnection.setAllowUserInteraction(false); InputStream urlStream = url.openStream(); String type = urlConnection.guessContentTypeFromStream(urlStream); if (type == null) break; if (type.rupareTo("text/html") != 0) break; // search the input stream for links // first, read in the entire URL byte b[] = new byte[1000]; int numRead = urlStream.read(b); String content = new String(b, 0, numRead); while (numRead != -1) { if (Thread.currentThread() != searchThread) break; numRead = urlStream.read(b); if (numRead != -1) { String newContent = new String(b, 0, numRead); content += newContent; } } urlStream.close(); if (Thread.currentThread() != searchThread) break; String lowerCaseContent = content.toLowerCase(); int index = 0; while ((index = lowerCaseContent.indexOf("<a", index)) != -1) { if ((index = lowerCaseContent.indexOf("href", index)) == -1) break; if ((index = lowerCaseContent.indexOf("=", index)) == -1) break; if (Thread.currentThread() != searchThread) break; index++; String remaining = content.substring(index); StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\">#"); String strLink = st.nextToken(); URL urlLink; try { urlLink = new URL(url, strLink); strLink = urlLink.toString(); } catch (MalformedURLException e) { setStatus("ERROR: bad URL " + strLink); continue; } // only look at http links if (urlLink.getProtocol().rupareTo("http") != 0) break; if (Thread.currentThread() != searchThread) break; try { // try opening the URL URLConnection urlLinkConnection = urlLink.openConnection(); urlLinkConnection.setAllowUserInteraction(false); InputStream linkStream = urlLink.openStream(); String strType = urlLinkConnection.guessContentTypeFromStream(linkStream); linkStream.close(); // if another page, add to the end of search list if (strType == null) break; if (strType.rupareTo("text/html") == 0) { // check to see if this URL has already been // searched or is going to be searched if ((!vectorSearched.contains(strLink)) && (!vectorToSearch.contains(strLink))) { // test to make sure it is robot-safe! if (robotSafe(urlLink)) vectorToSearch.addElement(strLink); } } // if the proper type, add it to the results list // unless we have already seen it if (strType.rupareTo(strTargetType) == 0) { if (vectorMatches.contains(strLink) == false) { listMatches.add(strLink); vectorMatches.addElement(strLink); numberFound++; if (numberFound >= SEARCH_LIMIT) break; } } } catch (IOException e) { setStatus("ERROR: couldn"t open URL " + strLink); continue; } } } catch (IOException e) { setStatus("ERROR: couldn"t open URL " + strURL); break; } numberSearched++; if (numberSearched >= SEARCH_LIMIT) break; } if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT) setStatus("reached search limit of " + SEARCH_LIMIT); else setStatus("done"); searchThread = null; // searchThread.stop(); } void setStatus(String status) { labelStatus.setText(status); } public void actionPerformed(ActionEvent event) { String command = event.getActionCommand(); if (command.rupareTo(SEARCH) == 0) { setStatus("searching..."); // launch a thread to do the search if (searchThread == null) { searchThread = new Thread(this); } searchThread.start(); } else if (command.rupareTo(STOP) == 0) { stop(); } } public static void main (String argv[]) { Frame f = new Frame("WebFrame"); WebCrawler applet = new WebCrawler(); f.add("Center", applet);
/* Behind a firewall set your proxy and port here!
- /
Properties props= new Properties(System.getProperties()); props.put("http.proxySet", "true"); props.put("http.proxyHost", "webcache-cup"); props.put("http.proxyPort", "8080"); Properties newprops = new Properties(props); System.setProperties(newprops);
/**/
applet.init(); applet.start(); f.pack(); f.show(); }
}
</source>