Java/Network Protocol/Crawler
Версия от 18:01, 31 мая 2010; (обсуждение)
Search Crawler
// The SearchCrawler class is shown here and is examined in detail in the
// following sections. Notice that it extends JFrame:
/*
* Chapter 6 - Crawling the Web with Java
* The Art of Java
* by Herbert Schildt and James Holmes
* McGraw-Hill/Osborne 2003
*
*/
import java.awt.BorderLayout;
import java.awt.Cursor;
import java.awt.Font;
import java.awt.GridBagConstraints;
import java.awt.GridBagLayout;
import java.awt.Insets;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.KeyEvent;
import java.awt.event.WindowAdapter;
import java.awt.event.WindowEvent;
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.BorderFactory;
import javax.swing.JButton;
import javax.swing.JCheckBox;
import javax.swing.JComboBox;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JMenu;
import javax.swing.JMenuBar;
import javax.swing.JMenuItem;
import javax.swing.JOptionPane;
import javax.swing.JPanel;
import javax.swing.JProgressBar;
import javax.swing.JScrollPane;
import javax.swing.JSeparator;
import javax.swing.JTable;
import javax.swing.JTextField;
import javax.swing.table.DefaultTableModel;
// The Search Web Crawler
public class SearchCrawler extends JFrame {
// Max URLs drop-down values.
private static final String[] MAX_URLS = { "50", "100", "500", "1000" };
// Cache of robot disallow lists.
private HashMap disallowListCache = new HashMap();
// Search GUI controls.
private JTextField startTextField;
private JComboBox maxComboBox;
private JCheckBox limitCheckBox;
private JTextField logTextField;
private JTextField searchTextField;
private JCheckBox caseCheckBox;
private JButton searchButton;
// Search stats GUI controls.
private JLabel crawlingLabel2;
private JLabel crawledLabel2;
private JLabel toCrawlLabel2;
private JProgressBar progressBar;
private JLabel matchesLabel2;
// Table listing search matches.
private JTable table;
// Flag for whether or not crawling is underway.
private boolean crawling;
// Matches log file print writer.
private PrintWriter logFileWriter;
// Constructor for Search Web Crawler.
public SearchCrawler() {
// Set application title.
setTitle("Search Crawler");
// Set window size.
setSize(600, 600);
// Handle window closing events.
addWindowListener(new WindowAdapter() {
public void windowClosing(WindowEvent e) {
actionExit();
}
});
// Set up File menu.
JMenuBar menuBar = new JMenuBar();
JMenu fileMenu = new JMenu("File");
fileMenu.setMnemonic(KeyEvent.VK_F);
JMenuItem fileExitMenuItem = new JMenuItem("Exit", KeyEvent.VK_X);
fileExitMenuItem.addActionListener(new ActionListener() {
public void actionPerformed(ActionEvent e) {
actionExit();
}
});
fileMenu.add(fileExitMenuItem);
menuBar.add(fileMenu);
setJMenuBar(menuBar);
// Set up search panel.
JPanel searchPanel = new JPanel();
GridBagConstraints constraints;
GridBagLayout layout = new GridBagLayout();
searchPanel.setLayout(layout);
JLabel startLabel = new JLabel("Start URL:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(startLabel, constraints);
searchPanel.add(startLabel);
startTextField = new JTextField();
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 0, 5);
layout.setConstraints(startTextField, constraints);
searchPanel.add(startTextField);
JLabel maxLabel = new JLabel("Max URLs to Crawl:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(maxLabel, constraints);
searchPanel.add(maxLabel);
maxComboBox = new JComboBox(MAX_URLS);
maxComboBox.setEditable(true);
constraints = new GridBagConstraints();
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(maxComboBox, constraints);
searchPanel.add(maxComboBox);
limitCheckBox = new JCheckBox("Limit crawling to Start URL site");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.WEST;
constraints.insets = new Insets(0, 10, 0, 0);
layout.setConstraints(limitCheckBox, constraints);
searchPanel.add(limitCheckBox);
JLabel blankLabel = new JLabel();
constraints = new GridBagConstraints();
constraints.gridwidth = GridBagConstraints.REMAINDER;
layout.setConstraints(blankLabel, constraints);
searchPanel.add(blankLabel);
JLabel logLabel = new JLabel("Matches Log File:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(logLabel, constraints);
searchPanel.add(logLabel);
String file = System.getProperty("user.dir")
+ System.getProperty("file.separator") + "crawler.log";
logTextField = new JTextField(file);
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 0, 5);
layout.setConstraints(logTextField, constraints);
searchPanel.add(logTextField);
JLabel searchLabel = new JLabel("Search String:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(searchLabel, constraints);
searchPanel.add(searchLabel);
searchTextField = new JTextField();
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.insets = new Insets(5, 5, 0, 0);
constraints.gridwidth = 2;
constraints.weightx = 1.0d;
layout.setConstraints(searchTextField, constraints);
searchPanel.add(searchTextField);
caseCheckBox = new JCheckBox("Case Sensitive");
constraints = new GridBagConstraints();
constraints.insets = new Insets(5, 5, 0, 5);
constraints.gridwidth = GridBagConstraints.REMAINDER;
layout.setConstraints(caseCheckBox, constraints);
searchPanel.add(caseCheckBox);
searchButton = new JButton("Search");
searchButton.addActionListener(new ActionListener() {
public void actionPerformed(ActionEvent e) {
actionSearch();
}
});
constraints = new GridBagConstraints();
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 5, 5);
layout.setConstraints(searchButton, constraints);
searchPanel.add(searchButton);
JSeparator separator = new JSeparator();
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 5, 5);
layout.setConstraints(separator, constraints);
searchPanel.add(separator);
JLabel crawlingLabel1 = new JLabel("Crawling:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(crawlingLabel1, constraints);
searchPanel.add(crawlingLabel1);
crawlingLabel2 = new JLabel();
crawlingLabel2.setFont(crawlingLabel2.getFont().deriveFont(Font.PLAIN));
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 0, 5);
layout.setConstraints(crawlingLabel2, constraints);
searchPanel.add(crawlingLabel2);
JLabel crawledLabel1 = new JLabel("Crawled URLs:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(crawledLabel1, constraints);
searchPanel.add(crawledLabel1);
crawledLabel2 = new JLabel();
crawledLabel2.setFont(crawledLabel2.getFont().deriveFont(Font.PLAIN));
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 0, 5);
layout.setConstraints(crawledLabel2, constraints);
searchPanel.add(crawledLabel2);
JLabel toCrawlLabel1 = new JLabel("URLs to Crawl:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(toCrawlLabel1, constraints);
searchPanel.add(toCrawlLabel1);
toCrawlLabel2 = new JLabel();
toCrawlLabel2.setFont(toCrawlLabel2.getFont().deriveFont(Font.PLAIN));
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 0, 5);
layout.setConstraints(toCrawlLabel2, constraints);
searchPanel.add(toCrawlLabel2);
JLabel progressLabel = new JLabel("Crawling Progress:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(progressLabel, constraints);
searchPanel.add(progressLabel);
progressBar = new JProgressBar();
progressBar.setMinimum(0);
progressBar.setStringPainted(true);
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 0, 5);
layout.setConstraints(progressBar, constraints);
searchPanel.add(progressBar);
JLabel matchesLabel1 = new JLabel("Search Matches:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 10, 0);
layout.setConstraints(matchesLabel1, constraints);
searchPanel.add(matchesLabel1);
matchesLabel2 = new JLabel();
matchesLabel2.setFont(matchesLabel2.getFont().deriveFont(Font.PLAIN));
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 10, 5);
layout.setConstraints(matchesLabel2, constraints);
searchPanel.add(matchesLabel2);
// Set up matches table.
table = new JTable(new DefaultTableModel(new Object[][] {},
new String[] { "URL" }) {
public boolean isCellEditable(int row, int column) {
return false;
}
});
// Set up Matches panel.
JPanel matchesPanel = new JPanel();
matchesPanel.setBorder(BorderFactory.createTitledBorder("Matches"));
matchesPanel.setLayout(new BorderLayout());
matchesPanel.add(new JScrollPane(table), BorderLayout.CENTER);
// Add panels to display.
getContentPane().setLayout(new BorderLayout());
getContentPane().add(searchPanel, BorderLayout.NORTH);
getContentPane().add(matchesPanel, BorderLayout.CENTER);
}
// Exit this program.
private void actionExit() {
System.exit(0);
}
// Handle Search/Stop button being clicked.
private void actionSearch() {
// If stop button clicked, turn crawling flag off.
if (crawling) {
crawling = false;
return;
}
ArrayList errorList = new ArrayList();
// Validate that start URL has been entered.
String startUrl = startTextField.getText().trim();
if (startUrl.length() < 1) {
errorList.add("Missing Start URL.");
}
// Verify start URL.
else if (verifyUrl(startUrl) == null) {
errorList.add("Invalid Start URL.");
}
// Validate that Max URLs is either empty or is a number.
int maxUrls = 0;
String max = ((String) maxComboBox.getSelectedItem()).trim();
if (max.length() > 0) {
try {
maxUrls = Integer.parseInt(max);
} catch (NumberFormatException e) {
}
if (maxUrls < 1) {
errorList.add("Invalid Max URLs value.");
}
}
// Validate that matches log file has been entered.
String logFile = logTextField.getText().trim();
if (logFile.length() < 1) {
errorList.add("Missing Matches Log File.");
}
// Validate that search string has been entered.
String searchString = searchTextField.getText().trim();
if (searchString.length() < 1) {
errorList.add("Missing Search String.");
}
// Show errors, if any, and return.
if (errorList.size() > 0) {
StringBuffer message = new StringBuffer();
// Concatenate errors into single message.
for (int i = 0; i < errorList.size(); i++) {
message.append(errorList.get(i));
if (i + 1 < errorList.size()) {
message.append("\n");
}
}
showError(message.toString());
return;
}
// Remove "www" from start URL if present.
startUrl = removeWwwFromUrl(startUrl);
// Start the Search Crawler.
search(logFile, startUrl, maxUrls, searchString);
}
private void search(final String logFile, final String startUrl,
final int maxUrls, final String searchString) {
// Start the search in a new thread.
Thread thread = new Thread(new Runnable() {
public void run() {
// Show hour glass cursor while crawling is under way.
setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));
// Disable search controls.
startTextField.setEnabled(false);
maxComboBox.setEnabled(false);
limitCheckBox.setEnabled(false);
logTextField.setEnabled(false);
searchTextField.setEnabled(false);
caseCheckBox.setEnabled(false);
// Switch Search button to "Stop."
searchButton.setText("Stop");
// Reset stats.
table.setModel(new DefaultTableModel(new Object[][] {},
new String[] { "URL" }) {
public boolean isCellEditable(int row, int column) {
return false;
}
});
updateStats(startUrl, 0, 0, maxUrls);
// Open matches log file.
try {
logFileWriter = new PrintWriter(new FileWriter(logFile));
} catch (Exception e) {
showError("Unable to open matches log file.");
return;
}
// Turn crawling flag on.
crawling = true;
// Perform the actual crawling.
crawl(startUrl, maxUrls, limitCheckBox.isSelected(),
searchString, caseCheckBox.isSelected());
// Turn crawling flag off.
crawling = false;
// Close matches log file.
try {
logFileWriter.close();
} catch (Exception e) {
showError("Unable to close matches log file.");
}
// Mark search as done.
crawlingLabel2.setText("Done");
// Enable search controls.
startTextField.setEnabled(true);
maxComboBox.setEnabled(true);
limitCheckBox.setEnabled(true);
logTextField.setEnabled(true);
searchTextField.setEnabled(true);
caseCheckBox.setEnabled(true);
// Switch search button back to "Search."
searchButton.setText("Search");
// Return to default cursor.
setCursor(Cursor.getDefaultCursor());
// Show message if search string not found.
if (table.getRowCount() == 0) {
JOptionPane
.showMessageDialog(
SearchCrawler.this,
"Your Search String was not found. Please try another.",
"Search String Not Found",
JOptionPane.WARNING_MESSAGE);
}
}
});
thread.start();
}
// Show dialog box with error message.
private void showError(String message) {
JOptionPane.showMessageDialog(this, message, "Error",
JOptionPane.ERROR_MESSAGE);
}
// Update crawling stats.
private void updateStats(String crawling, int crawled, int toCrawl,
int maxUrls) {
crawlingLabel2.setText(crawling);
crawledLabel2.setText("" + crawled);
toCrawlLabel2.setText("" + toCrawl);
// Update progress bar.
if (maxUrls == -1) {
progressBar.setMaximum(crawled + toCrawl);
} else {
progressBar.setMaximum(maxUrls);
}
progressBar.setValue(crawled);
matchesLabel2.setText("" + table.getRowCount());
}
// Add match to matches table and log file.
private void addMatch(String url) {
// Add URL to matches table.
DefaultTableModel model = (DefaultTableModel) table.getModel();
model.addRow(new Object[] { url });
// Add URL to matches log file.
try {
logFileWriter.println(url);
} catch (Exception e) {
showError("Unable to log match.");
}
}
// Verify URL format.
private URL verifyUrl(String url) {
// Only allow HTTP URLs.
if (!url.toLowerCase().startsWith("http://"))
return null;
// Verify format of URL.
URL verifiedUrl = null;
try {
verifiedUrl = new URL(url);
} catch (Exception e) {
return null;
}
return verifiedUrl;
}
// Check if robot is allowed to access the given URL.
private boolean isRobotAllowed(URL urlToCheck) {
String host = urlToCheck.getHost().toLowerCase();
// Retrieve host"s disallow list from cache.
ArrayList disallowList = (ArrayList) disallowListCache.get(host);
// If list is not in the cache, download and cache it.
if (disallowList == null) {
disallowList = new ArrayList();
try {
URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
// Open connection to robot file URL for reading.
BufferedReader reader = new BufferedReader(
new InputStreamReader(robotsFileUrl.openStream()));
// Read robot file, creating list of disallowed paths.
String line;
while ((line = reader.readLine()) != null) {
if (line.indexOf("Disallow:") == 0) {
String disallowPath = line.substring("Disallow:"
.length());
// Check disallow path for comments and remove if
// present.
int commentIndex = disallowPath.indexOf("#");
if (commentIndex != -1) {
disallowPath = disallowPath.substring(0,
commentIndex);
}
// Remove leading or trailing spaces from disallow path.
disallowPath = disallowPath.trim();
// Add disallow path to list.
disallowList.add(disallowPath);
}
}
// Add new disallow list to cache.
disallowListCache.put(host, disallowList);
} catch (Exception e) {
/*
* Assume robot is allowed since an exception is thrown if the
* robot file doesn"t exist.
*/
return true;
}
}
/*
* Loop through disallow list to see if crawling is allowed for the
* given URL.
*/
String file = urlToCheck.getFile();
for (int i = 0; i < disallowList.size(); i++) {
String disallow = (String) disallowList.get(i);
if (file.startsWith(disallow)) {
return false;
}
}
return true;
}
// Download page at given URL.
private String downloadPage(URL pageUrl) {
try {
// Open connection to URL for reading.
BufferedReader reader = new BufferedReader(new InputStreamReader(
pageUrl.openStream()));
// Read page into buffer.
String line;
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
pageBuffer.append(line);
}
return pageBuffer.toString();
} catch (Exception e) {
}
return null;
}
// Remove leading "www" from a URL"s host if present.
private String removeWwwFromUrl(String url) {
int index = url.indexOf("://www.");
if (index != -1) {
return url.substring(0, index + 3) + url.substring(index + 7);
}
return (url);
}
// Parse through page contents and retrieve links.
private ArrayList retrieveLinks(URL pageUrl, String pageContents,
HashSet crawledList, boolean limitHost) {
// Compile link matching pattern.
Pattern p = Pattern.rupile("<a\\s+href\\s*=\\s*\"?(.*?)[\" |>]",
Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(pageContents);
// Create list of link matches.
ArrayList linkList = new ArrayList();
while (m.find()) {
String link = m.group(1).trim();
// Skip empty links.
if (link.length() < 1) {
continue;
}
// Skip links that are just page anchors.
if (link.charAt(0) == "#") {
continue;
}
// Skip mailto links.
if (link.indexOf("mailto:") != -1) {
continue;
}
// Skip JavaScript links.
if (link.toLowerCase().indexOf("javascript") != -1) {
continue;
}
// Prefix absolute and relative URLs if necessary.
if (link.indexOf("://") == -1) {
// Handle absolute URLs.
if (link.charAt(0) == "/") {
link = "http://" + pageUrl.getHost() + link;
// Handle relative URLs.
} else {
String file = pageUrl.getFile();
if (file.indexOf("/") == -1) {
link = "http://" + pageUrl.getHost() + "/" + link;
} else {
String path = file.substring(0,
file.lastIndexOf("/") + 1);
link = "http://" + pageUrl.getHost() + path + link;
}
}
}
// Remove anchors from link.
int index = link.indexOf("#");
if (index != -1) {
link = link.substring(0, index);
}
// Remove leading "www" from URL"s host if present.
link = removeWwwFromUrl(link);
// Verify link and skip if invalid.
URL verifiedLink = verifyUrl(link);
if (verifiedLink == null) {
continue;
}
/*
* If specified, limit links to those having the same host as the
* start URL.
*/
if (limitHost
&& !pageUrl.getHost().toLowerCase().equals(
verifiedLink.getHost().toLowerCase())) {
continue;
}
// Skip link if it has already been crawled.
if (crawledList.contains(link)) {
continue;
}
// Add link to list.
linkList.add(link);
}
return (linkList);
}
/*
* Determine whether or not search string is matched in the given page
* contents.
*/
private boolean searchStringMatches(String pageContents,
String searchString, boolean caseSensitive) {
String searchContents = pageContents;
/*
* If case-sensitive search, lowercase page contents for comparison.
*/
if (!caseSensitive) {
searchContents = pageContents.toLowerCase();
}
// Split search string into individual terms.
Pattern p = Pattern.rupile("[\\s]+");
String[] terms = p.split(searchString);
// Check to see if each term matches.
for (int i = 0; i < terms.length; i++) {
if (caseSensitive) {
if (searchContents.indexOf(terms[i]) == -1) {
return false;
}
} else {
if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
return false;
}
}
}
return true;
}
// Perform the actual crawling, searching for the search string.
public void crawl(String startUrl, int maxUrls, boolean limitHost,
String searchString, boolean caseSensitive) {
// Set up crawl lists.
HashSet crawledList = new HashSet();
LinkedHashSet toCrawlList = new LinkedHashSet();
// Add start URL to the to crawl list.
toCrawlList.add(startUrl);
/*
* Perform actual crawling by looping through the To Crawl list.
*/
while (crawling && toCrawlList.size() > 0) {
/*
* Check to see if the max URL count has been reached, if it was
* specified.
*/
if (maxUrls != -1) {
if (crawledList.size() == maxUrls) {
break;
}
}
// Get URL at bottom of the list.
String url = (String) toCrawlList.iterator().next();
// Remove URL from the To Crawl list.
toCrawlList.remove(url);
// Convert string url to URL object.
URL verifiedUrl = verifyUrl(url);
// Skip URL if robots are not allowed to access it.
if (!isRobotAllowed(verifiedUrl)) {
continue;
}
// Update crawling stats.
updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);
// Add page to the crawled list.
crawledList.add(url);
// Download the page at the given URL.
String pageContents = downloadPage(verifiedUrl);
/*
* If the page was downloaded successfully, retrieve all its links
* and then see if it contains the search string.
*/
if (pageContents != null && pageContents.length() > 0) {
// Retrieve list of valid links from page.
ArrayList links = retrieveLinks(verifiedUrl, pageContents,
crawledList, limitHost);
// Add links to the To Crawl list.
toCrawlList.addAll(links);
/*
* Check if search string is present in page, and if so, record
* a match.
*/
if (searchStringMatches(pageContents, searchString,
caseSensitive)) {
addMatch(url);
}
}
// Update crawling stats.
updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);
}
}
// Run the Search Crawler.
public static void main(String[] args) {
SearchCrawler crawler = new SearchCrawler();
crawler.show();
}
}
/**
A quantifier determines how many times an expression is matched. The quantifiers are shown here:
+ Match one or more.
* Match zero or more.
? Match zero or one.
*/
/*
Character Sequence Explanation
<a Look for the characters "<a".
\\s+ Look for one or more space characters.
href Look for the characters "href".
\\s* Look for zero or more space characters.
= Look for the character "--".
\\s* Look for zero or more space characters.
\"? Look for zero or one quote character.
(.*?)Look for zero or more of any character until the next part of the pattern is matched, and place the results in a group.
[\">]Look for quote character or greater than (">") character.
*/
Web crawler
Revised from: http://java.sun.ru/developer/technicalArticles/ThirdParty/WebCrawler/
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.Vector;
public class WebCrawler implements Runnable {
public static final String SEARCH = "Search";
public static final String STOP = "Stop";
public static final String DISALLOW = "Disallow:";
public static final int SEARCH_LIMIT = 50;
Vector vectorToSearch = new Vector();
Vector vectorSearched = new Vector();
Vector vectorMatches = new Vector();
Thread searchThread;
public WebCrawler() {
// ("text/html");
// ("audio/basic");
// ("audio/au");
// ("audio/aiff");
// ("audio/wav");
// ("video/mpeg");
// ("video/x-avi");
URLConnection.setDefaultAllowUserInteraction(false);
searchThread = new Thread(this);
searchThread.start();
}
public void run() {
String strURL = "http://www.google.ru";
String strTargetType = "text/html";
int numberSearched = 0;
int numberFound = 0;
if (strURL.length() == 0) {
System.out.println("ERROR: must enter a starting URL");
return;
}
vectorToSearch = new Vector();
vectorSearched = new Vector();
vectorMatches = new Vector();
vectorToSearch.addElement(strURL);
while ((vectorToSearch.size() > 0)
&& (Thread.currentThread() == searchThread)) {
strURL = (String) vectorToSearch.elementAt(0);
System.out.println("searching " + strURL);
URL url = null;
try {
url = new URL(strURL);
} catch (MalformedURLException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
vectorToSearch.removeElementAt(0);
vectorSearched.addElement(strURL);
try {
URLConnection urlConnection = url.openConnection();
urlConnection.setAllowUserInteraction(false);
InputStream urlStream = url.openStream();
String type = urlConnection.guessContentTypeFromStream(urlStream);
if (type == null)
break;
if (type.rupareTo("text/html") != 0)
break;
byte b[] = new byte[5000];
int numRead = urlStream.read(b);
String content = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != searchThread)
break;
numRead = urlStream.read(b);
if (numRead != -1) {
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
urlStream.close();
if (Thread.currentThread() != searchThread)
break;
String lowerCaseContent = content.toLowerCase();
int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1) {
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
break;
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
break;
if (Thread.currentThread() != searchThread)
break;
index++;
String remaining = content.substring(index);
StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken();
URL urlLink;
try {
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
} catch (MalformedURLException e) {
System.out.println("ERROR: bad URL " + strLink);
continue;
}
if (urlLink.getProtocol().rupareTo("http") != 0)
break;
if (Thread.currentThread() != searchThread)
break;
try {
URLConnection urlLinkConnection = urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);
InputStream linkStream = urlLink.openStream();
String strType = urlLinkConnection
.guessContentTypeFromStream(linkStream);
linkStream.close();
if (strType == null)
break;
if (strType.rupareTo("text/html") == 0) {
if ((!vectorSearched.contains(strLink))
&& (!vectorToSearch.contains(strLink))) {
vectorToSearch.addElement(strLink);
}
}
if (strType.rupareTo(strTargetType) == 0) {
if (vectorMatches.contains(strLink) == false) {
System.out.println(strLink);
vectorMatches.addElement(strLink);
numberFound++;
if (numberFound >= SEARCH_LIMIT)
break;
}
}
} catch (IOException e) {
System.out.println("ERROR: couldn"t open URL " + strLink);
continue;
}
}
} catch (IOException e) {
System.out.println("ERROR: couldn"t open URL " + strURL);
break;
}
numberSearched++;
if (numberSearched >= SEARCH_LIMIT)
break;
}
if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
System.out.println("reached search limit of " + SEARCH_LIMIT);
else
System.out.println("done");
searchThread = null;
}
public static void main(String argv[]) {
WebCrawler applet = new WebCrawler();
/*
* Behind a firewall set your proxy and port here!
*/
Properties props = new Properties(System.getProperties());
props.put("http.proxySet", "true");
props.put("http.proxyHost", "webcache-cup");
props.put("http.proxyPort", "8080");
Properties newprops = new Properties(props);
System.setProperties(newprops);
}
}
Web Crawler from Sun Microsystems
/* Copyright 2004 Sun Microsystems, Inc. All rights reserved. You may not modify, use, reproduce, or distribute this software except in compliance with the terms of the License at:*/
//import java.applet.Applet;
import java.awt.BorderLayout;
import java.awt.Button;
import java.awt.Choice;
import java.awt.FlowLayout;
import java.awt.Frame;
import java.awt.Graphics;
import java.awt.Label;
import java.awt.List;
import java.awt.Panel;
import java.awt.TextField;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.Vector;
public class WebCrawler extends Applet implements ActionListener, Runnable {
public static final String SEARCH = "Search";
public static final String STOP = "Stop";
public static final String DISALLOW = "Disallow:";
public static final int SEARCH_LIMIT = 50;
Panel panelMain;
List listMatches;
Label labelStatus;
// URLs to be searched
Vector vectorToSearch;
// URLs already searched
Vector vectorSearched;
// URLs which match
Vector vectorMatches;
Thread searchThread;
TextField textURL;
Choice choiceType;
public void init() {
// set up the main UI panel
panelMain = new Panel();
panelMain.setLayout(new BorderLayout(5, 5));
// text entry components
Panel panelEntry = new Panel();
panelEntry.setLayout(new BorderLayout(5, 5));
Panel panelURL = new Panel();
panelURL.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
Label labelURL = new Label("Starting URL: ", Label.RIGHT);
panelURL.add(labelURL);
textURL = new TextField("", 40);
panelURL.add(textURL);
panelEntry.add("North", panelURL);
Panel panelType = new Panel();
panelType.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
Label labelType = new Label("Content type: ", Label.RIGHT);
panelType.add(labelType);
choiceType = new Choice();
choiceType.addItem("text/html");
choiceType.addItem("audio/basic");
choiceType.addItem("audio/au");
choiceType.addItem("audio/aiff");
choiceType.addItem("audio/wav");
choiceType.addItem("video/mpeg");
choiceType.addItem("video/x-avi");
panelType.add(choiceType);
panelEntry.add("South", panelType);
panelMain.add("North", panelEntry);
// list of result URLs
Panel panelListButtons = new Panel();
panelListButtons.setLayout(new BorderLayout(5, 5));
Panel panelList = new Panel();
panelList.setLayout(new BorderLayout(5, 5));
Label labelResults = new Label("Search results");
panelList.add("North", labelResults);
Panel panelListCurrent = new Panel();
panelListCurrent.setLayout(new BorderLayout(5, 5));
listMatches = new List(10);
panelListCurrent.add("North", listMatches);
labelStatus = new Label("");
panelListCurrent.add("South", labelStatus);
panelList.add("South", panelListCurrent);
panelListButtons.add("North", panelList);
// control buttons
Panel panelButtons = new Panel();
Button buttonSearch = new Button(SEARCH);
buttonSearch.addActionListener(this);
panelButtons.add(buttonSearch);
Button buttonStop = new Button(STOP);
buttonStop.addActionListener(this);
panelButtons.add(buttonStop);
panelListButtons.add("South", panelButtons);
panelMain.add("South", panelListButtons);
add(panelMain);
setVisible(true);
repaint();
// initialize search data structures
vectorToSearch = new Vector();
vectorSearched = new Vector();
vectorMatches = new Vector();
// set default for URL access
URLConnection.setDefaultAllowUserInteraction(false);
}
public void start() {
}
public void stop() {
if (searchThread != null) {
setStatus("stopping...");
searchThread = null;
}
}
public void destroy() {
}
boolean robotSafe(URL url) {
String strHost = url.getHost();
// form URL of the robots.txt file
String strRobot = "http://" + strHost + "/robots.txt";
URL urlRobot;
try {
urlRobot = new URL(strRobot);
} catch (MalformedURLException e) {
// something weird is happening, so don"t trust it
return false;
}
String strCommands;
try {
InputStream urlRobotStream = urlRobot.openStream();
// read in entire file
byte b[] = new byte[1000];
int numRead = urlRobotStream.read(b);
strCommands = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != searchThread)
break;
numRead = urlRobotStream.read(b);
if (numRead != -1) {
String newCommands = new String(b, 0, numRead);
strCommands += newCommands;
}
}
urlRobotStream.close();
} catch (IOException e) {
// if there is no robots.txt file, it is OK to search
return true;
}
// assume that this robots.txt refers to us and
// search for "Disallow:" commands.
String strURL = url.getFile();
int index = 0;
while ((index = strCommands.indexOf(DISALLOW, index)) != -1) {
index += DISALLOW.length();
String strPath = strCommands.substring(index);
StringTokenizer st = new StringTokenizer(strPath);
if (!st.hasMoreTokens())
break;
String strBadPath = st.nextToken();
// if the URL starts with a disallowed path, it is not safe
if (strURL.indexOf(strBadPath) == 0)
return false;
}
return true;
}
public void paint(Graphics g) {
//Draw a Rectangle around the applet"s display area.
g.drawRect(0, 0, getSize().width - 1, getSize().height - 1);
panelMain.paint(g);
panelMain.paintComponents(g);
// update(g);
// panelMain.update(g);
}
public void run() {
String strURL = textURL.getText();
String strTargetType = choiceType.getSelectedItem();
int numberSearched = 0;
int numberFound = 0;
if (strURL.length() == 0) {
setStatus("ERROR: must enter a starting URL");
return;
}
// initialize search data structures
vectorToSearch.removeAllElements();
vectorSearched.removeAllElements();
vectorMatches.removeAllElements();
listMatches.removeAll();
vectorToSearch.addElement(strURL);
while ((vectorToSearch.size() > 0)
&& (Thread.currentThread() == searchThread)) {
// get the first element from the to be searched list
strURL = (String) vectorToSearch.elementAt(0);
setStatus("searching " + strURL);
URL url;
try {
url = new URL(strURL);
} catch (MalformedURLException e) {
setStatus("ERROR: invalid URL " + strURL);
break;
}
// mark the URL as searched (we want this one way or the other)
vectorToSearch.removeElementAt(0);
vectorSearched.addElement(strURL);
// can only search http: protocol URLs
if (url.getProtocol().rupareTo("http") != 0)
break;
// test to make sure it is before searching
if (!robotSafe(url))
break;
try {
// try opening the URL
URLConnection urlConnection = url.openConnection();
urlConnection.setAllowUserInteraction(false);
InputStream urlStream = url.openStream();
String type
= urlConnection.guessContentTypeFromStream(urlStream);
if (type == null)
break;
if (type.rupareTo("text/html") != 0)
break;
// search the input stream for links
// first, read in the entire URL
byte b[] = new byte[1000];
int numRead = urlStream.read(b);
String content = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != searchThread)
break;
numRead = urlStream.read(b);
if (numRead != -1) {
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
urlStream.close();
if (Thread.currentThread() != searchThread)
break;
String lowerCaseContent = content.toLowerCase();
int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
{
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
break;
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
break;
if (Thread.currentThread() != searchThread)
break;
index++;
String remaining = content.substring(index);
StringTokenizer st
= new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken();
URL urlLink;
try {
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
} catch (MalformedURLException e) {
setStatus("ERROR: bad URL " + strLink);
continue;
}
// only look at http links
if (urlLink.getProtocol().rupareTo("http") != 0)
break;
if (Thread.currentThread() != searchThread)
break;
try {
// try opening the URL
URLConnection urlLinkConnection
= urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);
InputStream linkStream = urlLink.openStream();
String strType
= urlLinkConnection.guessContentTypeFromStream(linkStream);
linkStream.close();
// if another page, add to the end of search list
if (strType == null)
break;
if (strType.rupareTo("text/html") == 0) {
// check to see if this URL has already been
// searched or is going to be searched
if ((!vectorSearched.contains(strLink))
&& (!vectorToSearch.contains(strLink))) {
// test to make sure it is robot-safe!
if (robotSafe(urlLink))
vectorToSearch.addElement(strLink);
}
}
// if the proper type, add it to the results list
// unless we have already seen it
if (strType.rupareTo(strTargetType) == 0) {
if (vectorMatches.contains(strLink) == false) {
listMatches.add(strLink);
vectorMatches.addElement(strLink);
numberFound++;
if (numberFound >= SEARCH_LIMIT)
break;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn"t open URL " + strLink);
continue;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn"t open URL " + strURL);
break;
}
numberSearched++;
if (numberSearched >= SEARCH_LIMIT)
break;
}
if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
setStatus("reached search limit of " + SEARCH_LIMIT);
else
setStatus("done");
searchThread = null;
// searchThread.stop();
}
void setStatus(String status) {
labelStatus.setText(status);
}
public void actionPerformed(ActionEvent event) {
String command = event.getActionCommand();
if (command.rupareTo(SEARCH) == 0) {
setStatus("searching...");
// launch a thread to do the search
if (searchThread == null) {
searchThread = new Thread(this);
}
searchThread.start();
}
else if (command.rupareTo(STOP) == 0) {
stop();
}
}
public static void main (String argv[])
{
Frame f = new Frame("WebFrame");
WebCrawler applet = new WebCrawler();
f.add("Center", applet);
/* Behind a firewall set your proxy and port here!
*/
Properties props= new Properties(System.getProperties());
props.put("http.proxySet", "true");
props.put("http.proxyHost", "webcache-cup");
props.put("http.proxyPort", "8080");
Properties newprops = new Properties(props);
System.setProperties(newprops);
/**/
applet.init();
applet.start();
f.pack();
f.show();
}
}