java left logo
java middle logo
java right logo
 

Home arrow Java SE Tips
 
 
Main Menu
Home
Java Tutorials
Book Reviews
Java SE Tips
Java ME Tips
Java EE Tips
Other API Tips
Java Applications
Java Libraries
Java Games
Sitemap
Java Network
Java Forums
Java Tips Blog




Most Visited Tips
Java SE Tips
Java ME Tips
Java EE Tips
Other API Tips
Java Applications
Java Libraries
Java Games
Book Reviews
Top Rated Tips
Java SE Tips
Java ME Tips
Java EE Tips
Other API Tips
Java Applications
Java Libraries
Java Games
Book Reviews


Statistics
Registered Users: 769
Java SE Tips: 614
Java ME Tips: 201
Java EE Tips: 184
Other API Tips: 779
Java Applications: 298
Java Libraries: 209
Java Games: 16
Book Reviews:
 
 
 
How to create a Search Crawler in Java E-mail
User Rating: / 12
PoorBest 
Search Crawler in Java

With Search Crawler shown in this tip, you can enter search criteria and then search the Web in real time, URL by URL, looking for matches to the criteria.

The content of the SearchCrawler.java file is written below:

import java.awt.*;
import java.awt.event.*;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import javax.swing.*;
import javax.swing.table.*;

// The Search Web Crawler
public class SearchCrawler extends JFrame {
    
    // Max URLs drop down values.
    private static final String[] MAX_URLS =
    {"50""100""500""1000"};
    
    // Cache of robot disallow lists.
    private HashMap disallowListCache = new HashMap();
    
    // Search GUI controls.
    private JTextField startTextField;
    private JComboBox maxComboBox;
    private JCheckBox limitCheckBox;
    private JTextField logTextField;
    private JTextField searchTextField;
    private JCheckBox caseCheckBox;
    private JButton searchButton;
    
    // Search stats GUI controls.
    private JLabel crawlingLabel2;
    private JLabel crawledLabel2;
    private JLabel toCrawlLabel2;
    private JProgressBar progressBar;
    private JLabel matchesLabel2;
    
    // Table listing search matches.
    private JTable table;
    
    // Flag for whether or not crawling is underway.
    private boolean crawling;
    
    // Matches log file print writer.
    private PrintWriter logFileWriter;
    
    // Constructor for Search Web Crawler.
    public SearchCrawler() {
        // Set application title.
        setTitle("Search Crawler");
        
        // Set window size.
        setSize(600600);
        
        // Handle window closing events.
        addWindowListener(new WindowAdapter() {
            public void windowClosing(WindowEvent e) {
                actionExit();
            }
        });
        
        // Set up file menu.
        JMenuBar menuBar = new JMenuBar();
        JMenu fileMenu = new JMenu("File");
        fileMenu.setMnemonic(KeyEvent.VK_F);
        JMenuItem fileExitMenuItem = new JMenuItem("Exit",
                KeyEvent.VK_X);
        fileExitMenuItem.addActionListener(new ActionListener() {
            public void actionPerformed(ActionEvent e) {
                actionExit();
            }
        });
        fileMenu.add(fileExitMenuItem);
        menuBar.add(fileMenu);
        setJMenuBar(menuBar);
        
        // Set up search panel.
        JPanel searchPanel = new JPanel();
        GridBagConstraints constraints;
        GridBagLayout layout = new GridBagLayout();
        searchPanel.setLayout(layout);
        
        JLabel startLabel = new JLabel("Start URL:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5500);
        layout.setConstraints(startLabel, constraints);
        searchPanel.add(startLabel);
        
        startTextField = new JTextField();
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5505);
        layout.setConstraints(startTextField, constraints);
        searchPanel.add(startTextField);
        
        JLabel maxLabel = new JLabel("Max URLs to Crawl:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5500);
        layout.setConstraints(maxLabel, constraints);
        searchPanel.add(maxLabel);
        
        maxComboBox = new JComboBox(MAX_URLS);
        maxComboBox.setEditable(true);
        constraints = new GridBagConstraints();
        constraints.insets = new Insets(5500);
        layout.setConstraints(maxComboBox, constraints);
        searchPanel.add(maxComboBox);
        
        limitCheckBox =
                new JCheckBox("Limit crawling to Start URL site");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.WEST;
        constraints.insets = new Insets(01000);
        layout.setConstraints(limitCheckBox, constraints);
        searchPanel.add(limitCheckBox);
        
        JLabel blankLabel = new JLabel();
        constraints = new GridBagConstraints();
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        layout.setConstraints(blankLabel, constraints);
        searchPanel.add(blankLabel);
        
        JLabel logLabel = new JLabel("Matches Log File:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5500);
        layout.setConstraints(logLabel, constraints);
        searchPanel.add(logLabel);
        
        String file =
                System.getProperty("user.dir"+
                System.getProperty("file.separator"+
                "crawler.log";
        logTextField = new JTextField(file);
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5505);
        layout.setConstraints(logTextField, constraints);
        searchPanel.add(logTextField);
        
        JLabel searchLabel = new JLabel("Search String:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5500);
        layout.setConstraints(searchLabel, constraints);
        searchPanel.add(searchLabel);
        
        searchTextField = new JTextField();
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.insets = new Insets(5500);
        constraints.gridwidth= 2;
        constraints.weightx = 1.0d;
        layout.setConstraints(searchTextField, constraints);
        searchPanel.add(searchTextField);
        
        caseCheckBox = new JCheckBox("Case Sensitive");
        constraints = new GridBagConstraints();
        constraints.insets = new Insets(5505);
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        layout.setConstraints(caseCheckBox, constraints);
        searchPanel.add(caseCheckBox);
        
        searchButton = new JButton("Search");
        searchButton.addActionListener(new ActionListener() {
            public void actionPerformed(ActionEvent e) {
                actionSearch();
            }
        });
        constraints = new GridBagConstraints();
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5555);
        layout.setConstraints(searchButton, constraints);
        searchPanel.add(searchButton);
        
        JSeparator separator = new JSeparator();
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5555);
        layout.setConstraints(separator, constraints);
        searchPanel.add(separator);
        
        JLabel crawlingLabel1 = new JLabel("Crawling:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5500);
        layout.setConstraints(crawlingLabel1, constraints);
        searchPanel.add(crawlingLabel1);
        
        crawlingLabel2 = new JLabel();
        crawlingLabel2.setFont(
                crawlingLabel2.getFont().deriveFont(Font.PLAIN));
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5505);
        layout.setConstraints(crawlingLabel2, constraints);
        searchPanel.add(crawlingLabel2);
        
        JLabel crawledLabel1 = new JLabel("Crawled URLs:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5500);
        layout.setConstraints(crawledLabel1, constraints);
        searchPanel.add(crawledLabel1);
        
        crawledLabel2 = new JLabel();
        crawledLabel2.setFont(
                crawledLabel2.getFont().deriveFont(Font.PLAIN));
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5505);
        layout.setConstraints(crawledLabel2, constraints);
        searchPanel.add(crawledLabel2);
        
        JLabel toCrawlLabel1 = new JLabel("URLs to Crawl:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5500);
        layout.setConstraints(toCrawlLabel1, constraints);
        searchPanel.add(toCrawlLabel1);
        
        toCrawlLabel2 = new JLabel();
        toCrawlLabel2.setFont(
                toCrawlLabel2.getFont().deriveFont(Font.PLAIN));
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5505);
        layout.setConstraints(toCrawlLabel2, constraints);
        searchPanel.add(toCrawlLabel2);
        
        JLabel progressLabel = new JLabel("Crawling Progress:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5500);
        layout.setConstraints(progressLabel, constraints);
        searchPanel.add(progressLabel);
        
        progressBar = new JProgressBar();
        progressBar.setMinimum(0);
        progressBar.setStringPainted(true);
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5505);
        layout.setConstraints(progressBar, constraints);
        searchPanel.add(progressBar);
        
        JLabel matchesLabel1 = new JLabel("Search Matches:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(55100);
        layout.setConstraints(matchesLabel1, constraints);
        searchPanel.add(matchesLabel1);
        
        matchesLabel2 = new JLabel();
        matchesLabel2.setFont(
                matchesLabel2.getFont().deriveFont(Font.PLAIN));
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(55105);
        layout.setConstraints(matchesLabel2, constraints);
        searchPanel.add(matchesLabel2);
        
        // Set up matches table.
        table =
                new JTable(new DefaultTableModel(new Object[][]{},
                new String[]{"URL"}) {
            public boolean isCellEditable(int row, int column) {
                return false;
            }
        });
        
        // Set up matches panel.
        JPanel matchesPanel = new JPanel();
        matchesPanel.setBorder(
                BorderFactory.createTitledBorder("Matches"));
        matchesPanel.setLayout(new BorderLayout());
        matchesPanel.add(new JScrollPane(table),
                BorderLayout.CENTER);
        
        // Add panels to display.
        getContentPane().setLayout(new BorderLayout());
        getContentPane().add(searchPanel, BorderLayout.NORTH);
        getContentPane().add(matchesPanel, BorderLayout.CENTER);
    }
    
    // Exit this program.
    private void actionExit() {
        System.exit(0);
    }
    
    // Handle search/stop button being clicked.
    private void actionSearch() {
        // If stop button clicked, turn crawling flag off.
        if (crawling) {
            crawling = false;
            return;
        }
        
        ArrayList errorList = new ArrayList();
        
        // Validate that start URL has been entered.
        String startUrl = startTextField.getText().trim();
        if (startUrl.length() 1) {
            errorList.add("Missing Start URL.");
        }
        // Verify start URL.
        else if (verifyUrl(startUrl== null) {
            errorList.add("Invalid Start URL.");
        }
        
        // Validate that max URLs is either empty or is a number.
        int maxUrls = 0;
        String max = ((StringmaxComboBox.getSelectedItem()).trim();
        if (max.length() 0) {
            try {
                maxUrls = Integer.parseInt(max);
            catch (NumberFormatException e) {
            }
            if (maxUrls < 1) {
                errorList.add("Invalid Max URLs value.");
            }
        }
        
        // Validate that matches log file has been entered.
        String logFile = logTextField.getText().trim();
        if (logFile.length() 1) {
            errorList.add("Missing Matches Log File.");
        }
        
        // Validate that search string has been entered.
        String searchString = searchTextField.getText().trim();
        if (searchString.length() 1) {
            errorList.add("Missing Search String.");
        }
        
        // Show errors, if any, and return.
        if (errorList.size() 0) {
            StringBuffer message = new StringBuffer();
            
            // Concatenate errors into single message.
            for (int i = 0; i < errorList.size(); i++) {
                message.append(errorList.get(i));
                if (i + < errorList.size()) {
                    message.append("\n");
                }
            }
            
            showError(message.toString());
            return;
        }
        
        // Remove "www" from start URL if present.
        startUrl = removeWwwFromUrl(startUrl);
        
        // Start the search crawler.
        search(logFile, startUrl, maxUrls, searchString);
    }
    
    private void search(final String logFile, final String startUrl,
            final int maxUrls, final String searchString) {
        // Start the search in a new thread.
        Thread thread = new Thread(new Runnable() {
            public void run() {
                // Show hour glass cursor while crawling is under way.
                setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));
                
                // Disable search controls.
                startTextField.setEnabled(false);
                maxComboBox.setEnabled(false);
                limitCheckBox.setEnabled(false);
                logTextField.setEnabled(false);
                searchTextField.setEnabled(false);
                caseCheckBox.setEnabled(false);
                
                // Switch search button to "Stop."
                searchButton.setText("Stop");
                
                // Reset stats.
                table.setModel(new DefaultTableModel(new Object[][]{},
                        new String[]{"URL"}) {
                    public boolean isCellEditable(int row, int column) {
                        return false;
                    }
                });
                updateStats(startUrl, 00, maxUrls);
                
                // Open matches log file.
                try {
                    logFileWriter = new PrintWriter(new FileWriter(logFile));
                catch (Exception e) {
                    showError("Unable to open matches log file.");
                    return;
                }
                
                // Turn crawling flag on.
                crawling = true;
                
                // Perform the actual crawling.
                crawl(startUrl, maxUrls, limitCheckBox.isSelected(),
                        searchString, caseCheckBox.isSelected());
                
                // Turn crawling flag off.
                crawling = false;
                
                // Close matches log file.
                try {
                    logFileWriter.close();
                catch (Exception e) {
                    showError("Unable to close matches log file.");
                }
                
                // Mark search as done.
                crawlingLabel2.setText("Done");
                
                // Enable search controls.
                startTextField.setEnabled(true);
                maxComboBox.setEnabled(true);
                limitCheckBox.setEnabled(true);
                logTextField.setEnabled(true);
                searchTextField.setEnabled(true);
                caseCheckBox.setEnabled(true);
                
                // Switch search button back to "Search."
                searchButton.setText("Search");
                
                // Return to default cursor.
                setCursor(Cursor.getDefaultCursor());
                
                // Show message if search string not found.
                if (table.getRowCount() == 0) {
                    JOptionPane.showMessageDialog(SearchCrawler.this,
                     "Your Search String was not found. Please try another.",
                     "Search String Not Found",
                     JOptionPane.WARNING_MESSAGE);
                }
            }
        });
        thread.start();
    }
    
    // Show dialog box with error message.
    private void showError(String message) {
        JOptionPane.showMessageDialog(this, message, "Error",
                JOptionPane.ERROR_MESSAGE);
    }
    
    // Update crawling stats.
    private void updateStats(
            String crawling, int crawled, int toCrawl, int maxUrls) {
        crawlingLabel2.setText(crawling);
        crawledLabel2.setText("" + crawled);
        toCrawlLabel2.setText("" + toCrawl);
        
        // Update progress bar.
        if (maxUrls == -1) {
            progressBar.setMaximum(crawled + toCrawl);
        else {
            progressBar.setMaximum(maxUrls);
        }
        progressBar.setValue(crawled);
        
        matchesLabel2.setText("" + table.getRowCount());
    }
    
    // Add match to matches table and log file.
    private void addMatch(String url) {
        // Add URL to matches table.
        DefaultTableModel model =
                (DefaultTableModeltable.getModel();