Lecture Videos
  1  import java.util.Scanner;
  2  import java.util.ArrayList;
  3  
  4  public class WebCrawler {
  5    public static void main(String[] args) {
  6      Scanner input = new Scanner(System.in);
  7      System.out.print("Enter a URL: ");
  8      String url = input.nextLine(); 
  9      crawler(url); // Traverse the Web from the a starting url
 10    }
 11  
 12    public static void crawler(String startingURL) {
 13      ArrayList<String> listOfPendingURLs = new ArrayList<>();
 14      ArrayList<String> listOfTraversedURLs = new ArrayList<>();
 15      
 16      listOfPendingURLs.add(startingURL);
 17      while (!listOfPendingURLs.isEmpty() && 
 18          listOfTraversedURLs.size() <= 100) {
 19        String urlString = listOfPendingURLs.remove(0);
 20        listOfTraversedURLs.add(urlString);
 21        System.out.println("Crawl " + urlString);
 22  
 23        for (String s: getSubURLs(urlString)) {
 24          if (!listOfTraversedURLs.contains(s) && 
 25          		!listOfPendingURLs.contains(s))
 26            listOfPendingURLs.add(s);
 27        }
 28      }
 29    }
 30    
 31    public static ArrayList<String> getSubURLs(String urlString) {
 32      ArrayList<String> list = new ArrayList<>();
 33      
 34      try {
 35        java.net.URL url = new java.net.URL(urlString); 
 36        Scanner input = new Scanner(url.openStream());
 37        int current = 0;
 38        while (input.hasNext()) {
 39          String line = input.nextLine();
 40          current = line.indexOf("http:", current);
 41          while (current > 0) {
 42            int endIndex = line.indexOf("\"", current);
 43            if (endIndex > 0) { // Ensure that a correct URL is found
 44              list.add(line.substring(current, endIndex)); 
 45              current = line.indexOf("http:", endIndex);
 46            }
 47            else 
 48              current = -1;
 49          }
 50        } 
 51      }
 52      catch (Exception ex) {
 53        System.out.println("Error: " + ex.getMessage());
 54      }
 55      
 56      return list;
 57    }
 58  }