1 import java.util.Scanner;
2 import java.util.ArrayList;
3
4 public class WebCrawler {
5 public static void main(String[] args) {
6 Scanner input = new Scanner(System.in);
7 System.out.print("Enter a URL: ");
8 String url = input.nextLine();
9 crawler(url);
10 }
11
12 public static void crawler(String startingURL) {
13 ArrayList<String> listOfPendingURLs = new ArrayList<>();
14 ArrayList<String> listOfTraversedURLs = new ArrayList<>();
15
16 listOfPendingURLs.add(startingURL);
17 while (!listOfPendingURLs.isEmpty() &&
18 listOfTraversedURLs.size() <= 100) {
19 String urlString = listOfPendingURLs.remove(0);
20 listOfTraversedURLs.add(urlString);
21 System.out.println("Crawl " + urlString);
22
23 for (String s: getSubURLs(urlString)) {
24 if (!listOfTraversedURLs.contains(s) &&
25 !listOfPendingURLs.contains(s))
26 listOfPendingURLs.add(s);
27 }
28 }
29 }
30
31 public static ArrayList<String> getSubURLs(String urlString) {
32 ArrayList<String> list = new ArrayList<>();
33
34 try {
35 java.net.URL url = new java.net.URL(urlString);
36 Scanner input = new Scanner(url.openStream());
37 int current = 0;
38 while (input.hasNext()) {
39 String line = input.nextLine();
40 current = line.indexOf("http:", current);
41 while (current > 0) {
42 int endIndex = line.indexOf("\"", current);
43 if (endIndex > 0) {
44 list.add(line.substring(current, endIndex));
45 current = line.indexOf("http:", endIndex);
46 }
47 else
48 current = -1;
49 }
50 }
51 }
52 catch (Exception ex) {
53 System.out.println("Error: " + ex.getMessage());
54 }
55
56 return list;
57 }
58 }