从web读取数据
// package TEST_ALL;
import java.net.URL;
import java.util.Scanner;
public class ReadWebData2 {
public static void main(String[] args) {
System.out.print("Enter a URL: ");
String URLString = new Scanner(System.in).next();
try {
java.net.URL url = new java.net.URL(URLString);
int count = 0;
Scanner input = new Scanner(url.openStream());
while (input.hasNext()) {
String line = input.nextLine();
count += line.length();
}
System.out.println("The file size is " + count + " characters");
}
catch (java.net.MalformedURLException ex) {
System.out.println("Invalid URL");
}
catch (java.io.IOException ex) {
System.out.println(ex.getMessage());
}
}
}
但是奇怪的是,连接要么被拒,要么被重置,能正常运行的网站不多。
Enter a URL: http://www.ni.com/zh-cn/support.html
Connection reset
或者
Enter a URL: https://www.baidu.com/index.html
Connection refused: connect
大概是网络被封的原因,用自己的PC就没这个问题。
Enter a URL: http://www.ni.com/zh-cn/support.html
The file size is 73317 characters
网络爬虫
// package mytest;
import java.util.Scanner;
import java.util.ArrayList;
public class WebCrawler {
public static void main(String[] args) {
java.util.Scanner input = new java.util.Scanner(System.in);
System.out.print("Enter a URL: ");
String url = input.nextLine();
crawler(url); // Traverse the Web from the a starting url
}
public static void crawler(String startingURL) {
ArrayList<String> listOfPendingURLs = new ArrayList<>();
ArrayList<String> listOfTraversedURLs = new ArrayList<>();
listOfPendingURLs.add(startingURL);
while (!listOfPendingURLs.isEmpty() && listOfTraversedURLs.size() <= 100) {
String urlString = listOfPendingURLs.remove(0);
if (!listOfTraversedURLs.contains(urlString)) {
listOfTraversedURLs.add(urlString);
System.out.println("Craw " + urlString);
for (String s: getSubURLs(urlString)) {
if (!listOfTraversedURLs.contains(s))
listOfPendingURLs.add(s);
}
}
}
}
public static ArrayList<String> getSubURLs(String urlString) {
ArrayList<String> list = new ArrayList<>();
try {
java.net.URL url = new java.net.URL(urlString);
Scanner input = new Scanner(url.openStream());
int current = 0;
while (input.hasNext()) {
String line = input.nextLine();
current = line.indexOf("http:", current);
while (current > 0) {
int endIndex = line.indexOf("\"", current);
if (endIndex > 0) { // Ensure that a correct URL is found
list.add(line.substring(current, endIndex));
current = line.indexOf("http:", endIndex);
}
else
current = -1;
}
}
}
catch (Exception ex) {
System.out.println("Error: " + ex.getMessage());
}
return list;
}
}
都是书上的例子,运行起来特别慢,结果省略。
[1] Introduction to Java Programming chapter 12.12, 12.13