You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
422 lines
14 KiB
422 lines
14 KiB
import java.io.BufferedReader;
|
|
import java.io.IOException;
|
|
import java.io.InputStreamReader;
|
|
import java.net.HttpURLConnection;
|
|
import java.net.URL;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
|
|
// 爬虫基类
|
|
abstract class Spider {
|
|
protected String baseUrl;
|
|
|
|
public Spider(String baseUrl) {
|
|
this.baseUrl = baseUrl;
|
|
}
|
|
|
|
// 发送HTTP请求获取页面内容
|
|
protected Document fetchPage(String url) throws IOException {
|
|
return Jsoup.connect(url)
|
|
.userAgent("Mozilla/5.0")
|
|
.timeout(10000)
|
|
.get();
|
|
}
|
|
|
|
// 抽象方法:爬取数据
|
|
public abstract List<?> crawl() throws IOException;
|
|
|
|
// 抽象方法:解析页面
|
|
protected abstract List<?> parsePage(Document doc);
|
|
|
|
// 重载方法:解析页面(用于原生HTML解析)
|
|
protected List<?> parsePage(String html) {
|
|
// 默认实现,子类可以选择性重写
|
|
return new ArrayList<>();
|
|
}
|
|
}
|
|
|
|
// 电影数据模型
|
|
class Movie {
|
|
private String title;
|
|
private String rating;
|
|
private String url;
|
|
|
|
public Movie(String title, String rating, String url) {
|
|
this.title = title;
|
|
this.rating = rating;
|
|
this.url = url;
|
|
}
|
|
|
|
@Override
|
|
public String toString() {
|
|
return "Movie{" +
|
|
"title='" + title + '\'' +
|
|
", rating='" + rating + '\'' +
|
|
", url='" + url + '\'' +
|
|
'}';
|
|
}
|
|
}
|
|
|
|
// 书籍数据模型
|
|
class Book {
|
|
private String title;
|
|
private String author;
|
|
private String rating;
|
|
private String url;
|
|
|
|
public Book(String title, String author, String rating, String url) {
|
|
this.title = title;
|
|
this.author = author;
|
|
this.rating = rating;
|
|
this.url = url;
|
|
}
|
|
|
|
@Override
|
|
public String toString() {
|
|
return "Book{" +
|
|
"title='" + title + '\'' +
|
|
", author='" + author + '\'' +
|
|
", rating='" + rating + '\'' +
|
|
", url='" + url + '\'' +
|
|
'}';
|
|
}
|
|
}
|
|
|
|
// 天气数据模型
|
|
class Weather {
|
|
private String province;
|
|
private String city;
|
|
private String temperature;
|
|
private String weatherCondition;
|
|
private String url;
|
|
|
|
public Weather(String province, String city, String temperature, String weatherCondition, String url) {
|
|
this.province = province;
|
|
this.city = city;
|
|
this.temperature = temperature;
|
|
this.weatherCondition = weatherCondition;
|
|
this.url = url;
|
|
}
|
|
|
|
@Override
|
|
public String toString() {
|
|
return "Weather{" +
|
|
"province='" + province + '\'' +
|
|
", city='" + city + '\'' +
|
|
", temperature='" + temperature + '\'' +
|
|
", weatherCondition='" + weatherCondition + '\'' +
|
|
", url='" + url + '\'' +
|
|
'}';
|
|
}
|
|
}
|
|
|
|
// 豆瓣电影爬虫 - 爬取Top250
|
|
class DoubanMovieSpider extends Spider {
|
|
public DoubanMovieSpider() {
|
|
super("https://movie.douban.com/top250");
|
|
}
|
|
|
|
@Override
|
|
public List<Movie> crawl() throws IOException {
|
|
List<Movie> allMovies = new ArrayList<>();
|
|
|
|
// 豆瓣Top250有10页,每页25部电影
|
|
for (int start = 0; start < 250; start += 25) {
|
|
String pageUrl = baseUrl + "?start=" + start;
|
|
System.out.println("正在爬取第 " + (start / 25 + 1) + " 页...");
|
|
Document doc = fetchPage(pageUrl);
|
|
List<Movie> movies = parsePage(doc);
|
|
allMovies.addAll(movies);
|
|
|
|
// 添加延迟,避免请求过快
|
|
try {
|
|
Thread.sleep(1000);
|
|
} catch (InterruptedException e) {
|
|
Thread.currentThread().interrupt();
|
|
}
|
|
}
|
|
|
|
return allMovies;
|
|
}
|
|
|
|
@Override
|
|
protected List<Movie> parsePage(Document doc) {
|
|
List<Movie> movies = new ArrayList<>();
|
|
|
|
// 使用Jsoup选择器解析页面
|
|
Elements movieItems = doc.select(".info");
|
|
for (Element item : movieItems) {
|
|
// 提取标题
|
|
Element titleElement = item.selectFirst(".title");
|
|
if (titleElement != null) {
|
|
String title = titleElement.text().trim();
|
|
|
|
// 提取评分
|
|
Element ratingElement = item.selectFirst(".rating_num");
|
|
String rating = "";
|
|
if (ratingElement != null) {
|
|
rating = ratingElement.text().trim();
|
|
}
|
|
|
|
// 提取链接
|
|
Element linkElement = item.selectFirst("a");
|
|
String url = "";
|
|
if (linkElement != null) {
|
|
url = linkElement.attr("href");
|
|
}
|
|
|
|
movies.add(new Movie(title, rating, url));
|
|
}
|
|
}
|
|
|
|
return movies;
|
|
}
|
|
}
|
|
|
|
// 豆瓣读书爬虫 - 爬取评分前100
|
|
class DoubanBookSpider extends Spider {
|
|
public DoubanBookSpider() {
|
|
super("https://book.douban.com/top250");
|
|
}
|
|
|
|
@Override
|
|
public List<Book> crawl() throws IOException {
|
|
List<Book> allBooks = new ArrayList<>();
|
|
|
|
// 豆瓣读书Top250,每页25本,爬取前4页(100本)
|
|
for (int start = 0; start < 100; start += 25) {
|
|
String pageUrl = baseUrl + "?start=" + start;
|
|
System.out.println("正在爬取第 " + (start / 25 + 1) + " 页...");
|
|
Document doc = fetchPage(pageUrl);
|
|
List<Book> books = parsePage(doc);
|
|
allBooks.addAll(books);
|
|
|
|
// 添加延迟,避免请求过快
|
|
try {
|
|
Thread.sleep(1000);
|
|
} catch (InterruptedException e) {
|
|
Thread.currentThread().interrupt();
|
|
}
|
|
}
|
|
|
|
return allBooks;
|
|
}
|
|
|
|
@Override
|
|
protected List<Book> parsePage(Document doc) {
|
|
List<Book> books = new ArrayList<>();
|
|
|
|
// 使用Jsoup选择器解析页面
|
|
Elements bookItems = doc.select("tr.item");
|
|
for (Element item : bookItems) {
|
|
// 提取链接
|
|
Element linkElement = item.selectFirst(".nbg");
|
|
if (linkElement != null) {
|
|
String url = linkElement.attr("href");
|
|
|
|
// 提取标题
|
|
Element titleElement = linkElement.selectFirst("img");
|
|
String title = "";
|
|
if (titleElement != null) {
|
|
title = titleElement.attr("alt").trim();
|
|
}
|
|
|
|
// 提取作者信息
|
|
Element authorElement = item.selectFirst(".pl");
|
|
String author = "";
|
|
if (authorElement != null) {
|
|
author = authorElement.text().trim();
|
|
}
|
|
|
|
// 提取评分
|
|
Element ratingElement = item.selectFirst(".rating_nums");
|
|
String rating = "";
|
|
if (ratingElement != null) {
|
|
rating = ratingElement.text().trim();
|
|
}
|
|
|
|
books.add(new Book(title, author, rating, url));
|
|
}
|
|
}
|
|
|
|
return books;
|
|
}
|
|
}
|
|
|
|
// 中国天气网爬虫 - 爬取各省份天气
|
|
class WeatherSpider extends Spider {
|
|
// 中国各省份代码映射
|
|
private static final String[] PROVINCES = {
|
|
"北京", "上海", "天津", "重庆", "河北", "山西", "辽宁", "吉林", "黑龙江",
|
|
"江苏", "浙江", "安徽", "福建", "江西", "山东", "河南", "湖北", "湖南",
|
|
"广东", "海南", "四川", "贵州", "云南", "陕西", "甘肃", "青海", "台湾",
|
|
"内蒙古", "广西", "西藏", "宁夏", "新疆", "香港", "澳门"
|
|
};
|
|
|
|
public WeatherSpider() {
|
|
super("http://www.weather.com.cn");
|
|
}
|
|
|
|
@Override
|
|
public List<Weather> crawl() throws IOException {
|
|
List<Weather> allWeather = new ArrayList<>();
|
|
|
|
System.out.println("正在爬取中国各省份天气数据...");
|
|
|
|
// 爬取主要城市天气(使用中国天气网的API)
|
|
String[] majorCities = {
|
|
"101010100", // 北京
|
|
"101020100", // 上海
|
|
"101030100", // 天津
|
|
"101040100", // 重庆
|
|
"101050101", // 哈尔滨
|
|
"101060101", // 长春
|
|
"101070101", // 沈阳
|
|
"101080101", // 呼和浩特
|
|
"101090101", // 石家庄
|
|
"101100101", // 太原
|
|
"101110101", // 西安
|
|
"101120101", // 济南
|
|
"101130101", // 乌鲁木齐
|
|
"101140101", // 拉萨
|
|
"101150101", // 西宁
|
|
"101160101", // 兰州
|
|
"101170101", // 银川
|
|
"101180101", // 郑州
|
|
"101190101", // 南京
|
|
"101200101", // 武汉
|
|
"101210101", // 杭州
|
|
"101220101", // 合肥
|
|
"101230101", // 福州
|
|
"101240101", // 南昌
|
|
"101250101", // 长沙
|
|
"101260101", // 贵阳
|
|
"101270101", // 成都
|
|
"101280101", // 广州
|
|
"101290101", // 昆明
|
|
"101300101", // 南宁
|
|
"101310101", // 海口
|
|
"101320101", // 香港
|
|
"101330101", // 澳门
|
|
"101340101" // 台北
|
|
};
|
|
|
|
String[] cityNames = {
|
|
"北京", "上海", "天津", "重庆", "哈尔滨", "长春", "沈阳", "呼和浩特",
|
|
"石家庄", "太原", "西安", "济南", "乌鲁木齐", "拉萨", "西宁", "兰州",
|
|
"银川", "郑州", "南京", "武汉", "杭州", "合肥", "福州", "南昌",
|
|
"长沙", "贵阳", "成都", "广州", "昆明", "南宁", "海口", "香港", "澳门", "台北"
|
|
};
|
|
|
|
for (int i = 0; i < majorCities.length; i++) {
|
|
String cityCode = majorCities[i];
|
|
String cityName = cityNames[i];
|
|
String province = getProvinceByCity(cityName);
|
|
|
|
try {
|
|
String weatherUrl = "http://www.weather.com.cn/weather1d/" + cityCode + ".shtml";
|
|
Document doc = fetchPage(weatherUrl);
|
|
Weather weather = parseWeatherPage(doc, province, cityName, weatherUrl);
|
|
if (weather != null) {
|
|
allWeather.add(weather);
|
|
System.out.println("已获取 " + cityName + " 的天气数据");
|
|
}
|
|
|
|
// 添加延迟,避免请求过快
|
|
Thread.sleep(500);
|
|
} catch (Exception e) {
|
|
System.err.println("获取 " + cityName + " 天气数据失败: " + e.getMessage());
|
|
}
|
|
}
|
|
|
|
return allWeather;
|
|
}
|
|
|
|
private String getProvinceByCity(String city) {
|
|
// 简化的省份映射
|
|
if (city.equals("北京") || city.equals("上海") || city.equals("天津") || city.equals("重庆")) {
|
|
return city;
|
|
} else if (city.equals("哈尔滨") || city.equals("长春") || city.equals("沈阳")) {
|
|
return "东北";
|
|
} else if (city.equals("呼和浩特") || city.equals("石家庄") || city.equals("太原")) {
|
|
return "华北";
|
|
} else if (city.equals("西安") || city.equals("兰州") || city.equals("西宁") || city.equals("银川")) {
|
|
return "西北";
|
|
} else if (city.equals("济南") || city.equals("郑州") || city.equals("南京")) {
|
|
return "华东";
|
|
} else if (city.equals("武汉") || city.equals("长沙") || city.equals("南昌")) {
|
|
return "华中";
|
|
} else if (city.equals("杭州") || city.equals("合肥") || city.equals("福州")) {
|
|
return "华东";
|
|
} else if (city.equals("成都") || city.equals("贵阳") || city.equals("昆明")) {
|
|
return "西南";
|
|
} else if (city.equals("广州") || city.equals("南宁") || city.equals("海口")) {
|
|
return "华南";
|
|
} else if (city.equals("乌鲁木齐") || city.equals("拉萨")) {
|
|
return "西北";
|
|
} else if (city.equals("香港") || city.equals("澳门") || city.equals("台北")) {
|
|
return "港澳台";
|
|
}
|
|
return "其他";
|
|
}
|
|
|
|
private Weather parseWeatherPage(Document doc, String province, String city, String url) {
|
|
try {
|
|
// 提取温度
|
|
Element tempElement = doc.selectFirst(".tem");
|
|
String temperature = "";
|
|
if (tempElement != null) {
|
|
temperature = tempElement.text().trim();
|
|
}
|
|
|
|
// 提取天气状况
|
|
Element weatherElement = doc.selectFirst(".wea");
|
|
String weatherCondition = "";
|
|
if (weatherElement != null) {
|
|
weatherCondition = weatherElement.text().trim();
|
|
}
|
|
|
|
return new Weather(province, city, temperature, weatherCondition, url);
|
|
} catch (Exception e) {
|
|
System.err.println("解析 " + city + " 天气数据失败: " + e.getMessage());
|
|
return null;
|
|
}
|
|
}
|
|
|
|
@Override
|
|
protected List<Weather> parsePage(Document doc) {
|
|
// 这个方法在crawl中被直接调用,不需要实现
|
|
return new ArrayList<>();
|
|
}
|
|
}
|
|
|
|
// 测试类
|
|
public class SpiderFramework {
|
|
public static void main(String[] args) {
|
|
List<Spider> spiders = new ArrayList<>();
|
|
spiders.add(new DoubanMovieSpider());
|
|
spiders.add(new DoubanBookSpider());
|
|
spiders.add(new WeatherSpider());
|
|
|
|
for (Spider spider : spiders) {
|
|
try {
|
|
System.out.println("\n爬取 " + spider.getClass().getSimpleName() + " 数据:");
|
|
List<?> data = spider.crawl();
|
|
System.out.println("共爬取 " + data.size() + " 条数据");
|
|
|
|
// 只显示前5条数据
|
|
int displayCount = Math.min(5, data.size());
|
|
System.out.println("显示前 " + displayCount + " 条:");
|
|
for (int i = 0; i < displayCount; i++) {
|
|
System.out.println(data.get(i));
|
|
}
|
|
} catch (IOException e) {
|
|
System.err.println("爬取失败: " + e.getMessage());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|