You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

422 lines
14 KiB

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
// 爬虫基类
abstract class Spider {
protected String baseUrl;
public Spider(String baseUrl) {
this.baseUrl = baseUrl;
}
// 发送HTTP请求获取页面内容
protected Document fetchPage(String url) throws IOException {
return Jsoup.connect(url)
.userAgent("Mozilla/5.0")
.timeout(10000)
.get();
}
// 抽象方法:爬取数据
public abstract List<?> crawl() throws IOException;
// 抽象方法:解析页面
protected abstract List<?> parsePage(Document doc);
// 重载方法:解析页面(用于原生HTML解析)
protected List<?> parsePage(String html) {
// 默认实现,子类可以选择性重写
return new ArrayList<>();
}
}
// 电影数据模型
class Movie {
private String title;
private String rating;
private String url;
public Movie(String title, String rating, String url) {
this.title = title;
this.rating = rating;
this.url = url;
}
@Override
public String toString() {
return "Movie{" +
"title='" + title + '\'' +
", rating='" + rating + '\'' +
", url='" + url + '\'' +
'}';
}
}
// 书籍数据模型
class Book {
private String title;
private String author;
private String rating;
private String url;
public Book(String title, String author, String rating, String url) {
this.title = title;
this.author = author;
this.rating = rating;
this.url = url;
}
@Override
public String toString() {
return "Book{" +
"title='" + title + '\'' +
", author='" + author + '\'' +
", rating='" + rating + '\'' +
", url='" + url + '\'' +
'}';
}
}
// 天气数据模型
class Weather {
private String province;
private String city;
private String temperature;
private String weatherCondition;
private String url;
public Weather(String province, String city, String temperature, String weatherCondition, String url) {
this.province = province;
this.city = city;
this.temperature = temperature;
this.weatherCondition = weatherCondition;
this.url = url;
}
@Override
public String toString() {
return "Weather{" +
"province='" + province + '\'' +
", city='" + city + '\'' +
", temperature='" + temperature + '\'' +
", weatherCondition='" + weatherCondition + '\'' +
", url='" + url + '\'' +
'}';
}
}
// 豆瓣电影爬虫 - 爬取Top250
class DoubanMovieSpider extends Spider {
public DoubanMovieSpider() {
super("https://movie.douban.com/top250");
}
@Override
public List<Movie> crawl() throws IOException {
List<Movie> allMovies = new ArrayList<>();
// 豆瓣Top250有10页,每页25部电影
for (int start = 0; start < 250; start += 25) {
String pageUrl = baseUrl + "?start=" + start;
System.out.println("正在爬取第 " + (start / 25 + 1) + " 页...");
Document doc = fetchPage(pageUrl);
List<Movie> movies = parsePage(doc);
allMovies.addAll(movies);
// 添加延迟,避免请求过快
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
return allMovies;
}
@Override
protected List<Movie> parsePage(Document doc) {
List<Movie> movies = new ArrayList<>();
// 使用Jsoup选择器解析页面
Elements movieItems = doc.select(".info");
for (Element item : movieItems) {
// 提取标题
Element titleElement = item.selectFirst(".title");
if (titleElement != null) {
String title = titleElement.text().trim();
// 提取评分
Element ratingElement = item.selectFirst(".rating_num");
String rating = "";
if (ratingElement != null) {
rating = ratingElement.text().trim();
}
// 提取链接
Element linkElement = item.selectFirst("a");
String url = "";
if (linkElement != null) {
url = linkElement.attr("href");
}
movies.add(new Movie(title, rating, url));
}
}
return movies;
}
}
// 豆瓣读书爬虫 - 爬取评分前100
class DoubanBookSpider extends Spider {
public DoubanBookSpider() {
super("https://book.douban.com/top250");
}
@Override
public List<Book> crawl() throws IOException {
List<Book> allBooks = new ArrayList<>();
// 豆瓣读书Top250,每页25本,爬取前4页(100本)
for (int start = 0; start < 100; start += 25) {
String pageUrl = baseUrl + "?start=" + start;
System.out.println("正在爬取第 " + (start / 25 + 1) + " 页...");
Document doc = fetchPage(pageUrl);
List<Book> books = parsePage(doc);
allBooks.addAll(books);
// 添加延迟,避免请求过快
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
return allBooks;
}
@Override
protected List<Book> parsePage(Document doc) {
List<Book> books = new ArrayList<>();
// 使用Jsoup选择器解析页面
Elements bookItems = doc.select("tr.item");
for (Element item : bookItems) {
// 提取链接
Element linkElement = item.selectFirst(".nbg");
if (linkElement != null) {
String url = linkElement.attr("href");
// 提取标题
Element titleElement = linkElement.selectFirst("img");
String title = "";
if (titleElement != null) {
title = titleElement.attr("alt").trim();
}
// 提取作者信息
Element authorElement = item.selectFirst(".pl");
String author = "";
if (authorElement != null) {
author = authorElement.text().trim();
}
// 提取评分
Element ratingElement = item.selectFirst(".rating_nums");
String rating = "";
if (ratingElement != null) {
rating = ratingElement.text().trim();
}
books.add(new Book(title, author, rating, url));
}
}
return books;
}
}
// 中国天气网爬虫 - 爬取各省份天气
class WeatherSpider extends Spider {
// 中国各省份代码映射
private static final String[] PROVINCES = {
"北京", "上海", "天津", "重庆", "河北", "山西", "辽宁", "吉林", "黑龙江",
"江苏", "浙江", "安徽", "福建", "江西", "山东", "河南", "湖北", "湖南",
"广东", "海南", "四川", "贵州", "云南", "陕西", "甘肃", "青海", "台湾",
"内蒙古", "广西", "西藏", "宁夏", "新疆", "香港", "澳门"
};
public WeatherSpider() {
super("http://www.weather.com.cn");
}
@Override
public List<Weather> crawl() throws IOException {
List<Weather> allWeather = new ArrayList<>();
System.out.println("正在爬取中国各省份天气数据...");
// 爬取主要城市天气(使用中国天气网的API)
String[] majorCities = {
"101010100", // 北京
"101020100", // 上海
"101030100", // 天津
"101040100", // 重庆
"101050101", // 哈尔滨
"101060101", // 长春
"101070101", // 沈阳
"101080101", // 呼和浩特
"101090101", // 石家庄
"101100101", // 太原
"101110101", // 西安
"101120101", // 济南
"101130101", // 乌鲁木齐
"101140101", // 拉萨
"101150101", // 西宁
"101160101", // 兰州
"101170101", // 银川
"101180101", // 郑州
"101190101", // 南京
"101200101", // 武汉
"101210101", // 杭州
"101220101", // 合肥
"101230101", // 福州
"101240101", // 南昌
"101250101", // 长沙
"101260101", // 贵阳
"101270101", // 成都
"101280101", // 广州
"101290101", // 昆明
"101300101", // 南宁
"101310101", // 海口
"101320101", // 香港
"101330101", // 澳门
"101340101" // 台北
};
String[] cityNames = {
"北京", "上海", "天津", "重庆", "哈尔滨", "长春", "沈阳", "呼和浩特",
"石家庄", "太原", "西安", "济南", "乌鲁木齐", "拉萨", "西宁", "兰州",
"银川", "郑州", "南京", "武汉", "杭州", "合肥", "福州", "南昌",
"长沙", "贵阳", "成都", "广州", "昆明", "南宁", "海口", "香港", "澳门", "台北"
};
for (int i = 0; i < majorCities.length; i++) {
String cityCode = majorCities[i];
String cityName = cityNames[i];
String province = getProvinceByCity(cityName);
try {
String weatherUrl = "http://www.weather.com.cn/weather1d/" + cityCode + ".shtml";
Document doc = fetchPage(weatherUrl);
Weather weather = parseWeatherPage(doc, province, cityName, weatherUrl);
if (weather != null) {
allWeather.add(weather);
System.out.println("已获取 " + cityName + " 的天气数据");
}
// 添加延迟,避免请求过快
Thread.sleep(500);
} catch (Exception e) {
System.err.println("获取 " + cityName + " 天气数据失败: " + e.getMessage());
}
}
return allWeather;
}
private String getProvinceByCity(String city) {
// 简化的省份映射
if (city.equals("北京") || city.equals("上海") || city.equals("天津") || city.equals("重庆")) {
return city;
} else if (city.equals("哈尔滨") || city.equals("长春") || city.equals("沈阳")) {
return "东北";
} else if (city.equals("呼和浩特") || city.equals("石家庄") || city.equals("太原")) {
return "华北";
} else if (city.equals("西安") || city.equals("兰州") || city.equals("西宁") || city.equals("银川")) {
return "西北";
} else if (city.equals("济南") || city.equals("郑州") || city.equals("南京")) {
return "华东";
} else if (city.equals("武汉") || city.equals("长沙") || city.equals("南昌")) {
return "华中";
} else if (city.equals("杭州") || city.equals("合肥") || city.equals("福州")) {
return "华东";
} else if (city.equals("成都") || city.equals("贵阳") || city.equals("昆明")) {
return "西南";
} else if (city.equals("广州") || city.equals("南宁") || city.equals("海口")) {
return "华南";
} else if (city.equals("乌鲁木齐") || city.equals("拉萨")) {
return "西北";
} else if (city.equals("香港") || city.equals("澳门") || city.equals("台北")) {
return "港澳台";
}
return "其他";
}
private Weather parseWeatherPage(Document doc, String province, String city, String url) {
try {
// 提取温度
Element tempElement = doc.selectFirst(".tem");
String temperature = "";
if (tempElement != null) {
temperature = tempElement.text().trim();
}
// 提取天气状况
Element weatherElement = doc.selectFirst(".wea");
String weatherCondition = "";
if (weatherElement != null) {
weatherCondition = weatherElement.text().trim();
}
return new Weather(province, city, temperature, weatherCondition, url);
} catch (Exception e) {
System.err.println("解析 " + city + " 天气数据失败: " + e.getMessage());
return null;
}
}
@Override
protected List<Weather> parsePage(Document doc) {
// 这个方法在crawl中被直接调用,不需要实现
return new ArrayList<>();
}
}
// 测试类
public class SpiderFramework {
public static void main(String[] args) {
List<Spider> spiders = new ArrayList<>();
spiders.add(new DoubanMovieSpider());
spiders.add(new DoubanBookSpider());
spiders.add(new WeatherSpider());
for (Spider spider : spiders) {
try {
System.out.println("\n爬取 " + spider.getClass().getSimpleName() + " 数据:");
List<?> data = spider.crawl();
System.out.println("共爬取 " + data.size() + " 条数据");
// 只显示前5条数据
int displayCount = Math.min(5, data.size());
System.out.println("显示前 " + displayCount + " 条:");
for (int i = 0; i < displayCount; i++) {
System.out.println(data.get(i));
}
} catch (IOException e) {
System.err.println("爬取失败: " + e.getMessage());
}
}
}
}