1 changed files with 422 additions and 0 deletions
@ -0,0 +1,422 @@ |
|||||
|
import java.io.BufferedReader; |
||||
|
import java.io.IOException; |
||||
|
import java.io.InputStreamReader; |
||||
|
import java.net.HttpURLConnection; |
||||
|
import java.net.URL; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
// 爬虫基类
|
||||
|
abstract class Spider { |
||||
|
protected String baseUrl; |
||||
|
|
||||
|
public Spider(String baseUrl) { |
||||
|
this.baseUrl = baseUrl; |
||||
|
} |
||||
|
|
||||
|
// 发送HTTP请求获取页面内容
|
||||
|
protected Document fetchPage(String url) throws IOException { |
||||
|
return Jsoup.connect(url) |
||||
|
.userAgent("Mozilla/5.0") |
||||
|
.timeout(10000) |
||||
|
.get(); |
||||
|
} |
||||
|
|
||||
|
// 抽象方法:爬取数据
|
||||
|
public abstract List<?> crawl() throws IOException; |
||||
|
|
||||
|
// 抽象方法:解析页面
|
||||
|
protected abstract List<?> parsePage(Document doc); |
||||
|
|
||||
|
// 重载方法:解析页面(用于原生HTML解析)
|
||||
|
protected List<?> parsePage(String html) { |
||||
|
// 默认实现,子类可以选择性重写
|
||||
|
return new ArrayList<>(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 电影数据模型
|
||||
|
class Movie { |
||||
|
private String title; |
||||
|
private String rating; |
||||
|
private String url; |
||||
|
|
||||
|
public Movie(String title, String rating, String url) { |
||||
|
this.title = title; |
||||
|
this.rating = rating; |
||||
|
this.url = url; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Movie{" + |
||||
|
"title='" + title + '\'' + |
||||
|
", rating='" + rating + '\'' + |
||||
|
", url='" + url + '\'' + |
||||
|
'}'; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 书籍数据模型
|
||||
|
class Book { |
||||
|
private String title; |
||||
|
private String author; |
||||
|
private String rating; |
||||
|
private String url; |
||||
|
|
||||
|
public Book(String title, String author, String rating, String url) { |
||||
|
this.title = title; |
||||
|
this.author = author; |
||||
|
this.rating = rating; |
||||
|
this.url = url; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Book{" + |
||||
|
"title='" + title + '\'' + |
||||
|
", author='" + author + '\'' + |
||||
|
", rating='" + rating + '\'' + |
||||
|
", url='" + url + '\'' + |
||||
|
'}'; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 天气数据模型
|
||||
|
class Weather { |
||||
|
private String province; |
||||
|
private String city; |
||||
|
private String temperature; |
||||
|
private String weatherCondition; |
||||
|
private String url; |
||||
|
|
||||
|
public Weather(String province, String city, String temperature, String weatherCondition, String url) { |
||||
|
this.province = province; |
||||
|
this.city = city; |
||||
|
this.temperature = temperature; |
||||
|
this.weatherCondition = weatherCondition; |
||||
|
this.url = url; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Weather{" + |
||||
|
"province='" + province + '\'' + |
||||
|
", city='" + city + '\'' + |
||||
|
", temperature='" + temperature + '\'' + |
||||
|
", weatherCondition='" + weatherCondition + '\'' + |
||||
|
", url='" + url + '\'' + |
||||
|
'}'; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 豆瓣电影爬虫 - 爬取Top250
|
||||
|
class DoubanMovieSpider extends Spider { |
||||
|
public DoubanMovieSpider() { |
||||
|
super("https://movie.douban.com/top250"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> crawl() throws IOException { |
||||
|
List<Movie> allMovies = new ArrayList<>(); |
||||
|
|
||||
|
// 豆瓣Top250有10页,每页25部电影
|
||||
|
for (int start = 0; start < 250; start += 25) { |
||||
|
String pageUrl = baseUrl + "?start=" + start; |
||||
|
System.out.println("正在爬取第 " + (start / 25 + 1) + " 页..."); |
||||
|
Document doc = fetchPage(pageUrl); |
||||
|
List<Movie> movies = parsePage(doc); |
||||
|
allMovies.addAll(movies); |
||||
|
|
||||
|
// 添加延迟,避免请求过快
|
||||
|
try { |
||||
|
Thread.sleep(1000); |
||||
|
} catch (InterruptedException e) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return allMovies; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Movie> parsePage(Document doc) { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
|
||||
|
// 使用Jsoup选择器解析页面
|
||||
|
Elements movieItems = doc.select(".info"); |
||||
|
for (Element item : movieItems) { |
||||
|
// 提取标题
|
||||
|
Element titleElement = item.selectFirst(".title"); |
||||
|
if (titleElement != null) { |
||||
|
String title = titleElement.text().trim(); |
||||
|
|
||||
|
// 提取评分
|
||||
|
Element ratingElement = item.selectFirst(".rating_num"); |
||||
|
String rating = ""; |
||||
|
if (ratingElement != null) { |
||||
|
rating = ratingElement.text().trim(); |
||||
|
} |
||||
|
|
||||
|
// 提取链接
|
||||
|
Element linkElement = item.selectFirst("a"); |
||||
|
String url = ""; |
||||
|
if (linkElement != null) { |
||||
|
url = linkElement.attr("href"); |
||||
|
} |
||||
|
|
||||
|
movies.add(new Movie(title, rating, url)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return movies; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 豆瓣读书爬虫 - 爬取评分前100
|
||||
|
class DoubanBookSpider extends Spider { |
||||
|
public DoubanBookSpider() { |
||||
|
super("https://book.douban.com/top250"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Book> crawl() throws IOException { |
||||
|
List<Book> allBooks = new ArrayList<>(); |
||||
|
|
||||
|
// 豆瓣读书Top250,每页25本,爬取前4页(100本)
|
||||
|
for (int start = 0; start < 100; start += 25) { |
||||
|
String pageUrl = baseUrl + "?start=" + start; |
||||
|
System.out.println("正在爬取第 " + (start / 25 + 1) + " 页..."); |
||||
|
Document doc = fetchPage(pageUrl); |
||||
|
List<Book> books = parsePage(doc); |
||||
|
allBooks.addAll(books); |
||||
|
|
||||
|
// 添加延迟,避免请求过快
|
||||
|
try { |
||||
|
Thread.sleep(1000); |
||||
|
} catch (InterruptedException e) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return allBooks; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Book> parsePage(Document doc) { |
||||
|
List<Book> books = new ArrayList<>(); |
||||
|
|
||||
|
// 使用Jsoup选择器解析页面
|
||||
|
Elements bookItems = doc.select("tr.item"); |
||||
|
for (Element item : bookItems) { |
||||
|
// 提取链接
|
||||
|
Element linkElement = item.selectFirst(".nbg"); |
||||
|
if (linkElement != null) { |
||||
|
String url = linkElement.attr("href"); |
||||
|
|
||||
|
// 提取标题
|
||||
|
Element titleElement = linkElement.selectFirst("img"); |
||||
|
String title = ""; |
||||
|
if (titleElement != null) { |
||||
|
title = titleElement.attr("alt").trim(); |
||||
|
} |
||||
|
|
||||
|
// 提取作者信息
|
||||
|
Element authorElement = item.selectFirst(".pl"); |
||||
|
String author = ""; |
||||
|
if (authorElement != null) { |
||||
|
author = authorElement.text().trim(); |
||||
|
} |
||||
|
|
||||
|
// 提取评分
|
||||
|
Element ratingElement = item.selectFirst(".rating_nums"); |
||||
|
String rating = ""; |
||||
|
if (ratingElement != null) { |
||||
|
rating = ratingElement.text().trim(); |
||||
|
} |
||||
|
|
||||
|
books.add(new Book(title, author, rating, url)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return books; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 中国天气网爬虫 - 爬取各省份天气
|
||||
|
class WeatherSpider extends Spider { |
||||
|
// 中国各省份代码映射
|
||||
|
private static final String[] PROVINCES = { |
||||
|
"北京", "上海", "天津", "重庆", "河北", "山西", "辽宁", "吉林", "黑龙江", |
||||
|
"江苏", "浙江", "安徽", "福建", "江西", "山东", "河南", "湖北", "湖南", |
||||
|
"广东", "海南", "四川", "贵州", "云南", "陕西", "甘肃", "青海", "台湾", |
||||
|
"内蒙古", "广西", "西藏", "宁夏", "新疆", "香港", "澳门" |
||||
|
}; |
||||
|
|
||||
|
public WeatherSpider() { |
||||
|
super("http://www.weather.com.cn"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Weather> crawl() throws IOException { |
||||
|
List<Weather> allWeather = new ArrayList<>(); |
||||
|
|
||||
|
System.out.println("正在爬取中国各省份天气数据..."); |
||||
|
|
||||
|
// 爬取主要城市天气(使用中国天气网的API)
|
||||
|
String[] majorCities = { |
||||
|
"101010100", // 北京
|
||||
|
"101020100", // 上海
|
||||
|
"101030100", // 天津
|
||||
|
"101040100", // 重庆
|
||||
|
"101050101", // 哈尔滨
|
||||
|
"101060101", // 长春
|
||||
|
"101070101", // 沈阳
|
||||
|
"101080101", // 呼和浩特
|
||||
|
"101090101", // 石家庄
|
||||
|
"101100101", // 太原
|
||||
|
"101110101", // 西安
|
||||
|
"101120101", // 济南
|
||||
|
"101130101", // 乌鲁木齐
|
||||
|
"101140101", // 拉萨
|
||||
|
"101150101", // 西宁
|
||||
|
"101160101", // 兰州
|
||||
|
"101170101", // 银川
|
||||
|
"101180101", // 郑州
|
||||
|
"101190101", // 南京
|
||||
|
"101200101", // 武汉
|
||||
|
"101210101", // 杭州
|
||||
|
"101220101", // 合肥
|
||||
|
"101230101", // 福州
|
||||
|
"101240101", // 南昌
|
||||
|
"101250101", // 长沙
|
||||
|
"101260101", // 贵阳
|
||||
|
"101270101", // 成都
|
||||
|
"101280101", // 广州
|
||||
|
"101290101", // 昆明
|
||||
|
"101300101", // 南宁
|
||||
|
"101310101", // 海口
|
||||
|
"101320101", // 香港
|
||||
|
"101330101", // 澳门
|
||||
|
"101340101" // 台北
|
||||
|
}; |
||||
|
|
||||
|
String[] cityNames = { |
||||
|
"北京", "上海", "天津", "重庆", "哈尔滨", "长春", "沈阳", "呼和浩特", |
||||
|
"石家庄", "太原", "西安", "济南", "乌鲁木齐", "拉萨", "西宁", "兰州", |
||||
|
"银川", "郑州", "南京", "武汉", "杭州", "合肥", "福州", "南昌", |
||||
|
"长沙", "贵阳", "成都", "广州", "昆明", "南宁", "海口", "香港", "澳门", "台北" |
||||
|
}; |
||||
|
|
||||
|
for (int i = 0; i < majorCities.length; i++) { |
||||
|
String cityCode = majorCities[i]; |
||||
|
String cityName = cityNames[i]; |
||||
|
String province = getProvinceByCity(cityName); |
||||
|
|
||||
|
try { |
||||
|
String weatherUrl = "http://www.weather.com.cn/weather1d/" + cityCode + ".shtml"; |
||||
|
Document doc = fetchPage(weatherUrl); |
||||
|
Weather weather = parseWeatherPage(doc, province, cityName, weatherUrl); |
||||
|
if (weather != null) { |
||||
|
allWeather.add(weather); |
||||
|
System.out.println("已获取 " + cityName + " 的天气数据"); |
||||
|
} |
||||
|
|
||||
|
// 添加延迟,避免请求过快
|
||||
|
Thread.sleep(500); |
||||
|
} catch (Exception e) { |
||||
|
System.err.println("获取 " + cityName + " 天气数据失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return allWeather; |
||||
|
} |
||||
|
|
||||
|
private String getProvinceByCity(String city) { |
||||
|
// 简化的省份映射
|
||||
|
if (city.equals("北京") || city.equals("上海") || city.equals("天津") || city.equals("重庆")) { |
||||
|
return city; |
||||
|
} else if (city.equals("哈尔滨") || city.equals("长春") || city.equals("沈阳")) { |
||||
|
return "东北"; |
||||
|
} else if (city.equals("呼和浩特") || city.equals("石家庄") || city.equals("太原")) { |
||||
|
return "华北"; |
||||
|
} else if (city.equals("西安") || city.equals("兰州") || city.equals("西宁") || city.equals("银川")) { |
||||
|
return "西北"; |
||||
|
} else if (city.equals("济南") || city.equals("郑州") || city.equals("南京")) { |
||||
|
return "华东"; |
||||
|
} else if (city.equals("武汉") || city.equals("长沙") || city.equals("南昌")) { |
||||
|
return "华中"; |
||||
|
} else if (city.equals("杭州") || city.equals("合肥") || city.equals("福州")) { |
||||
|
return "华东"; |
||||
|
} else if (city.equals("成都") || city.equals("贵阳") || city.equals("昆明")) { |
||||
|
return "西南"; |
||||
|
} else if (city.equals("广州") || city.equals("南宁") || city.equals("海口")) { |
||||
|
return "华南"; |
||||
|
} else if (city.equals("乌鲁木齐") || city.equals("拉萨")) { |
||||
|
return "西北"; |
||||
|
} else if (city.equals("香港") || city.equals("澳门") || city.equals("台北")) { |
||||
|
return "港澳台"; |
||||
|
} |
||||
|
return "其他"; |
||||
|
} |
||||
|
|
||||
|
private Weather parseWeatherPage(Document doc, String province, String city, String url) { |
||||
|
try { |
||||
|
// 提取温度
|
||||
|
Element tempElement = doc.selectFirst(".tem"); |
||||
|
String temperature = ""; |
||||
|
if (tempElement != null) { |
||||
|
temperature = tempElement.text().trim(); |
||||
|
} |
||||
|
|
||||
|
// 提取天气状况
|
||||
|
Element weatherElement = doc.selectFirst(".wea"); |
||||
|
String weatherCondition = ""; |
||||
|
if (weatherElement != null) { |
||||
|
weatherCondition = weatherElement.text().trim(); |
||||
|
} |
||||
|
|
||||
|
return new Weather(province, city, temperature, weatherCondition, url); |
||||
|
} catch (Exception e) { |
||||
|
System.err.println("解析 " + city + " 天气数据失败: " + e.getMessage()); |
||||
|
return null; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<Weather> parsePage(Document doc) { |
||||
|
// 这个方法在crawl中被直接调用,不需要实现
|
||||
|
return new ArrayList<>(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 测试类
|
||||
|
public class SpiderFramework { |
||||
|
public static void main(String[] args) { |
||||
|
List<Spider> spiders = new ArrayList<>(); |
||||
|
spiders.add(new DoubanMovieSpider()); |
||||
|
spiders.add(new DoubanBookSpider()); |
||||
|
spiders.add(new WeatherSpider()); |
||||
|
|
||||
|
for (Spider spider : spiders) { |
||||
|
try { |
||||
|
System.out.println("\n爬取 " + spider.getClass().getSimpleName() + " 数据:"); |
||||
|
List<?> data = spider.crawl(); |
||||
|
System.out.println("共爬取 " + data.size() + " 条数据"); |
||||
|
|
||||
|
// 只显示前5条数据
|
||||
|
int displayCount = Math.min(5, data.size()); |
||||
|
System.out.println("显示前 " + displayCount + " 条:"); |
||||
|
for (int i = 0; i < displayCount; i++) { |
||||
|
System.out.println(data.get(i)); |
||||
|
} |
||||
|
} catch (IOException e) { |
||||
|
System.err.println("爬取失败: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue