1 changed files with 422 additions and 0 deletions
@ -0,0 +1,422 @@ |
|||
import java.io.BufferedReader; |
|||
import java.io.IOException; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
// 爬虫基类
|
|||
abstract class Spider { |
|||
protected String baseUrl; |
|||
|
|||
public Spider(String baseUrl) { |
|||
this.baseUrl = baseUrl; |
|||
} |
|||
|
|||
// 发送HTTP请求获取页面内容
|
|||
protected Document fetchPage(String url) throws IOException { |
|||
return Jsoup.connect(url) |
|||
.userAgent("Mozilla/5.0") |
|||
.timeout(10000) |
|||
.get(); |
|||
} |
|||
|
|||
// 抽象方法:爬取数据
|
|||
public abstract List<?> crawl() throws IOException; |
|||
|
|||
// 抽象方法:解析页面
|
|||
protected abstract List<?> parsePage(Document doc); |
|||
|
|||
// 重载方法:解析页面(用于原生HTML解析)
|
|||
protected List<?> parsePage(String html) { |
|||
// 默认实现,子类可以选择性重写
|
|||
return new ArrayList<>(); |
|||
} |
|||
} |
|||
|
|||
// 电影数据模型
|
|||
class Movie { |
|||
private String title; |
|||
private String rating; |
|||
private String url; |
|||
|
|||
public Movie(String title, String rating, String url) { |
|||
this.title = title; |
|||
this.rating = rating; |
|||
this.url = url; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Movie{" + |
|||
"title='" + title + '\'' + |
|||
", rating='" + rating + '\'' + |
|||
", url='" + url + '\'' + |
|||
'}'; |
|||
} |
|||
} |
|||
|
|||
// 书籍数据模型
|
|||
class Book { |
|||
private String title; |
|||
private String author; |
|||
private String rating; |
|||
private String url; |
|||
|
|||
public Book(String title, String author, String rating, String url) { |
|||
this.title = title; |
|||
this.author = author; |
|||
this.rating = rating; |
|||
this.url = url; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Book{" + |
|||
"title='" + title + '\'' + |
|||
", author='" + author + '\'' + |
|||
", rating='" + rating + '\'' + |
|||
", url='" + url + '\'' + |
|||
'}'; |
|||
} |
|||
} |
|||
|
|||
// 天气数据模型
|
|||
class Weather { |
|||
private String province; |
|||
private String city; |
|||
private String temperature; |
|||
private String weatherCondition; |
|||
private String url; |
|||
|
|||
public Weather(String province, String city, String temperature, String weatherCondition, String url) { |
|||
this.province = province; |
|||
this.city = city; |
|||
this.temperature = temperature; |
|||
this.weatherCondition = weatherCondition; |
|||
this.url = url; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Weather{" + |
|||
"province='" + province + '\'' + |
|||
", city='" + city + '\'' + |
|||
", temperature='" + temperature + '\'' + |
|||
", weatherCondition='" + weatherCondition + '\'' + |
|||
", url='" + url + '\'' + |
|||
'}'; |
|||
} |
|||
} |
|||
|
|||
// 豆瓣电影爬虫 - 爬取Top250
|
|||
class DoubanMovieSpider extends Spider { |
|||
public DoubanMovieSpider() { |
|||
super("https://movie.douban.com/top250"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Movie> crawl() throws IOException { |
|||
List<Movie> allMovies = new ArrayList<>(); |
|||
|
|||
// 豆瓣Top250有10页,每页25部电影
|
|||
for (int start = 0; start < 250; start += 25) { |
|||
String pageUrl = baseUrl + "?start=" + start; |
|||
System.out.println("正在爬取第 " + (start / 25 + 1) + " 页..."); |
|||
Document doc = fetchPage(pageUrl); |
|||
List<Movie> movies = parsePage(doc); |
|||
allMovies.addAll(movies); |
|||
|
|||
// 添加延迟,避免请求过快
|
|||
try { |
|||
Thread.sleep(1000); |
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
} |
|||
} |
|||
|
|||
return allMovies; |
|||
} |
|||
|
|||
@Override |
|||
protected List<Movie> parsePage(Document doc) { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
|
|||
// 使用Jsoup选择器解析页面
|
|||
Elements movieItems = doc.select(".info"); |
|||
for (Element item : movieItems) { |
|||
// 提取标题
|
|||
Element titleElement = item.selectFirst(".title"); |
|||
if (titleElement != null) { |
|||
String title = titleElement.text().trim(); |
|||
|
|||
// 提取评分
|
|||
Element ratingElement = item.selectFirst(".rating_num"); |
|||
String rating = ""; |
|||
if (ratingElement != null) { |
|||
rating = ratingElement.text().trim(); |
|||
} |
|||
|
|||
// 提取链接
|
|||
Element linkElement = item.selectFirst("a"); |
|||
String url = ""; |
|||
if (linkElement != null) { |
|||
url = linkElement.attr("href"); |
|||
} |
|||
|
|||
movies.add(new Movie(title, rating, url)); |
|||
} |
|||
} |
|||
|
|||
return movies; |
|||
} |
|||
} |
|||
|
|||
// 豆瓣读书爬虫 - 爬取评分前100
|
|||
class DoubanBookSpider extends Spider { |
|||
public DoubanBookSpider() { |
|||
super("https://book.douban.com/top250"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Book> crawl() throws IOException { |
|||
List<Book> allBooks = new ArrayList<>(); |
|||
|
|||
// 豆瓣读书Top250,每页25本,爬取前4页(100本)
|
|||
for (int start = 0; start < 100; start += 25) { |
|||
String pageUrl = baseUrl + "?start=" + start; |
|||
System.out.println("正在爬取第 " + (start / 25 + 1) + " 页..."); |
|||
Document doc = fetchPage(pageUrl); |
|||
List<Book> books = parsePage(doc); |
|||
allBooks.addAll(books); |
|||
|
|||
// 添加延迟,避免请求过快
|
|||
try { |
|||
Thread.sleep(1000); |
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
} |
|||
} |
|||
|
|||
return allBooks; |
|||
} |
|||
|
|||
@Override |
|||
protected List<Book> parsePage(Document doc) { |
|||
List<Book> books = new ArrayList<>(); |
|||
|
|||
// 使用Jsoup选择器解析页面
|
|||
Elements bookItems = doc.select("tr.item"); |
|||
for (Element item : bookItems) { |
|||
// 提取链接
|
|||
Element linkElement = item.selectFirst(".nbg"); |
|||
if (linkElement != null) { |
|||
String url = linkElement.attr("href"); |
|||
|
|||
// 提取标题
|
|||
Element titleElement = linkElement.selectFirst("img"); |
|||
String title = ""; |
|||
if (titleElement != null) { |
|||
title = titleElement.attr("alt").trim(); |
|||
} |
|||
|
|||
// 提取作者信息
|
|||
Element authorElement = item.selectFirst(".pl"); |
|||
String author = ""; |
|||
if (authorElement != null) { |
|||
author = authorElement.text().trim(); |
|||
} |
|||
|
|||
// 提取评分
|
|||
Element ratingElement = item.selectFirst(".rating_nums"); |
|||
String rating = ""; |
|||
if (ratingElement != null) { |
|||
rating = ratingElement.text().trim(); |
|||
} |
|||
|
|||
books.add(new Book(title, author, rating, url)); |
|||
} |
|||
} |
|||
|
|||
return books; |
|||
} |
|||
} |
|||
|
|||
// 中国天气网爬虫 - 爬取各省份天气
|
|||
class WeatherSpider extends Spider { |
|||
// 中国各省份代码映射
|
|||
private static final String[] PROVINCES = { |
|||
"北京", "上海", "天津", "重庆", "河北", "山西", "辽宁", "吉林", "黑龙江", |
|||
"江苏", "浙江", "安徽", "福建", "江西", "山东", "河南", "湖北", "湖南", |
|||
"广东", "海南", "四川", "贵州", "云南", "陕西", "甘肃", "青海", "台湾", |
|||
"内蒙古", "广西", "西藏", "宁夏", "新疆", "香港", "澳门" |
|||
}; |
|||
|
|||
public WeatherSpider() { |
|||
super("http://www.weather.com.cn"); |
|||
} |
|||
|
|||
@Override |
|||
public List<Weather> crawl() throws IOException { |
|||
List<Weather> allWeather = new ArrayList<>(); |
|||
|
|||
System.out.println("正在爬取中国各省份天气数据..."); |
|||
|
|||
// 爬取主要城市天气(使用中国天气网的API)
|
|||
String[] majorCities = { |
|||
"101010100", // 北京
|
|||
"101020100", // 上海
|
|||
"101030100", // 天津
|
|||
"101040100", // 重庆
|
|||
"101050101", // 哈尔滨
|
|||
"101060101", // 长春
|
|||
"101070101", // 沈阳
|
|||
"101080101", // 呼和浩特
|
|||
"101090101", // 石家庄
|
|||
"101100101", // 太原
|
|||
"101110101", // 西安
|
|||
"101120101", // 济南
|
|||
"101130101", // 乌鲁木齐
|
|||
"101140101", // 拉萨
|
|||
"101150101", // 西宁
|
|||
"101160101", // 兰州
|
|||
"101170101", // 银川
|
|||
"101180101", // 郑州
|
|||
"101190101", // 南京
|
|||
"101200101", // 武汉
|
|||
"101210101", // 杭州
|
|||
"101220101", // 合肥
|
|||
"101230101", // 福州
|
|||
"101240101", // 南昌
|
|||
"101250101", // 长沙
|
|||
"101260101", // 贵阳
|
|||
"101270101", // 成都
|
|||
"101280101", // 广州
|
|||
"101290101", // 昆明
|
|||
"101300101", // 南宁
|
|||
"101310101", // 海口
|
|||
"101320101", // 香港
|
|||
"101330101", // 澳门
|
|||
"101340101" // 台北
|
|||
}; |
|||
|
|||
String[] cityNames = { |
|||
"北京", "上海", "天津", "重庆", "哈尔滨", "长春", "沈阳", "呼和浩特", |
|||
"石家庄", "太原", "西安", "济南", "乌鲁木齐", "拉萨", "西宁", "兰州", |
|||
"银川", "郑州", "南京", "武汉", "杭州", "合肥", "福州", "南昌", |
|||
"长沙", "贵阳", "成都", "广州", "昆明", "南宁", "海口", "香港", "澳门", "台北" |
|||
}; |
|||
|
|||
for (int i = 0; i < majorCities.length; i++) { |
|||
String cityCode = majorCities[i]; |
|||
String cityName = cityNames[i]; |
|||
String province = getProvinceByCity(cityName); |
|||
|
|||
try { |
|||
String weatherUrl = "http://www.weather.com.cn/weather1d/" + cityCode + ".shtml"; |
|||
Document doc = fetchPage(weatherUrl); |
|||
Weather weather = parseWeatherPage(doc, province, cityName, weatherUrl); |
|||
if (weather != null) { |
|||
allWeather.add(weather); |
|||
System.out.println("已获取 " + cityName + " 的天气数据"); |
|||
} |
|||
|
|||
// 添加延迟,避免请求过快
|
|||
Thread.sleep(500); |
|||
} catch (Exception e) { |
|||
System.err.println("获取 " + cityName + " 天气数据失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
return allWeather; |
|||
} |
|||
|
|||
private String getProvinceByCity(String city) { |
|||
// 简化的省份映射
|
|||
if (city.equals("北京") || city.equals("上海") || city.equals("天津") || city.equals("重庆")) { |
|||
return city; |
|||
} else if (city.equals("哈尔滨") || city.equals("长春") || city.equals("沈阳")) { |
|||
return "东北"; |
|||
} else if (city.equals("呼和浩特") || city.equals("石家庄") || city.equals("太原")) { |
|||
return "华北"; |
|||
} else if (city.equals("西安") || city.equals("兰州") || city.equals("西宁") || city.equals("银川")) { |
|||
return "西北"; |
|||
} else if (city.equals("济南") || city.equals("郑州") || city.equals("南京")) { |
|||
return "华东"; |
|||
} else if (city.equals("武汉") || city.equals("长沙") || city.equals("南昌")) { |
|||
return "华中"; |
|||
} else if (city.equals("杭州") || city.equals("合肥") || city.equals("福州")) { |
|||
return "华东"; |
|||
} else if (city.equals("成都") || city.equals("贵阳") || city.equals("昆明")) { |
|||
return "西南"; |
|||
} else if (city.equals("广州") || city.equals("南宁") || city.equals("海口")) { |
|||
return "华南"; |
|||
} else if (city.equals("乌鲁木齐") || city.equals("拉萨")) { |
|||
return "西北"; |
|||
} else if (city.equals("香港") || city.equals("澳门") || city.equals("台北")) { |
|||
return "港澳台"; |
|||
} |
|||
return "其他"; |
|||
} |
|||
|
|||
private Weather parseWeatherPage(Document doc, String province, String city, String url) { |
|||
try { |
|||
// 提取温度
|
|||
Element tempElement = doc.selectFirst(".tem"); |
|||
String temperature = ""; |
|||
if (tempElement != null) { |
|||
temperature = tempElement.text().trim(); |
|||
} |
|||
|
|||
// 提取天气状况
|
|||
Element weatherElement = doc.selectFirst(".wea"); |
|||
String weatherCondition = ""; |
|||
if (weatherElement != null) { |
|||
weatherCondition = weatherElement.text().trim(); |
|||
} |
|||
|
|||
return new Weather(province, city, temperature, weatherCondition, url); |
|||
} catch (Exception e) { |
|||
System.err.println("解析 " + city + " 天气数据失败: " + e.getMessage()); |
|||
return null; |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
protected List<Weather> parsePage(Document doc) { |
|||
// 这个方法在crawl中被直接调用,不需要实现
|
|||
return new ArrayList<>(); |
|||
} |
|||
} |
|||
|
|||
// 测试类
|
|||
public class SpiderFramework { |
|||
public static void main(String[] args) { |
|||
List<Spider> spiders = new ArrayList<>(); |
|||
spiders.add(new DoubanMovieSpider()); |
|||
spiders.add(new DoubanBookSpider()); |
|||
spiders.add(new WeatherSpider()); |
|||
|
|||
for (Spider spider : spiders) { |
|||
try { |
|||
System.out.println("\n爬取 " + spider.getClass().getSimpleName() + " 数据:"); |
|||
List<?> data = spider.crawl(); |
|||
System.out.println("共爬取 " + data.size() + " 条数据"); |
|||
|
|||
// 只显示前5条数据
|
|||
int displayCount = Math.min(5, data.size()); |
|||
System.out.println("显示前 " + displayCount + " 条:"); |
|||
for (int i = 0; i < displayCount; i++) { |
|||
System.out.println(data.get(i)); |
|||
} |
|||
} catch (IOException e) { |
|||
System.err.println("爬取失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
Loading…
Reference in new issue