diff --git a/W11/CrawlerMain2/.gitignore b/W11/CrawlerMain2/.gitignore new file mode 100644 index 0000000..f68d109 --- /dev/null +++ b/W11/CrawlerMain2/.gitignore @@ -0,0 +1,29 @@ +### IntelliJ IDEA ### +out/ +!**/src/main/**/out/ +!**/src/test/**/out/ + +### Eclipse ### +.apt_generated +.classpath +.factorypath +.project +.settings +.springBeans +.sts4-cache +bin/ +!**/src/main/**/bin/ +!**/src/test/**/bin/ + +### NetBeans ### +/nbproject/private/ +/nbbuild/ +/dist/ +/nbdist/ +/.nb-gradle/ + +### VS Code ### +.vscode/ + +### Mac OS ### +.DS_Store \ No newline at end of file diff --git a/W11/CrawlerMain2/.idea/.gitignore b/W11/CrawlerMain2/.idea/.gitignore new file mode 100644 index 0000000..7d05e99 --- /dev/null +++ b/W11/CrawlerMain2/.idea/.gitignore @@ -0,0 +1,10 @@ +# 默认忽略的文件 +/shelf/ +/workspace.xml +# 基于编辑器的 HTTP 客户端请求 +/httpRequests/ +# 依赖于环境的 Maven 主目录路径 +/mavenHomeManager.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/W11/CrawlerMain2/.idea/libraries/jcommon_1_0_24.xml b/W11/CrawlerMain2/.idea/libraries/jcommon_1_0_24.xml new file mode 100644 index 0000000..cef0a8d --- /dev/null +++ b/W11/CrawlerMain2/.idea/libraries/jcommon_1_0_24.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/W11/CrawlerMain2/.idea/libraries/jfreechart_1_5_3.xml b/W11/CrawlerMain2/.idea/libraries/jfreechart_1_5_3.xml new file mode 100644 index 0000000..6fdf9d7 --- /dev/null +++ b/W11/CrawlerMain2/.idea/libraries/jfreechart_1_5_3.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/W11/CrawlerMain2/.idea/libraries/jsoup_1_17_2.xml b/W11/CrawlerMain2/.idea/libraries/jsoup_1_17_2.xml new file mode 100644 index 0000000..90ce41d --- /dev/null +++ b/W11/CrawlerMain2/.idea/libraries/jsoup_1_17_2.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/W11/CrawlerMain2/.idea/libraries/kumo_core_1_12.xml b/W11/CrawlerMain2/.idea/libraries/kumo_core_1_12.xml new file mode 100644 index 0000000..c74069d --- /dev/null +++ b/W11/CrawlerMain2/.idea/libraries/kumo_core_1_12.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/W11/CrawlerMain2/.idea/libraries/logback_classic_1_4_11.xml b/W11/CrawlerMain2/.idea/libraries/logback_classic_1_4_11.xml new file mode 100644 index 0000000..54a73cf --- /dev/null +++ b/W11/CrawlerMain2/.idea/libraries/logback_classic_1_4_11.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/W11/CrawlerMain2/.idea/libraries/logback_core_1_4_11.xml b/W11/CrawlerMain2/.idea/libraries/logback_core_1_4_11.xml new file mode 100644 index 0000000..fbdb3a1 --- /dev/null +++ b/W11/CrawlerMain2/.idea/libraries/logback_core_1_4_11.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/W11/CrawlerMain2/.idea/libraries/slf4j_api_2_0_9.xml b/W11/CrawlerMain2/.idea/libraries/slf4j_api_2_0_9.xml new file mode 100644 index 0000000..7c49634 --- /dev/null +++ b/W11/CrawlerMain2/.idea/libraries/slf4j_api_2_0_9.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/W11/CrawlerMain2/.idea/misc.xml b/W11/CrawlerMain2/.idea/misc.xml new file mode 100644 index 0000000..3653b1f --- /dev/null +++ b/W11/CrawlerMain2/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/W11/CrawlerMain2/.idea/modules.xml b/W11/CrawlerMain2/.idea/modules.xml new file mode 100644 index 0000000..8824534 --- /dev/null +++ b/W11/CrawlerMain2/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/W11/CrawlerMain2/CrawlerMain2.iml b/W11/CrawlerMain2/CrawlerMain2.iml new file mode 100644 index 0000000..e0317a0 --- /dev/null +++ b/W11/CrawlerMain2/CrawlerMain2.iml @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/W11/CrawlerMain2/src/CrawlerMain.java b/W11/CrawlerMain2/src/CrawlerMain.java new file mode 100644 index 0000000..70eb78a --- /dev/null +++ b/W11/CrawlerMain2/src/CrawlerMain.java @@ -0,0 +1,440 @@ +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +// ===================== 1. 新增自定义异常包(对应作业要求第一条) ===================== +// 爬虫根异常 +class CrawlerException extends Exception { + public CrawlerException(String message) { + super(message); + } + public CrawlerException(String message, Throwable cause) { + super(message, cause); + } +} + +// 网络异常 +class NetworkException extends CrawlerException { + public NetworkException(String message) { + super(message); + } + public NetworkException(String message, Throwable cause) { + super(message, cause); + } +} + +// 解析异常 +class ParseException extends CrawlerException { + public ParseException(String message) { + super(message); + } + public ParseException(String message, Throwable cause) { + super(message, cause); + } +} + +// ===================== 2. 抽象策略接口(声明抛出ParseException) ===================== +interface Crawler { + List startCrawl() throws ParseException, NetworkException, CrawlerException; +} + +// ===================== 3. 抽象模板父类 + 重试逻辑(对应作业要求第三条) ===================== +abstract class BaseCrawler implements Crawler { + protected final String baseUrl; + protected static final Logger logger = LoggerFactory.getLogger(BaseCrawler.class); + // 重试次数配置 + private static final int MAX_RETRY = 3; + private static final long RETRY_INTERVAL = 2000; // 2秒重试间隔 + + protected BaseCrawler(String baseUrl) { + this.baseUrl = baseUrl; + } + + // 带重试逻辑的通用页面请求方法 + protected Document getPage(String url) throws NetworkException { + int retryCount = 0; + while (retryCount < MAX_RETRY) { + try { + logger.info("第{}次请求页面:{}", retryCount + 1, url); + return Jsoup.connect(url) + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64)") + .timeout(15000) + .get(); + } catch (Exception e) { + retryCount++; + logger.error("请求页面失败,剩余重试次数:{}", MAX_RETRY - retryCount, e); + if (retryCount >= MAX_RETRY) { + throw new NetworkException("页面请求重试" + MAX_RETRY + "次后仍失败:" + url, e); + } + try { + Thread.sleep(RETRY_INTERVAL); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + } + } + } + throw new NetworkException("未知错误导致页面请求失败:" + url); + } + + @Override + public abstract List startCrawl() throws ParseException, NetworkException, CrawlerException; +} + +// ===================== 4. 实体类(完善封装) ===================== +class Movie { + private final String title; + private final String rating; + + public Movie(String title, String rating) { + this.title = title; + this.rating = rating; + } + + public String getTitle() { return title; } + public double getRatingDouble() { + try { + return Double.parseDouble(rating); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("评分格式错误:" + rating, e); + } + } + public String getRating() { return rating; } + + @Override + public String toString() { + return "电影:《" + title + "》 | 评分:" + rating; + } +} + +class Hero { + private final String name; + public Hero(String name) { this.name = name; } + public String getName() { return name; } + @Override + public String toString() { return "英雄:" + name; } +} + +class Weather { + private final String province; + private final String city; + private final String temperature; + private final String condition; + + public Weather(String province, String city, String temperature, String condition) { + this.province = province; + this.city = city; + this.temperature = temperature; + this.condition = condition; + } + + public String getProvince() { return province; } + public String getCity() { return city; } + public String getTemperature() { return temperature; } + public String getCondition() { return condition; } + + @Override + public String toString() { + return "省份:" + province + " | 城市:" + city + " | 天气:" + condition + " | 温度:" + temperature; + } +} + +// ===================== 5. 具体策略类(抛出自定义异常) ===================== +class MovieCrawler extends BaseCrawler { + private static final Logger logger = LoggerFactory.getLogger(MovieCrawler.class); + + public MovieCrawler() { + super("https://movie.douban.com/top250"); + } + + @Override + public List startCrawl() throws ParseException, NetworkException, CrawlerException { + List list = new ArrayList<>(); + logger.info("开始爬取豆瓣电影Top250"); + try { + for (int i = 0; i < 250; i += 25) { + Document doc = getPage(baseUrl + "?start=" + i); + Elements items = doc.select(".item"); + if (items.isEmpty()) { + throw new ParseException("页面解析失败:未找到电影列表项"); + } + for (Element e : items) { + Element titleEle = e.select(".title").first(); + Element ratingEle = e.select(".rating_num").first(); + if (titleEle == null || ratingEle == null) { + logger.warn("单条电影数据解析失败,跳过"); + continue; + } + String title = titleEle.text().split("/")[0].trim(); + String rating = ratingEle.text(); + list.add(new Movie(title, rating)); + } + Thread.sleep(1000); + } + logger.info("豆瓣电影爬取完成,共{}条数据", list.size()); + } catch (NetworkException e) { + throw e; // 抛出网络异常 + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new CrawlerException("爬取被中断", e); + } catch (Exception e) { + throw new ParseException("电影数据解析异常", e); + } + return list; + } +} + +class HeroCrawler extends BaseCrawler { + private static final Logger logger = LoggerFactory.getLogger(HeroCrawler.class); + + public HeroCrawler() { + super("https://pvp.qq.com/web201605/herolist.shtml"); + } + + @Override + public List startCrawl() throws ParseException, NetworkException, CrawlerException { + List list = new ArrayList<>(); + logger.info("开始爬取王者荣耀英雄数据"); + try { + Document doc = getPage(baseUrl); + Elements heros = doc.select("ul.herolist li a"); + if (heros.isEmpty()) { + throw new ParseException("页面解析失败:未找到英雄列表项"); + } + for (Element h : heros) { + String name = h.text().trim(); + if (!name.isEmpty()) { + list.add(new Hero(name)); + } + } + logger.info("英雄爬取完成,共{}条数据", list.size()); + } catch (NetworkException e) { + throw e; + } catch (Exception e) { + throw new ParseException("英雄数据解析异常", e); + } + return list; + } +} + +class WeatherCrawler extends BaseCrawler { + private static final Logger logger = LoggerFactory.getLogger(WeatherCrawler.class); + + private static final String[][] cities = { + {"北京","北京","101010100"},{"上海","上海","101020100"},{"天津","天津","101030100"},{"重庆","重庆","101040100"}, + {"河北","石家庄","101090101"},{"山西","太原","101100101"},{"辽宁","沈阳","101070101"},{"吉林","长春","101060101"}, + {"黑龙江","哈尔滨","101050101"},{"江苏","南京","101190101"},{"浙江","杭州","101210101"},{"安徽","合肥","101220101"}, + {"福建","福州","101230101"},{"江西","南昌","101240101"},{"山东","济南","101120101"},{"河南","郑州","101180101"}, + {"湖北","武汉","101200101"},{"湖南","长沙","101250101"},{"广东","广州","101280101"},{"海南","海口","101310101"}, + {"四川","成都","101270101"},{"贵州","贵阳","101260101"},{"云南","昆明","101290101"},{"陕西","西安","101110101"}, + {"甘肃","兰州","101160101"},{"青海","西宁","101150101"},{"内蒙古","呼和浩特","101080101"},{"广西","南宁","101300101"}, + {"西藏","拉萨","101140101"},{"宁夏","银川","101170101"},{"新疆","乌鲁木齐","101130101"}, + {"香港","香港","101320101"},{"澳门","澳门","101330101"},{"台湾","台北","101340101"} + }; + + public WeatherCrawler() { + super("https://www.weather.com.cn/weather/"); + } + + @Override + public List startCrawl() throws ParseException, NetworkException, CrawlerException { + List list = new ArrayList<>(); + logger.info("开始爬取全国天气数据"); + try { + for (String[] city : cities) { + String province = city[0]; + String cityName = city[1]; + String code = city[2]; + Document doc = getPage(baseUrl + code + ".shtml"); + Element today = doc.select("ul.t li").first(); + if (today == null) { + throw new ParseException("天气页面解析失败:未找到今日天气数据"); + } + String temp = today.select(".tem").text(); + String wea = today.select(".wea").text(); + list.add(new Weather(province, cityName, temp, wea)); + Thread.sleep(500); + } + logger.info("天气数据爬取完成,共{}条数据", list.size()); + } catch (NetworkException e) { + throw e; + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new CrawlerException("爬取被中断", e); + } catch (Exception e) { + throw new ParseException("天气数据解析异常", e); + } + return list; + } +} + +// ===================== 6. 策略上下文Context ===================== +class CrawlerContext { + private Crawler crawlerStrategy; + private static final Logger logger = LoggerFactory.getLogger(CrawlerContext.class); + + public void setCrawlerStrategy(Crawler crawlerStrategy) { + this.crawlerStrategy = crawlerStrategy; + } + + public List executeCrawl() throws ParseException, NetworkException, CrawlerException { + if (crawlerStrategy == null) { + logger.error("未设置爬取策略"); + throw new CrawlerException("爬取策略未配置"); + } + return crawlerStrategy.startCrawl(); + } +} + +// ===================== 7. 工具类(增强防御检查,对应作业要求第六条) ===================== +final class DataUtil { + private static final String PATH = "D:\\Java爬虫\\"; + private static final Logger logger = LoggerFactory.getLogger(DataUtil.class); + + private DataUtil() {} + + public static void initFolder() { + File dir = new File(PATH); + if (!dir.exists()) { + boolean created = dir.mkdirs(); + if (created) { + logger.info("创建目录:{}", PATH); + } else { + logger.error("目录创建失败:{}", PATH); + } + } + } + + public static void saveText(String fileName, String content) throws IOException { + // 增强防御检查 + if (fileName == null || fileName.isBlank()) { + throw new IllegalArgumentException("文件名不能为空"); + } + if (content == null || content.isBlank()) { + logger.warn("保存文件内容为空,跳过:{}", fileName); + return; + } + try (FileWriter fw = new FileWriter(PATH + fileName)) { + fw.write(content); + } + logger.info("文件保存成功:{}", fileName); + } + + public static void addAll(String fileName, List dataList) throws IOException { + // 增强防御检查 + if (dataList == null) { + throw new NullPointerException("待保存数据列表不能为null"); + } + if (dataList.isEmpty()) { + logger.warn("批量数据为空,跳过保存:{}", fileName); + return; + } + StringBuilder sb = new StringBuilder(); + dataList.forEach(item -> { + if (item != null) { // 空元素防御 + sb.append(item).append("\r\n"); + } + }); + saveText(fileName, sb.toString()); + } + + public static void analyzeData(List movieList, List heroList) { + // 增强防御检查 + if (movieList == null) { + logger.error("电影数据列表为null"); + return; + } + if (heroList == null) { + logger.error("英雄数据列表为null"); + return; + } + if (movieList.isEmpty()) { + logger.warn("电影数据列表为空,跳过分析"); + return; + } + logger.info("===== 执行数据分析 ====="); + double sum = 0; + int validCount = 0; + for (Movie movie : movieList) { + try { + sum += movie.getRatingDouble(); + validCount++; + } catch (IllegalArgumentException e) { + logger.warn("电影评分解析失败,跳过:{}", movie.getTitle(), e); + } + } + if (validCount == 0) { + logger.error("无有效电影评分数据"); + return; + } + double avg = sum / validCount; + System.out.println("电影平均评分:" + String.format("%.2f", avg)); + long highScoreCount = movieList.stream() + .filter(m -> { + try { + return m.getRatingDouble() >= 8.5; + } catch (IllegalArgumentException e) { + return false; + } + }) + .count(); + System.out.println("8.5分以上电影数量:" + highScoreCount); + System.out.println("英雄总数量:" + heroList.size()); + logger.info("数据分析结束"); + } +} + +// ===================== 8. 主程序(统一异常处理 + 完整日志) ===================== +public class CrawlerMain { + private static final Logger logger = LoggerFactory.getLogger(CrawlerMain.class); + + public static void main(String[] args) { + logger.info("===== 爬虫程序启动 ====="); + CrawlerContext context = new CrawlerContext(); + List movieList = new ArrayList<>(); + List heroList = new ArrayList<>(); + List weatherList = new ArrayList<>(); + + try { + // 爬取电影 + context.setCrawlerStrategy(new MovieCrawler()); + movieList = (List) context.executeCrawl(); + + // 爬取英雄 + context.setCrawlerStrategy(new HeroCrawler()); + heroList = (List) context.executeCrawl(); + + // 爬取天气 + context.setCrawlerStrategy(new WeatherCrawler()); + weatherList = (List) context.executeCrawl(); + + // 保存数据 + DataUtil.initFolder(); + DataUtil.addAll("电影数据.txt", movieList); + DataUtil.addAll("英雄数据.txt", heroList); + DataUtil.addAll("天气数据.txt", weatherList); + + // 数据分析 + DataUtil.analyzeData(movieList, heroList); + + logger.info("===== 全部任务执行完成 ====="); + System.out.println("✅ 数据已全部保存至 D:\\Java爬虫"); + } catch (NetworkException e) { + logger.error("网络异常:爬取失败", e); + } catch (ParseException e) { + logger.error("解析异常:数据解析失败", e); + } catch (CrawlerException e) { + logger.error("爬虫核心异常", e); + } catch (Exception e) { + logger.error("程序运行异常", e); + } + } +} \ No newline at end of file diff --git a/W11/CrawlerMain2/src/logback.xml b/W11/CrawlerMain2/src/logback.xml new file mode 100644 index 0000000..5f121a2 --- /dev/null +++ b/W11/CrawlerMain2/src/logback.xml @@ -0,0 +1,25 @@ + + + + + + %d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{50} - %msg%n + UTF-8 + + + + + + D:\Java爬虫\crawler.log + + %d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{50} - %msg%n + UTF-8 + + + + + + + + + \ No newline at end of file