Browse Source

提交W11

main
ZhengShiyi 1 month ago
parent
commit
afb20b879f
  1. 29
      W11/CrawlerMain2/.gitignore
  2. 10
      W11/CrawlerMain2/.idea/.gitignore
  3. 9
      W11/CrawlerMain2/.idea/libraries/jcommon_1_0_24.xml
  4. 9
      W11/CrawlerMain2/.idea/libraries/jfreechart_1_5_3.xml
  5. 9
      W11/CrawlerMain2/.idea/libraries/jsoup_1_17_2.xml
  6. 9
      W11/CrawlerMain2/.idea/libraries/kumo_core_1_12.xml
  7. 9
      W11/CrawlerMain2/.idea/libraries/logback_classic_1_4_11.xml
  8. 9
      W11/CrawlerMain2/.idea/libraries/logback_core_1_4_11.xml
  9. 9
      W11/CrawlerMain2/.idea/libraries/slf4j_api_2_0_9.xml
  10. 6
      W11/CrawlerMain2/.idea/misc.xml
  11. 8
      W11/CrawlerMain2/.idea/modules.xml
  12. 18
      W11/CrawlerMain2/CrawlerMain2.iml
  13. 440
      W11/CrawlerMain2/src/CrawlerMain.java
  14. 25
      W11/CrawlerMain2/src/logback.xml

29
W11/CrawlerMain2/.gitignore

@ -0,0 +1,29 @@
### IntelliJ IDEA ###
out/
!**/src/main/**/out/
!**/src/test/**/out/
### Eclipse ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
bin/
!**/src/main/**/bin/
!**/src/test/**/bin/
### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
### VS Code ###
.vscode/
### Mac OS ###
.DS_Store

10
W11/CrawlerMain2/.idea/.gitignore

@ -0,0 +1,10 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
# 依赖于环境的 Maven 主目录路径
/mavenHomeManager.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

9
W11/CrawlerMain2/.idea/libraries/jcommon_1_0_24.xml

@ -0,0 +1,9 @@
<component name="libraryTable">
<library name="jcommon-1.0.24">
<CLASSES>
<root url="jar://$USER_HOME$/Downloads/jcommon-1.0.24.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>

9
W11/CrawlerMain2/.idea/libraries/jfreechart_1_5_3.xml

@ -0,0 +1,9 @@
<component name="libraryTable">
<library name="jfreechart-1.5.3">
<CLASSES>
<root url="jar://$USER_HOME$/Downloads/jfreechart-1.5.3.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>

9
W11/CrawlerMain2/.idea/libraries/jsoup_1_17_2.xml

@ -0,0 +1,9 @@
<component name="libraryTable">
<library name="jsoup-1.17.2">
<CLASSES>
<root url="jar://$USER_HOME$/Downloads/jsoup-1.17.2.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>

9
W11/CrawlerMain2/.idea/libraries/kumo_core_1_12.xml

@ -0,0 +1,9 @@
<component name="libraryTable">
<library name="kumo-core-1.12">
<CLASSES>
<root url="jar://$USER_HOME$/Downloads/kumo-core-1.12.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>

9
W11/CrawlerMain2/.idea/libraries/logback_classic_1_4_11.xml

@ -0,0 +1,9 @@
<component name="libraryTable">
<library name="logback-classic-1.4.11">
<CLASSES>
<root url="jar://$USER_HOME$/Downloads/logback-classic-1.4.11.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>

9
W11/CrawlerMain2/.idea/libraries/logback_core_1_4_11.xml

@ -0,0 +1,9 @@
<component name="libraryTable">
<library name="logback-core-1.4.11">
<CLASSES>
<root url="jar://$USER_HOME$/Downloads/logback-core-1.4.11.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>

9
W11/CrawlerMain2/.idea/libraries/slf4j_api_2_0_9.xml

@ -0,0 +1,9 @@
<component name="libraryTable">
<library name="slf4j-api-2.0.9">
<CLASSES>
<root url="jar://$USER_HOME$/Downloads/slf4j-api-2.0.9.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>

6
W11/CrawlerMain2/.idea/misc.xml

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="openjdk-26" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

8
W11/CrawlerMain2/.idea/modules.xml

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/CrawlerMain2.iml" filepath="$PROJECT_DIR$/CrawlerMain2.iml" />
</modules>
</component>
</project>

18
W11/CrawlerMain2/CrawlerMain2.iml

@ -0,0 +1,18 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="jsoup-1.17.2" level="project" />
<orderEntry type="library" name="jfreechart-1.5.3" level="project" />
<orderEntry type="library" name="jcommon-1.0.24" level="project" />
<orderEntry type="library" name="kumo-core-1.12" level="project" />
<orderEntry type="library" name="logback-classic-1.4.11" level="project" />
<orderEntry type="library" name="logback-core-1.4.11" level="project" />
<orderEntry type="library" name="slf4j-api-2.0.9" level="project" />
</component>
</module>

440
W11/CrawlerMain2/src/CrawlerMain.java

@ -0,0 +1,440 @@
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
// ===================== 1. 新增自定义异常包(对应作业要求第一条) =====================
// 爬虫根异常
class CrawlerException extends Exception {
public CrawlerException(String message) {
super(message);
}
public CrawlerException(String message, Throwable cause) {
super(message, cause);
}
}
// 网络异常
class NetworkException extends CrawlerException {
public NetworkException(String message) {
super(message);
}
public NetworkException(String message, Throwable cause) {
super(message, cause);
}
}
// 解析异常
class ParseException extends CrawlerException {
public ParseException(String message) {
super(message);
}
public ParseException(String message, Throwable cause) {
super(message, cause);
}
}
// ===================== 2. 抽象策略接口(声明抛出ParseException) =====================
interface Crawler {
List<?> startCrawl() throws ParseException, NetworkException, CrawlerException;
}
// ===================== 3. 抽象模板父类 + 重试逻辑(对应作业要求第三条) =====================
abstract class BaseCrawler implements Crawler {
protected final String baseUrl;
protected static final Logger logger = LoggerFactory.getLogger(BaseCrawler.class);
// 重试次数配置
private static final int MAX_RETRY = 3;
private static final long RETRY_INTERVAL = 2000; // 2秒重试间隔
protected BaseCrawler(String baseUrl) {
this.baseUrl = baseUrl;
}
// 带重试逻辑的通用页面请求方法
protected Document getPage(String url) throws NetworkException {
int retryCount = 0;
while (retryCount < MAX_RETRY) {
try {
logger.info("第{}次请求页面:{}", retryCount + 1, url);
return Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
.timeout(15000)
.get();
} catch (Exception e) {
retryCount++;
logger.error("请求页面失败,剩余重试次数:{}", MAX_RETRY - retryCount, e);
if (retryCount >= MAX_RETRY) {
throw new NetworkException("页面请求重试" + MAX_RETRY + "次后仍失败:" + url, e);
}
try {
Thread.sleep(RETRY_INTERVAL);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
}
}
}
throw new NetworkException("未知错误导致页面请求失败:" + url);
}
@Override
public abstract List<?> startCrawl() throws ParseException, NetworkException, CrawlerException;
}
// ===================== 4. 实体类(完善封装) =====================
class Movie {
private final String title;
private final String rating;
public Movie(String title, String rating) {
this.title = title;
this.rating = rating;
}
public String getTitle() { return title; }
public double getRatingDouble() {
try {
return Double.parseDouble(rating);
} catch (NumberFormatException e) {
throw new IllegalArgumentException("评分格式错误:" + rating, e);
}
}
public String getRating() { return rating; }
@Override
public String toString() {
return "电影:《" + title + "》 | 评分:" + rating;
}
}
class Hero {
private final String name;
public Hero(String name) { this.name = name; }
public String getName() { return name; }
@Override
public String toString() { return "英雄:" + name; }
}
class Weather {
private final String province;
private final String city;
private final String temperature;
private final String condition;
public Weather(String province, String city, String temperature, String condition) {
this.province = province;
this.city = city;
this.temperature = temperature;
this.condition = condition;
}
public String getProvince() { return province; }
public String getCity() { return city; }
public String getTemperature() { return temperature; }
public String getCondition() { return condition; }
@Override
public String toString() {
return "省份:" + province + " | 城市:" + city + " | 天气:" + condition + " | 温度:" + temperature;
}
}
// ===================== 5. 具体策略类(抛出自定义异常) =====================
class MovieCrawler extends BaseCrawler {
private static final Logger logger = LoggerFactory.getLogger(MovieCrawler.class);
public MovieCrawler() {
super("https://movie.douban.com/top250");
}
@Override
public List<Movie> startCrawl() throws ParseException, NetworkException, CrawlerException {
List<Movie> list = new ArrayList<>();
logger.info("开始爬取豆瓣电影Top250");
try {
for (int i = 0; i < 250; i += 25) {
Document doc = getPage(baseUrl + "?start=" + i);
Elements items = doc.select(".item");
if (items.isEmpty()) {
throw new ParseException("页面解析失败:未找到电影列表项");
}
for (Element e : items) {
Element titleEle = e.select(".title").first();
Element ratingEle = e.select(".rating_num").first();
if (titleEle == null || ratingEle == null) {
logger.warn("单条电影数据解析失败,跳过");
continue;
}
String title = titleEle.text().split("/")[0].trim();
String rating = ratingEle.text();
list.add(new Movie(title, rating));
}
Thread.sleep(1000);
}
logger.info("豆瓣电影爬取完成,共{}条数据", list.size());
} catch (NetworkException e) {
throw e; // 抛出网络异常
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new CrawlerException("爬取被中断", e);
} catch (Exception e) {
throw new ParseException("电影数据解析异常", e);
}
return list;
}
}
class HeroCrawler extends BaseCrawler {
private static final Logger logger = LoggerFactory.getLogger(HeroCrawler.class);
public HeroCrawler() {
super("https://pvp.qq.com/web201605/herolist.shtml");
}
@Override
public List<Hero> startCrawl() throws ParseException, NetworkException, CrawlerException {
List<Hero> list = new ArrayList<>();
logger.info("开始爬取王者荣耀英雄数据");
try {
Document doc = getPage(baseUrl);
Elements heros = doc.select("ul.herolist li a");
if (heros.isEmpty()) {
throw new ParseException("页面解析失败:未找到英雄列表项");
}
for (Element h : heros) {
String name = h.text().trim();
if (!name.isEmpty()) {
list.add(new Hero(name));
}
}
logger.info("英雄爬取完成,共{}条数据", list.size());
} catch (NetworkException e) {
throw e;
} catch (Exception e) {
throw new ParseException("英雄数据解析异常", e);
}
return list;
}
}
class WeatherCrawler extends BaseCrawler {
private static final Logger logger = LoggerFactory.getLogger(WeatherCrawler.class);
private static final String[][] cities = {
{"北京","北京","101010100"},{"上海","上海","101020100"},{"天津","天津","101030100"},{"重庆","重庆","101040100"},
{"河北","石家庄","101090101"},{"山西","太原","101100101"},{"辽宁","沈阳","101070101"},{"吉林","长春","101060101"},
{"黑龙江","哈尔滨","101050101"},{"江苏","南京","101190101"},{"浙江","杭州","101210101"},{"安徽","合肥","101220101"},
{"福建","福州","101230101"},{"江西","南昌","101240101"},{"山东","济南","101120101"},{"河南","郑州","101180101"},
{"湖北","武汉","101200101"},{"湖南","长沙","101250101"},{"广东","广州","101280101"},{"海南","海口","101310101"},
{"四川","成都","101270101"},{"贵州","贵阳","101260101"},{"云南","昆明","101290101"},{"陕西","西安","101110101"},
{"甘肃","兰州","101160101"},{"青海","西宁","101150101"},{"内蒙古","呼和浩特","101080101"},{"广西","南宁","101300101"},
{"西藏","拉萨","101140101"},{"宁夏","银川","101170101"},{"新疆","乌鲁木齐","101130101"},
{"香港","香港","101320101"},{"澳门","澳门","101330101"},{"台湾","台北","101340101"}
};
public WeatherCrawler() {
super("https://www.weather.com.cn/weather/");
}
@Override
public List<Weather> startCrawl() throws ParseException, NetworkException, CrawlerException {
List<Weather> list = new ArrayList<>();
logger.info("开始爬取全国天气数据");
try {
for (String[] city : cities) {
String province = city[0];
String cityName = city[1];
String code = city[2];
Document doc = getPage(baseUrl + code + ".shtml");
Element today = doc.select("ul.t li").first();
if (today == null) {
throw new ParseException("天气页面解析失败:未找到今日天气数据");
}
String temp = today.select(".tem").text();
String wea = today.select(".wea").text();
list.add(new Weather(province, cityName, temp, wea));
Thread.sleep(500);
}
logger.info("天气数据爬取完成,共{}条数据", list.size());
} catch (NetworkException e) {
throw e;
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new CrawlerException("爬取被中断", e);
} catch (Exception e) {
throw new ParseException("天气数据解析异常", e);
}
return list;
}
}
// ===================== 6. 策略上下文Context =====================
class CrawlerContext {
private Crawler crawlerStrategy;
private static final Logger logger = LoggerFactory.getLogger(CrawlerContext.class);
public void setCrawlerStrategy(Crawler crawlerStrategy) {
this.crawlerStrategy = crawlerStrategy;
}
public List<?> executeCrawl() throws ParseException, NetworkException, CrawlerException {
if (crawlerStrategy == null) {
logger.error("未设置爬取策略");
throw new CrawlerException("爬取策略未配置");
}
return crawlerStrategy.startCrawl();
}
}
// ===================== 7. 工具类(增强防御检查,对应作业要求第六条) =====================
final class DataUtil {
private static final String PATH = "D:\\Java爬虫\\";
private static final Logger logger = LoggerFactory.getLogger(DataUtil.class);
private DataUtil() {}
public static void initFolder() {
File dir = new File(PATH);
if (!dir.exists()) {
boolean created = dir.mkdirs();
if (created) {
logger.info("创建目录:{}", PATH);
} else {
logger.error("目录创建失败:{}", PATH);
}
}
}
public static void saveText(String fileName, String content) throws IOException {
// 增强防御检查
if (fileName == null || fileName.isBlank()) {
throw new IllegalArgumentException("文件名不能为空");
}
if (content == null || content.isBlank()) {
logger.warn("保存文件内容为空,跳过:{}", fileName);
return;
}
try (FileWriter fw = new FileWriter(PATH + fileName)) {
fw.write(content);
}
logger.info("文件保存成功:{}", fileName);
}
public static <T> void addAll(String fileName, List<T> dataList) throws IOException {
// 增强防御检查
if (dataList == null) {
throw new NullPointerException("待保存数据列表不能为null");
}
if (dataList.isEmpty()) {
logger.warn("批量数据为空,跳过保存:{}", fileName);
return;
}
StringBuilder sb = new StringBuilder();
dataList.forEach(item -> {
if (item != null) { // 空元素防御
sb.append(item).append("\r\n");
}
});
saveText(fileName, sb.toString());
}
public static void analyzeData(List<Movie> movieList, List<Hero> heroList) {
// 增强防御检查
if (movieList == null) {
logger.error("电影数据列表为null");
return;
}
if (heroList == null) {
logger.error("英雄数据列表为null");
return;
}
if (movieList.isEmpty()) {
logger.warn("电影数据列表为空,跳过分析");
return;
}
logger.info("===== 执行数据分析 =====");
double sum = 0;
int validCount = 0;
for (Movie movie : movieList) {
try {
sum += movie.getRatingDouble();
validCount++;
} catch (IllegalArgumentException e) {
logger.warn("电影评分解析失败,跳过:{}", movie.getTitle(), e);
}
}
if (validCount == 0) {
logger.error("无有效电影评分数据");
return;
}
double avg = sum / validCount;
System.out.println("电影平均评分:" + String.format("%.2f", avg));
long highScoreCount = movieList.stream()
.filter(m -> {
try {
return m.getRatingDouble() >= 8.5;
} catch (IllegalArgumentException e) {
return false;
}
})
.count();
System.out.println("8.5分以上电影数量:" + highScoreCount);
System.out.println("英雄总数量:" + heroList.size());
logger.info("数据分析结束");
}
}
// ===================== 8. 主程序(统一异常处理 + 完整日志) =====================
public class CrawlerMain {
private static final Logger logger = LoggerFactory.getLogger(CrawlerMain.class);
public static void main(String[] args) {
logger.info("===== 爬虫程序启动 =====");
CrawlerContext context = new CrawlerContext();
List<Movie> movieList = new ArrayList<>();
List<Hero> heroList = new ArrayList<>();
List<Weather> weatherList = new ArrayList<>();
try {
// 爬取电影
context.setCrawlerStrategy(new MovieCrawler());
movieList = (List<Movie>) context.executeCrawl();
// 爬取英雄
context.setCrawlerStrategy(new HeroCrawler());
heroList = (List<Hero>) context.executeCrawl();
// 爬取天气
context.setCrawlerStrategy(new WeatherCrawler());
weatherList = (List<Weather>) context.executeCrawl();
// 保存数据
DataUtil.initFolder();
DataUtil.addAll("电影数据.txt", movieList);
DataUtil.addAll("英雄数据.txt", heroList);
DataUtil.addAll("天气数据.txt", weatherList);
// 数据分析
DataUtil.analyzeData(movieList, heroList);
logger.info("===== 全部任务执行完成 =====");
System.out.println("✅ 数据已全部保存至 D:\\Java爬虫");
} catch (NetworkException e) {
logger.error("网络异常:爬取失败", e);
} catch (ParseException e) {
logger.error("解析异常:数据解析失败", e);
} catch (CrawlerException e) {
logger.error("爬虫核心异常", e);
} catch (Exception e) {
logger.error("程序运行异常", e);
}
}
}

25
W11/CrawlerMain2/src/logback.xml

@ -0,0 +1,25 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<!-- 控制台输出 -->
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{50} - %msg%n</pattern>
<charset>UTF-8</charset>
</encoder>
</appender>
<!-- 文件输出(可选) -->
<appender name="FILE" class="ch.qos.logback.core.FileAppender">
<file>D:\Java爬虫\crawler.log</file>
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{50} - %msg%n</pattern>
<charset>UTF-8</charset>
</encoder>
</appender>
<!-- 全局日志级别 -->
<root level="INFO">
<appender-ref ref="CONSOLE" />
<appender-ref ref="FILE" /> <!-- 如需日志文件则保留,否则删除 -->
</root>
</configuration>
Loading…
Cancel
Save