Wengxiyi 3 weeks ago
parent
commit
dbef5edd2b
  1. BIN
      project/202506050208-翁希怡-期末实验报告.docx
  2. 58
      project/src/main/java/BankAccount.java
  3. 338
      project/src/main/java/com/hnu/crawler/AdmissionCrawlerMain.java
  4. 6
      project/src/main/java/com/hnu/crawler/command/Command.java
  5. 73
      project/src/main/java/com/hnu/crawler/command/CrawlCommand.java
  6. 60
      project/src/main/java/com/hnu/crawler/command/DemoCommand.java
  7. 22
      project/src/main/java/com/hnu/crawler/command/ExitCommand.java
  8. 36
      project/src/main/java/com/hnu/crawler/command/HelpCommand.java
  9. 40
      project/src/main/java/com/hnu/crawler/command/ListCommand.java
  10. 65
      project/src/main/java/com/hnu/crawler/command/QueryCommand.java
  11. 135
      project/src/main/java/com/hnu/crawler/config/ConfigManager.java
  12. 219
      project/src/main/java/com/hnu/crawler/model/AdmissionInfo.java
  13. 125
      project/src/main/java/com/hnu/crawler/model/UniversityConfig.java
  14. 159
      project/src/main/java/com/hnu/crawler/parser/AdmissionParser.java
  15. 216
      project/src/main/java/com/hnu/crawler/query/DataQuery.java
  16. 114
      project/src/main/java/com/hnu/crawler/storage/DataStorage.java
  17. 75
      project/src/main/java/com/hnu/crawler/strategy/BlogStrategy.java
  18. 117
      project/src/main/java/com/hnu/crawler/strategy/ConfigBasedCrawler.java
  19. 11
      project/src/main/java/com/hnu/crawler/strategy/CrawlerStrategy.java
  20. 72
      project/src/main/java/com/hnu/crawler/strategy/NewsStrategy.java
  21. 67
      project/src/main/java/com/hnu/crawler/strategy/SinglePageCrawler.java
  22. 108
      project/src/main/java/com/hnu/crawler/strategy/UniversityStrategy.java
  23. 52
      project/src/main/java/com/hnu/crawler/util/HttpClientUtil.java
  24. 33
      project/src/main/java/com/hnu/crawler/view/ConsoleView.java
  25. 12
      project/src/main/resources/logback.xml
  26. 70
      project/src/test/java/BankAccountTest.java

BIN
project/202506050208-翁希怡-期末实验报告.docx

Binary file not shown.

58
project/src/main/java/BankAccount.java

@ -0,0 +1,58 @@
public class BankAccount{
private String accountNumber;
private String ownerName;
private double balance;
public BankAccount(String accountNumber, String ownerName) {
this.accountNumber=accountNumber;
this.ownerName=ownerName;
this.balance=0.0;
}
public String getAccountNumber(){
return accountNumber;
}
public String getOwnerName(){
return ownerName;
}
public void setOwnerName(String ownerName){
this.ownerName=ownerName;
}
public double getBalance(){
return balance;
}
public void deposit(double amount){
if (amount>0){
balance+=amount;
System.out.println("存款成功,当前余额:"+balance);
}else{
System.out.println("存款金额必须大于0");
}
}
public void withdraw(double amount){
if (amount>0 && amount<=balance){
balance-=amount;
System.out.println("取款成功,当前余额:"+balance);
}else{
System.out.println("余额不足或金额无效");
}
}
public static void main(String[] args){
BankAccount account = new BankAccount("1234567890", "张三");
System.out.println("账户创建成功!");
System.out.println("账户号:" + account.getAccountNumber());
System.out.println("户主:" + account.getOwnerName());
System.out.println("初始余额:" + account.getBalance());
account.deposit(1000);
account.withdraw(500);
account.withdraw(600);
account.deposit(-100);
}
}

338
project/src/main/java/com/hnu/crawler/AdmissionCrawlerMain.java

@ -0,0 +1,338 @@
package com.hnu.crawler;
import com.hnu.crawler.command.Command;
import com.hnu.crawler.command.CrawlCommand;
import com.hnu.crawler.command.DemoCommand;
import com.hnu.crawler.command.ExitCommand;
import com.hnu.crawler.command.HelpCommand;
import com.hnu.crawler.command.ListCommand;
import com.hnu.crawler.command.QueryCommand;
import com.hnu.crawler.config.ConfigManager;
import com.hnu.crawler.model.AdmissionInfo;
import com.hnu.crawler.model.UniversityConfig;
import com.hnu.crawler.query.DataQuery;
import com.hnu.crawler.storage.DataStorage;
import com.hnu.crawler.strategy.ConfigBasedCrawler;
import com.hnu.crawler.strategy.CrawlerStrategy;
import com.hnu.crawler.strategy.SinglePageCrawler;
import com.hnu.crawler.view.ConsoleView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
public class AdmissionCrawlerMain {
private static final Logger logger = LoggerFactory.getLogger(AdmissionCrawlerMain.class);
private static final String OUTPUT_DIR = "data";
private static final String CSV_FILE = OUTPUT_DIR + "/admission_info.csv";
private static final String JSON_FILE = OUTPUT_DIR + "/admission_info.json";
public static void main(String[] args) {
System.out.println("========================================");
System.out.println(" 高校招生信息爬虫系统 v1.0");
System.out.println("========================================");
try {
ConfigManager.createSampleConfig();
System.out.println("[INFO] 配置文件加载完成");
ConsoleView view = new ConsoleView();
Map<String, Command> commands = registerCommands(view);
System.out.println("[INFO] 命令注册完成");
if (args.length > 0) {
executeCommand(args, commands, view);
return;
}
runInteractiveMode(commands, view);
} catch (Exception e) {
System.err.println("[ERROR] 程序启动失败: " + e.getMessage());
e.printStackTrace();
}
}
private static Map<String, Command> registerCommands(ConsoleView view) {
Map<String, Command> commands = new HashMap<>();
commands.put("crawl", new CrawlCommand(view));
commands.put("list", new ListCommand(view));
commands.put("query", new QueryCommand(view));
commands.put("demo", new DemoCommand(view));
commands.put("help", new HelpCommand(view));
commands.put("exit", new ExitCommand(view));
return commands;
}
private static void executeCommand(String[] args, Map<String, Command> commands, ConsoleView view) {
String cmdName = args[0].toLowerCase();
Command command = commands.get(cmdName);
if (command != null) {
command.execute(args);
} else {
view.printError("未知命令: " + cmdName);
view.printInfo("输入 help 查看可用命令");
}
}
private static void runInteractiveMode(Map<String, Command> commands, ConsoleView view) {
view.printSuccess("欢迎使用高校招生信息爬虫系统!");
view.printInfo("输入 'help' 查看可用命令");
Scanner scanner = new Scanner(System.in);
while (true) {
String input = view.readLine().trim();
if (input.isEmpty()) continue;
String[] parts = input.split("\\s+");
String cmdName = parts[0].toLowerCase();
if (cmdName.equals("menu")) {
showMainMenu(view);
handleMenuSelection(scanner, view);
continue;
}
Command command = commands.get(cmdName);
if (command != null) {
command.execute(parts);
} else {
view.printError("未知命令: " + cmdName);
view.printInfo("输入 'help' 查看可用命令,或输入 'menu' 返回菜单模式");
}
}
}
private static void showMainMenu(ConsoleView view) {
view.print("\n");
view.print("╔══════════════════════════════════════════╗");
view.print("║ 高校本科招生信息爬虫系统 ║");
view.print("╠══════════════════════════════════════════╣");
view.print("║ 1. 单页面爬取 ║");
view.print("║ 2. 从配置文件批量爬取 ║");
view.print("║ 3. CLI命令模式 ║");
view.print("║ 4. 数据查询 ║");
view.print("║ 5. 演示模式(生成模拟数据) ║");
view.print("║ 6. 查看配置高校列表 ║");
view.print("║ 0. 退出程序 ║");
view.print("╚══════════════════════════════════════════╝");
view.print("请选择操作 (0-6): ");
}
private static void handleMenuSelection(Scanner scanner, ConsoleView view) {
int choice;
try {
choice = Integer.parseInt(scanner.nextLine().trim());
} catch (NumberFormatException e) {
view.printError("输入无效,请输入数字!");
return;
}
switch (choice) {
case 1:
crawlWithStrategy(new SinglePageCrawler(), scanner, view);
break;
case 2:
crawlWithStrategy(new ConfigBasedCrawler(), scanner, view);
break;
case 3:
view.printInfo("已切换到CLI命令模式");
view.printInfo("输入 'menu' 返回菜单模式,输入 'help' 查看命令");
break;
case 4:
queryData(scanner, view);
break;
case 5:
demoMode(view);
break;
case 6:
showConfig(view);
break;
case 0:
logger.info("程序退出");
scanner.close();
System.exit(0);
break;
default:
view.printError("无效选项,请重新选择!");
}
}
private static void crawlWithStrategy(CrawlerStrategy strategy, Scanner scanner, ConsoleView view) {
logger.info("使用{}策略进行爬取", strategy.getStrategyName());
List<AdmissionInfo> results = strategy.crawl(scanner);
if (!results.isEmpty()) {
DataStorage.saveToCsv(results, CSV_FILE);
DataStorage.saveToJson(results, JSON_FILE);
view.printSuccess("数据已保存到: " + CSV_FILE + " 和 " + JSON_FILE);
}
}
private static void queryData(Scanner scanner, ConsoleView view) {
view.print("\n=== 数据查询 ===");
List<AdmissionInfo> allData = DataQuery.loadAllData();
if (allData.isEmpty()) {
view.printInfo("暂无数据,请先进行爬取或使用演示模式!");
return;
}
view.print("当前共有 " + allData.size() + " 条数据");
while (true) {
view.print("\n查询选项:");
view.print(" 1. 按院校查询");
view.print(" 2. 按专业查询");
view.print(" 3. 按年份查询");
view.print(" 4. 按分数段查询");
view.print(" 5. 查看所有数据(按分数排序)");
view.print(" 0. 返回主菜单");
view.print("请选择查询方式: ");
int choice;
try {
choice = Integer.parseInt(scanner.nextLine().trim());
} catch (NumberFormatException e) {
view.printError("输入无效!");
continue;
}
List<AdmissionInfo> results = new ArrayList<>();
switch (choice) {
case 1:
view.print("请输入院校名称(支持模糊匹配): ");
String uniName = scanner.nextLine().trim();
results = DataQuery.queryByUniversity(allData, uniName);
break;
case 2:
view.print("请输入专业名称(支持模糊匹配): ");
String majorName = scanner.nextLine().trim();
results = DataQuery.queryByMajor(allData, majorName);
break;
case 3:
view.print("请输入年份: ");
String year = scanner.nextLine().trim();
results = DataQuery.queryByYear(allData, year);
break;
case 4:
view.print("请输入最低分数: ");
double minScore = Double.parseDouble(scanner.nextLine().trim());
view.print("请输入最高分数: ");
double maxScore = Double.parseDouble(scanner.nextLine().trim());
results = DataQuery.queryByScoreRange(allData, minScore, maxScore);
break;
case 5:
results = DataQuery.sortByScore(allData, false);
break;
case 0:
return;
default:
view.printError("无效选项!");
continue;
}
DataQuery.printResults(results);
}
}
private static void showConfig(ConsoleView view) {
view.print("\n=== 已配置高校列表 ===");
List<UniversityConfig> universities = ConfigManager.loadUniversities();
if (universities.isEmpty()) {
view.print("暂无配置,请编辑 config/universities.json 文件");
return;
}
for (UniversityConfig uni : universities) {
view.print(String.format("%n【%s】%s (%s)",
uni.isEnabled() ? "●" : "○",
uni.getName(),
uni.getProvince()));
view.print(String.format(" 院校代码: %s", uni.getCode()));
view.print(String.format(" 配置页面数: %d",
uni.getAdmissionPages() != null ? uni.getAdmissionPages().size() : 0));
if (uni.getAdmissionPages() != null) {
for (UniversityConfig.AdmissionPageConfig page : uni.getAdmissionPages()) {
view.print(String.format(" [%s] %s - %s",
page.isEnabled() ? "启用" : "禁用",
page.getYear(),
page.getDescription()));
}
}
}
view.print("\n提示: 编辑 config/universities.json 文件可添加更多高校配置");
}
private static void demoMode(ConsoleView view) {
view.print("\n=== 演示模式 ===");
logger.info("进入演示模式");
List<AdmissionInfo> demoData = createDemoData();
view.print("生成演示数据...");
view.print("共生成 " + demoData.size() + " 条演示数据");
DataStorage.saveToCsvOverwrite(demoData, CSV_FILE);
DataStorage.saveToJson(demoData, JSON_FILE);
view.printSuccess("\n演示数据已保存到:");
view.print(" - CSV: " + CSV_FILE);
view.print(" - JSON: " + JSON_FILE);
view.print("\n现在可以选择「数据查询」功能来查询演示数据!");
}
private static List<AdmissionInfo> createDemoData() {
List<AdmissionInfo> data = new ArrayList<>();
String[] universities = {"湖南大学", "中南大学", "湖南师范大学", "国防科技大学"};
String[] majors = {"计算机科学与技术", "软件工程", "电子信息工程", "机械工程", "土木工程", "金融学", "临床医学"};
String[] categories = {"物理类", "历史类"};
String[] batches = {"本科批", "本科提前批"};
String[] years = {"2022", "2023", "2024"};
int id = 1;
for (String year : years) {
for (String university : universities) {
for (String major : majors) {
for (String category : categories) {
AdmissionInfo info = new AdmissionInfo();
info.setUniversityName(university);
info.setUniversityCode(String.format("%04d", 10530 + id++ % 10));
info.setProvince("湖南省");
info.setCategory(category);
info.setMajorName(major);
info.setMajorCode(String.format("%06d", 800000 + id * 10));
info.setPlanCount((int) (Math.random() * 50 + 10));
double baseScore = 550;
if (university.equals("国防科技大学")) baseScore += 50;
if (university.equals("中南大学")) baseScore += 30;
if (major.equals("临床医学")) baseScore += 20;
if (major.equals("计算机科学与技术")) baseScore += 15;
info.setMinScore(baseScore + Math.random() * 60);
info.setMaxScore(info.getMinScore() + Math.random() * 30);
info.setAvgScore((info.getMinScore() + info.getMaxScore()) / 2);
info.setMinRank((int) (Math.random() * 10000 + 1000));
info.setMaxRank(info.getMinRank() + (int) (Math.random() * 500));
info.setYear(year);
info.setBatch(batches[(int) (Math.random() * batches.length)]);
info.setSourceUrl("https://example.edu/admission/" + year);
data.add(info);
}
}
}
}
return data;
}
}

6
project/src/main/java/com/hnu/crawler/command/Command.java

@ -0,0 +1,6 @@
package com.hnu.crawler.command;
public interface Command {
String getName();
void execute(String[] args);
}

73
project/src/main/java/com/hnu/crawler/command/CrawlCommand.java

@ -0,0 +1,73 @@
package com.hnu.crawler.command;
import com.hnu.crawler.strategy.CrawlerStrategy;
import com.hnu.crawler.strategy.NewsStrategy;
import com.hnu.crawler.strategy.BlogStrategy;
import com.hnu.crawler.strategy.UniversityStrategy;
import com.hnu.crawler.storage.DataStorage;
import com.hnu.crawler.view.ConsoleView;
import com.hnu.crawler.model.AdmissionInfo;
import java.util.List;
import java.util.Scanner;
public class CrawlCommand implements Command {
private final ConsoleView view;
public CrawlCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "crawl";
}
@Override
public void execute(String[] args) {
if (args.length < 2) {
view.printError("用法: crawl <类型> <URL>");
view.printInfo("支持的类型: news(新闻), blog(博客), university(高校)");
return;
}
String type = args[1].toLowerCase();
String url = args.length > 2 ? args[2] : "";
CrawlerStrategy strategy = null;
switch (type) {
case "news":
strategy = new NewsStrategy(view);
break;
case "blog":
strategy = new BlogStrategy(view);
break;
case "university":
strategy = new UniversityStrategy(view);
break;
default:
view.printError("未知类型: " + type);
view.printInfo("支持的类型: news(新闻), blog(博客), university(高校)");
return;
}
if (url.isEmpty()) {
Scanner scanner = new Scanner(System.in);
view.printInfo("请输入目标URL: ");
url = scanner.nextLine().trim();
}
view.printInfo("开始爬取 [" + type + "]: " + url);
List<AdmissionInfo> results = strategy.crawl(url);
if (!results.isEmpty()) {
DataStorage.saveToCsv(results, "data/crawler_results.csv");
DataStorage.saveToJson(results, "data/crawler_results.json");
view.printSuccess("爬取完成!共获取 " + results.size() + " 条数据");
view.printInfo("数据已保存到 data/crawler_results.csv 和 data/crawler_results.json");
} else {
view.printError("未获取到数据,请检查URL或网站结构");
}
}
}

60
project/src/main/java/com/hnu/crawler/command/DemoCommand.java

@ -0,0 +1,60 @@
package com.hnu.crawler.command;
import com.hnu.crawler.storage.DataStorage;
import com.hnu.crawler.view.ConsoleView;
import com.hnu.crawler.model.AdmissionInfo;
import java.util.ArrayList;
import java.util.List;
public class DemoCommand implements Command {
private final ConsoleView view;
public DemoCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "demo";
}
@Override
public void execute(String[] args) {
view.printInfo("生成演示数据...");
List<AdmissionInfo> demoData = createDemoData();
DataStorage.saveToCsvOverwrite(demoData, "data/crawler_results.csv");
DataStorage.saveToJson(demoData, "data/crawler_results.json");
view.printSuccess("演示数据生成完成!共 " + demoData.size() + " 条");
view.printInfo("数据已保存到 data/crawler_results.csv 和 data/crawler_results.json");
}
private List<AdmissionInfo> createDemoData() {
List<AdmissionInfo> data = new ArrayList<>();
String[] universities = {"湖南大学", "中南大学", "湖南师范大学"};
String[] majors = {"计算机科学与技术", "软件工程", "电子信息工程"};
String[] years = {"2022", "2023", "2024"};
int id = 1;
for (String year : years) {
for (String university : universities) {
for (String major : majors) {
AdmissionInfo info = new AdmissionInfo();
info.setUniversityName(university);
info.setMajorName(major);
info.setYear(year);
info.setMinScore(550 + Math.random() * 80);
info.setMaxScore(info.getMinScore() + Math.random() * 30);
info.setAvgScore((info.getMinScore() + info.getMaxScore()) / 2);
data.add(info);
}
}
}
return data;
}
}

22
project/src/main/java/com/hnu/crawler/command/ExitCommand.java

@ -0,0 +1,22 @@
package com.hnu.crawler.command;
import com.hnu.crawler.view.ConsoleView;
public class ExitCommand implements Command {
private final ConsoleView view;
public ExitCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "exit";
}
@Override
public void execute(String[] args) {
view.printSuccess("程序退出");
System.exit(0);
}
}

36
project/src/main/java/com/hnu/crawler/command/HelpCommand.java

@ -0,0 +1,36 @@
package com.hnu.crawler.command;
import com.hnu.crawler.view.ConsoleView;
public class HelpCommand implements Command {
private final ConsoleView view;
public HelpCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "help";
}
@Override
public void execute(String[] args) {
view.printInfo("可用命令:");
view.printInfo(" crawl <类型> [URL] - 爬取网站数据");
view.printInfo(" 类型: news(新闻), blog(博客), university(高校)");
view.printInfo(" 示例: crawl news https://news.example.com");
view.printInfo(" list - 列出所有已爬取的数据");
view.printInfo(" query <选项> <关键词> - 查询数据");
view.printInfo(" 选项: university(院校), major(专业), year(年份)");
view.printInfo(" 示例: query university 湖南");
view.printInfo(" demo - 生成演示数据");
view.printInfo(" help - 显示此帮助信息");
view.printInfo(" exit - 退出程序");
view.printInfo("");
view.printInfo("支持的网站类型:");
view.printInfo(" 1. 新闻网站 (news) - 爬取新闻列表和内容");
view.printInfo(" 2. 博客网站 (blog) - 爬取博客文章");
view.printInfo(" 3. 高校网站 (university) - 爬取招生信息");
}
}

40
project/src/main/java/com/hnu/crawler/command/ListCommand.java

@ -0,0 +1,40 @@
package com.hnu.crawler.command;
import com.hnu.crawler.query.DataQuery;
import com.hnu.crawler.view.ConsoleView;
import com.hnu.crawler.model.AdmissionInfo;
import java.util.List;
public class ListCommand implements Command {
private final ConsoleView view;
public ListCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "list";
}
@Override
public void execute(String[] args) {
List<AdmissionInfo> allData = DataQuery.loadAllData();
if (allData.isEmpty()) {
view.printInfo("暂无数据,请先使用 crawl 命令爬取数据");
return;
}
view.printInfo("共 " + allData.size() + " 条数据:");
view.printInfo("=====================================");
for (int i = 0; i < allData.size(); i++) {
AdmissionInfo info = allData.get(i);
view.printInfo((i + 1) + ". " + info.getUniversityName() + " - " + info.getMajorName());
view.printInfo(" 分数: " + info.getMinScore() + " - " + info.getMaxScore());
view.printInfo(" 年份: " + info.getYear());
}
view.printInfo("=====================================");
}
}

65
project/src/main/java/com/hnu/crawler/command/QueryCommand.java

@ -0,0 +1,65 @@
package com.hnu.crawler.command;
import com.hnu.crawler.query.DataQuery;
import com.hnu.crawler.view.ConsoleView;
import com.hnu.crawler.model.AdmissionInfo;
import java.util.List;
public class QueryCommand implements Command {
private final ConsoleView view;
public QueryCommand(ConsoleView view) {
this.view = view;
}
@Override
public String getName() {
return "query";
}
@Override
public void execute(String[] args) {
if (args.length < 3) {
view.printError("用法: query <选项> <关键词>");
view.printInfo("选项: university(院校), major(专业), year(年份)");
return;
}
String option = args[1].toLowerCase();
String keyword = args[2];
List<AdmissionInfo> allData = DataQuery.loadAllData();
if (allData.isEmpty()) {
view.printInfo("暂无数据,请先爬取或使用 demo 命令生成演示数据");
return;
}
List<AdmissionInfo> results;
switch (option) {
case "university":
results = DataQuery.queryByUniversity(allData, keyword);
break;
case "major":
results = DataQuery.queryByMajor(allData, keyword);
break;
case "year":
results = DataQuery.queryByYear(allData, keyword);
break;
default:
view.printError("未知选项: " + option);
return;
}
if (results.isEmpty()) {
view.printInfo("未找到匹配的数据");
} else {
view.printInfo("找到 " + results.size() + " 条匹配数据:");
for (AdmissionInfo info : results) {
view.printInfo("- " + info.getUniversityName() + " - " + info.getMajorName());
}
}
}
}

135
project/src/main/java/com/hnu/crawler/config/ConfigManager.java

@ -0,0 +1,135 @@
package com.hnu.crawler.config;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import com.hnu.crawler.model.UniversityConfig;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class ConfigManager {
private static final Logger logger = LoggerFactory.getLogger(ConfigManager.class);
private static final ObjectMapper objectMapper = new ObjectMapper();
private static final String CONFIG_DIR = "config";
private static final String UNIVERSITIES_CONFIG = CONFIG_DIR + "/universities.json";
static {
objectMapper.registerModule(new JavaTimeModule());
objectMapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS);
objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
}
public static void ensureConfigDir() {
File dir = new File(CONFIG_DIR);
if (!dir.exists()) {
dir.mkdirs();
logger.info("创建配置目录: {}", CONFIG_DIR);
}
}
public static List<UniversityConfig> loadUniversities() {
ensureConfigDir();
File configFile = new File(UNIVERSITIES_CONFIG);
if (!configFile.exists()) {
logger.warn("配置文件不存在: {}", UNIVERSITIES_CONFIG);
return new ArrayList<>();
}
try {
return objectMapper.readValue(configFile,
new TypeReference<List<UniversityConfig>>() {});
} catch (IOException e) {
logger.error("加载高校配置失败", e);
return new ArrayList<>();
}
}
public static void saveUniversities(List<UniversityConfig> universities) {
ensureConfigDir();
try {
objectMapper.writeValue(new File(UNIVERSITIES_CONFIG), universities);
logger.info("保存高校配置成功,共 {} 所高校", universities.size());
} catch (IOException e) {
logger.error("保存高校配置失败", e);
}
}
public static UniversityConfig findUniversityById(String id) {
List<UniversityConfig> universities = loadUniversities();
return universities.stream()
.filter(u -> u.getId().equals(id))
.findFirst()
.orElse(null);
}
public static List<UniversityConfig> findUniversityByName(String name) {
List<UniversityConfig> universities = loadUniversities();
List<UniversityConfig> result = new ArrayList<>();
for (UniversityConfig uni : universities) {
if (uni.getName().contains(name)) {
result.add(uni);
}
}
return result;
}
public static void createSampleConfig() {
ensureConfigDir();
File configFile = new File(UNIVERSITIES_CONFIG);
if (configFile.exists()) {
logger.info("示例配置已存在,跳过创建");
return;
}
List<UniversityConfig> sampleConfigs = new ArrayList<>();
UniversityConfig hnu = new UniversityConfig();
hnu.setId("hnu");
hnu.setName("湖南大学");
hnu.setCode("10532");
hnu.setProvince("湖南省");
hnu.setBaseUrl("https://admission.hnu.edu.cn");
List<UniversityConfig.AdmissionPageConfig> hnuPages = new ArrayList<>();
UniversityConfig.AdmissionPageConfig hnu2024 = new UniversityConfig.AdmissionPageConfig();
hnu2024.setYear("2024");
hnu2024.setUrl("https://admission.hnu.edu.cn/info/1008/3001.htm");
hnu2024.setDescription("2024年本科招生分数线");
hnu2024.setTableSelector("table");
hnuPages.add(hnu2024);
hnu.setAdmissionPages(hnuPages);
sampleConfigs.add(hnu);
UniversityConfig csu = new UniversityConfig();
csu.setId("csu");
csu.setName("中南大学");
csu.setCode("10533");
csu.setProvince("湖南省");
csu.setBaseUrl("https://zhaosheng.csu.edu.cn");
List<UniversityConfig.AdmissionPageConfig> csuPages = new ArrayList<>();
UniversityConfig.AdmissionPageConfig csu2024 = new UniversityConfig.AdmissionPageConfig();
csu2024.setYear("2024");
csu2024.setUrl("https://zhaosheng.csu.edu.cn/xxfw/lnfs.htm");
csu2024.setDescription("2024年本科招生分数线");
csu2024.setTableSelector("table");
csuPages.add(csu2024);
csu.setAdmissionPages(csuPages);
sampleConfigs.add(csu);
saveUniversities(sampleConfigs);
logger.info("创建示例配置文件成功: {}", UNIVERSITIES_CONFIG);
}
}

219
project/src/main/java/com/hnu/crawler/model/AdmissionInfo.java

@ -0,0 +1,219 @@
package com.hnu.crawler.model;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.List;
public class AdmissionInfo {
private String universityName;
private String universityCode;
private String province;
private String category;
private String majorName;
private String majorCode;
private Integer planCount;
private Double minScore;
private Double maxScore;
private Double avgScore;
private Integer minRank;
private Integer maxRank;
private String year;
private String batch;
private String remarks;
private LocalDateTime crawlTime;
private String sourceUrl;
public AdmissionInfo() {
this.crawlTime = LocalDateTime.now();
}
public String getUniversityName() {
return universityName;
}
public void setUniversityName(String universityName) {
this.universityName = universityName;
}
public String getUniversityCode() {
return universityCode;
}
public void setUniversityCode(String universityCode) {
this.universityCode = universityCode;
}
public String getProvince() {
return province;
}
public void setProvince(String province) {
this.province = province;
}
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
public String getMajorName() {
return majorName;
}
public void setMajorName(String majorName) {
this.majorName = majorName;
}
public String getMajorCode() {
return majorCode;
}
public void setMajorCode(String majorCode) {
this.majorCode = majorCode;
}
public Integer getPlanCount() {
return planCount;
}
public void setPlanCount(Integer planCount) {
this.planCount = planCount;
}
public Double getMinScore() {
return minScore;
}
public void setMinScore(Double minScore) {
this.minScore = minScore;
}
public Double getMaxScore() {
return maxScore;
}
public void setMaxScore(Double maxScore) {
this.maxScore = maxScore;
}
public Double getAvgScore() {
return avgScore;
}
public void setAvgScore(Double avgScore) {
this.avgScore = avgScore;
}
public Integer getMinRank() {
return minRank;
}
public void setMinRank(Integer minRank) {
this.minRank = minRank;
}
public Integer getMaxRank() {
return maxRank;
}
public void setMaxRank(Integer maxRank) {
this.maxRank = maxRank;
}
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getBatch() {
return batch;
}
public void setBatch(String batch) {
this.batch = batch;
}
public String getRemarks() {
return remarks;
}
public void setRemarks(String remarks) {
this.remarks = remarks;
}
public LocalDateTime getCrawlTime() {
return crawlTime;
}
public void setCrawlTime(LocalDateTime crawlTime) {
this.crawlTime = crawlTime;
}
public String getSourceUrl() {
return sourceUrl;
}
public void setSourceUrl(String sourceUrl) {
this.sourceUrl = sourceUrl;
}
public static List<String> getHeaders() {
List<String> headers = new ArrayList<>();
headers.add("院校名称");
headers.add("院校代码");
headers.add("省份");
headers.add("科类");
headers.add("专业名称");
headers.add("专业代码");
headers.add("计划数");
headers.add("最低分");
headers.add("最高分");
headers.add("平均分");
headers.add("最低位次");
headers.add("最高位次");
headers.add("年份");
headers.add("批次");
headers.add("备注");
headers.add("爬取时间");
headers.add("来源URL");
return headers;
}
public List<String> toCsvRow() {
List<String> row = new ArrayList<>();
row.add(universityName != null ? universityName : "");
row.add(universityCode != null ? universityCode : "");
row.add(province != null ? province : "");
row.add(category != null ? category : "");
row.add(majorName != null ? majorName : "");
row.add(majorCode != null ? majorCode : "");
row.add(planCount != null ? planCount.toString() : "");
row.add(minScore != null ? minScore.toString() : "");
row.add(maxScore != null ? maxScore.toString() : "");
row.add(avgScore != null ? avgScore.toString() : "");
row.add(minRank != null ? minRank.toString() : "");
row.add(maxRank != null ? maxRank.toString() : "");
row.add(year != null ? year : "");
row.add(batch != null ? batch : "");
row.add(remarks != null ? remarks : "");
row.add(crawlTime != null ? crawlTime.toString() : "");
row.add(sourceUrl != null ? sourceUrl : "");
return row;
}
@Override
public String toString() {
return "AdmissionInfo{" +
"universityName='" + universityName + '\'' +
", majorName='" + majorName + '\'' +
", year='" + year + '\'' +
", minScore=" + minScore +
'}';
}
}

125
project/src/main/java/com/hnu/crawler/model/UniversityConfig.java

@ -0,0 +1,125 @@
package com.hnu.crawler.model;
import java.util.List;
public class UniversityConfig {
private String id;
private String name;
private String code;
private String province;
private String baseUrl;
private List<AdmissionPageConfig> admissionPages;
private boolean enabled;
public UniversityConfig() {
this.enabled = true;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getProvince() {
return province;
}
public void setProvince(String province) {
this.province = province;
}
public String getBaseUrl() {
return baseUrl;
}
public void setBaseUrl(String baseUrl) {
this.baseUrl = baseUrl;
}
public List<AdmissionPageConfig> getAdmissionPages() {
return admissionPages;
}
public void setAdmissionPages(List<AdmissionPageConfig> admissionPages) {
this.admissionPages = admissionPages;
}
public boolean isEnabled() {
return enabled;
}
public void setEnabled(boolean enabled) {
this.enabled = enabled;
}
public static class AdmissionPageConfig {
private String year;
private String url;
private String description;
private String tableSelector;
private boolean enabled;
public AdmissionPageConfig() {
this.enabled = true;
}
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getTableSelector() {
return tableSelector;
}
public void setTableSelector(String tableSelector) {
this.tableSelector = tableSelector;
}
public boolean isEnabled() {
return enabled;
}
public void setEnabled(boolean enabled) {
this.enabled = enabled;
}
}
}

159
project/src/main/java/com/hnu/crawler/parser/AdmissionParser.java

@ -0,0 +1,159 @@
package com.hnu.crawler.parser;
import com.hnu.crawler.model.AdmissionInfo;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
public class AdmissionParser {
private static final Logger logger = LoggerFactory.getLogger(AdmissionParser.class);
public static List<AdmissionInfo> parseTable(String html, String sourceUrl, String universityName, String year) {
List<AdmissionInfo> infoList = new ArrayList<>();
try {
Document doc = Jsoup.parse(html);
Elements tables = doc.select("table");
if (tables.isEmpty()) {
logger.warn("未找到表格数据");
return infoList;
}
for (Element table : tables) {
Elements rows = table.select("tr");
if (rows.size() <= 1) {
continue;
}
Elements headerRow = rows.first().select("th, td");
List<String> headers = new ArrayList<>();
for (Element header : headerRow) {
headers.add(header.text().trim());
}
for (int i = 1; i < rows.size(); i++) {
Element row = rows.get(i);
Elements cells = row.select("td");
if (cells.size() < headers.size()) {
continue;
}
AdmissionInfo info = new AdmissionInfo();
info.setUniversityName(universityName);
info.setYear(year);
info.setSourceUrl(sourceUrl);
for (int j = 0; j < cells.size() && j < headers.size(); j++) {
String header = headers.get(j);
String value = cells.get(j).text().trim();
parseField(info, header, value);
}
if (info.getMajorName() != null || info.getMinScore() != null) {
infoList.add(info);
}
}
}
logger.info("解析到 {} 条招生信息", infoList.size());
} catch (Exception e) {
logger.error("解析HTML失败", e);
}
return infoList;
}
private static void parseField(AdmissionInfo info, String header, String value) {
if (value == null || value.isEmpty()) {
return;
}
header = header.toLowerCase();
if (header.contains("专业") && (header.contains("名称") || header.contains("专业名"))) {
info.setMajorName(value);
} else if (header.contains("专业") && header.contains("代码")) {
info.setMajorCode(value);
} else if (header.contains("院校") && header.contains("代码")) {
info.setUniversityCode(value);
} else if (header.contains("省份")) {
info.setProvince(value);
} else if (header.contains("科类") || header.contains("文理") || header.contains("科目")) {
info.setCategory(value);
} else if (header.contains("计划") || header.contains("人数")) {
try {
info.setPlanCount(Integer.parseInt(value));
} catch (NumberFormatException e) {
}
} else if (header.contains("最低") && header.contains("分")) {
try {
info.setMinScore(Double.parseDouble(value));
} catch (NumberFormatException e) {
}
} else if (header.contains("最高") && header.contains("分")) {
try {
info.setMaxScore(Double.parseDouble(value));
} catch (NumberFormatException e) {
}
} else if (header.contains("平均") && header.contains("分")) {
try {
info.setAvgScore(Double.parseDouble(value));
} catch (NumberFormatException e) {
}
} else if (header.contains("最低") && header.contains("位次")) {
try {
info.setMinRank(Integer.parseInt(value));
} catch (NumberFormatException e) {
}
} else if (header.contains("最高") && header.contains("位次")) {
try {
info.setMaxRank(Integer.parseInt(value));
} catch (NumberFormatException e) {
}
} else if (header.contains("批次")) {
info.setBatch(value);
} else if (header.contains("备注") || header.contains("说明")) {
info.setRemarks(value);
}
}
public static List<String> extractUrls(String html, String baseUrl) {
List<String> urls = new ArrayList<>();
try {
Document doc = Jsoup.parse(html);
Elements links = doc.select("a[href]");
for (Element link : links) {
String href = link.attr("abs:href");
if (href.isEmpty()) {
href = link.attr("href");
if (!href.startsWith("http") && baseUrl != null) {
if (href.startsWith("/")) {
href = baseUrl + href;
} else {
href = baseUrl + "/" + href;
}
}
}
if (!href.isEmpty() && (href.contains("zhaosheng") || href.contains("zs") ||
href.contains("admission") || href.contains("fenshu") ||
href.contains("score") || href.contains("lishi"))) {
urls.add(href);
}
}
} catch (Exception e) {
logger.error("提取URL失败", e);
}
return urls;
}
}

216
project/src/main/java/com/hnu/crawler/query/DataQuery.java

@ -0,0 +1,216 @@
package com.hnu.crawler.query;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import com.hnu.crawler.model.AdmissionInfo;
import com.opencsv.CSVReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
public class DataQuery {
private static final Logger logger = LoggerFactory.getLogger(DataQuery.class);
private static final ObjectMapper objectMapper = new ObjectMapper();
private static final String DATA_DIR = "data";
static {
objectMapper.registerModule(new JavaTimeModule());
}
public static List<AdmissionInfo> loadAllData() {
List<AdmissionInfo> allData = new ArrayList<>();
Path dataPath = Paths.get(DATA_DIR);
if (!Files.exists(dataPath)) {
logger.warn("数据目录不存在: {}", DATA_DIR);
return allData;
}
Path csvFile = dataPath.resolve("admission_info.csv");
if (Files.exists(csvFile)) {
allData.addAll(loadFromCsv(csvFile.toString()));
}
Path jsonFile = dataPath.resolve("admission_info.json");
if (Files.exists(jsonFile)) {
allData.addAll(loadFromJson(jsonFile.toString()));
}
return allData;
}
public static List<AdmissionInfo> loadFromCsv(String filePath) {
List<AdmissionInfo> data = new ArrayList<>();
try (CSVReader reader = new CSVReader(new FileReader(filePath))) {
List<String[]> rows = reader.readAll();
if (rows.size() <= 1) {
return data;
}
String[] headers = rows.get(0);
for (int i = 1; i < rows.size(); i++) {
String[] row = rows.get(i);
AdmissionInfo info = new AdmissionInfo();
for (int j = 0; j < headers.length && j < row.length; j++) {
String header = headers[j];
String value = row[j];
setField(info, header, value);
}
data.add(info);
}
} catch (Exception e) {
logger.error("从CSV加载数据失败: {}", filePath, e);
}
return data;
}
public static List<AdmissionInfo> loadFromJson(String filePath) {
try {
return objectMapper.readValue(Paths.get(filePath).toFile(),
new TypeReference<List<AdmissionInfo>>() {});
} catch (IOException e) {
logger.error("从JSON加载数据失败: {}", filePath, e);
return new ArrayList<>();
}
}
private static void setField(AdmissionInfo info, String header, String value) {
if (value == null || value.isEmpty()) {
return;
}
try {
switch (header) {
case "院校名称":
info.setUniversityName(value);
break;
case "院校代码":
info.setUniversityCode(value);
break;
case "省份":
info.setProvince(value);
break;
case "科类":
info.setCategory(value);
break;
case "专业名称":
info.setMajorName(value);
break;
case "专业代码":
info.setMajorCode(value);
break;
case "计划数":
info.setPlanCount(Integer.parseInt(value));
break;
case "最低分":
info.setMinScore(Double.parseDouble(value));
break;
case "最高分":
info.setMaxScore(Double.parseDouble(value));
break;
case "平均分":
info.setAvgScore(Double.parseDouble(value));
break;
case "最低位次":
info.setMinRank(Integer.parseInt(value));
break;
case "最高位次":
info.setMaxRank(Integer.parseInt(value));
break;
case "年份":
info.setYear(value);
break;
case "批次":
info.setBatch(value);
break;
case "备注":
info.setRemarks(value);
break;
}
} catch (NumberFormatException e) {
}
}
public static List<AdmissionInfo> queryByUniversity(List<AdmissionInfo> data, String universityName) {
return data.stream()
.filter(info -> info.getUniversityName() != null &&
info.getUniversityName().contains(universityName))
.collect(Collectors.toList());
}
public static List<AdmissionInfo> queryByMajor(List<AdmissionInfo> data, String majorName) {
return data.stream()
.filter(info -> info.getMajorName() != null &&
info.getMajorName().contains(majorName))
.collect(Collectors.toList());
}
public static List<AdmissionInfo> queryByYear(List<AdmissionInfo> data, String year) {
return data.stream()
.filter(info -> info.getYear() != null && info.getYear().equals(year))
.collect(Collectors.toList());
}
public static List<AdmissionInfo> queryByScoreRange(List<AdmissionInfo> data,
double minScore, double maxScore) {
return data.stream()
.filter(info -> info.getMinScore() != null &&
info.getMinScore() >= minScore && info.getMinScore() <= maxScore)
.collect(Collectors.toList());
}
public static List<AdmissionInfo> sortByScore(List<AdmissionInfo> data, boolean ascending) {
return data.stream()
.sorted((a, b) -> {
Double scoreA = a.getMinScore();
Double scoreB = b.getMinScore();
if (scoreA == null && scoreB == null) return 0;
if (scoreA == null) return 1;
if (scoreB == null) return -1;
return ascending ? scoreA.compareTo(scoreB) : scoreB.compareTo(scoreA);
})
.collect(Collectors.toList());
}
public static void printResults(List<AdmissionInfo> results) {
if (results.isEmpty()) {
System.out.println("未找到匹配的结果");
return;
}
System.out.println("\n查询结果 (共 " + results.size() + " 条):");
System.out.println("=".repeat(120));
System.out.printf("%-15s %-10s %-20s %-8s %-8s %-8s %s%n",
"院校", "年份", "专业", "最低分", "最高分", "平均分", "批次");
System.out.println("-".repeat(120));
for (AdmissionInfo info : results) {
System.out.printf("%-15s %-10s %-20s %-8.1f %-8.1f %-8.1f %s%n",
truncate(info.getUniversityName(), 15),
info.getYear() != null ? info.getYear() : "",
truncate(info.getMajorName(), 20),
info.getMinScore() != null ? info.getMinScore() : 0,
info.getMaxScore() != null ? info.getMaxScore() : 0,
info.getAvgScore() != null ? info.getAvgScore() : 0,
info.getBatch() != null ? info.getBatch() : "");
}
System.out.println("=".repeat(120));
}
private static String truncate(String str, int maxLen) {
if (str == null) return "";
return str.length() > maxLen ? str.substring(0, maxLen - 2) + ".." : str;
}
}

114
project/src/main/java/com/hnu/crawler/storage/DataStorage.java

@ -0,0 +1,114 @@
package com.hnu.crawler.storage;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import com.hnu.crawler.model.AdmissionInfo;
import com.opencsv.CSVWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
public class DataStorage {
private static final Logger logger = LoggerFactory.getLogger(DataStorage.class);
private static final ObjectMapper objectMapper = new ObjectMapper();
static {
objectMapper.registerModule(new JavaTimeModule());
objectMapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS);
objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
}
public static void ensureDirectoryExists(String directoryPath) {
try {
Path path = Paths.get(directoryPath);
if (!Files.exists(path)) {
Files.createDirectories(path);
logger.info("创建目录: {}", directoryPath);
}
} catch (IOException e) {
logger.error("创建目录失败: {}", directoryPath, e);
}
}
public static void saveToCsv(List<AdmissionInfo> infoList, String filePath) {
if (infoList == null || infoList.isEmpty()) {
logger.warn("数据为空,跳过CSV保存");
return;
}
ensureDirectoryExists(new File(filePath).getParent());
try (CSVWriter writer = new CSVWriter(new FileWriter(filePath, true))) {
File file = new File(filePath);
if (!file.exists() || file.length() == 0) {
writer.writeNext(AdmissionInfo.getHeaders().toArray(new String[0]));
}
for (AdmissionInfo info : infoList) {
writer.writeNext(info.toCsvRow().toArray(new String[0]));
}
logger.info("成功保存 {} 条数据到CSV: {}", infoList.size(), filePath);
} catch (IOException e) {
logger.error("保存CSV失败: {}", filePath, e);
}
}
public static void saveToJson(List<AdmissionInfo> infoList, String filePath) {
if (infoList == null || infoList.isEmpty()) {
logger.warn("数据为空,跳过JSON保存");
return;
}
ensureDirectoryExists(new File(filePath).getParent());
try {
List<AdmissionInfo> existingData = null;
File file = new File(filePath);
if (file.exists()) {
existingData = objectMapper.readValue(file,
objectMapper.getTypeFactory().constructCollectionType(List.class, AdmissionInfo.class));
}
if (existingData != null) {
existingData.addAll(infoList);
objectMapper.writeValue(file, existingData);
} else {
objectMapper.writeValue(file, infoList);
}
logger.info("成功保存 {} 条数据到JSON: {}", infoList.size(), filePath);
} catch (IOException e) {
logger.error("保存JSON失败: {}", filePath, e);
}
}
public static void saveToCsvOverwrite(List<AdmissionInfo> infoList, String filePath) {
if (infoList == null || infoList.isEmpty()) {
logger.warn("数据为空,跳过CSV保存");
return;
}
ensureDirectoryExists(new File(filePath).getParent());
try (CSVWriter writer = new CSVWriter(new FileWriter(filePath))) {
writer.writeNext(AdmissionInfo.getHeaders().toArray(new String[0]));
for (AdmissionInfo info : infoList) {
writer.writeNext(info.toCsvRow().toArray(new String[0]));
}
logger.info("成功覆盖保存 {} 条数据到CSV: {}", infoList.size(), filePath);
} catch (IOException e) {
logger.error("保存CSV失败: {}", filePath, e);
}
}
}

75
project/src/main/java/com/hnu/crawler/strategy/BlogStrategy.java

@ -0,0 +1,75 @@
package com.hnu.crawler.strategy;
import com.hnu.crawler.model.AdmissionInfo;
import com.hnu.crawler.util.HttpClientUtil;
import com.hnu.crawler.view.ConsoleView;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class BlogStrategy implements CrawlerStrategy {
private final ConsoleView view;
public BlogStrategy(ConsoleView view) {
this.view = view;
}
@Override
public List<AdmissionInfo> crawl(String url) {
List<AdmissionInfo> results = new ArrayList<>();
try {
String html = HttpClientUtil.fetchHtml(url);
if (html == null) {
return results;
}
Document doc = Jsoup.parse(html);
Elements blogItems = doc.select(".post, .blog-post, .entry, article");
for (Element item : blogItems) {
String title = item.select("h1, h2, .post-title, .entry-title, .title").first() != null ?
item.select("h1, h2, .post-title, .entry-title, .title").first().text() : "";
String articleUrl = item.select("a").first() != null ?
item.select("a").first().attr("abs:href") : url;
String author = item.select(".author, .post-author, .byline").text();
if (!title.isEmpty()) {
AdmissionInfo info = new AdmissionInfo();
info.setUniversityName("博客文章");
info.setMajorName(title);
info.setYear(java.time.LocalDate.now().getYear() + "");
info.setSourceUrl(articleUrl);
if (!author.isEmpty()) {
info.setRemarks("作者: " + author);
}
results.add(info);
}
}
view.printInfo("解析到 " + results.size() + " 篇博客文章");
} catch (Exception e) {
view.printError("爬取博客失败: " + e.getMessage());
}
return results;
}
@Override
public List<AdmissionInfo> crawl(java.util.Scanner scanner) {
view.printInfo("请输入博客网站URL: ");
String url = scanner.nextLine().trim();
return crawl(url);
}
@Override
public String getStrategyName() {
return "博客网站爬取";
}
}

117
project/src/main/java/com/hnu/crawler/strategy/ConfigBasedCrawler.java

@ -0,0 +1,117 @@
package com.hnu.crawler.strategy;
import com.hnu.crawler.config.ConfigManager;
import com.hnu.crawler.model.AdmissionInfo;
import com.hnu.crawler.model.UniversityConfig;
import com.hnu.crawler.parser.AdmissionParser;
import com.hnu.crawler.util.HttpClientUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
public class ConfigBasedCrawler implements CrawlerStrategy {
private static final Logger logger = LoggerFactory.getLogger(ConfigBasedCrawler.class);
@Override
public List<AdmissionInfo> crawl(Scanner scanner) {
System.out.println("\n=== 从配置文件批量爬取 ===");
List<UniversityConfig> universities = ConfigManager.loadUniversities();
if (universities.isEmpty()) {
System.out.println("未找到配置的高校,请先编辑 config/universities.json 文件");
return List.of();
}
System.out.println("已配置的高校列表:");
for (int i = 0; i < universities.size(); i++) {
UniversityConfig uni = universities.get(i);
System.out.printf(" %d. %s (%s)%n", i + 1, uni.getName(), uni.getProvince());
}
System.out.print("\n请选择要爬取的高校编号(0表示全部): ");
int choice;
try {
choice = Integer.parseInt(scanner.nextLine().trim());
} catch (NumberFormatException e) {
System.out.println("输入无效!");
return List.of();
}
List<UniversityConfig> toCrawl = new ArrayList<>();
if (choice == 0) {
toCrawl = universities;
} else if (choice > 0 && choice <= universities.size()) {
toCrawl.add(universities.get(choice - 1));
} else {
System.out.println("无效的选择!");
return List.of();
}
System.out.print("请输入请求间隔(毫秒,默认2000): ");
String delayStr = scanner.nextLine().trim();
long delay = delayStr.isEmpty() ? 2000 : Long.parseLong(delayStr);
List<AdmissionInfo> allResults = new ArrayList<>();
for (UniversityConfig uni : toCrawl) {
if (!uni.isEnabled()) {
System.out.printf("跳过已禁用的高校: %s%n", uni.getName());
continue;
}
System.out.printf("%n正在处理: %s%n", uni.getName());
for (UniversityConfig.AdmissionPageConfig page : uni.getAdmissionPages()) {
if (!page.isEnabled()) {
System.out.printf(" 跳过已禁用的页面: %s%n", page.getDescription());
continue;
}
System.out.printf(" 爬取 %s: %s%n", page.getYear(), page.getDescription());
String html = HttpClientUtil.fetchHtml(page.getUrl());
if (html != null) {
List<AdmissionInfo> infoList = AdmissionParser.parseTable(
html, page.getUrl(), uni.getName(), page.getYear());
for (AdmissionInfo info : infoList) {
info.setUniversityCode(uni.getCode());
info.setProvince(uni.getProvince());
}
if (!infoList.isEmpty()) {
allResults.addAll(infoList);
System.out.printf(" 成功获取 %d 条数据%n", infoList.size());
} else {
System.out.println(" 未解析到数据");
}
} else {
System.out.println(" 获取页面失败");
}
try {
Thread.sleep(delay);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
System.out.printf("%n批量爬取完成!共获取 %d 条数据%n", allResults.size());
return allResults;
}
@Override
public List<AdmissionInfo> crawl(String url) {
System.out.println("配置文件爬取策略需要从配置文件读取URL列表");
System.out.println("请使用菜单模式或配置文件进行批量爬取");
return List.of();
}
@Override
public String getStrategyName() {
return "配置文件批量爬取";
}
}

11
project/src/main/java/com/hnu/crawler/strategy/CrawlerStrategy.java

@ -0,0 +1,11 @@
package com.hnu.crawler.strategy;
import com.hnu.crawler.model.AdmissionInfo;
import java.util.List;
import java.util.Scanner;
public interface CrawlerStrategy {
List<AdmissionInfo> crawl(Scanner scanner);
List<AdmissionInfo> crawl(String url);
String getStrategyName();
}

72
project/src/main/java/com/hnu/crawler/strategy/NewsStrategy.java

@ -0,0 +1,72 @@
package com.hnu.crawler.strategy;
import com.hnu.crawler.model.AdmissionInfo;
import com.hnu.crawler.util.HttpClientUtil;
import com.hnu.crawler.view.ConsoleView;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class NewsStrategy implements CrawlerStrategy {
private final ConsoleView view;
public NewsStrategy(ConsoleView view) {
this.view = view;
}
@Override
public List<AdmissionInfo> crawl(String url) {
List<AdmissionInfo> results = new ArrayList<>();
try {
String html = HttpClientUtil.fetchHtml(url);
if (html == null) {
return results;
}
Document doc = Jsoup.parse(html);
Elements newsItems = doc.select("article, .news-item, .list-item, .item, .article-item");
for (Element item : newsItems) {
String title = item.select("h2, h3, .title, a").first() != null ?
item.select("h2, h3, .title, a").first().text() : "";
String articleUrl = item.select("a").first() != null ?
item.select("a").first().attr("abs:href") : url;
String summary = item.select(".summary, .description, p").text();
if (!title.isEmpty()) {
AdmissionInfo info = new AdmissionInfo();
info.setUniversityName("新闻网站");
info.setMajorName(title);
info.setYear(java.time.LocalDate.now().getYear() + "");
info.setSourceUrl(articleUrl);
results.add(info);
}
}
view.printInfo("解析到 " + results.size() + " 条新闻");
} catch (Exception e) {
view.printError("爬取新闻失败: " + e.getMessage());
}
return results;
}
@Override
public List<AdmissionInfo> crawl(java.util.Scanner scanner) {
view.printInfo("请输入新闻网站URL: ");
String url = scanner.nextLine().trim();
return crawl(url);
}
@Override
public String getStrategyName() {
return "新闻网站爬取";
}
}

67
project/src/main/java/com/hnu/crawler/strategy/SinglePageCrawler.java

@ -0,0 +1,67 @@
package com.hnu.crawler.strategy;
import com.hnu.crawler.model.AdmissionInfo;
import com.hnu.crawler.parser.AdmissionParser;
import com.hnu.crawler.util.HttpClientUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.Scanner;
public class SinglePageCrawler implements CrawlerStrategy {
private static final Logger logger = LoggerFactory.getLogger(SinglePageCrawler.class);
@Override
public List<AdmissionInfo> crawl(Scanner scanner) {
System.out.println("\n=== 单页面爬取 ===");
System.out.print("请输入目标URL: ");
String url = scanner.nextLine().trim();
System.out.print("请输入院校名称: ");
String universityName = scanner.nextLine().trim();
System.out.print("请输入年份: ");
String year = scanner.nextLine().trim();
System.out.print("请输入请求间隔(毫秒,默认1000): ");
String delayStr = scanner.nextLine().trim();
long delay = delayStr.isEmpty() ? 1000 : Long.parseLong(delayStr);
logger.info("开始爬取: {}", url);
String html = HttpClientUtil.fetchHtml(url);
if (html != null) {
List<AdmissionInfo> infoList = AdmissionParser.parseTable(html, url, universityName, year);
if (!infoList.isEmpty()) {
System.out.println("爬取完成!共获取 " + infoList.size() + " 条数据");
return infoList;
} else {
System.out.println("未解析到数据,请检查页面结构");
}
} else {
System.out.println("获取页面失败,请检查URL是否正确");
}
return List.of();
}
@Override
public List<AdmissionInfo> crawl(String url) {
logger.info("开始爬取: {}", url);
String html = HttpClientUtil.fetchHtml(url);
if (html != null) {
List<AdmissionInfo> infoList = AdmissionParser.parseTable(html, url, "未知院校", "");
if (!infoList.isEmpty()) {
System.out.println("爬取完成!共获取 " + infoList.size() + " 条数据");
return infoList;
}
}
return List.of();
}
@Override
public String getStrategyName() {
return "单页面爬取";
}
}

108
project/src/main/java/com/hnu/crawler/strategy/UniversityStrategy.java

@ -0,0 +1,108 @@
package com.hnu.crawler.strategy;
import com.hnu.crawler.model.AdmissionInfo;
import com.hnu.crawler.util.HttpClientUtil;
import com.hnu.crawler.view.ConsoleView;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class UniversityStrategy implements CrawlerStrategy {
private final ConsoleView view;
public UniversityStrategy(ConsoleView view) {
this.view = view;
}
@Override
public List<AdmissionInfo> crawl(String url) {
List<AdmissionInfo> results = new ArrayList<>();
try {
String html = HttpClientUtil.fetchHtml(url);
if (html == null) {
return results;
}
Document doc = Jsoup.parse(html);
String universityName = doc.select("title").text();
if (universityName.length() > 50) {
universityName = universityName.substring(0, 50);
}
Elements tables = doc.select("table");
for (Element table : tables) {
Elements rows = table.select("tr");
for (int i = 1; i < rows.size(); i++) {
Element row = rows.get(i);
Elements cells = row.select("td, th");
if (cells.size() >= 2) {
AdmissionInfo info = new AdmissionInfo();
info.setUniversityName(universityName);
info.setMajorName(cells.get(0).text());
if (cells.size() > 1) {
try {
String scoreStr = cells.get(1).text().replaceAll("[^0-9.]", "");
if (!scoreStr.isEmpty()) {
info.setMinScore(Double.parseDouble(scoreStr));
}
} catch (Exception ignored) {
}
}
if (cells.size() > 2) {
info.setYear(cells.get(2).text().replaceAll("[^0-9]", ""));
}
info.setSourceUrl(url);
results.add(info);
}
}
}
if (results.isEmpty()) {
Elements listItems = doc.select(".list-item, .news-item, .notice-item");
for (Element item : listItems) {
String title = item.select("a, .title").text();
String link = item.select("a").attr("abs:href");
if (!title.isEmpty()) {
AdmissionInfo info = new AdmissionInfo();
info.setUniversityName(universityName);
info.setMajorName(title);
info.setSourceUrl(link.isEmpty() ? url : link);
results.add(info);
}
}
}
view.printInfo("解析到 " + results.size() + " 条招生信息");
} catch (Exception e) {
view.printError("爬取高校网站失败: " + e.getMessage());
}
return results;
}
@Override
public List<AdmissionInfo> crawl(java.util.Scanner scanner) {
view.printInfo("请输入高校招生网站URL: ");
String url = scanner.nextLine().trim();
return crawl(url);
}
@Override
public String getStrategyName() {
return "高校网站爬取";
}
}

52
project/src/main/java/com/hnu/crawler/util/HttpClientUtil.java

@ -0,0 +1,52 @@
package com.hnu.crawler.util;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.ParseException;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
public class HttpClientUtil {
private static final Logger logger = LoggerFactory.getLogger(HttpClientUtil.class);
private static final int TIMEOUT = 30000;
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
public static String fetchHtml(String url) {
try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
HttpGet request = new HttpGet(url);
request.setHeader("User-Agent", USER_AGENT);
request.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
request.setHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
request.setHeader("Connection", "keep-alive");
try (CloseableHttpResponse response = httpClient.execute(request)) {
int statusCode = response.getCode();
if (statusCode == 200) {
String html = EntityUtils.toString(response.getEntity(), "UTF-8");
logger.info("成功获取页面: {}", url);
return html;
} else {
logger.error("请求失败,状态码: {}, URL: {}", statusCode, url);
return null;
}
}
} catch (IOException | ParseException e) {
logger.error("获取页面失败: {}", url, e);
return null;
}
}
public static void sleep(long milliseconds) {
try {
Thread.sleep(milliseconds);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("睡眠被中断", e);
}
}
}

33
project/src/main/java/com/hnu/crawler/view/ConsoleView.java

@ -0,0 +1,33 @@
package com.hnu.crawler.view;
import java.util.Scanner;
public class ConsoleView {
private static final String ANSI_RESET = "\u001B[0m";
private static final String ANSI_GREEN = "\u001B[32m";
private static final String ANSI_RED = "\u001B[31m";
private static final String ANSI_BLUE = "\u001B[34m";
private final Scanner scanner = new Scanner(System.in);
public String readLine() {
System.out.print("> ");
return scanner.nextLine();
}
public void printSuccess(String msg) {
System.out.println(ANSI_GREEN + msg + ANSI_RESET);
}
public void printError(String msg) {
System.out.println(ANSI_RED + msg + ANSI_RESET);
}
public void printInfo(String msg) {
System.out.println(ANSI_BLUE + msg + ANSI_RESET);
}
public void print(String msg) {
System.out.println(msg);
}
}

12
project/src/main/resources/logback.xml

@ -0,0 +1,12 @@
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
<charset>UTF-8</charset>
</encoder>
</appender>
<root level="INFO">
<appender-ref ref="STDOUT" />
</root>
</configuration>

70
project/src/test/java/BankAccountTest.java

@ -0,0 +1,70 @@
import static org.junit.jupiter.api.Assertions.*;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.DisplayName;
class BankAccountTest {
private BankAccount account;
@BeforeEach
void setUp() {
account = new BankAccount("1234567890", "张三");
}
@Test
@DisplayName("测试账户初始化")
void testAccountInitialization() {
assertEquals("1234567890", account.getAccountNumber());
assertEquals("张三", account.getOwnerName());
assertEquals(0.0, account.getBalance(), 0.001);
}
@Test
@DisplayName("测试存款功能")
void testDeposit() {
account.deposit(1000.0);
assertEquals(1000.0, account.getBalance(), 0.001);
}
@Test
@DisplayName("测试存款负数")
void testDepositNegativeAmount() {
double initialBalance = account.getBalance();
account.deposit(-100.0);
assertEquals(initialBalance, account.getBalance(), 0.001);
}
@Test
@DisplayName("测试取款功能")
void testWithdraw() {
account.deposit(1000.0);
account.withdraw(500.0);
assertEquals(500.0, account.getBalance(), 0.001);
}
@Test
@DisplayName("测试取款超过余额")
void testWithdrawInsufficientBalance() {
account.deposit(500.0);
double balanceBefore = account.getBalance();
account.withdraw(1000.0);
assertEquals(balanceBefore, account.getBalance(), 0.001);
}
@Test
@DisplayName("测试取款负数")
void testWithdrawNegativeAmount() {
account.deposit(1000.0);
double balanceBefore = account.getBalance();
account.withdraw(-100.0);
assertEquals(balanceBefore, account.getBalance(), 0.001);
}
@Test
@DisplayName("测试设置户主姓名")
void testSetOwnerName() {
account.setOwnerName("李四");
assertEquals("李四", account.getOwnerName());
}
}
Loading…
Cancel
Save