diff --git a/project/202506050208-翁希怡-期末实验报告.docx b/project/202506050208-翁希怡-期末实验报告.docx new file mode 100644 index 0000000..516143b Binary files /dev/null and b/project/202506050208-翁希怡-期末实验报告.docx differ diff --git a/project/src/main/java/BankAccount.java b/project/src/main/java/BankAccount.java new file mode 100644 index 0000000..6ed7333 --- /dev/null +++ b/project/src/main/java/BankAccount.java @@ -0,0 +1,58 @@ +public class BankAccount{ + private String accountNumber; + private String ownerName; + private double balance; + + public BankAccount(String accountNumber, String ownerName) { + this.accountNumber=accountNumber; + this.ownerName=ownerName; + this.balance=0.0; + } + + public String getAccountNumber(){ + return accountNumber; + } + + public String getOwnerName(){ + return ownerName; + } + + public void setOwnerName(String ownerName){ + this.ownerName=ownerName; + } + + public double getBalance(){ + return balance; + } + + public void deposit(double amount){ + if (amount>0){ + balance+=amount; + System.out.println("存款成功,当前余额:"+balance); + }else{ + System.out.println("存款金额必须大于0"); + } + } + + public void withdraw(double amount){ + if (amount>0 && amount<=balance){ + balance-=amount; + System.out.println("取款成功,当前余额:"+balance); + }else{ + System.out.println("余额不足或金额无效"); + } + } + + public static void main(String[] args){ + BankAccount account = new BankAccount("1234567890", "张三"); + System.out.println("账户创建成功!"); + System.out.println("账户号:" + account.getAccountNumber()); + System.out.println("户主:" + account.getOwnerName()); + System.out.println("初始余额:" + account.getBalance()); + + account.deposit(1000); + account.withdraw(500); + account.withdraw(600); + account.deposit(-100); + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/AdmissionCrawlerMain.java b/project/src/main/java/com/hnu/crawler/AdmissionCrawlerMain.java new file mode 100644 index 0000000..2e4d7d6 --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/AdmissionCrawlerMain.java @@ -0,0 +1,338 @@ +package com.hnu.crawler; + +import com.hnu.crawler.command.Command; +import com.hnu.crawler.command.CrawlCommand; +import com.hnu.crawler.command.DemoCommand; +import com.hnu.crawler.command.ExitCommand; +import com.hnu.crawler.command.HelpCommand; +import com.hnu.crawler.command.ListCommand; +import com.hnu.crawler.command.QueryCommand; +import com.hnu.crawler.config.ConfigManager; +import com.hnu.crawler.model.AdmissionInfo; +import com.hnu.crawler.model.UniversityConfig; +import com.hnu.crawler.query.DataQuery; +import com.hnu.crawler.storage.DataStorage; +import com.hnu.crawler.strategy.ConfigBasedCrawler; +import com.hnu.crawler.strategy.CrawlerStrategy; +import com.hnu.crawler.strategy.SinglePageCrawler; +import com.hnu.crawler.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Scanner; + +public class AdmissionCrawlerMain { + private static final Logger logger = LoggerFactory.getLogger(AdmissionCrawlerMain.class); + private static final String OUTPUT_DIR = "data"; + private static final String CSV_FILE = OUTPUT_DIR + "/admission_info.csv"; + private static final String JSON_FILE = OUTPUT_DIR + "/admission_info.json"; + + public static void main(String[] args) { + System.out.println("========================================"); + System.out.println(" 高校招生信息爬虫系统 v1.0"); + System.out.println("========================================"); + + try { + ConfigManager.createSampleConfig(); + System.out.println("[INFO] 配置文件加载完成"); + + ConsoleView view = new ConsoleView(); + Map commands = registerCommands(view); + System.out.println("[INFO] 命令注册完成"); + + if (args.length > 0) { + executeCommand(args, commands, view); + return; + } + + runInteractiveMode(commands, view); + } catch (Exception e) { + System.err.println("[ERROR] 程序启动失败: " + e.getMessage()); + e.printStackTrace(); + } + } + + private static Map registerCommands(ConsoleView view) { + Map commands = new HashMap<>(); + commands.put("crawl", new CrawlCommand(view)); + commands.put("list", new ListCommand(view)); + commands.put("query", new QueryCommand(view)); + commands.put("demo", new DemoCommand(view)); + commands.put("help", new HelpCommand(view)); + commands.put("exit", new ExitCommand(view)); + return commands; + } + + private static void executeCommand(String[] args, Map commands, ConsoleView view) { + String cmdName = args[0].toLowerCase(); + Command command = commands.get(cmdName); + + if (command != null) { + command.execute(args); + } else { + view.printError("未知命令: " + cmdName); + view.printInfo("输入 help 查看可用命令"); + } + } + + private static void runInteractiveMode(Map commands, ConsoleView view) { + view.printSuccess("欢迎使用高校招生信息爬虫系统!"); + view.printInfo("输入 'help' 查看可用命令"); + + Scanner scanner = new Scanner(System.in); + + while (true) { + String input = view.readLine().trim(); + if (input.isEmpty()) continue; + + String[] parts = input.split("\\s+"); + String cmdName = parts[0].toLowerCase(); + + if (cmdName.equals("menu")) { + showMainMenu(view); + handleMenuSelection(scanner, view); + continue; + } + + Command command = commands.get(cmdName); + if (command != null) { + command.execute(parts); + } else { + view.printError("未知命令: " + cmdName); + view.printInfo("输入 'help' 查看可用命令,或输入 'menu' 返回菜单模式"); + } + } + } + + private static void showMainMenu(ConsoleView view) { + view.print("\n"); + view.print("╔══════════════════════════════════════════╗"); + view.print("║ 高校本科招生信息爬虫系统 ║"); + view.print("╠══════════════════════════════════════════╣"); + view.print("║ 1. 单页面爬取 ║"); + view.print("║ 2. 从配置文件批量爬取 ║"); + view.print("║ 3. CLI命令模式 ║"); + view.print("║ 4. 数据查询 ║"); + view.print("║ 5. 演示模式(生成模拟数据) ║"); + view.print("║ 6. 查看配置高校列表 ║"); + view.print("║ 0. 退出程序 ║"); + view.print("╚══════════════════════════════════════════╝"); + view.print("请选择操作 (0-6): "); + } + + private static void handleMenuSelection(Scanner scanner, ConsoleView view) { + int choice; + try { + choice = Integer.parseInt(scanner.nextLine().trim()); + } catch (NumberFormatException e) { + view.printError("输入无效,请输入数字!"); + return; + } + + switch (choice) { + case 1: + crawlWithStrategy(new SinglePageCrawler(), scanner, view); + break; + case 2: + crawlWithStrategy(new ConfigBasedCrawler(), scanner, view); + break; + case 3: + view.printInfo("已切换到CLI命令模式"); + view.printInfo("输入 'menu' 返回菜单模式,输入 'help' 查看命令"); + break; + case 4: + queryData(scanner, view); + break; + case 5: + demoMode(view); + break; + case 6: + showConfig(view); + break; + case 0: + logger.info("程序退出"); + scanner.close(); + System.exit(0); + break; + default: + view.printError("无效选项,请重新选择!"); + } + } + + private static void crawlWithStrategy(CrawlerStrategy strategy, Scanner scanner, ConsoleView view) { + logger.info("使用{}策略进行爬取", strategy.getStrategyName()); + List results = strategy.crawl(scanner); + + if (!results.isEmpty()) { + DataStorage.saveToCsv(results, CSV_FILE); + DataStorage.saveToJson(results, JSON_FILE); + view.printSuccess("数据已保存到: " + CSV_FILE + " 和 " + JSON_FILE); + } + } + + private static void queryData(Scanner scanner, ConsoleView view) { + view.print("\n=== 数据查询 ==="); + List allData = DataQuery.loadAllData(); + + if (allData.isEmpty()) { + view.printInfo("暂无数据,请先进行爬取或使用演示模式!"); + return; + } + + view.print("当前共有 " + allData.size() + " 条数据"); + + while (true) { + view.print("\n查询选项:"); + view.print(" 1. 按院校查询"); + view.print(" 2. 按专业查询"); + view.print(" 3. 按年份查询"); + view.print(" 4. 按分数段查询"); + view.print(" 5. 查看所有数据(按分数排序)"); + view.print(" 0. 返回主菜单"); + view.print("请选择查询方式: "); + + int choice; + try { + choice = Integer.parseInt(scanner.nextLine().trim()); + } catch (NumberFormatException e) { + view.printError("输入无效!"); + continue; + } + + List results = new ArrayList<>(); + + switch (choice) { + case 1: + view.print("请输入院校名称(支持模糊匹配): "); + String uniName = scanner.nextLine().trim(); + results = DataQuery.queryByUniversity(allData, uniName); + break; + case 2: + view.print("请输入专业名称(支持模糊匹配): "); + String majorName = scanner.nextLine().trim(); + results = DataQuery.queryByMajor(allData, majorName); + break; + case 3: + view.print("请输入年份: "); + String year = scanner.nextLine().trim(); + results = DataQuery.queryByYear(allData, year); + break; + case 4: + view.print("请输入最低分数: "); + double minScore = Double.parseDouble(scanner.nextLine().trim()); + view.print("请输入最高分数: "); + double maxScore = Double.parseDouble(scanner.nextLine().trim()); + results = DataQuery.queryByScoreRange(allData, minScore, maxScore); + break; + case 5: + results = DataQuery.sortByScore(allData, false); + break; + case 0: + return; + default: + view.printError("无效选项!"); + continue; + } + + DataQuery.printResults(results); + } + } + + private static void showConfig(ConsoleView view) { + view.print("\n=== 已配置高校列表 ==="); + List universities = ConfigManager.loadUniversities(); + + if (universities.isEmpty()) { + view.print("暂无配置,请编辑 config/universities.json 文件"); + return; + } + + for (UniversityConfig uni : universities) { + view.print(String.format("%n【%s】%s (%s)", + uni.isEnabled() ? "●" : "○", + uni.getName(), + uni.getProvince())); + view.print(String.format(" 院校代码: %s", uni.getCode())); + view.print(String.format(" 配置页面数: %d", + uni.getAdmissionPages() != null ? uni.getAdmissionPages().size() : 0)); + + if (uni.getAdmissionPages() != null) { + for (UniversityConfig.AdmissionPageConfig page : uni.getAdmissionPages()) { + view.print(String.format(" [%s] %s - %s", + page.isEnabled() ? "启用" : "禁用", + page.getYear(), + page.getDescription())); + } + } + } + view.print("\n提示: 编辑 config/universities.json 文件可添加更多高校配置"); + } + + private static void demoMode(ConsoleView view) { + view.print("\n=== 演示模式 ==="); + logger.info("进入演示模式"); + + List demoData = createDemoData(); + + view.print("生成演示数据..."); + view.print("共生成 " + demoData.size() + " 条演示数据"); + + DataStorage.saveToCsvOverwrite(demoData, CSV_FILE); + DataStorage.saveToJson(demoData, JSON_FILE); + + view.printSuccess("\n演示数据已保存到:"); + view.print(" - CSV: " + CSV_FILE); + view.print(" - JSON: " + JSON_FILE); + view.print("\n现在可以选择「数据查询」功能来查询演示数据!"); + } + + private static List createDemoData() { + List data = new ArrayList<>(); + + String[] universities = {"湖南大学", "中南大学", "湖南师范大学", "国防科技大学"}; + String[] majors = {"计算机科学与技术", "软件工程", "电子信息工程", "机械工程", "土木工程", "金融学", "临床医学"}; + String[] categories = {"物理类", "历史类"}; + String[] batches = {"本科批", "本科提前批"}; + String[] years = {"2022", "2023", "2024"}; + + int id = 1; + for (String year : years) { + for (String university : universities) { + for (String major : majors) { + for (String category : categories) { + AdmissionInfo info = new AdmissionInfo(); + info.setUniversityName(university); + info.setUniversityCode(String.format("%04d", 10530 + id++ % 10)); + info.setProvince("湖南省"); + info.setCategory(category); + info.setMajorName(major); + info.setMajorCode(String.format("%06d", 800000 + id * 10)); + info.setPlanCount((int) (Math.random() * 50 + 10)); + + double baseScore = 550; + if (university.equals("国防科技大学")) baseScore += 50; + if (university.equals("中南大学")) baseScore += 30; + if (major.equals("临床医学")) baseScore += 20; + if (major.equals("计算机科学与技术")) baseScore += 15; + + info.setMinScore(baseScore + Math.random() * 60); + info.setMaxScore(info.getMinScore() + Math.random() * 30); + info.setAvgScore((info.getMinScore() + info.getMaxScore()) / 2); + info.setMinRank((int) (Math.random() * 10000 + 1000)); + info.setMaxRank(info.getMinRank() + (int) (Math.random() * 500)); + info.setYear(year); + info.setBatch(batches[(int) (Math.random() * batches.length)]); + info.setSourceUrl("https://example.edu/admission/" + year); + data.add(info); + } + } + } + } + + return data; + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/command/Command.java b/project/src/main/java/com/hnu/crawler/command/Command.java new file mode 100644 index 0000000..907bbc3 --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/command/Command.java @@ -0,0 +1,6 @@ +package com.hnu.crawler.command; + +public interface Command { + String getName(); + void execute(String[] args); +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/command/CrawlCommand.java b/project/src/main/java/com/hnu/crawler/command/CrawlCommand.java new file mode 100644 index 0000000..7e7a588 --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/command/CrawlCommand.java @@ -0,0 +1,73 @@ +package com.hnu.crawler.command; + +import com.hnu.crawler.strategy.CrawlerStrategy; +import com.hnu.crawler.strategy.NewsStrategy; +import com.hnu.crawler.strategy.BlogStrategy; +import com.hnu.crawler.strategy.UniversityStrategy; +import com.hnu.crawler.storage.DataStorage; +import com.hnu.crawler.view.ConsoleView; +import com.hnu.crawler.model.AdmissionInfo; + +import java.util.List; +import java.util.Scanner; + +public class CrawlCommand implements Command { + private final ConsoleView view; + + public CrawlCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "crawl"; + } + + @Override + public void execute(String[] args) { + if (args.length < 2) { + view.printError("用法: crawl <类型> "); + view.printInfo("支持的类型: news(新闻), blog(博客), university(高校)"); + return; + } + + String type = args[1].toLowerCase(); + String url = args.length > 2 ? args[2] : ""; + + CrawlerStrategy strategy = null; + + switch (type) { + case "news": + strategy = new NewsStrategy(view); + break; + case "blog": + strategy = new BlogStrategy(view); + break; + case "university": + strategy = new UniversityStrategy(view); + break; + default: + view.printError("未知类型: " + type); + view.printInfo("支持的类型: news(新闻), blog(博客), university(高校)"); + return; + } + + if (url.isEmpty()) { + Scanner scanner = new Scanner(System.in); + view.printInfo("请输入目标URL: "); + url = scanner.nextLine().trim(); + } + + view.printInfo("开始爬取 [" + type + "]: " + url); + List results = strategy.crawl(url); + + if (!results.isEmpty()) { + DataStorage.saveToCsv(results, "data/crawler_results.csv"); + DataStorage.saveToJson(results, "data/crawler_results.json"); + view.printSuccess("爬取完成!共获取 " + results.size() + " 条数据"); + view.printInfo("数据已保存到 data/crawler_results.csv 和 data/crawler_results.json"); + } else { + view.printError("未获取到数据,请检查URL或网站结构"); + } + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/command/DemoCommand.java b/project/src/main/java/com/hnu/crawler/command/DemoCommand.java new file mode 100644 index 0000000..dc52676 --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/command/DemoCommand.java @@ -0,0 +1,60 @@ +package com.hnu.crawler.command; + +import com.hnu.crawler.storage.DataStorage; +import com.hnu.crawler.view.ConsoleView; +import com.hnu.crawler.model.AdmissionInfo; + +import java.util.ArrayList; +import java.util.List; + +public class DemoCommand implements Command { + private final ConsoleView view; + + public DemoCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "demo"; + } + + @Override + public void execute(String[] args) { + view.printInfo("生成演示数据..."); + + List demoData = createDemoData(); + + DataStorage.saveToCsvOverwrite(demoData, "data/crawler_results.csv"); + DataStorage.saveToJson(demoData, "data/crawler_results.json"); + + view.printSuccess("演示数据生成完成!共 " + demoData.size() + " 条"); + view.printInfo("数据已保存到 data/crawler_results.csv 和 data/crawler_results.json"); + } + + private List createDemoData() { + List data = new ArrayList<>(); + + String[] universities = {"湖南大学", "中南大学", "湖南师范大学"}; + String[] majors = {"计算机科学与技术", "软件工程", "电子信息工程"}; + String[] years = {"2022", "2023", "2024"}; + + int id = 1; + for (String year : years) { + for (String university : universities) { + for (String major : majors) { + AdmissionInfo info = new AdmissionInfo(); + info.setUniversityName(university); + info.setMajorName(major); + info.setYear(year); + info.setMinScore(550 + Math.random() * 80); + info.setMaxScore(info.getMinScore() + Math.random() * 30); + info.setAvgScore((info.getMinScore() + info.getMaxScore()) / 2); + data.add(info); + } + } + } + + return data; + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/command/ExitCommand.java b/project/src/main/java/com/hnu/crawler/command/ExitCommand.java new file mode 100644 index 0000000..5ae47ab --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/command/ExitCommand.java @@ -0,0 +1,22 @@ +package com.hnu.crawler.command; + +import com.hnu.crawler.view.ConsoleView; + +public class ExitCommand implements Command { + private final ConsoleView view; + + public ExitCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "exit"; + } + + @Override + public void execute(String[] args) { + view.printSuccess("程序退出"); + System.exit(0); + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/command/HelpCommand.java b/project/src/main/java/com/hnu/crawler/command/HelpCommand.java new file mode 100644 index 0000000..fe06782 --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/command/HelpCommand.java @@ -0,0 +1,36 @@ +package com.hnu.crawler.command; + +import com.hnu.crawler.view.ConsoleView; + +public class HelpCommand implements Command { + private final ConsoleView view; + + public HelpCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "help"; + } + + @Override + public void execute(String[] args) { + view.printInfo("可用命令:"); + view.printInfo(" crawl <类型> [URL] - 爬取网站数据"); + view.printInfo(" 类型: news(新闻), blog(博客), university(高校)"); + view.printInfo(" 示例: crawl news https://news.example.com"); + view.printInfo(" list - 列出所有已爬取的数据"); + view.printInfo(" query <选项> <关键词> - 查询数据"); + view.printInfo(" 选项: university(院校), major(专业), year(年份)"); + view.printInfo(" 示例: query university 湖南"); + view.printInfo(" demo - 生成演示数据"); + view.printInfo(" help - 显示此帮助信息"); + view.printInfo(" exit - 退出程序"); + view.printInfo(""); + view.printInfo("支持的网站类型:"); + view.printInfo(" 1. 新闻网站 (news) - 爬取新闻列表和内容"); + view.printInfo(" 2. 博客网站 (blog) - 爬取博客文章"); + view.printInfo(" 3. 高校网站 (university) - 爬取招生信息"); + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/command/ListCommand.java b/project/src/main/java/com/hnu/crawler/command/ListCommand.java new file mode 100644 index 0000000..cb0228e --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/command/ListCommand.java @@ -0,0 +1,40 @@ +package com.hnu.crawler.command; + +import com.hnu.crawler.query.DataQuery; +import com.hnu.crawler.view.ConsoleView; +import com.hnu.crawler.model.AdmissionInfo; + +import java.util.List; + +public class ListCommand implements Command { + private final ConsoleView view; + + public ListCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "list"; + } + + @Override + public void execute(String[] args) { + List allData = DataQuery.loadAllData(); + + if (allData.isEmpty()) { + view.printInfo("暂无数据,请先使用 crawl 命令爬取数据"); + return; + } + + view.printInfo("共 " + allData.size() + " 条数据:"); + view.printInfo("====================================="); + for (int i = 0; i < allData.size(); i++) { + AdmissionInfo info = allData.get(i); + view.printInfo((i + 1) + ". " + info.getUniversityName() + " - " + info.getMajorName()); + view.printInfo(" 分数: " + info.getMinScore() + " - " + info.getMaxScore()); + view.printInfo(" 年份: " + info.getYear()); + } + view.printInfo("====================================="); + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/command/QueryCommand.java b/project/src/main/java/com/hnu/crawler/command/QueryCommand.java new file mode 100644 index 0000000..9d46d66 --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/command/QueryCommand.java @@ -0,0 +1,65 @@ +package com.hnu.crawler.command; + +import com.hnu.crawler.query.DataQuery; +import com.hnu.crawler.view.ConsoleView; +import com.hnu.crawler.model.AdmissionInfo; + +import java.util.List; + +public class QueryCommand implements Command { + private final ConsoleView view; + + public QueryCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "query"; + } + + @Override + public void execute(String[] args) { + if (args.length < 3) { + view.printError("用法: query <选项> <关键词>"); + view.printInfo("选项: university(院校), major(专业), year(年份)"); + return; + } + + String option = args[1].toLowerCase(); + String keyword = args[2]; + + List allData = DataQuery.loadAllData(); + + if (allData.isEmpty()) { + view.printInfo("暂无数据,请先爬取或使用 demo 命令生成演示数据"); + return; + } + + List results; + + switch (option) { + case "university": + results = DataQuery.queryByUniversity(allData, keyword); + break; + case "major": + results = DataQuery.queryByMajor(allData, keyword); + break; + case "year": + results = DataQuery.queryByYear(allData, keyword); + break; + default: + view.printError("未知选项: " + option); + return; + } + + if (results.isEmpty()) { + view.printInfo("未找到匹配的数据"); + } else { + view.printInfo("找到 " + results.size() + " 条匹配数据:"); + for (AdmissionInfo info : results) { + view.printInfo("- " + info.getUniversityName() + " - " + info.getMajorName()); + } + } + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/config/ConfigManager.java b/project/src/main/java/com/hnu/crawler/config/ConfigManager.java new file mode 100644 index 0000000..ed1e45f --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/config/ConfigManager.java @@ -0,0 +1,135 @@ +package com.hnu.crawler.config; + +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; +import com.hnu.crawler.model.UniversityConfig; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class ConfigManager { + private static final Logger logger = LoggerFactory.getLogger(ConfigManager.class); + private static final ObjectMapper objectMapper = new ObjectMapper(); + private static final String CONFIG_DIR = "config"; + private static final String UNIVERSITIES_CONFIG = CONFIG_DIR + "/universities.json"; + + static { + objectMapper.registerModule(new JavaTimeModule()); + objectMapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); + objectMapper.enable(SerializationFeature.INDENT_OUTPUT); + } + + public static void ensureConfigDir() { + File dir = new File(CONFIG_DIR); + if (!dir.exists()) { + dir.mkdirs(); + logger.info("创建配置目录: {}", CONFIG_DIR); + } + } + + public static List loadUniversities() { + ensureConfigDir(); + File configFile = new File(UNIVERSITIES_CONFIG); + + if (!configFile.exists()) { + logger.warn("配置文件不存在: {}", UNIVERSITIES_CONFIG); + return new ArrayList<>(); + } + + try { + return objectMapper.readValue(configFile, + new TypeReference>() {}); + } catch (IOException e) { + logger.error("加载高校配置失败", e); + return new ArrayList<>(); + } + } + + public static void saveUniversities(List universities) { + ensureConfigDir(); + try { + objectMapper.writeValue(new File(UNIVERSITIES_CONFIG), universities); + logger.info("保存高校配置成功,共 {} 所高校", universities.size()); + } catch (IOException e) { + logger.error("保存高校配置失败", e); + } + } + + public static UniversityConfig findUniversityById(String id) { + List universities = loadUniversities(); + return universities.stream() + .filter(u -> u.getId().equals(id)) + .findFirst() + .orElse(null); + } + + public static List findUniversityByName(String name) { + List universities = loadUniversities(); + List result = new ArrayList<>(); + for (UniversityConfig uni : universities) { + if (uni.getName().contains(name)) { + result.add(uni); + } + } + return result; + } + + public static void createSampleConfig() { + ensureConfigDir(); + File configFile = new File(UNIVERSITIES_CONFIG); + + if (configFile.exists()) { + logger.info("示例配置已存在,跳过创建"); + return; + } + + List sampleConfigs = new ArrayList<>(); + + UniversityConfig hnu = new UniversityConfig(); + hnu.setId("hnu"); + hnu.setName("湖南大学"); + hnu.setCode("10532"); + hnu.setProvince("湖南省"); + hnu.setBaseUrl("https://admission.hnu.edu.cn"); + + List hnuPages = new ArrayList<>(); + + UniversityConfig.AdmissionPageConfig hnu2024 = new UniversityConfig.AdmissionPageConfig(); + hnu2024.setYear("2024"); + hnu2024.setUrl("https://admission.hnu.edu.cn/info/1008/3001.htm"); + hnu2024.setDescription("2024年本科招生分数线"); + hnu2024.setTableSelector("table"); + hnuPages.add(hnu2024); + + hnu.setAdmissionPages(hnuPages); + sampleConfigs.add(hnu); + + UniversityConfig csu = new UniversityConfig(); + csu.setId("csu"); + csu.setName("中南大学"); + csu.setCode("10533"); + csu.setProvince("湖南省"); + csu.setBaseUrl("https://zhaosheng.csu.edu.cn"); + + List csuPages = new ArrayList<>(); + + UniversityConfig.AdmissionPageConfig csu2024 = new UniversityConfig.AdmissionPageConfig(); + csu2024.setYear("2024"); + csu2024.setUrl("https://zhaosheng.csu.edu.cn/xxfw/lnfs.htm"); + csu2024.setDescription("2024年本科招生分数线"); + csu2024.setTableSelector("table"); + csuPages.add(csu2024); + + csu.setAdmissionPages(csuPages); + sampleConfigs.add(csu); + + saveUniversities(sampleConfigs); + logger.info("创建示例配置文件成功: {}", UNIVERSITIES_CONFIG); + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/model/AdmissionInfo.java b/project/src/main/java/com/hnu/crawler/model/AdmissionInfo.java new file mode 100644 index 0000000..3deaa36 --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/model/AdmissionInfo.java @@ -0,0 +1,219 @@ +package com.hnu.crawler.model; + +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.List; + +public class AdmissionInfo { + private String universityName; + private String universityCode; + private String province; + private String category; + private String majorName; + private String majorCode; + private Integer planCount; + private Double minScore; + private Double maxScore; + private Double avgScore; + private Integer minRank; + private Integer maxRank; + private String year; + private String batch; + private String remarks; + private LocalDateTime crawlTime; + private String sourceUrl; + + public AdmissionInfo() { + this.crawlTime = LocalDateTime.now(); + } + + public String getUniversityName() { + return universityName; + } + + public void setUniversityName(String universityName) { + this.universityName = universityName; + } + + public String getUniversityCode() { + return universityCode; + } + + public void setUniversityCode(String universityCode) { + this.universityCode = universityCode; + } + + public String getProvince() { + return province; + } + + public void setProvince(String province) { + this.province = province; + } + + public String getCategory() { + return category; + } + + public void setCategory(String category) { + this.category = category; + } + + public String getMajorName() { + return majorName; + } + + public void setMajorName(String majorName) { + this.majorName = majorName; + } + + public String getMajorCode() { + return majorCode; + } + + public void setMajorCode(String majorCode) { + this.majorCode = majorCode; + } + + public Integer getPlanCount() { + return planCount; + } + + public void setPlanCount(Integer planCount) { + this.planCount = planCount; + } + + public Double getMinScore() { + return minScore; + } + + public void setMinScore(Double minScore) { + this.minScore = minScore; + } + + public Double getMaxScore() { + return maxScore; + } + + public void setMaxScore(Double maxScore) { + this.maxScore = maxScore; + } + + public Double getAvgScore() { + return avgScore; + } + + public void setAvgScore(Double avgScore) { + this.avgScore = avgScore; + } + + public Integer getMinRank() { + return minRank; + } + + public void setMinRank(Integer minRank) { + this.minRank = minRank; + } + + public Integer getMaxRank() { + return maxRank; + } + + public void setMaxRank(Integer maxRank) { + this.maxRank = maxRank; + } + + public String getYear() { + return year; + } + + public void setYear(String year) { + this.year = year; + } + + public String getBatch() { + return batch; + } + + public void setBatch(String batch) { + this.batch = batch; + } + + public String getRemarks() { + return remarks; + } + + public void setRemarks(String remarks) { + this.remarks = remarks; + } + + public LocalDateTime getCrawlTime() { + return crawlTime; + } + + public void setCrawlTime(LocalDateTime crawlTime) { + this.crawlTime = crawlTime; + } + + public String getSourceUrl() { + return sourceUrl; + } + + public void setSourceUrl(String sourceUrl) { + this.sourceUrl = sourceUrl; + } + + public static List getHeaders() { + List headers = new ArrayList<>(); + headers.add("院校名称"); + headers.add("院校代码"); + headers.add("省份"); + headers.add("科类"); + headers.add("专业名称"); + headers.add("专业代码"); + headers.add("计划数"); + headers.add("最低分"); + headers.add("最高分"); + headers.add("平均分"); + headers.add("最低位次"); + headers.add("最高位次"); + headers.add("年份"); + headers.add("批次"); + headers.add("备注"); + headers.add("爬取时间"); + headers.add("来源URL"); + return headers; + } + + public List toCsvRow() { + List row = new ArrayList<>(); + row.add(universityName != null ? universityName : ""); + row.add(universityCode != null ? universityCode : ""); + row.add(province != null ? province : ""); + row.add(category != null ? category : ""); + row.add(majorName != null ? majorName : ""); + row.add(majorCode != null ? majorCode : ""); + row.add(planCount != null ? planCount.toString() : ""); + row.add(minScore != null ? minScore.toString() : ""); + row.add(maxScore != null ? maxScore.toString() : ""); + row.add(avgScore != null ? avgScore.toString() : ""); + row.add(minRank != null ? minRank.toString() : ""); + row.add(maxRank != null ? maxRank.toString() : ""); + row.add(year != null ? year : ""); + row.add(batch != null ? batch : ""); + row.add(remarks != null ? remarks : ""); + row.add(crawlTime != null ? crawlTime.toString() : ""); + row.add(sourceUrl != null ? sourceUrl : ""); + return row; + } + + @Override + public String toString() { + return "AdmissionInfo{" + + "universityName='" + universityName + '\'' + + ", majorName='" + majorName + '\'' + + ", year='" + year + '\'' + + ", minScore=" + minScore + + '}'; + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/model/UniversityConfig.java b/project/src/main/java/com/hnu/crawler/model/UniversityConfig.java new file mode 100644 index 0000000..098adf9 --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/model/UniversityConfig.java @@ -0,0 +1,125 @@ +package com.hnu.crawler.model; + +import java.util.List; + +public class UniversityConfig { + private String id; + private String name; + private String code; + private String province; + private String baseUrl; + private List admissionPages; + private boolean enabled; + + public UniversityConfig() { + this.enabled = true; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getCode() { + return code; + } + + public void setCode(String code) { + this.code = code; + } + + public String getProvince() { + return province; + } + + public void setProvince(String province) { + this.province = province; + } + + public String getBaseUrl() { + return baseUrl; + } + + public void setBaseUrl(String baseUrl) { + this.baseUrl = baseUrl; + } + + public List getAdmissionPages() { + return admissionPages; + } + + public void setAdmissionPages(List admissionPages) { + this.admissionPages = admissionPages; + } + + public boolean isEnabled() { + return enabled; + } + + public void setEnabled(boolean enabled) { + this.enabled = enabled; + } + + public static class AdmissionPageConfig { + private String year; + private String url; + private String description; + private String tableSelector; + private boolean enabled; + + public AdmissionPageConfig() { + this.enabled = true; + } + + public String getYear() { + return year; + } + + public void setYear(String year) { + this.year = year; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getDescription() { + return description; + } + + public void setDescription(String description) { + this.description = description; + } + + public String getTableSelector() { + return tableSelector; + } + + public void setTableSelector(String tableSelector) { + this.tableSelector = tableSelector; + } + + public boolean isEnabled() { + return enabled; + } + + public void setEnabled(boolean enabled) { + this.enabled = enabled; + } + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/parser/AdmissionParser.java b/project/src/main/java/com/hnu/crawler/parser/AdmissionParser.java new file mode 100644 index 0000000..9c6668d --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/parser/AdmissionParser.java @@ -0,0 +1,159 @@ +package com.hnu.crawler.parser; + +import com.hnu.crawler.model.AdmissionInfo; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +public class AdmissionParser { + private static final Logger logger = LoggerFactory.getLogger(AdmissionParser.class); + + public static List parseTable(String html, String sourceUrl, String universityName, String year) { + List infoList = new ArrayList<>(); + + try { + Document doc = Jsoup.parse(html); + Elements tables = doc.select("table"); + + if (tables.isEmpty()) { + logger.warn("未找到表格数据"); + return infoList; + } + + for (Element table : tables) { + Elements rows = table.select("tr"); + if (rows.size() <= 1) { + continue; + } + + Elements headerRow = rows.first().select("th, td"); + List headers = new ArrayList<>(); + for (Element header : headerRow) { + headers.add(header.text().trim()); + } + + for (int i = 1; i < rows.size(); i++) { + Element row = rows.get(i); + Elements cells = row.select("td"); + + if (cells.size() < headers.size()) { + continue; + } + + AdmissionInfo info = new AdmissionInfo(); + info.setUniversityName(universityName); + info.setYear(year); + info.setSourceUrl(sourceUrl); + + for (int j = 0; j < cells.size() && j < headers.size(); j++) { + String header = headers.get(j); + String value = cells.get(j).text().trim(); + + parseField(info, header, value); + } + + if (info.getMajorName() != null || info.getMinScore() != null) { + infoList.add(info); + } + } + } + + logger.info("解析到 {} 条招生信息", infoList.size()); + } catch (Exception e) { + logger.error("解析HTML失败", e); + } + + return infoList; + } + + private static void parseField(AdmissionInfo info, String header, String value) { + if (value == null || value.isEmpty()) { + return; + } + + header = header.toLowerCase(); + + if (header.contains("专业") && (header.contains("名称") || header.contains("专业名"))) { + info.setMajorName(value); + } else if (header.contains("专业") && header.contains("代码")) { + info.setMajorCode(value); + } else if (header.contains("院校") && header.contains("代码")) { + info.setUniversityCode(value); + } else if (header.contains("省份")) { + info.setProvince(value); + } else if (header.contains("科类") || header.contains("文理") || header.contains("科目")) { + info.setCategory(value); + } else if (header.contains("计划") || header.contains("人数")) { + try { + info.setPlanCount(Integer.parseInt(value)); + } catch (NumberFormatException e) { + } + } else if (header.contains("最低") && header.contains("分")) { + try { + info.setMinScore(Double.parseDouble(value)); + } catch (NumberFormatException e) { + } + } else if (header.contains("最高") && header.contains("分")) { + try { + info.setMaxScore(Double.parseDouble(value)); + } catch (NumberFormatException e) { + } + } else if (header.contains("平均") && header.contains("分")) { + try { + info.setAvgScore(Double.parseDouble(value)); + } catch (NumberFormatException e) { + } + } else if (header.contains("最低") && header.contains("位次")) { + try { + info.setMinRank(Integer.parseInt(value)); + } catch (NumberFormatException e) { + } + } else if (header.contains("最高") && header.contains("位次")) { + try { + info.setMaxRank(Integer.parseInt(value)); + } catch (NumberFormatException e) { + } + } else if (header.contains("批次")) { + info.setBatch(value); + } else if (header.contains("备注") || header.contains("说明")) { + info.setRemarks(value); + } + } + + public static List extractUrls(String html, String baseUrl) { + List urls = new ArrayList<>(); + try { + Document doc = Jsoup.parse(html); + Elements links = doc.select("a[href]"); + + for (Element link : links) { + String href = link.attr("abs:href"); + if (href.isEmpty()) { + href = link.attr("href"); + if (!href.startsWith("http") && baseUrl != null) { + if (href.startsWith("/")) { + href = baseUrl + href; + } else { + href = baseUrl + "/" + href; + } + } + } + + if (!href.isEmpty() && (href.contains("zhaosheng") || href.contains("zs") || + href.contains("admission") || href.contains("fenshu") || + href.contains("score") || href.contains("lishi"))) { + urls.add(href); + } + } + } catch (Exception e) { + logger.error("提取URL失败", e); + } + return urls; + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/query/DataQuery.java b/project/src/main/java/com/hnu/crawler/query/DataQuery.java new file mode 100644 index 0000000..f85f36e --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/query/DataQuery.java @@ -0,0 +1,216 @@ +package com.hnu.crawler.query; + +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; +import com.hnu.crawler.model.AdmissionInfo; +import com.opencsv.CSVReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; + +public class DataQuery { + private static final Logger logger = LoggerFactory.getLogger(DataQuery.class); + private static final ObjectMapper objectMapper = new ObjectMapper(); + private static final String DATA_DIR = "data"; + + static { + objectMapper.registerModule(new JavaTimeModule()); + } + + public static List loadAllData() { + List allData = new ArrayList<>(); + Path dataPath = Paths.get(DATA_DIR); + + if (!Files.exists(dataPath)) { + logger.warn("数据目录不存在: {}", DATA_DIR); + return allData; + } + + Path csvFile = dataPath.resolve("admission_info.csv"); + if (Files.exists(csvFile)) { + allData.addAll(loadFromCsv(csvFile.toString())); + } + + Path jsonFile = dataPath.resolve("admission_info.json"); + if (Files.exists(jsonFile)) { + allData.addAll(loadFromJson(jsonFile.toString())); + } + + return allData; + } + + public static List loadFromCsv(String filePath) { + List data = new ArrayList<>(); + try (CSVReader reader = new CSVReader(new FileReader(filePath))) { + List rows = reader.readAll(); + if (rows.size() <= 1) { + return data; + } + + String[] headers = rows.get(0); + for (int i = 1; i < rows.size(); i++) { + String[] row = rows.get(i); + AdmissionInfo info = new AdmissionInfo(); + for (int j = 0; j < headers.length && j < row.length; j++) { + String header = headers[j]; + String value = row[j]; + setField(info, header, value); + } + data.add(info); + } + } catch (Exception e) { + logger.error("从CSV加载数据失败: {}", filePath, e); + } + return data; + } + + public static List loadFromJson(String filePath) { + try { + return objectMapper.readValue(Paths.get(filePath).toFile(), + new TypeReference>() {}); + } catch (IOException e) { + logger.error("从JSON加载数据失败: {}", filePath, e); + return new ArrayList<>(); + } + } + + private static void setField(AdmissionInfo info, String header, String value) { + if (value == null || value.isEmpty()) { + return; + } + + try { + switch (header) { + case "院校名称": + info.setUniversityName(value); + break; + case "院校代码": + info.setUniversityCode(value); + break; + case "省份": + info.setProvince(value); + break; + case "科类": + info.setCategory(value); + break; + case "专业名称": + info.setMajorName(value); + break; + case "专业代码": + info.setMajorCode(value); + break; + case "计划数": + info.setPlanCount(Integer.parseInt(value)); + break; + case "最低分": + info.setMinScore(Double.parseDouble(value)); + break; + case "最高分": + info.setMaxScore(Double.parseDouble(value)); + break; + case "平均分": + info.setAvgScore(Double.parseDouble(value)); + break; + case "最低位次": + info.setMinRank(Integer.parseInt(value)); + break; + case "最高位次": + info.setMaxRank(Integer.parseInt(value)); + break; + case "年份": + info.setYear(value); + break; + case "批次": + info.setBatch(value); + break; + case "备注": + info.setRemarks(value); + break; + } + } catch (NumberFormatException e) { + } + } + + public static List queryByUniversity(List data, String universityName) { + return data.stream() + .filter(info -> info.getUniversityName() != null && + info.getUniversityName().contains(universityName)) + .collect(Collectors.toList()); + } + + public static List queryByMajor(List data, String majorName) { + return data.stream() + .filter(info -> info.getMajorName() != null && + info.getMajorName().contains(majorName)) + .collect(Collectors.toList()); + } + + public static List queryByYear(List data, String year) { + return data.stream() + .filter(info -> info.getYear() != null && info.getYear().equals(year)) + .collect(Collectors.toList()); + } + + public static List queryByScoreRange(List data, + double minScore, double maxScore) { + return data.stream() + .filter(info -> info.getMinScore() != null && + info.getMinScore() >= minScore && info.getMinScore() <= maxScore) + .collect(Collectors.toList()); + } + + public static List sortByScore(List data, boolean ascending) { + return data.stream() + .sorted((a, b) -> { + Double scoreA = a.getMinScore(); + Double scoreB = b.getMinScore(); + + if (scoreA == null && scoreB == null) return 0; + if (scoreA == null) return 1; + if (scoreB == null) return -1; + + return ascending ? scoreA.compareTo(scoreB) : scoreB.compareTo(scoreA); + }) + .collect(Collectors.toList()); + } + + public static void printResults(List results) { + if (results.isEmpty()) { + System.out.println("未找到匹配的结果"); + return; + } + + System.out.println("\n查询结果 (共 " + results.size() + " 条):"); + System.out.println("=".repeat(120)); + System.out.printf("%-15s %-10s %-20s %-8s %-8s %-8s %s%n", + "院校", "年份", "专业", "最低分", "最高分", "平均分", "批次"); + System.out.println("-".repeat(120)); + + for (AdmissionInfo info : results) { + System.out.printf("%-15s %-10s %-20s %-8.1f %-8.1f %-8.1f %s%n", + truncate(info.getUniversityName(), 15), + info.getYear() != null ? info.getYear() : "", + truncate(info.getMajorName(), 20), + info.getMinScore() != null ? info.getMinScore() : 0, + info.getMaxScore() != null ? info.getMaxScore() : 0, + info.getAvgScore() != null ? info.getAvgScore() : 0, + info.getBatch() != null ? info.getBatch() : ""); + } + System.out.println("=".repeat(120)); + } + + private static String truncate(String str, int maxLen) { + if (str == null) return ""; + return str.length() > maxLen ? str.substring(0, maxLen - 2) + ".." : str; + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/storage/DataStorage.java b/project/src/main/java/com/hnu/crawler/storage/DataStorage.java new file mode 100644 index 0000000..663fb34 --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/storage/DataStorage.java @@ -0,0 +1,114 @@ +package com.hnu.crawler.storage; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; +import com.hnu.crawler.model.AdmissionInfo; +import com.opencsv.CSVWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; + +public class DataStorage { + private static final Logger logger = LoggerFactory.getLogger(DataStorage.class); + private static final ObjectMapper objectMapper = new ObjectMapper(); + + static { + objectMapper.registerModule(new JavaTimeModule()); + objectMapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); + objectMapper.enable(SerializationFeature.INDENT_OUTPUT); + } + + public static void ensureDirectoryExists(String directoryPath) { + try { + Path path = Paths.get(directoryPath); + if (!Files.exists(path)) { + Files.createDirectories(path); + logger.info("创建目录: {}", directoryPath); + } + } catch (IOException e) { + logger.error("创建目录失败: {}", directoryPath, e); + } + } + + public static void saveToCsv(List infoList, String filePath) { + if (infoList == null || infoList.isEmpty()) { + logger.warn("数据为空,跳过CSV保存"); + return; + } + + ensureDirectoryExists(new File(filePath).getParent()); + + try (CSVWriter writer = new CSVWriter(new FileWriter(filePath, true))) { + File file = new File(filePath); + if (!file.exists() || file.length() == 0) { + writer.writeNext(AdmissionInfo.getHeaders().toArray(new String[0])); + } + + for (AdmissionInfo info : infoList) { + writer.writeNext(info.toCsvRow().toArray(new String[0])); + } + + logger.info("成功保存 {} 条数据到CSV: {}", infoList.size(), filePath); + } catch (IOException e) { + logger.error("保存CSV失败: {}", filePath, e); + } + } + + public static void saveToJson(List infoList, String filePath) { + if (infoList == null || infoList.isEmpty()) { + logger.warn("数据为空,跳过JSON保存"); + return; + } + + ensureDirectoryExists(new File(filePath).getParent()); + + try { + List existingData = null; + File file = new File(filePath); + if (file.exists()) { + existingData = objectMapper.readValue(file, + objectMapper.getTypeFactory().constructCollectionType(List.class, AdmissionInfo.class)); + } + + if (existingData != null) { + existingData.addAll(infoList); + objectMapper.writeValue(file, existingData); + } else { + objectMapper.writeValue(file, infoList); + } + + logger.info("成功保存 {} 条数据到JSON: {}", infoList.size(), filePath); + } catch (IOException e) { + logger.error("保存JSON失败: {}", filePath, e); + } + } + + public static void saveToCsvOverwrite(List infoList, String filePath) { + if (infoList == null || infoList.isEmpty()) { + logger.warn("数据为空,跳过CSV保存"); + return; + } + + ensureDirectoryExists(new File(filePath).getParent()); + + try (CSVWriter writer = new CSVWriter(new FileWriter(filePath))) { + writer.writeNext(AdmissionInfo.getHeaders().toArray(new String[0])); + + for (AdmissionInfo info : infoList) { + writer.writeNext(info.toCsvRow().toArray(new String[0])); + } + + logger.info("成功覆盖保存 {} 条数据到CSV: {}", infoList.size(), filePath); + } catch (IOException e) { + logger.error("保存CSV失败: {}", filePath, e); + } + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/strategy/BlogStrategy.java b/project/src/main/java/com/hnu/crawler/strategy/BlogStrategy.java new file mode 100644 index 0000000..2e9165f --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/strategy/BlogStrategy.java @@ -0,0 +1,75 @@ +package com.hnu.crawler.strategy; + +import com.hnu.crawler.model.AdmissionInfo; +import com.hnu.crawler.util.HttpClientUtil; +import com.hnu.crawler.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; + +public class BlogStrategy implements CrawlerStrategy { + private final ConsoleView view; + + public BlogStrategy(ConsoleView view) { + this.view = view; + } + + @Override + public List crawl(String url) { + List results = new ArrayList<>(); + + try { + String html = HttpClientUtil.fetchHtml(url); + if (html == null) { + return results; + } + + Document doc = Jsoup.parse(html); + + Elements blogItems = doc.select(".post, .blog-post, .entry, article"); + + for (Element item : blogItems) { + String title = item.select("h1, h2, .post-title, .entry-title, .title").first() != null ? + item.select("h1, h2, .post-title, .entry-title, .title").first().text() : ""; + String articleUrl = item.select("a").first() != null ? + item.select("a").first().attr("abs:href") : url; + String author = item.select(".author, .post-author, .byline").text(); + + if (!title.isEmpty()) { + AdmissionInfo info = new AdmissionInfo(); + info.setUniversityName("博客文章"); + info.setMajorName(title); + info.setYear(java.time.LocalDate.now().getYear() + ""); + info.setSourceUrl(articleUrl); + if (!author.isEmpty()) { + info.setRemarks("作者: " + author); + } + results.add(info); + } + } + + view.printInfo("解析到 " + results.size() + " 篇博客文章"); + + } catch (Exception e) { + view.printError("爬取博客失败: " + e.getMessage()); + } + + return results; + } + + @Override + public List crawl(java.util.Scanner scanner) { + view.printInfo("请输入博客网站URL: "); + String url = scanner.nextLine().trim(); + return crawl(url); + } + + @Override + public String getStrategyName() { + return "博客网站爬取"; + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/strategy/ConfigBasedCrawler.java b/project/src/main/java/com/hnu/crawler/strategy/ConfigBasedCrawler.java new file mode 100644 index 0000000..2463c6d --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/strategy/ConfigBasedCrawler.java @@ -0,0 +1,117 @@ +package com.hnu.crawler.strategy; + +import com.hnu.crawler.config.ConfigManager; +import com.hnu.crawler.model.AdmissionInfo; +import com.hnu.crawler.model.UniversityConfig; +import com.hnu.crawler.parser.AdmissionParser; +import com.hnu.crawler.util.HttpClientUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.Scanner; + +public class ConfigBasedCrawler implements CrawlerStrategy { + private static final Logger logger = LoggerFactory.getLogger(ConfigBasedCrawler.class); + + @Override + public List crawl(Scanner scanner) { + System.out.println("\n=== 从配置文件批量爬取 ==="); + List universities = ConfigManager.loadUniversities(); + + if (universities.isEmpty()) { + System.out.println("未找到配置的高校,请先编辑 config/universities.json 文件"); + return List.of(); + } + + System.out.println("已配置的高校列表:"); + for (int i = 0; i < universities.size(); i++) { + UniversityConfig uni = universities.get(i); + System.out.printf(" %d. %s (%s)%n", i + 1, uni.getName(), uni.getProvince()); + } + + System.out.print("\n请选择要爬取的高校编号(0表示全部): "); + int choice; + try { + choice = Integer.parseInt(scanner.nextLine().trim()); + } catch (NumberFormatException e) { + System.out.println("输入无效!"); + return List.of(); + } + + List toCrawl = new ArrayList<>(); + if (choice == 0) { + toCrawl = universities; + } else if (choice > 0 && choice <= universities.size()) { + toCrawl.add(universities.get(choice - 1)); + } else { + System.out.println("无效的选择!"); + return List.of(); + } + + System.out.print("请输入请求间隔(毫秒,默认2000): "); + String delayStr = scanner.nextLine().trim(); + long delay = delayStr.isEmpty() ? 2000 : Long.parseLong(delayStr); + + List allResults = new ArrayList<>(); + for (UniversityConfig uni : toCrawl) { + if (!uni.isEnabled()) { + System.out.printf("跳过已禁用的高校: %s%n", uni.getName()); + continue; + } + + System.out.printf("%n正在处理: %s%n", uni.getName()); + + for (UniversityConfig.AdmissionPageConfig page : uni.getAdmissionPages()) { + if (!page.isEnabled()) { + System.out.printf(" 跳过已禁用的页面: %s%n", page.getDescription()); + continue; + } + + System.out.printf(" 爬取 %s: %s%n", page.getYear(), page.getDescription()); + + String html = HttpClientUtil.fetchHtml(page.getUrl()); + if (html != null) { + List infoList = AdmissionParser.parseTable( + html, page.getUrl(), uni.getName(), page.getYear()); + + for (AdmissionInfo info : infoList) { + info.setUniversityCode(uni.getCode()); + info.setProvince(uni.getProvince()); + } + + if (!infoList.isEmpty()) { + allResults.addAll(infoList); + System.out.printf(" 成功获取 %d 条数据%n", infoList.size()); + } else { + System.out.println(" 未解析到数据"); + } + } else { + System.out.println(" 获取页面失败"); + } + + try { + Thread.sleep(delay); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + } + + System.out.printf("%n批量爬取完成!共获取 %d 条数据%n", allResults.size()); + return allResults; + } + + @Override + public List crawl(String url) { + System.out.println("配置文件爬取策略需要从配置文件读取URL列表"); + System.out.println("请使用菜单模式或配置文件进行批量爬取"); + return List.of(); + } + + @Override + public String getStrategyName() { + return "配置文件批量爬取"; + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/strategy/CrawlerStrategy.java b/project/src/main/java/com/hnu/crawler/strategy/CrawlerStrategy.java new file mode 100644 index 0000000..271e043 --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/strategy/CrawlerStrategy.java @@ -0,0 +1,11 @@ +package com.hnu.crawler.strategy; + +import com.hnu.crawler.model.AdmissionInfo; +import java.util.List; +import java.util.Scanner; + +public interface CrawlerStrategy { + List crawl(Scanner scanner); + List crawl(String url); + String getStrategyName(); +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/strategy/NewsStrategy.java b/project/src/main/java/com/hnu/crawler/strategy/NewsStrategy.java new file mode 100644 index 0000000..0ae0ac2 --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/strategy/NewsStrategy.java @@ -0,0 +1,72 @@ +package com.hnu.crawler.strategy; + +import com.hnu.crawler.model.AdmissionInfo; +import com.hnu.crawler.util.HttpClientUtil; +import com.hnu.crawler.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; + +public class NewsStrategy implements CrawlerStrategy { + private final ConsoleView view; + + public NewsStrategy(ConsoleView view) { + this.view = view; + } + + @Override + public List crawl(String url) { + List results = new ArrayList<>(); + + try { + String html = HttpClientUtil.fetchHtml(url); + if (html == null) { + return results; + } + + Document doc = Jsoup.parse(html); + + Elements newsItems = doc.select("article, .news-item, .list-item, .item, .article-item"); + + for (Element item : newsItems) { + String title = item.select("h2, h3, .title, a").first() != null ? + item.select("h2, h3, .title, a").first().text() : ""; + String articleUrl = item.select("a").first() != null ? + item.select("a").first().attr("abs:href") : url; + String summary = item.select(".summary, .description, p").text(); + + if (!title.isEmpty()) { + AdmissionInfo info = new AdmissionInfo(); + info.setUniversityName("新闻网站"); + info.setMajorName(title); + info.setYear(java.time.LocalDate.now().getYear() + ""); + info.setSourceUrl(articleUrl); + results.add(info); + } + } + + view.printInfo("解析到 " + results.size() + " 条新闻"); + + } catch (Exception e) { + view.printError("爬取新闻失败: " + e.getMessage()); + } + + return results; + } + + @Override + public List crawl(java.util.Scanner scanner) { + view.printInfo("请输入新闻网站URL: "); + String url = scanner.nextLine().trim(); + return crawl(url); + } + + @Override + public String getStrategyName() { + return "新闻网站爬取"; + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/strategy/SinglePageCrawler.java b/project/src/main/java/com/hnu/crawler/strategy/SinglePageCrawler.java new file mode 100644 index 0000000..3b7a7a4 --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/strategy/SinglePageCrawler.java @@ -0,0 +1,67 @@ +package com.hnu.crawler.strategy; + +import com.hnu.crawler.model.AdmissionInfo; +import com.hnu.crawler.parser.AdmissionParser; +import com.hnu.crawler.util.HttpClientUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.Scanner; + +public class SinglePageCrawler implements CrawlerStrategy { + private static final Logger logger = LoggerFactory.getLogger(SinglePageCrawler.class); + + @Override + public List crawl(Scanner scanner) { + System.out.println("\n=== 单页面爬取 ==="); + System.out.print("请输入目标URL: "); + String url = scanner.nextLine().trim(); + + System.out.print("请输入院校名称: "); + String universityName = scanner.nextLine().trim(); + + System.out.print("请输入年份: "); + String year = scanner.nextLine().trim(); + + System.out.print("请输入请求间隔(毫秒,默认1000): "); + String delayStr = scanner.nextLine().trim(); + long delay = delayStr.isEmpty() ? 1000 : Long.parseLong(delayStr); + + logger.info("开始爬取: {}", url); + + String html = HttpClientUtil.fetchHtml(url); + if (html != null) { + List infoList = AdmissionParser.parseTable(html, url, universityName, year); + + if (!infoList.isEmpty()) { + System.out.println("爬取完成!共获取 " + infoList.size() + " 条数据"); + return infoList; + } else { + System.out.println("未解析到数据,请检查页面结构"); + } + } else { + System.out.println("获取页面失败,请检查URL是否正确"); + } + return List.of(); + } + + @Override + public List crawl(String url) { + logger.info("开始爬取: {}", url); + String html = HttpClientUtil.fetchHtml(url); + if (html != null) { + List infoList = AdmissionParser.parseTable(html, url, "未知院校", ""); + if (!infoList.isEmpty()) { + System.out.println("爬取完成!共获取 " + infoList.size() + " 条数据"); + return infoList; + } + } + return List.of(); + } + + @Override + public String getStrategyName() { + return "单页面爬取"; + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/strategy/UniversityStrategy.java b/project/src/main/java/com/hnu/crawler/strategy/UniversityStrategy.java new file mode 100644 index 0000000..aba2483 --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/strategy/UniversityStrategy.java @@ -0,0 +1,108 @@ +package com.hnu.crawler.strategy; + +import com.hnu.crawler.model.AdmissionInfo; +import com.hnu.crawler.util.HttpClientUtil; +import com.hnu.crawler.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; + +public class UniversityStrategy implements CrawlerStrategy { + private final ConsoleView view; + + public UniversityStrategy(ConsoleView view) { + this.view = view; + } + + @Override + public List crawl(String url) { + List results = new ArrayList<>(); + + try { + String html = HttpClientUtil.fetchHtml(url); + if (html == null) { + return results; + } + + Document doc = Jsoup.parse(html); + + String universityName = doc.select("title").text(); + if (universityName.length() > 50) { + universityName = universityName.substring(0, 50); + } + + Elements tables = doc.select("table"); + + for (Element table : tables) { + Elements rows = table.select("tr"); + + for (int i = 1; i < rows.size(); i++) { + Element row = rows.get(i); + Elements cells = row.select("td, th"); + + if (cells.size() >= 2) { + AdmissionInfo info = new AdmissionInfo(); + info.setUniversityName(universityName); + info.setMajorName(cells.get(0).text()); + + if (cells.size() > 1) { + try { + String scoreStr = cells.get(1).text().replaceAll("[^0-9.]", ""); + if (!scoreStr.isEmpty()) { + info.setMinScore(Double.parseDouble(scoreStr)); + } + } catch (Exception ignored) { + } + } + + if (cells.size() > 2) { + info.setYear(cells.get(2).text().replaceAll("[^0-9]", "")); + } + + info.setSourceUrl(url); + results.add(info); + } + } + } + + if (results.isEmpty()) { + Elements listItems = doc.select(".list-item, .news-item, .notice-item"); + for (Element item : listItems) { + String title = item.select("a, .title").text(); + String link = item.select("a").attr("abs:href"); + + if (!title.isEmpty()) { + AdmissionInfo info = new AdmissionInfo(); + info.setUniversityName(universityName); + info.setMajorName(title); + info.setSourceUrl(link.isEmpty() ? url : link); + results.add(info); + } + } + } + + view.printInfo("解析到 " + results.size() + " 条招生信息"); + + } catch (Exception e) { + view.printError("爬取高校网站失败: " + e.getMessage()); + } + + return results; + } + + @Override + public List crawl(java.util.Scanner scanner) { + view.printInfo("请输入高校招生网站URL: "); + String url = scanner.nextLine().trim(); + return crawl(url); + } + + @Override + public String getStrategyName() { + return "高校网站爬取"; + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/util/HttpClientUtil.java b/project/src/main/java/com/hnu/crawler/util/HttpClientUtil.java new file mode 100644 index 0000000..1ce348d --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/util/HttpClientUtil.java @@ -0,0 +1,52 @@ +package com.hnu.crawler.util; + +import org.apache.hc.client5.http.classic.methods.HttpGet; +import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; +import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; +import org.apache.hc.client5.http.impl.classic.HttpClients; +import org.apache.hc.core5.http.ParseException; +import org.apache.hc.core5.http.io.entity.EntityUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; + +public class HttpClientUtil { + private static final Logger logger = LoggerFactory.getLogger(HttpClientUtil.class); + private static final int TIMEOUT = 30000; + private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"; + + public static String fetchHtml(String url) { + try (CloseableHttpClient httpClient = HttpClients.createDefault()) { + HttpGet request = new HttpGet(url); + request.setHeader("User-Agent", USER_AGENT); + request.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); + request.setHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8"); + request.setHeader("Connection", "keep-alive"); + + try (CloseableHttpResponse response = httpClient.execute(request)) { + int statusCode = response.getCode(); + if (statusCode == 200) { + String html = EntityUtils.toString(response.getEntity(), "UTF-8"); + logger.info("成功获取页面: {}", url); + return html; + } else { + logger.error("请求失败,状态码: {}, URL: {}", statusCode, url); + return null; + } + } + } catch (IOException | ParseException e) { + logger.error("获取页面失败: {}", url, e); + return null; + } + } + + public static void sleep(long milliseconds) { + try { + Thread.sleep(milliseconds); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + logger.warn("睡眠被中断", e); + } + } +} \ No newline at end of file diff --git a/project/src/main/java/com/hnu/crawler/view/ConsoleView.java b/project/src/main/java/com/hnu/crawler/view/ConsoleView.java new file mode 100644 index 0000000..58aa8f6 --- /dev/null +++ b/project/src/main/java/com/hnu/crawler/view/ConsoleView.java @@ -0,0 +1,33 @@ +package com.hnu.crawler.view; + +import java.util.Scanner; + +public class ConsoleView { + private static final String ANSI_RESET = "\u001B[0m"; + private static final String ANSI_GREEN = "\u001B[32m"; + private static final String ANSI_RED = "\u001B[31m"; + private static final String ANSI_BLUE = "\u001B[34m"; + + private final Scanner scanner = new Scanner(System.in); + + public String readLine() { + System.out.print("> "); + return scanner.nextLine(); + } + + public void printSuccess(String msg) { + System.out.println(ANSI_GREEN + msg + ANSI_RESET); + } + + public void printError(String msg) { + System.out.println(ANSI_RED + msg + ANSI_RESET); + } + + public void printInfo(String msg) { + System.out.println(ANSI_BLUE + msg + ANSI_RESET); + } + + public void print(String msg) { + System.out.println(msg); + } +} \ No newline at end of file diff --git a/project/src/main/resources/logback.xml b/project/src/main/resources/logback.xml new file mode 100644 index 0000000..cfcf5b6 --- /dev/null +++ b/project/src/main/resources/logback.xml @@ -0,0 +1,12 @@ + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + UTF-8 + + + + + + + \ No newline at end of file diff --git a/project/src/test/java/BankAccountTest.java b/project/src/test/java/BankAccountTest.java new file mode 100644 index 0000000..39c6e1b --- /dev/null +++ b/project/src/test/java/BankAccountTest.java @@ -0,0 +1,70 @@ +import static org.junit.jupiter.api.Assertions.*; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.DisplayName; + +class BankAccountTest { + + private BankAccount account; + + @BeforeEach + void setUp() { + account = new BankAccount("1234567890", "张三"); + } + + @Test + @DisplayName("测试账户初始化") + void testAccountInitialization() { + assertEquals("1234567890", account.getAccountNumber()); + assertEquals("张三", account.getOwnerName()); + assertEquals(0.0, account.getBalance(), 0.001); + } + + @Test + @DisplayName("测试存款功能") + void testDeposit() { + account.deposit(1000.0); + assertEquals(1000.0, account.getBalance(), 0.001); + } + + @Test + @DisplayName("测试存款负数") + void testDepositNegativeAmount() { + double initialBalance = account.getBalance(); + account.deposit(-100.0); + assertEquals(initialBalance, account.getBalance(), 0.001); + } + + @Test + @DisplayName("测试取款功能") + void testWithdraw() { + account.deposit(1000.0); + account.withdraw(500.0); + assertEquals(500.0, account.getBalance(), 0.001); + } + + @Test + @DisplayName("测试取款超过余额") + void testWithdrawInsufficientBalance() { + account.deposit(500.0); + double balanceBefore = account.getBalance(); + account.withdraw(1000.0); + assertEquals(balanceBefore, account.getBalance(), 0.001); + } + + @Test + @DisplayName("测试取款负数") + void testWithdrawNegativeAmount() { + account.deposit(1000.0); + double balanceBefore = account.getBalance(); + account.withdraw(-100.0); + assertEquals(balanceBefore, account.getBalance(), 0.001); + } + + @Test + @DisplayName("测试设置户主姓名") + void testSetOwnerName() { + account.setOwnerName("李四"); + assertEquals("李四", account.getOwnerName()); + } +} \ No newline at end of file