26 changed files with 2283 additions and 0 deletions
Binary file not shown.
@ -0,0 +1,58 @@ |
|||
public class BankAccount{ |
|||
private String accountNumber; |
|||
private String ownerName; |
|||
private double balance; |
|||
|
|||
public BankAccount(String accountNumber, String ownerName) { |
|||
this.accountNumber=accountNumber; |
|||
this.ownerName=ownerName; |
|||
this.balance=0.0; |
|||
} |
|||
|
|||
public String getAccountNumber(){ |
|||
return accountNumber; |
|||
} |
|||
|
|||
public String getOwnerName(){ |
|||
return ownerName; |
|||
} |
|||
|
|||
public void setOwnerName(String ownerName){ |
|||
this.ownerName=ownerName; |
|||
} |
|||
|
|||
public double getBalance(){ |
|||
return balance; |
|||
} |
|||
|
|||
public void deposit(double amount){ |
|||
if (amount>0){ |
|||
balance+=amount; |
|||
System.out.println("存款成功,当前余额:"+balance); |
|||
}else{ |
|||
System.out.println("存款金额必须大于0"); |
|||
} |
|||
} |
|||
|
|||
public void withdraw(double amount){ |
|||
if (amount>0 && amount<=balance){ |
|||
balance-=amount; |
|||
System.out.println("取款成功,当前余额:"+balance); |
|||
}else{ |
|||
System.out.println("余额不足或金额无效"); |
|||
} |
|||
} |
|||
|
|||
public static void main(String[] args){ |
|||
BankAccount account = new BankAccount("1234567890", "张三"); |
|||
System.out.println("账户创建成功!"); |
|||
System.out.println("账户号:" + account.getAccountNumber()); |
|||
System.out.println("户主:" + account.getOwnerName()); |
|||
System.out.println("初始余额:" + account.getBalance()); |
|||
|
|||
account.deposit(1000); |
|||
account.withdraw(500); |
|||
account.withdraw(600); |
|||
account.deposit(-100); |
|||
} |
|||
} |
|||
@ -0,0 +1,338 @@ |
|||
package com.hnu.crawler; |
|||
|
|||
import com.hnu.crawler.command.Command; |
|||
import com.hnu.crawler.command.CrawlCommand; |
|||
import com.hnu.crawler.command.DemoCommand; |
|||
import com.hnu.crawler.command.ExitCommand; |
|||
import com.hnu.crawler.command.HelpCommand; |
|||
import com.hnu.crawler.command.ListCommand; |
|||
import com.hnu.crawler.command.QueryCommand; |
|||
import com.hnu.crawler.config.ConfigManager; |
|||
import com.hnu.crawler.model.AdmissionInfo; |
|||
import com.hnu.crawler.model.UniversityConfig; |
|||
import com.hnu.crawler.query.DataQuery; |
|||
import com.hnu.crawler.storage.DataStorage; |
|||
import com.hnu.crawler.strategy.ConfigBasedCrawler; |
|||
import com.hnu.crawler.strategy.CrawlerStrategy; |
|||
import com.hnu.crawler.strategy.SinglePageCrawler; |
|||
import com.hnu.crawler.view.ConsoleView; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.HashMap; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
import java.util.Scanner; |
|||
|
|||
public class AdmissionCrawlerMain { |
|||
private static final Logger logger = LoggerFactory.getLogger(AdmissionCrawlerMain.class); |
|||
private static final String OUTPUT_DIR = "data"; |
|||
private static final String CSV_FILE = OUTPUT_DIR + "/admission_info.csv"; |
|||
private static final String JSON_FILE = OUTPUT_DIR + "/admission_info.json"; |
|||
|
|||
public static void main(String[] args) { |
|||
System.out.println("========================================"); |
|||
System.out.println(" 高校招生信息爬虫系统 v1.0"); |
|||
System.out.println("========================================"); |
|||
|
|||
try { |
|||
ConfigManager.createSampleConfig(); |
|||
System.out.println("[INFO] 配置文件加载完成"); |
|||
|
|||
ConsoleView view = new ConsoleView(); |
|||
Map<String, Command> commands = registerCommands(view); |
|||
System.out.println("[INFO] 命令注册完成"); |
|||
|
|||
if (args.length > 0) { |
|||
executeCommand(args, commands, view); |
|||
return; |
|||
} |
|||
|
|||
runInteractiveMode(commands, view); |
|||
} catch (Exception e) { |
|||
System.err.println("[ERROR] 程序启动失败: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
|
|||
private static Map<String, Command> registerCommands(ConsoleView view) { |
|||
Map<String, Command> commands = new HashMap<>(); |
|||
commands.put("crawl", new CrawlCommand(view)); |
|||
commands.put("list", new ListCommand(view)); |
|||
commands.put("query", new QueryCommand(view)); |
|||
commands.put("demo", new DemoCommand(view)); |
|||
commands.put("help", new HelpCommand(view)); |
|||
commands.put("exit", new ExitCommand(view)); |
|||
return commands; |
|||
} |
|||
|
|||
private static void executeCommand(String[] args, Map<String, Command> commands, ConsoleView view) { |
|||
String cmdName = args[0].toLowerCase(); |
|||
Command command = commands.get(cmdName); |
|||
|
|||
if (command != null) { |
|||
command.execute(args); |
|||
} else { |
|||
view.printError("未知命令: " + cmdName); |
|||
view.printInfo("输入 help 查看可用命令"); |
|||
} |
|||
} |
|||
|
|||
private static void runInteractiveMode(Map<String, Command> commands, ConsoleView view) { |
|||
view.printSuccess("欢迎使用高校招生信息爬虫系统!"); |
|||
view.printInfo("输入 'help' 查看可用命令"); |
|||
|
|||
Scanner scanner = new Scanner(System.in); |
|||
|
|||
while (true) { |
|||
String input = view.readLine().trim(); |
|||
if (input.isEmpty()) continue; |
|||
|
|||
String[] parts = input.split("\\s+"); |
|||
String cmdName = parts[0].toLowerCase(); |
|||
|
|||
if (cmdName.equals("menu")) { |
|||
showMainMenu(view); |
|||
handleMenuSelection(scanner, view); |
|||
continue; |
|||
} |
|||
|
|||
Command command = commands.get(cmdName); |
|||
if (command != null) { |
|||
command.execute(parts); |
|||
} else { |
|||
view.printError("未知命令: " + cmdName); |
|||
view.printInfo("输入 'help' 查看可用命令,或输入 'menu' 返回菜单模式"); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private static void showMainMenu(ConsoleView view) { |
|||
view.print("\n"); |
|||
view.print("╔══════════════════════════════════════════╗"); |
|||
view.print("║ 高校本科招生信息爬虫系统 ║"); |
|||
view.print("╠══════════════════════════════════════════╣"); |
|||
view.print("║ 1. 单页面爬取 ║"); |
|||
view.print("║ 2. 从配置文件批量爬取 ║"); |
|||
view.print("║ 3. CLI命令模式 ║"); |
|||
view.print("║ 4. 数据查询 ║"); |
|||
view.print("║ 5. 演示模式(生成模拟数据) ║"); |
|||
view.print("║ 6. 查看配置高校列表 ║"); |
|||
view.print("║ 0. 退出程序 ║"); |
|||
view.print("╚══════════════════════════════════════════╝"); |
|||
view.print("请选择操作 (0-6): "); |
|||
} |
|||
|
|||
private static void handleMenuSelection(Scanner scanner, ConsoleView view) { |
|||
int choice; |
|||
try { |
|||
choice = Integer.parseInt(scanner.nextLine().trim()); |
|||
} catch (NumberFormatException e) { |
|||
view.printError("输入无效,请输入数字!"); |
|||
return; |
|||
} |
|||
|
|||
switch (choice) { |
|||
case 1: |
|||
crawlWithStrategy(new SinglePageCrawler(), scanner, view); |
|||
break; |
|||
case 2: |
|||
crawlWithStrategy(new ConfigBasedCrawler(), scanner, view); |
|||
break; |
|||
case 3: |
|||
view.printInfo("已切换到CLI命令模式"); |
|||
view.printInfo("输入 'menu' 返回菜单模式,输入 'help' 查看命令"); |
|||
break; |
|||
case 4: |
|||
queryData(scanner, view); |
|||
break; |
|||
case 5: |
|||
demoMode(view); |
|||
break; |
|||
case 6: |
|||
showConfig(view); |
|||
break; |
|||
case 0: |
|||
logger.info("程序退出"); |
|||
scanner.close(); |
|||
System.exit(0); |
|||
break; |
|||
default: |
|||
view.printError("无效选项,请重新选择!"); |
|||
} |
|||
} |
|||
|
|||
private static void crawlWithStrategy(CrawlerStrategy strategy, Scanner scanner, ConsoleView view) { |
|||
logger.info("使用{}策略进行爬取", strategy.getStrategyName()); |
|||
List<AdmissionInfo> results = strategy.crawl(scanner); |
|||
|
|||
if (!results.isEmpty()) { |
|||
DataStorage.saveToCsv(results, CSV_FILE); |
|||
DataStorage.saveToJson(results, JSON_FILE); |
|||
view.printSuccess("数据已保存到: " + CSV_FILE + " 和 " + JSON_FILE); |
|||
} |
|||
} |
|||
|
|||
private static void queryData(Scanner scanner, ConsoleView view) { |
|||
view.print("\n=== 数据查询 ==="); |
|||
List<AdmissionInfo> allData = DataQuery.loadAllData(); |
|||
|
|||
if (allData.isEmpty()) { |
|||
view.printInfo("暂无数据,请先进行爬取或使用演示模式!"); |
|||
return; |
|||
} |
|||
|
|||
view.print("当前共有 " + allData.size() + " 条数据"); |
|||
|
|||
while (true) { |
|||
view.print("\n查询选项:"); |
|||
view.print(" 1. 按院校查询"); |
|||
view.print(" 2. 按专业查询"); |
|||
view.print(" 3. 按年份查询"); |
|||
view.print(" 4. 按分数段查询"); |
|||
view.print(" 5. 查看所有数据(按分数排序)"); |
|||
view.print(" 0. 返回主菜单"); |
|||
view.print("请选择查询方式: "); |
|||
|
|||
int choice; |
|||
try { |
|||
choice = Integer.parseInt(scanner.nextLine().trim()); |
|||
} catch (NumberFormatException e) { |
|||
view.printError("输入无效!"); |
|||
continue; |
|||
} |
|||
|
|||
List<AdmissionInfo> results = new ArrayList<>(); |
|||
|
|||
switch (choice) { |
|||
case 1: |
|||
view.print("请输入院校名称(支持模糊匹配): "); |
|||
String uniName = scanner.nextLine().trim(); |
|||
results = DataQuery.queryByUniversity(allData, uniName); |
|||
break; |
|||
case 2: |
|||
view.print("请输入专业名称(支持模糊匹配): "); |
|||
String majorName = scanner.nextLine().trim(); |
|||
results = DataQuery.queryByMajor(allData, majorName); |
|||
break; |
|||
case 3: |
|||
view.print("请输入年份: "); |
|||
String year = scanner.nextLine().trim(); |
|||
results = DataQuery.queryByYear(allData, year); |
|||
break; |
|||
case 4: |
|||
view.print("请输入最低分数: "); |
|||
double minScore = Double.parseDouble(scanner.nextLine().trim()); |
|||
view.print("请输入最高分数: "); |
|||
double maxScore = Double.parseDouble(scanner.nextLine().trim()); |
|||
results = DataQuery.queryByScoreRange(allData, minScore, maxScore); |
|||
break; |
|||
case 5: |
|||
results = DataQuery.sortByScore(allData, false); |
|||
break; |
|||
case 0: |
|||
return; |
|||
default: |
|||
view.printError("无效选项!"); |
|||
continue; |
|||
} |
|||
|
|||
DataQuery.printResults(results); |
|||
} |
|||
} |
|||
|
|||
private static void showConfig(ConsoleView view) { |
|||
view.print("\n=== 已配置高校列表 ==="); |
|||
List<UniversityConfig> universities = ConfigManager.loadUniversities(); |
|||
|
|||
if (universities.isEmpty()) { |
|||
view.print("暂无配置,请编辑 config/universities.json 文件"); |
|||
return; |
|||
} |
|||
|
|||
for (UniversityConfig uni : universities) { |
|||
view.print(String.format("%n【%s】%s (%s)", |
|||
uni.isEnabled() ? "●" : "○", |
|||
uni.getName(), |
|||
uni.getProvince())); |
|||
view.print(String.format(" 院校代码: %s", uni.getCode())); |
|||
view.print(String.format(" 配置页面数: %d", |
|||
uni.getAdmissionPages() != null ? uni.getAdmissionPages().size() : 0)); |
|||
|
|||
if (uni.getAdmissionPages() != null) { |
|||
for (UniversityConfig.AdmissionPageConfig page : uni.getAdmissionPages()) { |
|||
view.print(String.format(" [%s] %s - %s", |
|||
page.isEnabled() ? "启用" : "禁用", |
|||
page.getYear(), |
|||
page.getDescription())); |
|||
} |
|||
} |
|||
} |
|||
view.print("\n提示: 编辑 config/universities.json 文件可添加更多高校配置"); |
|||
} |
|||
|
|||
private static void demoMode(ConsoleView view) { |
|||
view.print("\n=== 演示模式 ==="); |
|||
logger.info("进入演示模式"); |
|||
|
|||
List<AdmissionInfo> demoData = createDemoData(); |
|||
|
|||
view.print("生成演示数据..."); |
|||
view.print("共生成 " + demoData.size() + " 条演示数据"); |
|||
|
|||
DataStorage.saveToCsvOverwrite(demoData, CSV_FILE); |
|||
DataStorage.saveToJson(demoData, JSON_FILE); |
|||
|
|||
view.printSuccess("\n演示数据已保存到:"); |
|||
view.print(" - CSV: " + CSV_FILE); |
|||
view.print(" - JSON: " + JSON_FILE); |
|||
view.print("\n现在可以选择「数据查询」功能来查询演示数据!"); |
|||
} |
|||
|
|||
private static List<AdmissionInfo> createDemoData() { |
|||
List<AdmissionInfo> data = new ArrayList<>(); |
|||
|
|||
String[] universities = {"湖南大学", "中南大学", "湖南师范大学", "国防科技大学"}; |
|||
String[] majors = {"计算机科学与技术", "软件工程", "电子信息工程", "机械工程", "土木工程", "金融学", "临床医学"}; |
|||
String[] categories = {"物理类", "历史类"}; |
|||
String[] batches = {"本科批", "本科提前批"}; |
|||
String[] years = {"2022", "2023", "2024"}; |
|||
|
|||
int id = 1; |
|||
for (String year : years) { |
|||
for (String university : universities) { |
|||
for (String major : majors) { |
|||
for (String category : categories) { |
|||
AdmissionInfo info = new AdmissionInfo(); |
|||
info.setUniversityName(university); |
|||
info.setUniversityCode(String.format("%04d", 10530 + id++ % 10)); |
|||
info.setProvince("湖南省"); |
|||
info.setCategory(category); |
|||
info.setMajorName(major); |
|||
info.setMajorCode(String.format("%06d", 800000 + id * 10)); |
|||
info.setPlanCount((int) (Math.random() * 50 + 10)); |
|||
|
|||
double baseScore = 550; |
|||
if (university.equals("国防科技大学")) baseScore += 50; |
|||
if (university.equals("中南大学")) baseScore += 30; |
|||
if (major.equals("临床医学")) baseScore += 20; |
|||
if (major.equals("计算机科学与技术")) baseScore += 15; |
|||
|
|||
info.setMinScore(baseScore + Math.random() * 60); |
|||
info.setMaxScore(info.getMinScore() + Math.random() * 30); |
|||
info.setAvgScore((info.getMinScore() + info.getMaxScore()) / 2); |
|||
info.setMinRank((int) (Math.random() * 10000 + 1000)); |
|||
info.setMaxRank(info.getMinRank() + (int) (Math.random() * 500)); |
|||
info.setYear(year); |
|||
info.setBatch(batches[(int) (Math.random() * batches.length)]); |
|||
info.setSourceUrl("https://example.edu/admission/" + year); |
|||
data.add(info); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
return data; |
|||
} |
|||
} |
|||
@ -0,0 +1,6 @@ |
|||
package com.hnu.crawler.command; |
|||
|
|||
public interface Command { |
|||
String getName(); |
|||
void execute(String[] args); |
|||
} |
|||
@ -0,0 +1,73 @@ |
|||
package com.hnu.crawler.command; |
|||
|
|||
import com.hnu.crawler.strategy.CrawlerStrategy; |
|||
import com.hnu.crawler.strategy.NewsStrategy; |
|||
import com.hnu.crawler.strategy.BlogStrategy; |
|||
import com.hnu.crawler.strategy.UniversityStrategy; |
|||
import com.hnu.crawler.storage.DataStorage; |
|||
import com.hnu.crawler.view.ConsoleView; |
|||
import com.hnu.crawler.model.AdmissionInfo; |
|||
|
|||
import java.util.List; |
|||
import java.util.Scanner; |
|||
|
|||
public class CrawlCommand implements Command { |
|||
private final ConsoleView view; |
|||
|
|||
public CrawlCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "crawl"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args) { |
|||
if (args.length < 2) { |
|||
view.printError("用法: crawl <类型> <URL>"); |
|||
view.printInfo("支持的类型: news(新闻), blog(博客), university(高校)"); |
|||
return; |
|||
} |
|||
|
|||
String type = args[1].toLowerCase(); |
|||
String url = args.length > 2 ? args[2] : ""; |
|||
|
|||
CrawlerStrategy strategy = null; |
|||
|
|||
switch (type) { |
|||
case "news": |
|||
strategy = new NewsStrategy(view); |
|||
break; |
|||
case "blog": |
|||
strategy = new BlogStrategy(view); |
|||
break; |
|||
case "university": |
|||
strategy = new UniversityStrategy(view); |
|||
break; |
|||
default: |
|||
view.printError("未知类型: " + type); |
|||
view.printInfo("支持的类型: news(新闻), blog(博客), university(高校)"); |
|||
return; |
|||
} |
|||
|
|||
if (url.isEmpty()) { |
|||
Scanner scanner = new Scanner(System.in); |
|||
view.printInfo("请输入目标URL: "); |
|||
url = scanner.nextLine().trim(); |
|||
} |
|||
|
|||
view.printInfo("开始爬取 [" + type + "]: " + url); |
|||
List<AdmissionInfo> results = strategy.crawl(url); |
|||
|
|||
if (!results.isEmpty()) { |
|||
DataStorage.saveToCsv(results, "data/crawler_results.csv"); |
|||
DataStorage.saveToJson(results, "data/crawler_results.json"); |
|||
view.printSuccess("爬取完成!共获取 " + results.size() + " 条数据"); |
|||
view.printInfo("数据已保存到 data/crawler_results.csv 和 data/crawler_results.json"); |
|||
} else { |
|||
view.printError("未获取到数据,请检查URL或网站结构"); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,60 @@ |
|||
package com.hnu.crawler.command; |
|||
|
|||
import com.hnu.crawler.storage.DataStorage; |
|||
import com.hnu.crawler.view.ConsoleView; |
|||
import com.hnu.crawler.model.AdmissionInfo; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class DemoCommand implements Command { |
|||
private final ConsoleView view; |
|||
|
|||
public DemoCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "demo"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args) { |
|||
view.printInfo("生成演示数据..."); |
|||
|
|||
List<AdmissionInfo> demoData = createDemoData(); |
|||
|
|||
DataStorage.saveToCsvOverwrite(demoData, "data/crawler_results.csv"); |
|||
DataStorage.saveToJson(demoData, "data/crawler_results.json"); |
|||
|
|||
view.printSuccess("演示数据生成完成!共 " + demoData.size() + " 条"); |
|||
view.printInfo("数据已保存到 data/crawler_results.csv 和 data/crawler_results.json"); |
|||
} |
|||
|
|||
private List<AdmissionInfo> createDemoData() { |
|||
List<AdmissionInfo> data = new ArrayList<>(); |
|||
|
|||
String[] universities = {"湖南大学", "中南大学", "湖南师范大学"}; |
|||
String[] majors = {"计算机科学与技术", "软件工程", "电子信息工程"}; |
|||
String[] years = {"2022", "2023", "2024"}; |
|||
|
|||
int id = 1; |
|||
for (String year : years) { |
|||
for (String university : universities) { |
|||
for (String major : majors) { |
|||
AdmissionInfo info = new AdmissionInfo(); |
|||
info.setUniversityName(university); |
|||
info.setMajorName(major); |
|||
info.setYear(year); |
|||
info.setMinScore(550 + Math.random() * 80); |
|||
info.setMaxScore(info.getMinScore() + Math.random() * 30); |
|||
info.setAvgScore((info.getMinScore() + info.getMaxScore()) / 2); |
|||
data.add(info); |
|||
} |
|||
} |
|||
} |
|||
|
|||
return data; |
|||
} |
|||
} |
|||
@ -0,0 +1,22 @@ |
|||
package com.hnu.crawler.command; |
|||
|
|||
import com.hnu.crawler.view.ConsoleView; |
|||
|
|||
public class ExitCommand implements Command { |
|||
private final ConsoleView view; |
|||
|
|||
public ExitCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "exit"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args) { |
|||
view.printSuccess("程序退出"); |
|||
System.exit(0); |
|||
} |
|||
} |
|||
@ -0,0 +1,36 @@ |
|||
package com.hnu.crawler.command; |
|||
|
|||
import com.hnu.crawler.view.ConsoleView; |
|||
|
|||
public class HelpCommand implements Command { |
|||
private final ConsoleView view; |
|||
|
|||
public HelpCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "help"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args) { |
|||
view.printInfo("可用命令:"); |
|||
view.printInfo(" crawl <类型> [URL] - 爬取网站数据"); |
|||
view.printInfo(" 类型: news(新闻), blog(博客), university(高校)"); |
|||
view.printInfo(" 示例: crawl news https://news.example.com"); |
|||
view.printInfo(" list - 列出所有已爬取的数据"); |
|||
view.printInfo(" query <选项> <关键词> - 查询数据"); |
|||
view.printInfo(" 选项: university(院校), major(专业), year(年份)"); |
|||
view.printInfo(" 示例: query university 湖南"); |
|||
view.printInfo(" demo - 生成演示数据"); |
|||
view.printInfo(" help - 显示此帮助信息"); |
|||
view.printInfo(" exit - 退出程序"); |
|||
view.printInfo(""); |
|||
view.printInfo("支持的网站类型:"); |
|||
view.printInfo(" 1. 新闻网站 (news) - 爬取新闻列表和内容"); |
|||
view.printInfo(" 2. 博客网站 (blog) - 爬取博客文章"); |
|||
view.printInfo(" 3. 高校网站 (university) - 爬取招生信息"); |
|||
} |
|||
} |
|||
@ -0,0 +1,40 @@ |
|||
package com.hnu.crawler.command; |
|||
|
|||
import com.hnu.crawler.query.DataQuery; |
|||
import com.hnu.crawler.view.ConsoleView; |
|||
import com.hnu.crawler.model.AdmissionInfo; |
|||
|
|||
import java.util.List; |
|||
|
|||
public class ListCommand implements Command { |
|||
private final ConsoleView view; |
|||
|
|||
public ListCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "list"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args) { |
|||
List<AdmissionInfo> allData = DataQuery.loadAllData(); |
|||
|
|||
if (allData.isEmpty()) { |
|||
view.printInfo("暂无数据,请先使用 crawl 命令爬取数据"); |
|||
return; |
|||
} |
|||
|
|||
view.printInfo("共 " + allData.size() + " 条数据:"); |
|||
view.printInfo("====================================="); |
|||
for (int i = 0; i < allData.size(); i++) { |
|||
AdmissionInfo info = allData.get(i); |
|||
view.printInfo((i + 1) + ". " + info.getUniversityName() + " - " + info.getMajorName()); |
|||
view.printInfo(" 分数: " + info.getMinScore() + " - " + info.getMaxScore()); |
|||
view.printInfo(" 年份: " + info.getYear()); |
|||
} |
|||
view.printInfo("====================================="); |
|||
} |
|||
} |
|||
@ -0,0 +1,65 @@ |
|||
package com.hnu.crawler.command; |
|||
|
|||
import com.hnu.crawler.query.DataQuery; |
|||
import com.hnu.crawler.view.ConsoleView; |
|||
import com.hnu.crawler.model.AdmissionInfo; |
|||
|
|||
import java.util.List; |
|||
|
|||
public class QueryCommand implements Command { |
|||
private final ConsoleView view; |
|||
|
|||
public QueryCommand(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public String getName() { |
|||
return "query"; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(String[] args) { |
|||
if (args.length < 3) { |
|||
view.printError("用法: query <选项> <关键词>"); |
|||
view.printInfo("选项: university(院校), major(专业), year(年份)"); |
|||
return; |
|||
} |
|||
|
|||
String option = args[1].toLowerCase(); |
|||
String keyword = args[2]; |
|||
|
|||
List<AdmissionInfo> allData = DataQuery.loadAllData(); |
|||
|
|||
if (allData.isEmpty()) { |
|||
view.printInfo("暂无数据,请先爬取或使用 demo 命令生成演示数据"); |
|||
return; |
|||
} |
|||
|
|||
List<AdmissionInfo> results; |
|||
|
|||
switch (option) { |
|||
case "university": |
|||
results = DataQuery.queryByUniversity(allData, keyword); |
|||
break; |
|||
case "major": |
|||
results = DataQuery.queryByMajor(allData, keyword); |
|||
break; |
|||
case "year": |
|||
results = DataQuery.queryByYear(allData, keyword); |
|||
break; |
|||
default: |
|||
view.printError("未知选项: " + option); |
|||
return; |
|||
} |
|||
|
|||
if (results.isEmpty()) { |
|||
view.printInfo("未找到匹配的数据"); |
|||
} else { |
|||
view.printInfo("找到 " + results.size() + " 条匹配数据:"); |
|||
for (AdmissionInfo info : results) { |
|||
view.printInfo("- " + info.getUniversityName() + " - " + info.getMajorName()); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,135 @@ |
|||
package com.hnu.crawler.config; |
|||
|
|||
import com.fasterxml.jackson.core.type.TypeReference; |
|||
import com.fasterxml.jackson.databind.ObjectMapper; |
|||
import com.fasterxml.jackson.databind.SerializationFeature; |
|||
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; |
|||
import com.hnu.crawler.model.UniversityConfig; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.io.File; |
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class ConfigManager { |
|||
private static final Logger logger = LoggerFactory.getLogger(ConfigManager.class); |
|||
private static final ObjectMapper objectMapper = new ObjectMapper(); |
|||
private static final String CONFIG_DIR = "config"; |
|||
private static final String UNIVERSITIES_CONFIG = CONFIG_DIR + "/universities.json"; |
|||
|
|||
static { |
|||
objectMapper.registerModule(new JavaTimeModule()); |
|||
objectMapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); |
|||
objectMapper.enable(SerializationFeature.INDENT_OUTPUT); |
|||
} |
|||
|
|||
public static void ensureConfigDir() { |
|||
File dir = new File(CONFIG_DIR); |
|||
if (!dir.exists()) { |
|||
dir.mkdirs(); |
|||
logger.info("创建配置目录: {}", CONFIG_DIR); |
|||
} |
|||
} |
|||
|
|||
public static List<UniversityConfig> loadUniversities() { |
|||
ensureConfigDir(); |
|||
File configFile = new File(UNIVERSITIES_CONFIG); |
|||
|
|||
if (!configFile.exists()) { |
|||
logger.warn("配置文件不存在: {}", UNIVERSITIES_CONFIG); |
|||
return new ArrayList<>(); |
|||
} |
|||
|
|||
try { |
|||
return objectMapper.readValue(configFile, |
|||
new TypeReference<List<UniversityConfig>>() {}); |
|||
} catch (IOException e) { |
|||
logger.error("加载高校配置失败", e); |
|||
return new ArrayList<>(); |
|||
} |
|||
} |
|||
|
|||
public static void saveUniversities(List<UniversityConfig> universities) { |
|||
ensureConfigDir(); |
|||
try { |
|||
objectMapper.writeValue(new File(UNIVERSITIES_CONFIG), universities); |
|||
logger.info("保存高校配置成功,共 {} 所高校", universities.size()); |
|||
} catch (IOException e) { |
|||
logger.error("保存高校配置失败", e); |
|||
} |
|||
} |
|||
|
|||
public static UniversityConfig findUniversityById(String id) { |
|||
List<UniversityConfig> universities = loadUniversities(); |
|||
return universities.stream() |
|||
.filter(u -> u.getId().equals(id)) |
|||
.findFirst() |
|||
.orElse(null); |
|||
} |
|||
|
|||
public static List<UniversityConfig> findUniversityByName(String name) { |
|||
List<UniversityConfig> universities = loadUniversities(); |
|||
List<UniversityConfig> result = new ArrayList<>(); |
|||
for (UniversityConfig uni : universities) { |
|||
if (uni.getName().contains(name)) { |
|||
result.add(uni); |
|||
} |
|||
} |
|||
return result; |
|||
} |
|||
|
|||
public static void createSampleConfig() { |
|||
ensureConfigDir(); |
|||
File configFile = new File(UNIVERSITIES_CONFIG); |
|||
|
|||
if (configFile.exists()) { |
|||
logger.info("示例配置已存在,跳过创建"); |
|||
return; |
|||
} |
|||
|
|||
List<UniversityConfig> sampleConfigs = new ArrayList<>(); |
|||
|
|||
UniversityConfig hnu = new UniversityConfig(); |
|||
hnu.setId("hnu"); |
|||
hnu.setName("湖南大学"); |
|||
hnu.setCode("10532"); |
|||
hnu.setProvince("湖南省"); |
|||
hnu.setBaseUrl("https://admission.hnu.edu.cn"); |
|||
|
|||
List<UniversityConfig.AdmissionPageConfig> hnuPages = new ArrayList<>(); |
|||
|
|||
UniversityConfig.AdmissionPageConfig hnu2024 = new UniversityConfig.AdmissionPageConfig(); |
|||
hnu2024.setYear("2024"); |
|||
hnu2024.setUrl("https://admission.hnu.edu.cn/info/1008/3001.htm"); |
|||
hnu2024.setDescription("2024年本科招生分数线"); |
|||
hnu2024.setTableSelector("table"); |
|||
hnuPages.add(hnu2024); |
|||
|
|||
hnu.setAdmissionPages(hnuPages); |
|||
sampleConfigs.add(hnu); |
|||
|
|||
UniversityConfig csu = new UniversityConfig(); |
|||
csu.setId("csu"); |
|||
csu.setName("中南大学"); |
|||
csu.setCode("10533"); |
|||
csu.setProvince("湖南省"); |
|||
csu.setBaseUrl("https://zhaosheng.csu.edu.cn"); |
|||
|
|||
List<UniversityConfig.AdmissionPageConfig> csuPages = new ArrayList<>(); |
|||
|
|||
UniversityConfig.AdmissionPageConfig csu2024 = new UniversityConfig.AdmissionPageConfig(); |
|||
csu2024.setYear("2024"); |
|||
csu2024.setUrl("https://zhaosheng.csu.edu.cn/xxfw/lnfs.htm"); |
|||
csu2024.setDescription("2024年本科招生分数线"); |
|||
csu2024.setTableSelector("table"); |
|||
csuPages.add(csu2024); |
|||
|
|||
csu.setAdmissionPages(csuPages); |
|||
sampleConfigs.add(csu); |
|||
|
|||
saveUniversities(sampleConfigs); |
|||
logger.info("创建示例配置文件成功: {}", UNIVERSITIES_CONFIG); |
|||
} |
|||
} |
|||
@ -0,0 +1,219 @@ |
|||
package com.hnu.crawler.model; |
|||
|
|||
import java.time.LocalDateTime; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class AdmissionInfo { |
|||
private String universityName; |
|||
private String universityCode; |
|||
private String province; |
|||
private String category; |
|||
private String majorName; |
|||
private String majorCode; |
|||
private Integer planCount; |
|||
private Double minScore; |
|||
private Double maxScore; |
|||
private Double avgScore; |
|||
private Integer minRank; |
|||
private Integer maxRank; |
|||
private String year; |
|||
private String batch; |
|||
private String remarks; |
|||
private LocalDateTime crawlTime; |
|||
private String sourceUrl; |
|||
|
|||
public AdmissionInfo() { |
|||
this.crawlTime = LocalDateTime.now(); |
|||
} |
|||
|
|||
public String getUniversityName() { |
|||
return universityName; |
|||
} |
|||
|
|||
public void setUniversityName(String universityName) { |
|||
this.universityName = universityName; |
|||
} |
|||
|
|||
public String getUniversityCode() { |
|||
return universityCode; |
|||
} |
|||
|
|||
public void setUniversityCode(String universityCode) { |
|||
this.universityCode = universityCode; |
|||
} |
|||
|
|||
public String getProvince() { |
|||
return province; |
|||
} |
|||
|
|||
public void setProvince(String province) { |
|||
this.province = province; |
|||
} |
|||
|
|||
public String getCategory() { |
|||
return category; |
|||
} |
|||
|
|||
public void setCategory(String category) { |
|||
this.category = category; |
|||
} |
|||
|
|||
public String getMajorName() { |
|||
return majorName; |
|||
} |
|||
|
|||
public void setMajorName(String majorName) { |
|||
this.majorName = majorName; |
|||
} |
|||
|
|||
public String getMajorCode() { |
|||
return majorCode; |
|||
} |
|||
|
|||
public void setMajorCode(String majorCode) { |
|||
this.majorCode = majorCode; |
|||
} |
|||
|
|||
public Integer getPlanCount() { |
|||
return planCount; |
|||
} |
|||
|
|||
public void setPlanCount(Integer planCount) { |
|||
this.planCount = planCount; |
|||
} |
|||
|
|||
public Double getMinScore() { |
|||
return minScore; |
|||
} |
|||
|
|||
public void setMinScore(Double minScore) { |
|||
this.minScore = minScore; |
|||
} |
|||
|
|||
public Double getMaxScore() { |
|||
return maxScore; |
|||
} |
|||
|
|||
public void setMaxScore(Double maxScore) { |
|||
this.maxScore = maxScore; |
|||
} |
|||
|
|||
public Double getAvgScore() { |
|||
return avgScore; |
|||
} |
|||
|
|||
public void setAvgScore(Double avgScore) { |
|||
this.avgScore = avgScore; |
|||
} |
|||
|
|||
public Integer getMinRank() { |
|||
return minRank; |
|||
} |
|||
|
|||
public void setMinRank(Integer minRank) { |
|||
this.minRank = minRank; |
|||
} |
|||
|
|||
public Integer getMaxRank() { |
|||
return maxRank; |
|||
} |
|||
|
|||
public void setMaxRank(Integer maxRank) { |
|||
this.maxRank = maxRank; |
|||
} |
|||
|
|||
public String getYear() { |
|||
return year; |
|||
} |
|||
|
|||
public void setYear(String year) { |
|||
this.year = year; |
|||
} |
|||
|
|||
public String getBatch() { |
|||
return batch; |
|||
} |
|||
|
|||
public void setBatch(String batch) { |
|||
this.batch = batch; |
|||
} |
|||
|
|||
public String getRemarks() { |
|||
return remarks; |
|||
} |
|||
|
|||
public void setRemarks(String remarks) { |
|||
this.remarks = remarks; |
|||
} |
|||
|
|||
public LocalDateTime getCrawlTime() { |
|||
return crawlTime; |
|||
} |
|||
|
|||
public void setCrawlTime(LocalDateTime crawlTime) { |
|||
this.crawlTime = crawlTime; |
|||
} |
|||
|
|||
public String getSourceUrl() { |
|||
return sourceUrl; |
|||
} |
|||
|
|||
public void setSourceUrl(String sourceUrl) { |
|||
this.sourceUrl = sourceUrl; |
|||
} |
|||
|
|||
public static List<String> getHeaders() { |
|||
List<String> headers = new ArrayList<>(); |
|||
headers.add("院校名称"); |
|||
headers.add("院校代码"); |
|||
headers.add("省份"); |
|||
headers.add("科类"); |
|||
headers.add("专业名称"); |
|||
headers.add("专业代码"); |
|||
headers.add("计划数"); |
|||
headers.add("最低分"); |
|||
headers.add("最高分"); |
|||
headers.add("平均分"); |
|||
headers.add("最低位次"); |
|||
headers.add("最高位次"); |
|||
headers.add("年份"); |
|||
headers.add("批次"); |
|||
headers.add("备注"); |
|||
headers.add("爬取时间"); |
|||
headers.add("来源URL"); |
|||
return headers; |
|||
} |
|||
|
|||
public List<String> toCsvRow() { |
|||
List<String> row = new ArrayList<>(); |
|||
row.add(universityName != null ? universityName : ""); |
|||
row.add(universityCode != null ? universityCode : ""); |
|||
row.add(province != null ? province : ""); |
|||
row.add(category != null ? category : ""); |
|||
row.add(majorName != null ? majorName : ""); |
|||
row.add(majorCode != null ? majorCode : ""); |
|||
row.add(planCount != null ? planCount.toString() : ""); |
|||
row.add(minScore != null ? minScore.toString() : ""); |
|||
row.add(maxScore != null ? maxScore.toString() : ""); |
|||
row.add(avgScore != null ? avgScore.toString() : ""); |
|||
row.add(minRank != null ? minRank.toString() : ""); |
|||
row.add(maxRank != null ? maxRank.toString() : ""); |
|||
row.add(year != null ? year : ""); |
|||
row.add(batch != null ? batch : ""); |
|||
row.add(remarks != null ? remarks : ""); |
|||
row.add(crawlTime != null ? crawlTime.toString() : ""); |
|||
row.add(sourceUrl != null ? sourceUrl : ""); |
|||
return row; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "AdmissionInfo{" + |
|||
"universityName='" + universityName + '\'' + |
|||
", majorName='" + majorName + '\'' + |
|||
", year='" + year + '\'' + |
|||
", minScore=" + minScore + |
|||
'}'; |
|||
} |
|||
} |
|||
@ -0,0 +1,125 @@ |
|||
package com.hnu.crawler.model; |
|||
|
|||
import java.util.List; |
|||
|
|||
public class UniversityConfig { |
|||
private String id; |
|||
private String name; |
|||
private String code; |
|||
private String province; |
|||
private String baseUrl; |
|||
private List<AdmissionPageConfig> admissionPages; |
|||
private boolean enabled; |
|||
|
|||
public UniversityConfig() { |
|||
this.enabled = true; |
|||
} |
|||
|
|||
public String getId() { |
|||
return id; |
|||
} |
|||
|
|||
public void setId(String id) { |
|||
this.id = id; |
|||
} |
|||
|
|||
public String getName() { |
|||
return name; |
|||
} |
|||
|
|||
public void setName(String name) { |
|||
this.name = name; |
|||
} |
|||
|
|||
public String getCode() { |
|||
return code; |
|||
} |
|||
|
|||
public void setCode(String code) { |
|||
this.code = code; |
|||
} |
|||
|
|||
public String getProvince() { |
|||
return province; |
|||
} |
|||
|
|||
public void setProvince(String province) { |
|||
this.province = province; |
|||
} |
|||
|
|||
public String getBaseUrl() { |
|||
return baseUrl; |
|||
} |
|||
|
|||
public void setBaseUrl(String baseUrl) { |
|||
this.baseUrl = baseUrl; |
|||
} |
|||
|
|||
public List<AdmissionPageConfig> getAdmissionPages() { |
|||
return admissionPages; |
|||
} |
|||
|
|||
public void setAdmissionPages(List<AdmissionPageConfig> admissionPages) { |
|||
this.admissionPages = admissionPages; |
|||
} |
|||
|
|||
public boolean isEnabled() { |
|||
return enabled; |
|||
} |
|||
|
|||
public void setEnabled(boolean enabled) { |
|||
this.enabled = enabled; |
|||
} |
|||
|
|||
public static class AdmissionPageConfig { |
|||
private String year; |
|||
private String url; |
|||
private String description; |
|||
private String tableSelector; |
|||
private boolean enabled; |
|||
|
|||
public AdmissionPageConfig() { |
|||
this.enabled = true; |
|||
} |
|||
|
|||
public String getYear() { |
|||
return year; |
|||
} |
|||
|
|||
public void setYear(String year) { |
|||
this.year = year; |
|||
} |
|||
|
|||
public String getUrl() { |
|||
return url; |
|||
} |
|||
|
|||
public void setUrl(String url) { |
|||
this.url = url; |
|||
} |
|||
|
|||
public String getDescription() { |
|||
return description; |
|||
} |
|||
|
|||
public void setDescription(String description) { |
|||
this.description = description; |
|||
} |
|||
|
|||
public String getTableSelector() { |
|||
return tableSelector; |
|||
} |
|||
|
|||
public void setTableSelector(String tableSelector) { |
|||
this.tableSelector = tableSelector; |
|||
} |
|||
|
|||
public boolean isEnabled() { |
|||
return enabled; |
|||
} |
|||
|
|||
public void setEnabled(boolean enabled) { |
|||
this.enabled = enabled; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,159 @@ |
|||
package com.hnu.crawler.parser; |
|||
|
|||
import com.hnu.crawler.model.AdmissionInfo; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class AdmissionParser { |
|||
private static final Logger logger = LoggerFactory.getLogger(AdmissionParser.class); |
|||
|
|||
public static List<AdmissionInfo> parseTable(String html, String sourceUrl, String universityName, String year) { |
|||
List<AdmissionInfo> infoList = new ArrayList<>(); |
|||
|
|||
try { |
|||
Document doc = Jsoup.parse(html); |
|||
Elements tables = doc.select("table"); |
|||
|
|||
if (tables.isEmpty()) { |
|||
logger.warn("未找到表格数据"); |
|||
return infoList; |
|||
} |
|||
|
|||
for (Element table : tables) { |
|||
Elements rows = table.select("tr"); |
|||
if (rows.size() <= 1) { |
|||
continue; |
|||
} |
|||
|
|||
Elements headerRow = rows.first().select("th, td"); |
|||
List<String> headers = new ArrayList<>(); |
|||
for (Element header : headerRow) { |
|||
headers.add(header.text().trim()); |
|||
} |
|||
|
|||
for (int i = 1; i < rows.size(); i++) { |
|||
Element row = rows.get(i); |
|||
Elements cells = row.select("td"); |
|||
|
|||
if (cells.size() < headers.size()) { |
|||
continue; |
|||
} |
|||
|
|||
AdmissionInfo info = new AdmissionInfo(); |
|||
info.setUniversityName(universityName); |
|||
info.setYear(year); |
|||
info.setSourceUrl(sourceUrl); |
|||
|
|||
for (int j = 0; j < cells.size() && j < headers.size(); j++) { |
|||
String header = headers.get(j); |
|||
String value = cells.get(j).text().trim(); |
|||
|
|||
parseField(info, header, value); |
|||
} |
|||
|
|||
if (info.getMajorName() != null || info.getMinScore() != null) { |
|||
infoList.add(info); |
|||
} |
|||
} |
|||
} |
|||
|
|||
logger.info("解析到 {} 条招生信息", infoList.size()); |
|||
} catch (Exception e) { |
|||
logger.error("解析HTML失败", e); |
|||
} |
|||
|
|||
return infoList; |
|||
} |
|||
|
|||
private static void parseField(AdmissionInfo info, String header, String value) { |
|||
if (value == null || value.isEmpty()) { |
|||
return; |
|||
} |
|||
|
|||
header = header.toLowerCase(); |
|||
|
|||
if (header.contains("专业") && (header.contains("名称") || header.contains("专业名"))) { |
|||
info.setMajorName(value); |
|||
} else if (header.contains("专业") && header.contains("代码")) { |
|||
info.setMajorCode(value); |
|||
} else if (header.contains("院校") && header.contains("代码")) { |
|||
info.setUniversityCode(value); |
|||
} else if (header.contains("省份")) { |
|||
info.setProvince(value); |
|||
} else if (header.contains("科类") || header.contains("文理") || header.contains("科目")) { |
|||
info.setCategory(value); |
|||
} else if (header.contains("计划") || header.contains("人数")) { |
|||
try { |
|||
info.setPlanCount(Integer.parseInt(value)); |
|||
} catch (NumberFormatException e) { |
|||
} |
|||
} else if (header.contains("最低") && header.contains("分")) { |
|||
try { |
|||
info.setMinScore(Double.parseDouble(value)); |
|||
} catch (NumberFormatException e) { |
|||
} |
|||
} else if (header.contains("最高") && header.contains("分")) { |
|||
try { |
|||
info.setMaxScore(Double.parseDouble(value)); |
|||
} catch (NumberFormatException e) { |
|||
} |
|||
} else if (header.contains("平均") && header.contains("分")) { |
|||
try { |
|||
info.setAvgScore(Double.parseDouble(value)); |
|||
} catch (NumberFormatException e) { |
|||
} |
|||
} else if (header.contains("最低") && header.contains("位次")) { |
|||
try { |
|||
info.setMinRank(Integer.parseInt(value)); |
|||
} catch (NumberFormatException e) { |
|||
} |
|||
} else if (header.contains("最高") && header.contains("位次")) { |
|||
try { |
|||
info.setMaxRank(Integer.parseInt(value)); |
|||
} catch (NumberFormatException e) { |
|||
} |
|||
} else if (header.contains("批次")) { |
|||
info.setBatch(value); |
|||
} else if (header.contains("备注") || header.contains("说明")) { |
|||
info.setRemarks(value); |
|||
} |
|||
} |
|||
|
|||
public static List<String> extractUrls(String html, String baseUrl) { |
|||
List<String> urls = new ArrayList<>(); |
|||
try { |
|||
Document doc = Jsoup.parse(html); |
|||
Elements links = doc.select("a[href]"); |
|||
|
|||
for (Element link : links) { |
|||
String href = link.attr("abs:href"); |
|||
if (href.isEmpty()) { |
|||
href = link.attr("href"); |
|||
if (!href.startsWith("http") && baseUrl != null) { |
|||
if (href.startsWith("/")) { |
|||
href = baseUrl + href; |
|||
} else { |
|||
href = baseUrl + "/" + href; |
|||
} |
|||
} |
|||
} |
|||
|
|||
if (!href.isEmpty() && (href.contains("zhaosheng") || href.contains("zs") || |
|||
href.contains("admission") || href.contains("fenshu") || |
|||
href.contains("score") || href.contains("lishi"))) { |
|||
urls.add(href); |
|||
} |
|||
} |
|||
} catch (Exception e) { |
|||
logger.error("提取URL失败", e); |
|||
} |
|||
return urls; |
|||
} |
|||
} |
|||
@ -0,0 +1,216 @@ |
|||
package com.hnu.crawler.query; |
|||
|
|||
import com.fasterxml.jackson.core.type.TypeReference; |
|||
import com.fasterxml.jackson.databind.ObjectMapper; |
|||
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; |
|||
import com.hnu.crawler.model.AdmissionInfo; |
|||
import com.opencsv.CSVReader; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.io.FileReader; |
|||
import java.io.IOException; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Path; |
|||
import java.nio.file.Paths; |
|||
import java.util.ArrayList; |
|||
import java.util.Comparator; |
|||
import java.util.List; |
|||
import java.util.stream.Collectors; |
|||
|
|||
public class DataQuery { |
|||
private static final Logger logger = LoggerFactory.getLogger(DataQuery.class); |
|||
private static final ObjectMapper objectMapper = new ObjectMapper(); |
|||
private static final String DATA_DIR = "data"; |
|||
|
|||
static { |
|||
objectMapper.registerModule(new JavaTimeModule()); |
|||
} |
|||
|
|||
public static List<AdmissionInfo> loadAllData() { |
|||
List<AdmissionInfo> allData = new ArrayList<>(); |
|||
Path dataPath = Paths.get(DATA_DIR); |
|||
|
|||
if (!Files.exists(dataPath)) { |
|||
logger.warn("数据目录不存在: {}", DATA_DIR); |
|||
return allData; |
|||
} |
|||
|
|||
Path csvFile = dataPath.resolve("admission_info.csv"); |
|||
if (Files.exists(csvFile)) { |
|||
allData.addAll(loadFromCsv(csvFile.toString())); |
|||
} |
|||
|
|||
Path jsonFile = dataPath.resolve("admission_info.json"); |
|||
if (Files.exists(jsonFile)) { |
|||
allData.addAll(loadFromJson(jsonFile.toString())); |
|||
} |
|||
|
|||
return allData; |
|||
} |
|||
|
|||
public static List<AdmissionInfo> loadFromCsv(String filePath) { |
|||
List<AdmissionInfo> data = new ArrayList<>(); |
|||
try (CSVReader reader = new CSVReader(new FileReader(filePath))) { |
|||
List<String[]> rows = reader.readAll(); |
|||
if (rows.size() <= 1) { |
|||
return data; |
|||
} |
|||
|
|||
String[] headers = rows.get(0); |
|||
for (int i = 1; i < rows.size(); i++) { |
|||
String[] row = rows.get(i); |
|||
AdmissionInfo info = new AdmissionInfo(); |
|||
for (int j = 0; j < headers.length && j < row.length; j++) { |
|||
String header = headers[j]; |
|||
String value = row[j]; |
|||
setField(info, header, value); |
|||
} |
|||
data.add(info); |
|||
} |
|||
} catch (Exception e) { |
|||
logger.error("从CSV加载数据失败: {}", filePath, e); |
|||
} |
|||
return data; |
|||
} |
|||
|
|||
public static List<AdmissionInfo> loadFromJson(String filePath) { |
|||
try { |
|||
return objectMapper.readValue(Paths.get(filePath).toFile(), |
|||
new TypeReference<List<AdmissionInfo>>() {}); |
|||
} catch (IOException e) { |
|||
logger.error("从JSON加载数据失败: {}", filePath, e); |
|||
return new ArrayList<>(); |
|||
} |
|||
} |
|||
|
|||
private static void setField(AdmissionInfo info, String header, String value) { |
|||
if (value == null || value.isEmpty()) { |
|||
return; |
|||
} |
|||
|
|||
try { |
|||
switch (header) { |
|||
case "院校名称": |
|||
info.setUniversityName(value); |
|||
break; |
|||
case "院校代码": |
|||
info.setUniversityCode(value); |
|||
break; |
|||
case "省份": |
|||
info.setProvince(value); |
|||
break; |
|||
case "科类": |
|||
info.setCategory(value); |
|||
break; |
|||
case "专业名称": |
|||
info.setMajorName(value); |
|||
break; |
|||
case "专业代码": |
|||
info.setMajorCode(value); |
|||
break; |
|||
case "计划数": |
|||
info.setPlanCount(Integer.parseInt(value)); |
|||
break; |
|||
case "最低分": |
|||
info.setMinScore(Double.parseDouble(value)); |
|||
break; |
|||
case "最高分": |
|||
info.setMaxScore(Double.parseDouble(value)); |
|||
break; |
|||
case "平均分": |
|||
info.setAvgScore(Double.parseDouble(value)); |
|||
break; |
|||
case "最低位次": |
|||
info.setMinRank(Integer.parseInt(value)); |
|||
break; |
|||
case "最高位次": |
|||
info.setMaxRank(Integer.parseInt(value)); |
|||
break; |
|||
case "年份": |
|||
info.setYear(value); |
|||
break; |
|||
case "批次": |
|||
info.setBatch(value); |
|||
break; |
|||
case "备注": |
|||
info.setRemarks(value); |
|||
break; |
|||
} |
|||
} catch (NumberFormatException e) { |
|||
} |
|||
} |
|||
|
|||
public static List<AdmissionInfo> queryByUniversity(List<AdmissionInfo> data, String universityName) { |
|||
return data.stream() |
|||
.filter(info -> info.getUniversityName() != null && |
|||
info.getUniversityName().contains(universityName)) |
|||
.collect(Collectors.toList()); |
|||
} |
|||
|
|||
public static List<AdmissionInfo> queryByMajor(List<AdmissionInfo> data, String majorName) { |
|||
return data.stream() |
|||
.filter(info -> info.getMajorName() != null && |
|||
info.getMajorName().contains(majorName)) |
|||
.collect(Collectors.toList()); |
|||
} |
|||
|
|||
public static List<AdmissionInfo> queryByYear(List<AdmissionInfo> data, String year) { |
|||
return data.stream() |
|||
.filter(info -> info.getYear() != null && info.getYear().equals(year)) |
|||
.collect(Collectors.toList()); |
|||
} |
|||
|
|||
public static List<AdmissionInfo> queryByScoreRange(List<AdmissionInfo> data, |
|||
double minScore, double maxScore) { |
|||
return data.stream() |
|||
.filter(info -> info.getMinScore() != null && |
|||
info.getMinScore() >= minScore && info.getMinScore() <= maxScore) |
|||
.collect(Collectors.toList()); |
|||
} |
|||
|
|||
public static List<AdmissionInfo> sortByScore(List<AdmissionInfo> data, boolean ascending) { |
|||
return data.stream() |
|||
.sorted((a, b) -> { |
|||
Double scoreA = a.getMinScore(); |
|||
Double scoreB = b.getMinScore(); |
|||
|
|||
if (scoreA == null && scoreB == null) return 0; |
|||
if (scoreA == null) return 1; |
|||
if (scoreB == null) return -1; |
|||
|
|||
return ascending ? scoreA.compareTo(scoreB) : scoreB.compareTo(scoreA); |
|||
}) |
|||
.collect(Collectors.toList()); |
|||
} |
|||
|
|||
public static void printResults(List<AdmissionInfo> results) { |
|||
if (results.isEmpty()) { |
|||
System.out.println("未找到匹配的结果"); |
|||
return; |
|||
} |
|||
|
|||
System.out.println("\n查询结果 (共 " + results.size() + " 条):"); |
|||
System.out.println("=".repeat(120)); |
|||
System.out.printf("%-15s %-10s %-20s %-8s %-8s %-8s %s%n", |
|||
"院校", "年份", "专业", "最低分", "最高分", "平均分", "批次"); |
|||
System.out.println("-".repeat(120)); |
|||
|
|||
for (AdmissionInfo info : results) { |
|||
System.out.printf("%-15s %-10s %-20s %-8.1f %-8.1f %-8.1f %s%n", |
|||
truncate(info.getUniversityName(), 15), |
|||
info.getYear() != null ? info.getYear() : "", |
|||
truncate(info.getMajorName(), 20), |
|||
info.getMinScore() != null ? info.getMinScore() : 0, |
|||
info.getMaxScore() != null ? info.getMaxScore() : 0, |
|||
info.getAvgScore() != null ? info.getAvgScore() : 0, |
|||
info.getBatch() != null ? info.getBatch() : ""); |
|||
} |
|||
System.out.println("=".repeat(120)); |
|||
} |
|||
|
|||
private static String truncate(String str, int maxLen) { |
|||
if (str == null) return ""; |
|||
return str.length() > maxLen ? str.substring(0, maxLen - 2) + ".." : str; |
|||
} |
|||
} |
|||
@ -0,0 +1,114 @@ |
|||
package com.hnu.crawler.storage; |
|||
|
|||
import com.fasterxml.jackson.databind.ObjectMapper; |
|||
import com.fasterxml.jackson.databind.SerializationFeature; |
|||
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; |
|||
import com.hnu.crawler.model.AdmissionInfo; |
|||
import com.opencsv.CSVWriter; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.io.File; |
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Path; |
|||
import java.nio.file.Paths; |
|||
import java.util.List; |
|||
|
|||
public class DataStorage { |
|||
private static final Logger logger = LoggerFactory.getLogger(DataStorage.class); |
|||
private static final ObjectMapper objectMapper = new ObjectMapper(); |
|||
|
|||
static { |
|||
objectMapper.registerModule(new JavaTimeModule()); |
|||
objectMapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); |
|||
objectMapper.enable(SerializationFeature.INDENT_OUTPUT); |
|||
} |
|||
|
|||
public static void ensureDirectoryExists(String directoryPath) { |
|||
try { |
|||
Path path = Paths.get(directoryPath); |
|||
if (!Files.exists(path)) { |
|||
Files.createDirectories(path); |
|||
logger.info("创建目录: {}", directoryPath); |
|||
} |
|||
} catch (IOException e) { |
|||
logger.error("创建目录失败: {}", directoryPath, e); |
|||
} |
|||
} |
|||
|
|||
public static void saveToCsv(List<AdmissionInfo> infoList, String filePath) { |
|||
if (infoList == null || infoList.isEmpty()) { |
|||
logger.warn("数据为空,跳过CSV保存"); |
|||
return; |
|||
} |
|||
|
|||
ensureDirectoryExists(new File(filePath).getParent()); |
|||
|
|||
try (CSVWriter writer = new CSVWriter(new FileWriter(filePath, true))) { |
|||
File file = new File(filePath); |
|||
if (!file.exists() || file.length() == 0) { |
|||
writer.writeNext(AdmissionInfo.getHeaders().toArray(new String[0])); |
|||
} |
|||
|
|||
for (AdmissionInfo info : infoList) { |
|||
writer.writeNext(info.toCsvRow().toArray(new String[0])); |
|||
} |
|||
|
|||
logger.info("成功保存 {} 条数据到CSV: {}", infoList.size(), filePath); |
|||
} catch (IOException e) { |
|||
logger.error("保存CSV失败: {}", filePath, e); |
|||
} |
|||
} |
|||
|
|||
public static void saveToJson(List<AdmissionInfo> infoList, String filePath) { |
|||
if (infoList == null || infoList.isEmpty()) { |
|||
logger.warn("数据为空,跳过JSON保存"); |
|||
return; |
|||
} |
|||
|
|||
ensureDirectoryExists(new File(filePath).getParent()); |
|||
|
|||
try { |
|||
List<AdmissionInfo> existingData = null; |
|||
File file = new File(filePath); |
|||
if (file.exists()) { |
|||
existingData = objectMapper.readValue(file, |
|||
objectMapper.getTypeFactory().constructCollectionType(List.class, AdmissionInfo.class)); |
|||
} |
|||
|
|||
if (existingData != null) { |
|||
existingData.addAll(infoList); |
|||
objectMapper.writeValue(file, existingData); |
|||
} else { |
|||
objectMapper.writeValue(file, infoList); |
|||
} |
|||
|
|||
logger.info("成功保存 {} 条数据到JSON: {}", infoList.size(), filePath); |
|||
} catch (IOException e) { |
|||
logger.error("保存JSON失败: {}", filePath, e); |
|||
} |
|||
} |
|||
|
|||
public static void saveToCsvOverwrite(List<AdmissionInfo> infoList, String filePath) { |
|||
if (infoList == null || infoList.isEmpty()) { |
|||
logger.warn("数据为空,跳过CSV保存"); |
|||
return; |
|||
} |
|||
|
|||
ensureDirectoryExists(new File(filePath).getParent()); |
|||
|
|||
try (CSVWriter writer = new CSVWriter(new FileWriter(filePath))) { |
|||
writer.writeNext(AdmissionInfo.getHeaders().toArray(new String[0])); |
|||
|
|||
for (AdmissionInfo info : infoList) { |
|||
writer.writeNext(info.toCsvRow().toArray(new String[0])); |
|||
} |
|||
|
|||
logger.info("成功覆盖保存 {} 条数据到CSV: {}", infoList.size(), filePath); |
|||
} catch (IOException e) { |
|||
logger.error("保存CSV失败: {}", filePath, e); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,75 @@ |
|||
package com.hnu.crawler.strategy; |
|||
|
|||
import com.hnu.crawler.model.AdmissionInfo; |
|||
import com.hnu.crawler.util.HttpClientUtil; |
|||
import com.hnu.crawler.view.ConsoleView; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class BlogStrategy implements CrawlerStrategy { |
|||
private final ConsoleView view; |
|||
|
|||
public BlogStrategy(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public List<AdmissionInfo> crawl(String url) { |
|||
List<AdmissionInfo> results = new ArrayList<>(); |
|||
|
|||
try { |
|||
String html = HttpClientUtil.fetchHtml(url); |
|||
if (html == null) { |
|||
return results; |
|||
} |
|||
|
|||
Document doc = Jsoup.parse(html); |
|||
|
|||
Elements blogItems = doc.select(".post, .blog-post, .entry, article"); |
|||
|
|||
for (Element item : blogItems) { |
|||
String title = item.select("h1, h2, .post-title, .entry-title, .title").first() != null ? |
|||
item.select("h1, h2, .post-title, .entry-title, .title").first().text() : ""; |
|||
String articleUrl = item.select("a").first() != null ? |
|||
item.select("a").first().attr("abs:href") : url; |
|||
String author = item.select(".author, .post-author, .byline").text(); |
|||
|
|||
if (!title.isEmpty()) { |
|||
AdmissionInfo info = new AdmissionInfo(); |
|||
info.setUniversityName("博客文章"); |
|||
info.setMajorName(title); |
|||
info.setYear(java.time.LocalDate.now().getYear() + ""); |
|||
info.setSourceUrl(articleUrl); |
|||
if (!author.isEmpty()) { |
|||
info.setRemarks("作者: " + author); |
|||
} |
|||
results.add(info); |
|||
} |
|||
} |
|||
|
|||
view.printInfo("解析到 " + results.size() + " 篇博客文章"); |
|||
|
|||
} catch (Exception e) { |
|||
view.printError("爬取博客失败: " + e.getMessage()); |
|||
} |
|||
|
|||
return results; |
|||
} |
|||
|
|||
@Override |
|||
public List<AdmissionInfo> crawl(java.util.Scanner scanner) { |
|||
view.printInfo("请输入博客网站URL: "); |
|||
String url = scanner.nextLine().trim(); |
|||
return crawl(url); |
|||
} |
|||
|
|||
@Override |
|||
public String getStrategyName() { |
|||
return "博客网站爬取"; |
|||
} |
|||
} |
|||
@ -0,0 +1,117 @@ |
|||
package com.hnu.crawler.strategy; |
|||
|
|||
import com.hnu.crawler.config.ConfigManager; |
|||
import com.hnu.crawler.model.AdmissionInfo; |
|||
import com.hnu.crawler.model.UniversityConfig; |
|||
import com.hnu.crawler.parser.AdmissionParser; |
|||
import com.hnu.crawler.util.HttpClientUtil; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.Scanner; |
|||
|
|||
public class ConfigBasedCrawler implements CrawlerStrategy { |
|||
private static final Logger logger = LoggerFactory.getLogger(ConfigBasedCrawler.class); |
|||
|
|||
@Override |
|||
public List<AdmissionInfo> crawl(Scanner scanner) { |
|||
System.out.println("\n=== 从配置文件批量爬取 ==="); |
|||
List<UniversityConfig> universities = ConfigManager.loadUniversities(); |
|||
|
|||
if (universities.isEmpty()) { |
|||
System.out.println("未找到配置的高校,请先编辑 config/universities.json 文件"); |
|||
return List.of(); |
|||
} |
|||
|
|||
System.out.println("已配置的高校列表:"); |
|||
for (int i = 0; i < universities.size(); i++) { |
|||
UniversityConfig uni = universities.get(i); |
|||
System.out.printf(" %d. %s (%s)%n", i + 1, uni.getName(), uni.getProvince()); |
|||
} |
|||
|
|||
System.out.print("\n请选择要爬取的高校编号(0表示全部): "); |
|||
int choice; |
|||
try { |
|||
choice = Integer.parseInt(scanner.nextLine().trim()); |
|||
} catch (NumberFormatException e) { |
|||
System.out.println("输入无效!"); |
|||
return List.of(); |
|||
} |
|||
|
|||
List<UniversityConfig> toCrawl = new ArrayList<>(); |
|||
if (choice == 0) { |
|||
toCrawl = universities; |
|||
} else if (choice > 0 && choice <= universities.size()) { |
|||
toCrawl.add(universities.get(choice - 1)); |
|||
} else { |
|||
System.out.println("无效的选择!"); |
|||
return List.of(); |
|||
} |
|||
|
|||
System.out.print("请输入请求间隔(毫秒,默认2000): "); |
|||
String delayStr = scanner.nextLine().trim(); |
|||
long delay = delayStr.isEmpty() ? 2000 : Long.parseLong(delayStr); |
|||
|
|||
List<AdmissionInfo> allResults = new ArrayList<>(); |
|||
for (UniversityConfig uni : toCrawl) { |
|||
if (!uni.isEnabled()) { |
|||
System.out.printf("跳过已禁用的高校: %s%n", uni.getName()); |
|||
continue; |
|||
} |
|||
|
|||
System.out.printf("%n正在处理: %s%n", uni.getName()); |
|||
|
|||
for (UniversityConfig.AdmissionPageConfig page : uni.getAdmissionPages()) { |
|||
if (!page.isEnabled()) { |
|||
System.out.printf(" 跳过已禁用的页面: %s%n", page.getDescription()); |
|||
continue; |
|||
} |
|||
|
|||
System.out.printf(" 爬取 %s: %s%n", page.getYear(), page.getDescription()); |
|||
|
|||
String html = HttpClientUtil.fetchHtml(page.getUrl()); |
|||
if (html != null) { |
|||
List<AdmissionInfo> infoList = AdmissionParser.parseTable( |
|||
html, page.getUrl(), uni.getName(), page.getYear()); |
|||
|
|||
for (AdmissionInfo info : infoList) { |
|||
info.setUniversityCode(uni.getCode()); |
|||
info.setProvince(uni.getProvince()); |
|||
} |
|||
|
|||
if (!infoList.isEmpty()) { |
|||
allResults.addAll(infoList); |
|||
System.out.printf(" 成功获取 %d 条数据%n", infoList.size()); |
|||
} else { |
|||
System.out.println(" 未解析到数据"); |
|||
} |
|||
} else { |
|||
System.out.println(" 获取页面失败"); |
|||
} |
|||
|
|||
try { |
|||
Thread.sleep(delay); |
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
} |
|||
} |
|||
} |
|||
|
|||
System.out.printf("%n批量爬取完成!共获取 %d 条数据%n", allResults.size()); |
|||
return allResults; |
|||
} |
|||
|
|||
@Override |
|||
public List<AdmissionInfo> crawl(String url) { |
|||
System.out.println("配置文件爬取策略需要从配置文件读取URL列表"); |
|||
System.out.println("请使用菜单模式或配置文件进行批量爬取"); |
|||
return List.of(); |
|||
} |
|||
|
|||
@Override |
|||
public String getStrategyName() { |
|||
return "配置文件批量爬取"; |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package com.hnu.crawler.strategy; |
|||
|
|||
import com.hnu.crawler.model.AdmissionInfo; |
|||
import java.util.List; |
|||
import java.util.Scanner; |
|||
|
|||
public interface CrawlerStrategy { |
|||
List<AdmissionInfo> crawl(Scanner scanner); |
|||
List<AdmissionInfo> crawl(String url); |
|||
String getStrategyName(); |
|||
} |
|||
@ -0,0 +1,72 @@ |
|||
package com.hnu.crawler.strategy; |
|||
|
|||
import com.hnu.crawler.model.AdmissionInfo; |
|||
import com.hnu.crawler.util.HttpClientUtil; |
|||
import com.hnu.crawler.view.ConsoleView; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class NewsStrategy implements CrawlerStrategy { |
|||
private final ConsoleView view; |
|||
|
|||
public NewsStrategy(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public List<AdmissionInfo> crawl(String url) { |
|||
List<AdmissionInfo> results = new ArrayList<>(); |
|||
|
|||
try { |
|||
String html = HttpClientUtil.fetchHtml(url); |
|||
if (html == null) { |
|||
return results; |
|||
} |
|||
|
|||
Document doc = Jsoup.parse(html); |
|||
|
|||
Elements newsItems = doc.select("article, .news-item, .list-item, .item, .article-item"); |
|||
|
|||
for (Element item : newsItems) { |
|||
String title = item.select("h2, h3, .title, a").first() != null ? |
|||
item.select("h2, h3, .title, a").first().text() : ""; |
|||
String articleUrl = item.select("a").first() != null ? |
|||
item.select("a").first().attr("abs:href") : url; |
|||
String summary = item.select(".summary, .description, p").text(); |
|||
|
|||
if (!title.isEmpty()) { |
|||
AdmissionInfo info = new AdmissionInfo(); |
|||
info.setUniversityName("新闻网站"); |
|||
info.setMajorName(title); |
|||
info.setYear(java.time.LocalDate.now().getYear() + ""); |
|||
info.setSourceUrl(articleUrl); |
|||
results.add(info); |
|||
} |
|||
} |
|||
|
|||
view.printInfo("解析到 " + results.size() + " 条新闻"); |
|||
|
|||
} catch (Exception e) { |
|||
view.printError("爬取新闻失败: " + e.getMessage()); |
|||
} |
|||
|
|||
return results; |
|||
} |
|||
|
|||
@Override |
|||
public List<AdmissionInfo> crawl(java.util.Scanner scanner) { |
|||
view.printInfo("请输入新闻网站URL: "); |
|||
String url = scanner.nextLine().trim(); |
|||
return crawl(url); |
|||
} |
|||
|
|||
@Override |
|||
public String getStrategyName() { |
|||
return "新闻网站爬取"; |
|||
} |
|||
} |
|||
@ -0,0 +1,67 @@ |
|||
package com.hnu.crawler.strategy; |
|||
|
|||
import com.hnu.crawler.model.AdmissionInfo; |
|||
import com.hnu.crawler.parser.AdmissionParser; |
|||
import com.hnu.crawler.util.HttpClientUtil; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.util.List; |
|||
import java.util.Scanner; |
|||
|
|||
public class SinglePageCrawler implements CrawlerStrategy { |
|||
private static final Logger logger = LoggerFactory.getLogger(SinglePageCrawler.class); |
|||
|
|||
@Override |
|||
public List<AdmissionInfo> crawl(Scanner scanner) { |
|||
System.out.println("\n=== 单页面爬取 ==="); |
|||
System.out.print("请输入目标URL: "); |
|||
String url = scanner.nextLine().trim(); |
|||
|
|||
System.out.print("请输入院校名称: "); |
|||
String universityName = scanner.nextLine().trim(); |
|||
|
|||
System.out.print("请输入年份: "); |
|||
String year = scanner.nextLine().trim(); |
|||
|
|||
System.out.print("请输入请求间隔(毫秒,默认1000): "); |
|||
String delayStr = scanner.nextLine().trim(); |
|||
long delay = delayStr.isEmpty() ? 1000 : Long.parseLong(delayStr); |
|||
|
|||
logger.info("开始爬取: {}", url); |
|||
|
|||
String html = HttpClientUtil.fetchHtml(url); |
|||
if (html != null) { |
|||
List<AdmissionInfo> infoList = AdmissionParser.parseTable(html, url, universityName, year); |
|||
|
|||
if (!infoList.isEmpty()) { |
|||
System.out.println("爬取完成!共获取 " + infoList.size() + " 条数据"); |
|||
return infoList; |
|||
} else { |
|||
System.out.println("未解析到数据,请检查页面结构"); |
|||
} |
|||
} else { |
|||
System.out.println("获取页面失败,请检查URL是否正确"); |
|||
} |
|||
return List.of(); |
|||
} |
|||
|
|||
@Override |
|||
public List<AdmissionInfo> crawl(String url) { |
|||
logger.info("开始爬取: {}", url); |
|||
String html = HttpClientUtil.fetchHtml(url); |
|||
if (html != null) { |
|||
List<AdmissionInfo> infoList = AdmissionParser.parseTable(html, url, "未知院校", ""); |
|||
if (!infoList.isEmpty()) { |
|||
System.out.println("爬取完成!共获取 " + infoList.size() + " 条数据"); |
|||
return infoList; |
|||
} |
|||
} |
|||
return List.of(); |
|||
} |
|||
|
|||
@Override |
|||
public String getStrategyName() { |
|||
return "单页面爬取"; |
|||
} |
|||
} |
|||
@ -0,0 +1,108 @@ |
|||
package com.hnu.crawler.strategy; |
|||
|
|||
import com.hnu.crawler.model.AdmissionInfo; |
|||
import com.hnu.crawler.util.HttpClientUtil; |
|||
import com.hnu.crawler.view.ConsoleView; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class UniversityStrategy implements CrawlerStrategy { |
|||
private final ConsoleView view; |
|||
|
|||
public UniversityStrategy(ConsoleView view) { |
|||
this.view = view; |
|||
} |
|||
|
|||
@Override |
|||
public List<AdmissionInfo> crawl(String url) { |
|||
List<AdmissionInfo> results = new ArrayList<>(); |
|||
|
|||
try { |
|||
String html = HttpClientUtil.fetchHtml(url); |
|||
if (html == null) { |
|||
return results; |
|||
} |
|||
|
|||
Document doc = Jsoup.parse(html); |
|||
|
|||
String universityName = doc.select("title").text(); |
|||
if (universityName.length() > 50) { |
|||
universityName = universityName.substring(0, 50); |
|||
} |
|||
|
|||
Elements tables = doc.select("table"); |
|||
|
|||
for (Element table : tables) { |
|||
Elements rows = table.select("tr"); |
|||
|
|||
for (int i = 1; i < rows.size(); i++) { |
|||
Element row = rows.get(i); |
|||
Elements cells = row.select("td, th"); |
|||
|
|||
if (cells.size() >= 2) { |
|||
AdmissionInfo info = new AdmissionInfo(); |
|||
info.setUniversityName(universityName); |
|||
info.setMajorName(cells.get(0).text()); |
|||
|
|||
if (cells.size() > 1) { |
|||
try { |
|||
String scoreStr = cells.get(1).text().replaceAll("[^0-9.]", ""); |
|||
if (!scoreStr.isEmpty()) { |
|||
info.setMinScore(Double.parseDouble(scoreStr)); |
|||
} |
|||
} catch (Exception ignored) { |
|||
} |
|||
} |
|||
|
|||
if (cells.size() > 2) { |
|||
info.setYear(cells.get(2).text().replaceAll("[^0-9]", "")); |
|||
} |
|||
|
|||
info.setSourceUrl(url); |
|||
results.add(info); |
|||
} |
|||
} |
|||
} |
|||
|
|||
if (results.isEmpty()) { |
|||
Elements listItems = doc.select(".list-item, .news-item, .notice-item"); |
|||
for (Element item : listItems) { |
|||
String title = item.select("a, .title").text(); |
|||
String link = item.select("a").attr("abs:href"); |
|||
|
|||
if (!title.isEmpty()) { |
|||
AdmissionInfo info = new AdmissionInfo(); |
|||
info.setUniversityName(universityName); |
|||
info.setMajorName(title); |
|||
info.setSourceUrl(link.isEmpty() ? url : link); |
|||
results.add(info); |
|||
} |
|||
} |
|||
} |
|||
|
|||
view.printInfo("解析到 " + results.size() + " 条招生信息"); |
|||
|
|||
} catch (Exception e) { |
|||
view.printError("爬取高校网站失败: " + e.getMessage()); |
|||
} |
|||
|
|||
return results; |
|||
} |
|||
|
|||
@Override |
|||
public List<AdmissionInfo> crawl(java.util.Scanner scanner) { |
|||
view.printInfo("请输入高校招生网站URL: "); |
|||
String url = scanner.nextLine().trim(); |
|||
return crawl(url); |
|||
} |
|||
|
|||
@Override |
|||
public String getStrategyName() { |
|||
return "高校网站爬取"; |
|||
} |
|||
} |
|||
@ -0,0 +1,52 @@ |
|||
package com.hnu.crawler.util; |
|||
|
|||
import org.apache.hc.client5.http.classic.methods.HttpGet; |
|||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; |
|||
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; |
|||
import org.apache.hc.client5.http.impl.classic.HttpClients; |
|||
import org.apache.hc.core5.http.ParseException; |
|||
import org.apache.hc.core5.http.io.entity.EntityUtils; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.io.IOException; |
|||
|
|||
public class HttpClientUtil { |
|||
private static final Logger logger = LoggerFactory.getLogger(HttpClientUtil.class); |
|||
private static final int TIMEOUT = 30000; |
|||
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"; |
|||
|
|||
public static String fetchHtml(String url) { |
|||
try (CloseableHttpClient httpClient = HttpClients.createDefault()) { |
|||
HttpGet request = new HttpGet(url); |
|||
request.setHeader("User-Agent", USER_AGENT); |
|||
request.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); |
|||
request.setHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8"); |
|||
request.setHeader("Connection", "keep-alive"); |
|||
|
|||
try (CloseableHttpResponse response = httpClient.execute(request)) { |
|||
int statusCode = response.getCode(); |
|||
if (statusCode == 200) { |
|||
String html = EntityUtils.toString(response.getEntity(), "UTF-8"); |
|||
logger.info("成功获取页面: {}", url); |
|||
return html; |
|||
} else { |
|||
logger.error("请求失败,状态码: {}, URL: {}", statusCode, url); |
|||
return null; |
|||
} |
|||
} |
|||
} catch (IOException | ParseException e) { |
|||
logger.error("获取页面失败: {}", url, e); |
|||
return null; |
|||
} |
|||
} |
|||
|
|||
public static void sleep(long milliseconds) { |
|||
try { |
|||
Thread.sleep(milliseconds); |
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
logger.warn("睡眠被中断", e); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,33 @@ |
|||
package com.hnu.crawler.view; |
|||
|
|||
import java.util.Scanner; |
|||
|
|||
public class ConsoleView { |
|||
private static final String ANSI_RESET = "\u001B[0m"; |
|||
private static final String ANSI_GREEN = "\u001B[32m"; |
|||
private static final String ANSI_RED = "\u001B[31m"; |
|||
private static final String ANSI_BLUE = "\u001B[34m"; |
|||
|
|||
private final Scanner scanner = new Scanner(System.in); |
|||
|
|||
public String readLine() { |
|||
System.out.print("> "); |
|||
return scanner.nextLine(); |
|||
} |
|||
|
|||
public void printSuccess(String msg) { |
|||
System.out.println(ANSI_GREEN + msg + ANSI_RESET); |
|||
} |
|||
|
|||
public void printError(String msg) { |
|||
System.out.println(ANSI_RED + msg + ANSI_RESET); |
|||
} |
|||
|
|||
public void printInfo(String msg) { |
|||
System.out.println(ANSI_BLUE + msg + ANSI_RESET); |
|||
} |
|||
|
|||
public void print(String msg) { |
|||
System.out.println(msg); |
|||
} |
|||
} |
|||
@ -0,0 +1,12 @@ |
|||
<configuration> |
|||
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender"> |
|||
<encoder> |
|||
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
|||
<charset>UTF-8</charset> |
|||
</encoder> |
|||
</appender> |
|||
|
|||
<root level="INFO"> |
|||
<appender-ref ref="STDOUT" /> |
|||
</root> |
|||
</configuration> |
|||
@ -0,0 +1,70 @@ |
|||
import static org.junit.jupiter.api.Assertions.*; |
|||
import org.junit.jupiter.api.BeforeEach; |
|||
import org.junit.jupiter.api.Test; |
|||
import org.junit.jupiter.api.DisplayName; |
|||
|
|||
class BankAccountTest { |
|||
|
|||
private BankAccount account; |
|||
|
|||
@BeforeEach |
|||
void setUp() { |
|||
account = new BankAccount("1234567890", "张三"); |
|||
} |
|||
|
|||
@Test |
|||
@DisplayName("测试账户初始化") |
|||
void testAccountInitialization() { |
|||
assertEquals("1234567890", account.getAccountNumber()); |
|||
assertEquals("张三", account.getOwnerName()); |
|||
assertEquals(0.0, account.getBalance(), 0.001); |
|||
} |
|||
|
|||
@Test |
|||
@DisplayName("测试存款功能") |
|||
void testDeposit() { |
|||
account.deposit(1000.0); |
|||
assertEquals(1000.0, account.getBalance(), 0.001); |
|||
} |
|||
|
|||
@Test |
|||
@DisplayName("测试存款负数") |
|||
void testDepositNegativeAmount() { |
|||
double initialBalance = account.getBalance(); |
|||
account.deposit(-100.0); |
|||
assertEquals(initialBalance, account.getBalance(), 0.001); |
|||
} |
|||
|
|||
@Test |
|||
@DisplayName("测试取款功能") |
|||
void testWithdraw() { |
|||
account.deposit(1000.0); |
|||
account.withdraw(500.0); |
|||
assertEquals(500.0, account.getBalance(), 0.001); |
|||
} |
|||
|
|||
@Test |
|||
@DisplayName("测试取款超过余额") |
|||
void testWithdrawInsufficientBalance() { |
|||
account.deposit(500.0); |
|||
double balanceBefore = account.getBalance(); |
|||
account.withdraw(1000.0); |
|||
assertEquals(balanceBefore, account.getBalance(), 0.001); |
|||
} |
|||
|
|||
@Test |
|||
@DisplayName("测试取款负数") |
|||
void testWithdrawNegativeAmount() { |
|||
account.deposit(1000.0); |
|||
double balanceBefore = account.getBalance(); |
|||
account.withdraw(-100.0); |
|||
assertEquals(balanceBefore, account.getBalance(), 0.001); |
|||
} |
|||
|
|||
@Test |
|||
@DisplayName("测试设置户主姓名") |
|||
void testSetOwnerName() { |
|||
account.setOwnerName("李四"); |
|||
assertEquals("李四", account.getOwnerName()); |
|||
} |
|||
} |
|||
Loading…
Reference in new issue