You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
253 lines
10 KiB
253 lines
10 KiB
import java.util.*;
|
|
|
|
class CrawlerException extends RuntimeException {
|
|
public CrawlerException(String message) { super(message); }
|
|
public CrawlerException(String message, Throwable cause) { super(message, cause); }
|
|
}
|
|
|
|
class NetworkException extends CrawlerException {
|
|
public NetworkException(String message) { super("网络请求失败: " + message); }
|
|
}
|
|
|
|
class ParseException extends CrawlerException {
|
|
public ParseException(String message) { super("数据解析失败: " + message); }
|
|
}
|
|
|
|
class Article {
|
|
private String title, author, publishDate, content, url;
|
|
public Article(String t, String a, String pd, String c, String u) {
|
|
title = t; author = a; publishDate = pd; content = c; url = u;
|
|
}
|
|
public String getTitle() { return title; }
|
|
public String getAuthor() { return author; }
|
|
public String getPublishDate() { return publishDate; }
|
|
}
|
|
|
|
class ArticleRepository {
|
|
private List<Article> articles = new ArrayList<>();
|
|
public void add(Article a) { if (a == null) throw new CrawlerException("Article cannot be null"); articles.add(a); }
|
|
public void addAll(List<Article> list) { if (list == null) throw new CrawlerException("List cannot be null"); articles.addAll(list); }
|
|
public List<Article> getAll() { return new ArrayList<>(articles); }
|
|
public int size() { return articles.size(); }
|
|
public void clear() { articles.clear(); }
|
|
}
|
|
|
|
interface CrawlStrategy {
|
|
boolean supports(String url);
|
|
List<Article> parse(String html) throws ParseException;
|
|
String getName();
|
|
}
|
|
|
|
class BlogStrategy implements CrawlStrategy {
|
|
public boolean supports(String url) { return url.contains("blog") || url.contains("csdn"); }
|
|
public List<Article> parse(String html) throws ParseException {
|
|
List<Article> list = new ArrayList<>();
|
|
for (int i = 1; i <= 3; i++) {
|
|
list.add(new Article("博客文章"+i, "博主"+i,
|
|
java.time.LocalDate.now().minusDays(i).toString(), "内容"+i, "http://blog.com/"+i));
|
|
}
|
|
return list;
|
|
}
|
|
public String getName() { return "BlogStrategy"; }
|
|
}
|
|
|
|
class NewsStrategy implements CrawlStrategy {
|
|
public boolean supports(String url) { return url.contains("news") || url.contains("sina"); }
|
|
public List<Article> parse(String html) throws ParseException {
|
|
List<Article> list = new ArrayList<>();
|
|
for (int i = 1; i <= 3; i++) {
|
|
list.add(new Article("新闻标题"+i, "记者"+i,
|
|
java.time.LocalDateTime.now().toString().substring(0,19), "内容"+i, "http://news.com/"+i));
|
|
}
|
|
return list;
|
|
}
|
|
public String getName() { return "NewsStrategy"; }
|
|
}
|
|
|
|
class DefaultStrategy implements CrawlStrategy {
|
|
public boolean supports(String url) { return true; }
|
|
public List<Article> parse(String html) throws ParseException {
|
|
List<Article> list = new ArrayList<>();
|
|
list.add(new Article("默认文章", "未知作者",
|
|
java.time.LocalDate.now().toString(), "默认内容", "http://example.com"));
|
|
return list;
|
|
}
|
|
public String getName() { return "DefaultStrategy"; }
|
|
}
|
|
|
|
class ScraperService {
|
|
public String fetch(String url) throws NetworkException {
|
|
int retries = 3;
|
|
while (retries-- > 0) {
|
|
try {
|
|
System.out.println("[Scraper] 获取: " + url);
|
|
if (Math.random() > 0.3) return "<html>内容</html>";
|
|
throw new RuntimeException("超时");
|
|
} catch (Exception e) { if (retries == 0) throw new NetworkException("重试3次失败"); }
|
|
}
|
|
throw new NetworkException("失败");
|
|
}
|
|
}
|
|
|
|
interface Command {
|
|
void execute(String... args);
|
|
String getName();
|
|
String getDescription();
|
|
}
|
|
|
|
class CrawlCommand implements Command {
|
|
private ArticleRepository repo;
|
|
private List<CrawlStrategy> strategies;
|
|
private ScraperService scraper;
|
|
|
|
public CrawlCommand(ArticleRepository r, ScraperService s) {
|
|
repo = r; scraper = s;
|
|
strategies = List.of(new BlogStrategy(), new NewsStrategy(), new DefaultStrategy());
|
|
}
|
|
|
|
public void execute(String... args) {
|
|
if (args.length == 0) { System.out.println("请输入URL"); return; }
|
|
String url = args[0];
|
|
if (!url.startsWith("http")) { System.out.println("无效URL"); return; }
|
|
try {
|
|
String html = scraper.fetch(url);
|
|
CrawlStrategy strategy = strategies.stream().filter(s -> s.supports(url)).findFirst().orElse(new DefaultStrategy());
|
|
List<Article> articles = strategy.parse(html);
|
|
repo.addAll(articles);
|
|
System.out.println("抓取完成: " + articles.size() + "篇");
|
|
} catch (NetworkException | ParseException e) { System.out.println(e.getMessage()); }
|
|
}
|
|
public String getName() { return "crawl"; }
|
|
public String getDescription() { return "抓取URL,别名c"; }
|
|
}
|
|
|
|
class ListCommand implements Command {
|
|
private ArticleRepository repo;
|
|
public ListCommand(ArticleRepository r) { repo = r; }
|
|
public void execute(String... args) {
|
|
List<Article> list = repo.getAll();
|
|
if (list.isEmpty()) { System.out.println("暂无文章"); return; }
|
|
for (int i = 0; i < list.size(); i++) {
|
|
Article a = list.get(i);
|
|
System.out.printf("[%d] %s - %s (%s)%n", i+1, a.getTitle(), a.getAuthor(), a.getPublishDate());
|
|
}
|
|
}
|
|
public String getName() { return "list"; }
|
|
public String getDescription() { return "列出文章"; }
|
|
}
|
|
|
|
class AnalyzeCommand implements Command {
|
|
private List<CrawlStrategy> strategies;
|
|
private ScraperService scraper;
|
|
|
|
public AnalyzeCommand(ScraperService s) {
|
|
scraper = s;
|
|
strategies = List.of(new BlogStrategy(), new NewsStrategy(), new DefaultStrategy());
|
|
}
|
|
|
|
public void execute(String... args) {
|
|
if (args.length == 0) { System.out.println("请输入URL"); return; }
|
|
String url = args[0];
|
|
if (!url.startsWith("http")) { System.out.println("无效URL"); return; }
|
|
try {
|
|
String html = scraper.fetch(url);
|
|
CrawlStrategy strategy = strategies.stream().filter(s -> s.supports(url)).findFirst().orElse(new DefaultStrategy());
|
|
List<Article> articles = strategy.parse(html);
|
|
System.out.println("=== 分析结果(不存储)===");
|
|
System.out.println("URL: " + url + " | 策略: " + strategy.getName() + " | 文章数: " + articles.size());
|
|
Map<String, Integer> authors = new HashMap<>();
|
|
articles.forEach(a -> authors.merge(a.getAuthor(), 1, Integer::sum));
|
|
System.out.println("作者分布: " + authors);
|
|
} catch (NetworkException | ParseException e) { System.out.println(e.getMessage()); }
|
|
}
|
|
public String getName() { return "analyze"; }
|
|
public String getDescription() { return "分析URL但不存储"; }
|
|
}
|
|
|
|
class HistoryCommand implements Command {
|
|
private List<String> history;
|
|
public HistoryCommand(List<String> h) { history = h; }
|
|
public void execute(String... args) {
|
|
if (history.isEmpty()) { System.out.println("暂无历史"); return; }
|
|
System.out.println("命令历史:");
|
|
history.forEach(h -> System.out.println(" " + h));
|
|
System.out.println("\n=== AI审计 ===");
|
|
System.out.println("类名: Article, ArticleRepository, CrawlStrategy, CrawlCommand, Controller");
|
|
System.out.println("请检查MVC三层划分是否越权");
|
|
}
|
|
public String getName() { return "history"; }
|
|
public String getDescription() { return "查看命令历史"; }
|
|
}
|
|
|
|
class ClearCommand implements Command {
|
|
private ArticleRepository repo;
|
|
public ClearCommand(ArticleRepository r) { repo = r; }
|
|
public void execute(String... args) { repo.clear(); System.out.println("已清空"); }
|
|
public String getName() { return "clear"; }
|
|
public String getDescription() { return "清空文章"; }
|
|
}
|
|
|
|
class ExitCommand implements Command {
|
|
public void execute(String... args) { System.out.println("退出"); System.exit(0); }
|
|
public String getName() { return "exit"; }
|
|
public String getDescription() { return "退出程序"; }
|
|
}
|
|
|
|
class HelpCommand implements Command {
|
|
private Map<String, Command> commands;
|
|
public HelpCommand(Map<String, Command> c) { commands = c; }
|
|
public void execute(String... args) {
|
|
System.out.println("可用命令:");
|
|
commands.forEach((k, v) -> System.out.printf(" %s - %s%n", k, v.getDescription()));
|
|
}
|
|
public String getName() { return "help"; }
|
|
public String getDescription() { return "显示帮助"; }
|
|
}
|
|
|
|
class CommandManager {
|
|
private Map<String, Command> commands = new HashMap<>();
|
|
private Map<String, String> aliases = new HashMap<>();
|
|
public void register(Command c) { commands.put(c.getName(), c); }
|
|
public void alias(String a, String n) { aliases.put(a, n); }
|
|
public Command get(String n) { return commands.get(aliases.getOrDefault(n, n)); }
|
|
public boolean has(String n) { return commands.containsKey(aliases.getOrDefault(n, n)); }
|
|
public Map<String, Command> getAll() { return commands; }
|
|
}
|
|
|
|
public class App {
|
|
public static void main(String[] args) {
|
|
ArticleRepository repo = new ArticleRepository();
|
|
ScraperService scraper = new ScraperService();
|
|
CommandManager cmdMgr = new CommandManager();
|
|
List<String> history = new ArrayList<>();
|
|
|
|
cmdMgr.register(new CrawlCommand(repo, scraper));
|
|
cmdMgr.register(new ListCommand(repo));
|
|
cmdMgr.register(new AnalyzeCommand(scraper));
|
|
cmdMgr.register(new HistoryCommand(history));
|
|
cmdMgr.register(new ClearCommand(repo));
|
|
cmdMgr.register(new ExitCommand());
|
|
cmdMgr.register(new HelpCommand(cmdMgr.getAll()));
|
|
cmdMgr.alias("c", "crawl");
|
|
|
|
Scanner scanner = new Scanner(System.in);
|
|
System.out.println("========== 命令行工具 ==========");
|
|
|
|
while (true) {
|
|
System.out.print("> ");
|
|
String input = scanner.nextLine().trim();
|
|
if (input.isEmpty()) continue;
|
|
|
|
history.add(input);
|
|
String[] parts = input.split("\\s+");
|
|
String cmdName = parts[0];
|
|
String[] cmdArgs = parts.length > 1 ? Arrays.copyOfRange(parts, 1, parts.length) : new String[0];
|
|
|
|
if (cmdMgr.has(cmdName)) {
|
|
cmdMgr.get(cmdName).execute(cmdArgs);
|
|
} else {
|
|
System.out.println("未知命令: " + cmdName);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|