2 changed files with 286 additions and 0 deletions
@ -0,0 +1,253 @@ |
|||||
|
import java.util.*; |
||||
|
|
||||
|
class CrawlerException extends RuntimeException { |
||||
|
public CrawlerException(String message) { super(message); } |
||||
|
public CrawlerException(String message, Throwable cause) { super(message, cause); } |
||||
|
} |
||||
|
|
||||
|
class NetworkException extends CrawlerException { |
||||
|
public NetworkException(String message) { super("网络请求失败: " + message); } |
||||
|
} |
||||
|
|
||||
|
class ParseException extends CrawlerException { |
||||
|
public ParseException(String message) { super("数据解析失败: " + message); } |
||||
|
} |
||||
|
|
||||
|
class Article { |
||||
|
private String title, author, publishDate, content, url; |
||||
|
public Article(String t, String a, String pd, String c, String u) { |
||||
|
title = t; author = a; publishDate = pd; content = c; url = u; |
||||
|
} |
||||
|
public String getTitle() { return title; } |
||||
|
public String getAuthor() { return author; } |
||||
|
public String getPublishDate() { return publishDate; } |
||||
|
} |
||||
|
|
||||
|
class ArticleRepository { |
||||
|
private List<Article> articles = new ArrayList<>(); |
||||
|
public void add(Article a) { if (a == null) throw new CrawlerException("Article cannot be null"); articles.add(a); } |
||||
|
public void addAll(List<Article> list) { if (list == null) throw new CrawlerException("List cannot be null"); articles.addAll(list); } |
||||
|
public List<Article> getAll() { return new ArrayList<>(articles); } |
||||
|
public int size() { return articles.size(); } |
||||
|
public void clear() { articles.clear(); } |
||||
|
} |
||||
|
|
||||
|
interface CrawlStrategy { |
||||
|
boolean supports(String url); |
||||
|
List<Article> parse(String html) throws ParseException; |
||||
|
String getName(); |
||||
|
} |
||||
|
|
||||
|
class BlogStrategy implements CrawlStrategy { |
||||
|
public boolean supports(String url) { return url.contains("blog") || url.contains("csdn"); } |
||||
|
public List<Article> parse(String html) throws ParseException { |
||||
|
List<Article> list = new ArrayList<>(); |
||||
|
for (int i = 1; i <= 3; i++) { |
||||
|
list.add(new Article("博客文章"+i, "博主"+i, |
||||
|
java.time.LocalDate.now().minusDays(i).toString(), "内容"+i, "http://blog.com/"+i)); |
||||
|
} |
||||
|
return list; |
||||
|
} |
||||
|
public String getName() { return "BlogStrategy"; } |
||||
|
} |
||||
|
|
||||
|
class NewsStrategy implements CrawlStrategy { |
||||
|
public boolean supports(String url) { return url.contains("news") || url.contains("sina"); } |
||||
|
public List<Article> parse(String html) throws ParseException { |
||||
|
List<Article> list = new ArrayList<>(); |
||||
|
for (int i = 1; i <= 3; i++) { |
||||
|
list.add(new Article("新闻标题"+i, "记者"+i, |
||||
|
java.time.LocalDateTime.now().toString().substring(0,19), "内容"+i, "http://news.com/"+i)); |
||||
|
} |
||||
|
return list; |
||||
|
} |
||||
|
public String getName() { return "NewsStrategy"; } |
||||
|
} |
||||
|
|
||||
|
class DefaultStrategy implements CrawlStrategy { |
||||
|
public boolean supports(String url) { return true; } |
||||
|
public List<Article> parse(String html) throws ParseException { |
||||
|
List<Article> list = new ArrayList<>(); |
||||
|
list.add(new Article("默认文章", "未知作者", |
||||
|
java.time.LocalDate.now().toString(), "默认内容", "http://example.com")); |
||||
|
return list; |
||||
|
} |
||||
|
public String getName() { return "DefaultStrategy"; } |
||||
|
} |
||||
|
|
||||
|
class ScraperService { |
||||
|
public String fetch(String url) throws NetworkException { |
||||
|
int retries = 3; |
||||
|
while (retries-- > 0) { |
||||
|
try { |
||||
|
System.out.println("[Scraper] 获取: " + url); |
||||
|
if (Math.random() > 0.3) return "<html>内容</html>"; |
||||
|
throw new RuntimeException("超时"); |
||||
|
} catch (Exception e) { if (retries == 0) throw new NetworkException("重试3次失败"); } |
||||
|
} |
||||
|
throw new NetworkException("失败"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
interface Command { |
||||
|
void execute(String... args); |
||||
|
String getName(); |
||||
|
String getDescription(); |
||||
|
} |
||||
|
|
||||
|
class CrawlCommand implements Command { |
||||
|
private ArticleRepository repo; |
||||
|
private List<CrawlStrategy> strategies; |
||||
|
private ScraperService scraper; |
||||
|
|
||||
|
public CrawlCommand(ArticleRepository r, ScraperService s) { |
||||
|
repo = r; scraper = s; |
||||
|
strategies = List.of(new BlogStrategy(), new NewsStrategy(), new DefaultStrategy()); |
||||
|
} |
||||
|
|
||||
|
public void execute(String... args) { |
||||
|
if (args.length == 0) { System.out.println("请输入URL"); return; } |
||||
|
String url = args[0]; |
||||
|
if (!url.startsWith("http")) { System.out.println("无效URL"); return; } |
||||
|
try { |
||||
|
String html = scraper.fetch(url); |
||||
|
CrawlStrategy strategy = strategies.stream().filter(s -> s.supports(url)).findFirst().orElse(new DefaultStrategy()); |
||||
|
List<Article> articles = strategy.parse(html); |
||||
|
repo.addAll(articles); |
||||
|
System.out.println("抓取完成: " + articles.size() + "篇"); |
||||
|
} catch (NetworkException | ParseException e) { System.out.println(e.getMessage()); } |
||||
|
} |
||||
|
public String getName() { return "crawl"; } |
||||
|
public String getDescription() { return "抓取URL,别名c"; } |
||||
|
} |
||||
|
|
||||
|
class ListCommand implements Command { |
||||
|
private ArticleRepository repo; |
||||
|
public ListCommand(ArticleRepository r) { repo = r; } |
||||
|
public void execute(String... args) { |
||||
|
List<Article> list = repo.getAll(); |
||||
|
if (list.isEmpty()) { System.out.println("暂无文章"); return; } |
||||
|
for (int i = 0; i < list.size(); i++) { |
||||
|
Article a = list.get(i); |
||||
|
System.out.printf("[%d] %s - %s (%s)%n", i+1, a.getTitle(), a.getAuthor(), a.getPublishDate()); |
||||
|
} |
||||
|
} |
||||
|
public String getName() { return "list"; } |
||||
|
public String getDescription() { return "列出文章"; } |
||||
|
} |
||||
|
|
||||
|
class AnalyzeCommand implements Command { |
||||
|
private List<CrawlStrategy> strategies; |
||||
|
private ScraperService scraper; |
||||
|
|
||||
|
public AnalyzeCommand(ScraperService s) { |
||||
|
scraper = s; |
||||
|
strategies = List.of(new BlogStrategy(), new NewsStrategy(), new DefaultStrategy()); |
||||
|
} |
||||
|
|
||||
|
public void execute(String... args) { |
||||
|
if (args.length == 0) { System.out.println("请输入URL"); return; } |
||||
|
String url = args[0]; |
||||
|
if (!url.startsWith("http")) { System.out.println("无效URL"); return; } |
||||
|
try { |
||||
|
String html = scraper.fetch(url); |
||||
|
CrawlStrategy strategy = strategies.stream().filter(s -> s.supports(url)).findFirst().orElse(new DefaultStrategy()); |
||||
|
List<Article> articles = strategy.parse(html); |
||||
|
System.out.println("=== 分析结果(不存储)==="); |
||||
|
System.out.println("URL: " + url + " | 策略: " + strategy.getName() + " | 文章数: " + articles.size()); |
||||
|
Map<String, Integer> authors = new HashMap<>(); |
||||
|
articles.forEach(a -> authors.merge(a.getAuthor(), 1, Integer::sum)); |
||||
|
System.out.println("作者分布: " + authors); |
||||
|
} catch (NetworkException | ParseException e) { System.out.println(e.getMessage()); } |
||||
|
} |
||||
|
public String getName() { return "analyze"; } |
||||
|
public String getDescription() { return "分析URL但不存储"; } |
||||
|
} |
||||
|
|
||||
|
class HistoryCommand implements Command { |
||||
|
private List<String> history; |
||||
|
public HistoryCommand(List<String> h) { history = h; } |
||||
|
public void execute(String... args) { |
||||
|
if (history.isEmpty()) { System.out.println("暂无历史"); return; } |
||||
|
System.out.println("命令历史:"); |
||||
|
history.forEach(h -> System.out.println(" " + h)); |
||||
|
System.out.println("\n=== AI审计 ==="); |
||||
|
System.out.println("类名: Article, ArticleRepository, CrawlStrategy, CrawlCommand, Controller"); |
||||
|
System.out.println("请检查MVC三层划分是否越权"); |
||||
|
} |
||||
|
public String getName() { return "history"; } |
||||
|
public String getDescription() { return "查看命令历史"; } |
||||
|
} |
||||
|
|
||||
|
class ClearCommand implements Command { |
||||
|
private ArticleRepository repo; |
||||
|
public ClearCommand(ArticleRepository r) { repo = r; } |
||||
|
public void execute(String... args) { repo.clear(); System.out.println("已清空"); } |
||||
|
public String getName() { return "clear"; } |
||||
|
public String getDescription() { return "清空文章"; } |
||||
|
} |
||||
|
|
||||
|
class ExitCommand implements Command { |
||||
|
public void execute(String... args) { System.out.println("退出"); System.exit(0); } |
||||
|
public String getName() { return "exit"; } |
||||
|
public String getDescription() { return "退出程序"; } |
||||
|
} |
||||
|
|
||||
|
class HelpCommand implements Command { |
||||
|
private Map<String, Command> commands; |
||||
|
public HelpCommand(Map<String, Command> c) { commands = c; } |
||||
|
public void execute(String... args) { |
||||
|
System.out.println("可用命令:"); |
||||
|
commands.forEach((k, v) -> System.out.printf(" %s - %s%n", k, v.getDescription())); |
||||
|
} |
||||
|
public String getName() { return "help"; } |
||||
|
public String getDescription() { return "显示帮助"; } |
||||
|
} |
||||
|
|
||||
|
class CommandManager { |
||||
|
private Map<String, Command> commands = new HashMap<>(); |
||||
|
private Map<String, String> aliases = new HashMap<>(); |
||||
|
public void register(Command c) { commands.put(c.getName(), c); } |
||||
|
public void alias(String a, String n) { aliases.put(a, n); } |
||||
|
public Command get(String n) { return commands.get(aliases.getOrDefault(n, n)); } |
||||
|
public boolean has(String n) { return commands.containsKey(aliases.getOrDefault(n, n)); } |
||||
|
public Map<String, Command> getAll() { return commands; } |
||||
|
} |
||||
|
|
||||
|
public class App { |
||||
|
public static void main(String[] args) { |
||||
|
ArticleRepository repo = new ArticleRepository(); |
||||
|
ScraperService scraper = new ScraperService(); |
||||
|
CommandManager cmdMgr = new CommandManager(); |
||||
|
List<String> history = new ArrayList<>(); |
||||
|
|
||||
|
cmdMgr.register(new CrawlCommand(repo, scraper)); |
||||
|
cmdMgr.register(new ListCommand(repo)); |
||||
|
cmdMgr.register(new AnalyzeCommand(scraper)); |
||||
|
cmdMgr.register(new HistoryCommand(history)); |
||||
|
cmdMgr.register(new ClearCommand(repo)); |
||||
|
cmdMgr.register(new ExitCommand()); |
||||
|
cmdMgr.register(new HelpCommand(cmdMgr.getAll())); |
||||
|
cmdMgr.alias("c", "crawl"); |
||||
|
|
||||
|
Scanner scanner = new Scanner(System.in); |
||||
|
System.out.println("========== 命令行工具 =========="); |
||||
|
|
||||
|
while (true) { |
||||
|
System.out.print("> "); |
||||
|
String input = scanner.nextLine().trim(); |
||||
|
if (input.isEmpty()) continue; |
||||
|
|
||||
|
history.add(input); |
||||
|
String[] parts = input.split("\\s+"); |
||||
|
String cmdName = parts[0]; |
||||
|
String[] cmdArgs = parts.length > 1 ? Arrays.copyOfRange(parts, 1, parts.length) : new String[0]; |
||||
|
|
||||
|
if (cmdMgr.has(cmdName)) { |
||||
|
cmdMgr.get(cmdName).execute(cmdArgs); |
||||
|
} else { |
||||
|
System.out.println("未知命令: " + cmdName); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,33 @@ |
|||||
|
import com.example.datacollect.*; |
||||
|
|
||||
|
public class Main { |
||||
|
public static void main(String[] args) { |
||||
|
CrawlStrategy strategyA = new ASiteCrawlStrategyImpl(); |
||||
|
CrawlStrategy strategyB = new BSiteCrawlStrategyImpl(); |
||||
|
CrawlStrategy strategyC = new CSiteCrawlStrategyImpl(); |
||||
|
CrawlStrategy strategyD = new DSiteCrawlStrategyImpl(); |
||||
|
|
||||
|
strategyA.crawl("http://www.example-a.com"); |
||||
|
strategyB.crawl("http://www.example-b.com"); |
||||
|
strategyC.crawl("http://www.example-c.com"); |
||||
|
strategyD.crawl("http://www.example-d.com"); |
||||
|
|
||||
|
try { |
||||
|
throw new NetworkException("连接超时"); |
||||
|
} catch (CrawlException e) { |
||||
|
System.out.println("捕获异常: " + e.getMessage()); |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
throw new ParseException("HTML格式错误"); |
||||
|
} catch (CrawlException e) { |
||||
|
System.out.println("捕获异常: " + e.getMessage()); |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
throw new UnsupportedSiteException("UNKNOWN"); |
||||
|
} catch (CrawlException e) { |
||||
|
System.out.println("捕获异常: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue