2 changed files with 286 additions and 0 deletions
@ -0,0 +1,253 @@ |
|||
import java.util.*; |
|||
|
|||
class CrawlerException extends RuntimeException { |
|||
public CrawlerException(String message) { super(message); } |
|||
public CrawlerException(String message, Throwable cause) { super(message, cause); } |
|||
} |
|||
|
|||
class NetworkException extends CrawlerException { |
|||
public NetworkException(String message) { super("网络请求失败: " + message); } |
|||
} |
|||
|
|||
class ParseException extends CrawlerException { |
|||
public ParseException(String message) { super("数据解析失败: " + message); } |
|||
} |
|||
|
|||
class Article { |
|||
private String title, author, publishDate, content, url; |
|||
public Article(String t, String a, String pd, String c, String u) { |
|||
title = t; author = a; publishDate = pd; content = c; url = u; |
|||
} |
|||
public String getTitle() { return title; } |
|||
public String getAuthor() { return author; } |
|||
public String getPublishDate() { return publishDate; } |
|||
} |
|||
|
|||
class ArticleRepository { |
|||
private List<Article> articles = new ArrayList<>(); |
|||
public void add(Article a) { if (a == null) throw new CrawlerException("Article cannot be null"); articles.add(a); } |
|||
public void addAll(List<Article> list) { if (list == null) throw new CrawlerException("List cannot be null"); articles.addAll(list); } |
|||
public List<Article> getAll() { return new ArrayList<>(articles); } |
|||
public int size() { return articles.size(); } |
|||
public void clear() { articles.clear(); } |
|||
} |
|||
|
|||
interface CrawlStrategy { |
|||
boolean supports(String url); |
|||
List<Article> parse(String html) throws ParseException; |
|||
String getName(); |
|||
} |
|||
|
|||
class BlogStrategy implements CrawlStrategy { |
|||
public boolean supports(String url) { return url.contains("blog") || url.contains("csdn"); } |
|||
public List<Article> parse(String html) throws ParseException { |
|||
List<Article> list = new ArrayList<>(); |
|||
for (int i = 1; i <= 3; i++) { |
|||
list.add(new Article("博客文章"+i, "博主"+i, |
|||
java.time.LocalDate.now().minusDays(i).toString(), "内容"+i, "http://blog.com/"+i)); |
|||
} |
|||
return list; |
|||
} |
|||
public String getName() { return "BlogStrategy"; } |
|||
} |
|||
|
|||
class NewsStrategy implements CrawlStrategy { |
|||
public boolean supports(String url) { return url.contains("news") || url.contains("sina"); } |
|||
public List<Article> parse(String html) throws ParseException { |
|||
List<Article> list = new ArrayList<>(); |
|||
for (int i = 1; i <= 3; i++) { |
|||
list.add(new Article("新闻标题"+i, "记者"+i, |
|||
java.time.LocalDateTime.now().toString().substring(0,19), "内容"+i, "http://news.com/"+i)); |
|||
} |
|||
return list; |
|||
} |
|||
public String getName() { return "NewsStrategy"; } |
|||
} |
|||
|
|||
class DefaultStrategy implements CrawlStrategy { |
|||
public boolean supports(String url) { return true; } |
|||
public List<Article> parse(String html) throws ParseException { |
|||
List<Article> list = new ArrayList<>(); |
|||
list.add(new Article("默认文章", "未知作者", |
|||
java.time.LocalDate.now().toString(), "默认内容", "http://example.com")); |
|||
return list; |
|||
} |
|||
public String getName() { return "DefaultStrategy"; } |
|||
} |
|||
|
|||
class ScraperService { |
|||
public String fetch(String url) throws NetworkException { |
|||
int retries = 3; |
|||
while (retries-- > 0) { |
|||
try { |
|||
System.out.println("[Scraper] 获取: " + url); |
|||
if (Math.random() > 0.3) return "<html>内容</html>"; |
|||
throw new RuntimeException("超时"); |
|||
} catch (Exception e) { if (retries == 0) throw new NetworkException("重试3次失败"); } |
|||
} |
|||
throw new NetworkException("失败"); |
|||
} |
|||
} |
|||
|
|||
interface Command { |
|||
void execute(String... args); |
|||
String getName(); |
|||
String getDescription(); |
|||
} |
|||
|
|||
class CrawlCommand implements Command { |
|||
private ArticleRepository repo; |
|||
private List<CrawlStrategy> strategies; |
|||
private ScraperService scraper; |
|||
|
|||
public CrawlCommand(ArticleRepository r, ScraperService s) { |
|||
repo = r; scraper = s; |
|||
strategies = List.of(new BlogStrategy(), new NewsStrategy(), new DefaultStrategy()); |
|||
} |
|||
|
|||
public void execute(String... args) { |
|||
if (args.length == 0) { System.out.println("请输入URL"); return; } |
|||
String url = args[0]; |
|||
if (!url.startsWith("http")) { System.out.println("无效URL"); return; } |
|||
try { |
|||
String html = scraper.fetch(url); |
|||
CrawlStrategy strategy = strategies.stream().filter(s -> s.supports(url)).findFirst().orElse(new DefaultStrategy()); |
|||
List<Article> articles = strategy.parse(html); |
|||
repo.addAll(articles); |
|||
System.out.println("抓取完成: " + articles.size() + "篇"); |
|||
} catch (NetworkException | ParseException e) { System.out.println(e.getMessage()); } |
|||
} |
|||
public String getName() { return "crawl"; } |
|||
public String getDescription() { return "抓取URL,别名c"; } |
|||
} |
|||
|
|||
class ListCommand implements Command { |
|||
private ArticleRepository repo; |
|||
public ListCommand(ArticleRepository r) { repo = r; } |
|||
public void execute(String... args) { |
|||
List<Article> list = repo.getAll(); |
|||
if (list.isEmpty()) { System.out.println("暂无文章"); return; } |
|||
for (int i = 0; i < list.size(); i++) { |
|||
Article a = list.get(i); |
|||
System.out.printf("[%d] %s - %s (%s)%n", i+1, a.getTitle(), a.getAuthor(), a.getPublishDate()); |
|||
} |
|||
} |
|||
public String getName() { return "list"; } |
|||
public String getDescription() { return "列出文章"; } |
|||
} |
|||
|
|||
class AnalyzeCommand implements Command { |
|||
private List<CrawlStrategy> strategies; |
|||
private ScraperService scraper; |
|||
|
|||
public AnalyzeCommand(ScraperService s) { |
|||
scraper = s; |
|||
strategies = List.of(new BlogStrategy(), new NewsStrategy(), new DefaultStrategy()); |
|||
} |
|||
|
|||
public void execute(String... args) { |
|||
if (args.length == 0) { System.out.println("请输入URL"); return; } |
|||
String url = args[0]; |
|||
if (!url.startsWith("http")) { System.out.println("无效URL"); return; } |
|||
try { |
|||
String html = scraper.fetch(url); |
|||
CrawlStrategy strategy = strategies.stream().filter(s -> s.supports(url)).findFirst().orElse(new DefaultStrategy()); |
|||
List<Article> articles = strategy.parse(html); |
|||
System.out.println("=== 分析结果(不存储)==="); |
|||
System.out.println("URL: " + url + " | 策略: " + strategy.getName() + " | 文章数: " + articles.size()); |
|||
Map<String, Integer> authors = new HashMap<>(); |
|||
articles.forEach(a -> authors.merge(a.getAuthor(), 1, Integer::sum)); |
|||
System.out.println("作者分布: " + authors); |
|||
} catch (NetworkException | ParseException e) { System.out.println(e.getMessage()); } |
|||
} |
|||
public String getName() { return "analyze"; } |
|||
public String getDescription() { return "分析URL但不存储"; } |
|||
} |
|||
|
|||
class HistoryCommand implements Command { |
|||
private List<String> history; |
|||
public HistoryCommand(List<String> h) { history = h; } |
|||
public void execute(String... args) { |
|||
if (history.isEmpty()) { System.out.println("暂无历史"); return; } |
|||
System.out.println("命令历史:"); |
|||
history.forEach(h -> System.out.println(" " + h)); |
|||
System.out.println("\n=== AI审计 ==="); |
|||
System.out.println("类名: Article, ArticleRepository, CrawlStrategy, CrawlCommand, Controller"); |
|||
System.out.println("请检查MVC三层划分是否越权"); |
|||
} |
|||
public String getName() { return "history"; } |
|||
public String getDescription() { return "查看命令历史"; } |
|||
} |
|||
|
|||
class ClearCommand implements Command { |
|||
private ArticleRepository repo; |
|||
public ClearCommand(ArticleRepository r) { repo = r; } |
|||
public void execute(String... args) { repo.clear(); System.out.println("已清空"); } |
|||
public String getName() { return "clear"; } |
|||
public String getDescription() { return "清空文章"; } |
|||
} |
|||
|
|||
class ExitCommand implements Command { |
|||
public void execute(String... args) { System.out.println("退出"); System.exit(0); } |
|||
public String getName() { return "exit"; } |
|||
public String getDescription() { return "退出程序"; } |
|||
} |
|||
|
|||
class HelpCommand implements Command { |
|||
private Map<String, Command> commands; |
|||
public HelpCommand(Map<String, Command> c) { commands = c; } |
|||
public void execute(String... args) { |
|||
System.out.println("可用命令:"); |
|||
commands.forEach((k, v) -> System.out.printf(" %s - %s%n", k, v.getDescription())); |
|||
} |
|||
public String getName() { return "help"; } |
|||
public String getDescription() { return "显示帮助"; } |
|||
} |
|||
|
|||
class CommandManager { |
|||
private Map<String, Command> commands = new HashMap<>(); |
|||
private Map<String, String> aliases = new HashMap<>(); |
|||
public void register(Command c) { commands.put(c.getName(), c); } |
|||
public void alias(String a, String n) { aliases.put(a, n); } |
|||
public Command get(String n) { return commands.get(aliases.getOrDefault(n, n)); } |
|||
public boolean has(String n) { return commands.containsKey(aliases.getOrDefault(n, n)); } |
|||
public Map<String, Command> getAll() { return commands; } |
|||
} |
|||
|
|||
public class App { |
|||
public static void main(String[] args) { |
|||
ArticleRepository repo = new ArticleRepository(); |
|||
ScraperService scraper = new ScraperService(); |
|||
CommandManager cmdMgr = new CommandManager(); |
|||
List<String> history = new ArrayList<>(); |
|||
|
|||
cmdMgr.register(new CrawlCommand(repo, scraper)); |
|||
cmdMgr.register(new ListCommand(repo)); |
|||
cmdMgr.register(new AnalyzeCommand(scraper)); |
|||
cmdMgr.register(new HistoryCommand(history)); |
|||
cmdMgr.register(new ClearCommand(repo)); |
|||
cmdMgr.register(new ExitCommand()); |
|||
cmdMgr.register(new HelpCommand(cmdMgr.getAll())); |
|||
cmdMgr.alias("c", "crawl"); |
|||
|
|||
Scanner scanner = new Scanner(System.in); |
|||
System.out.println("========== 命令行工具 =========="); |
|||
|
|||
while (true) { |
|||
System.out.print("> "); |
|||
String input = scanner.nextLine().trim(); |
|||
if (input.isEmpty()) continue; |
|||
|
|||
history.add(input); |
|||
String[] parts = input.split("\\s+"); |
|||
String cmdName = parts[0]; |
|||
String[] cmdArgs = parts.length > 1 ? Arrays.copyOfRange(parts, 1, parts.length) : new String[0]; |
|||
|
|||
if (cmdMgr.has(cmdName)) { |
|||
cmdMgr.get(cmdName).execute(cmdArgs); |
|||
} else { |
|||
System.out.println("未知命令: " + cmdName); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,33 @@ |
|||
import com.example.datacollect.*; |
|||
|
|||
public class Main { |
|||
public static void main(String[] args) { |
|||
CrawlStrategy strategyA = new ASiteCrawlStrategyImpl(); |
|||
CrawlStrategy strategyB = new BSiteCrawlStrategyImpl(); |
|||
CrawlStrategy strategyC = new CSiteCrawlStrategyImpl(); |
|||
CrawlStrategy strategyD = new DSiteCrawlStrategyImpl(); |
|||
|
|||
strategyA.crawl("http://www.example-a.com"); |
|||
strategyB.crawl("http://www.example-b.com"); |
|||
strategyC.crawl("http://www.example-c.com"); |
|||
strategyD.crawl("http://www.example-d.com"); |
|||
|
|||
try { |
|||
throw new NetworkException("连接超时"); |
|||
} catch (CrawlException e) { |
|||
System.out.println("捕获异常: " + e.getMessage()); |
|||
} |
|||
|
|||
try { |
|||
throw new ParseException("HTML格式错误"); |
|||
} catch (CrawlException e) { |
|||
System.out.println("捕获异常: " + e.getMessage()); |
|||
} |
|||
|
|||
try { |
|||
throw new UnsupportedSiteException("UNKNOWN"); |
|||
} catch (CrawlException e) { |
|||
System.out.println("捕获异常: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
Loading…
Reference in new issue