diff --git a/project/Article.java b/project/Article.java new file mode 100644 index 0000000..961428c --- /dev/null +++ b/project/Article.java @@ -0,0 +1,58 @@ +package model; + +public class Article { + private String title; + private String content; + private String url; + private String source; + + public Article() { + } + + public Article(String title, String content, String url, String source) { + this.title = title; + this.content = content; + this.url = url; + this.source = source; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getSource() { + return source; + } + + public void setSource(String source) { + this.source = source; + } + + @Override + public String toString() { + return "【" + source + "】" + title + "\n" + + "链接:" + url + "\n" + + "内容:" + (content != null && content.length() > 100 ? + content.substring(0, 100) + "..." : content); + } +} diff --git a/project/CrawlerController.class b/project/CrawlerController.class new file mode 100644 index 0000000..3682698 Binary files /dev/null and b/project/CrawlerController.class differ diff --git a/project/CrawlerController.java b/project/CrawlerController.java new file mode 100644 index 0000000..51701e0 --- /dev/null +++ b/project/CrawlerController.java @@ -0,0 +1,122 @@ +package controller; + +import view.ConsoleView; +import model.Article; +import strategy.*; +import command.*; +import java.util.ArrayList; +import java.util.List; + +public class CrawlerController { + private ConsoleView view; + private List
articles; + private List strategies; + + public CrawlerController() { + this.view = new ConsoleView(); + this.articles = new ArrayList<>(); + this.strategies = new ArrayList<>(); + + strategies.add(new JjwxcStrategy()); + strategies.add(new BaiduStrategy()); + strategies.add(new HttpBinStrategy()); + strategies.add(new BingStrategy()); + } + + public ConsoleView getView() { + return view; + } + + public List
getArticles() { + return articles; + } + + public void addArticle(Article article) { + articles.add(article); + } + + public void clearArticles() { + articles.clear(); + } + + public String[] getStrategyNames() { + String[] names = new String[strategies.size()]; + for (int i = 0; i < strategies.size(); i++) { + names[i] = strategies.get(i).getName(); + } + return names; + } + + public void run() { + view.showWelcome(); + view.showHelp(); + + boolean running = true; + while (running) { + String input = view.getInput(); + + if (input.isEmpty()) { + continue; + } + + switch (input) { + case "1": + case "jjwxc": + executeCommand(new CrawlCommand(strategies.get(0), this)); + break; + + case "2": + case "baidu": + executeCommand(new CrawlCommand(strategies.get(1), this)); + break; + + case "3": + case "httpbin": + executeCommand(new CrawlCommand(strategies.get(2), this)); + break; + + case "4": + case "bing": + executeCommand(new CrawlCommand(strategies.get(3), this)); + break; + + case "all": + crawlAll(); + break; + + case "list": + executeCommand(new ListCommand(this)); + break; + + case "save": + executeCommand(new SaveCommand(this)); + break; + + case "help": + executeCommand(new HelpCommand(this)); + break; + + case "exit": + case "quit": + running = false; + view.showGoodbye(); + break; + + default: + view.showError("未知命令: " + input + ",输入 help 查看帮助"); + } + } + } + + private void executeCommand(Command command) { + command.execute(); + } + + private void crawlAll() { + view.showMessage("\n开始爬取所有网站...\n"); + for (CrawlStrategy strategy : strategies) { + executeCommand(new CrawlCommand(strategy, this)); + } + view.showMessage("\n全部爬取完成!共 " + articles.size() + " 条数据"); + } +} diff --git a/project/NetworkException.java b/project/NetworkException.java new file mode 100644 index 0000000..00887af --- /dev/null +++ b/project/NetworkException.java @@ -0,0 +1,26 @@ +package exception; + +public class NetworkException extends SpiderException { + public enum ErrorType { + CONNECTION_TIMEOUT, + CONNECTION_REFUSED, + HOST_NOT_FOUND, + RESPONSE_ERROR + } + + private final ErrorType errorType; + + public NetworkException(String message, ErrorType errorType) { + super(message); + this.errorType = errorType; + } + + public NetworkException(String message, ErrorType errorType, Throwable cause) { + super(message, cause); + this.errorType = errorType; + } + + public ErrorType getErrorType() { + return errorType; + } +} diff --git a/project/ParseException.java b/project/ParseException.java new file mode 100644 index 0000000..28f6391 --- /dev/null +++ b/project/ParseException.java @@ -0,0 +1,20 @@ +package exception; + +public class ParseException extends SpiderException { + public enum ErrorType { + INVALID_HTML, + TAG_NOT_FOUND, + REGEX_ERROR + } + + private final ErrorType errorType; + + public ParseException(String message, ErrorType errorType) { + super(message); + this.errorType = errorType; + } + + public ErrorType getErrorType() { + return errorType; + } +} diff --git a/project/SpiderException.java b/project/SpiderException.java new file mode 100644 index 0000000..9dac41a --- /dev/null +++ b/project/SpiderException.java @@ -0,0 +1,11 @@ +package exception; + +public class SpiderException extends Exception { + public SpiderException(String message) { + super(message); + } + + public SpiderException(String message, Throwable cause) { + super(message, cause); + } +}