From 03f92a0475ea31735617d73517a257ed43ed024f Mon Sep 17 00:00:00 2001 From: Hanminxi <1772454398@qq.com> Date: Sun, 31 May 2026 00:03:14 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20'project'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- project/Article.java | 58 +++++++++++++++ project/CrawlerController.class | Bin 0 -> 4076 bytes project/CrawlerController.java | 122 ++++++++++++++++++++++++++++++++ project/NetworkException.java | 26 +++++++ project/ParseException.java | 20 ++++++ project/SpiderException.java | 11 +++ 6 files changed, 237 insertions(+) create mode 100644 project/Article.java create mode 100644 project/CrawlerController.class create mode 100644 project/CrawlerController.java create mode 100644 project/NetworkException.java create mode 100644 project/ParseException.java create mode 100644 project/SpiderException.java diff --git a/project/Article.java b/project/Article.java new file mode 100644 index 0000000..961428c --- /dev/null +++ b/project/Article.java @@ -0,0 +1,58 @@ +package model; + +public class Article { + private String title; + private String content; + private String url; + private String source; + + public Article() { + } + + public Article(String title, String content, String url, String source) { + this.title = title; + this.content = content; + this.url = url; + this.source = source; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public String getSource() { + return source; + } + + public void setSource(String source) { + this.source = source; + } + + @Override + public String toString() { + return "【" + source + "】" + title + "\n" + + "链接:" + url + "\n" + + "内容:" + (content != null && content.length() > 100 ? + content.substring(0, 100) + "..." : content); + } +} diff --git a/project/CrawlerController.class b/project/CrawlerController.class new file mode 100644 index 0000000000000000000000000000000000000000..368269819c0d94246108e4427d1c8a50bd0f3bb9 GIT binary patch literal 4076 zcmaJ^YjjlA75?s>BzKrx$S~pkG7ugKWC#%6K?PC>feB!P;i0tLOs>g<$xN8JLlRmF zsAv!@qD7=46bc4KDO3rBhgEA^t8Hypf3`o`RTJ9P)#Z==XqSc3Z{Ita3=DPFI`^Et z&p!LT_c`;)f3Lm=UoEcroe5)tWYp)#;H=E|A5ez zi#8*4j1)Rg+=>~N*$^`08b&J^mW10&S2ne_1#6t$b`M6um5XtBKtq9!@tB}s(A{k5 z1f#xQ!O$W{UYeuIi#KVQn9h8u5vou0GcOcBPf;);C4GfuwJZ%q`bj1W$yBmWk#M`A zf4|t#&#z;;1S^}i8!^#%hK`w-MX$y~J4`!LRT^gB6;5&?FCm_*V;<&{s=;J8 zXBwknVY(@H0Q>aF#R4qS@gSHJ8hNZXnkjQgQ92t(TDc^~Vx4(LZFA#|(bn~5I2dg< z1z(}#Yl3$ReuWus5#(|mmBQCezEzQy1cP3NDjh4aio!ziWz8*CyKUem*{RmCTI_Z= z8u7-8XuaviS_R5X+55VV8VO;Jxh-LY<8C}kk2Oia-B?eeS#E43rcR>d#$&{kxv`m2 z8X1Be>n4sq2;Gu+t9mDv{JZZlWg z5+O@Nlpdnx%|@i&DTfLN@F0S1QhYH53yK2qWG*^1eI73ld{r@(7uR~2?Us%NwlihT zHZz#8%%lK`438%zJ9F|xn~run0zs+ICE>7!Z?W|Hh^3BFTJFG3(egVA#tXZ?Le4C0 z^473BjZF=`lVY=%Iot_Qf+2T(&KKttYECC=lbrR&iyw#-s(B{a__#~y~p<5ym9E}xueskPxpv< zujn``lI?=1Hsf)l!IZjuRmU;B#$*mzCi|5rG_0?v>`qvf#Fyvd1b(35q>fWk7ry=` z)kh*}Y{XZaZI;lU)^P@}GZ!O7+*_(npD*`8!C7(e8yP(z=?VI(Ce#oytVE3YKmMMM zyC~4i`jd8T_vG@2$`wq_po!lX+^fgd*nZM01@PTW8dTiIE;7w~V8B#(hCY zk{~ITIpDj?XZB^FpfCe@KYcxH?ny?%=N^lSYZVN%TW-HFFGbTma=WxkAEf-DMa7S$ zHPn>z2G&?caBH>EVwkhp!dX?o5%L|TshzK z+#7EERKX-q&)&0td+Mx{vpwez^me@P#q~WsdoLC!1--AG>^*#;_xbaAIry0eKgTaL zd@PCbO9d0{%(HL7P-J^_t6A!p>lEj5BgiafyM86TER%$ezAM-dd4W$1M59|1E#;Yh zOT*lCa9w+gc^~q0hb9v{z5bIdD%iwpSY^yQy($vn%Oq^X_JnlCbb-zLQ=o>VwDO}z|d z>NRK^FJr(Zc)0X(>AQsd{^jtg?_&5hjN-E?75I)CFTI5 z`MroA(_}F%oizVm(g%|Md51^d#U*~_ND+^UH+6E)&(K$1v_K&}K9TfYDIZQIB;q}C zsv9MuM4ye$nJKiih%}dRg2bQ7yXy^TB#SCua@g`%(bDeVgI z6wj&qDA3b|hn;C@)}qlz)m8^sR$bDC6-SZNdFLM`tJS$#{w@U0(XzGVJcwf^M?1$X zjw2jp9G5s|bNn7k`$^9K#Po@aIsXNv-Q#$+3^P7k!nqK$E*$6V!kh_fI9FiK2Rk|E zXTNxUH0fW@(XP~S?pQgvau?^1{Ju3^c;qO4=bD?J8$7HDc+l^A+y-U9iSo02?n>92` z3KjU5{RN{4CciU+oMnjpGG0xN%bew3)~`AK$nkd>EYrwF$sSiC|3U z#$HEiFFRu|oZ8FD*jqcb3$2}~BFT=8l=N*|(n5l}1l(hT(+EB(;L|oZ#LNU z9h8#3;NJO@oZ?{TVHdfIPI2gwes7MV9Kd1+O3-uL>DN~Jv<<7MfwTzTAGeUN;l!7i jKs9a articles; + private List strategies; + + public CrawlerController() { + this.view = new ConsoleView(); + this.articles = new ArrayList<>(); + this.strategies = new ArrayList<>(); + + strategies.add(new JjwxcStrategy()); + strategies.add(new BaiduStrategy()); + strategies.add(new HttpBinStrategy()); + strategies.add(new BingStrategy()); + } + + public ConsoleView getView() { + return view; + } + + public List
getArticles() { + return articles; + } + + public void addArticle(Article article) { + articles.add(article); + } + + public void clearArticles() { + articles.clear(); + } + + public String[] getStrategyNames() { + String[] names = new String[strategies.size()]; + for (int i = 0; i < strategies.size(); i++) { + names[i] = strategies.get(i).getName(); + } + return names; + } + + public void run() { + view.showWelcome(); + view.showHelp(); + + boolean running = true; + while (running) { + String input = view.getInput(); + + if (input.isEmpty()) { + continue; + } + + switch (input) { + case "1": + case "jjwxc": + executeCommand(new CrawlCommand(strategies.get(0), this)); + break; + + case "2": + case "baidu": + executeCommand(new CrawlCommand(strategies.get(1), this)); + break; + + case "3": + case "httpbin": + executeCommand(new CrawlCommand(strategies.get(2), this)); + break; + + case "4": + case "bing": + executeCommand(new CrawlCommand(strategies.get(3), this)); + break; + + case "all": + crawlAll(); + break; + + case "list": + executeCommand(new ListCommand(this)); + break; + + case "save": + executeCommand(new SaveCommand(this)); + break; + + case "help": + executeCommand(new HelpCommand(this)); + break; + + case "exit": + case "quit": + running = false; + view.showGoodbye(); + break; + + default: + view.showError("未知命令: " + input + ",输入 help 查看帮助"); + } + } + } + + private void executeCommand(Command command) { + command.execute(); + } + + private void crawlAll() { + view.showMessage("\n开始爬取所有网站...\n"); + for (CrawlStrategy strategy : strategies) { + executeCommand(new CrawlCommand(strategy, this)); + } + view.showMessage("\n全部爬取完成!共 " + articles.size() + " 条数据"); + } +} diff --git a/project/NetworkException.java b/project/NetworkException.java new file mode 100644 index 0000000..00887af --- /dev/null +++ b/project/NetworkException.java @@ -0,0 +1,26 @@ +package exception; + +public class NetworkException extends SpiderException { + public enum ErrorType { + CONNECTION_TIMEOUT, + CONNECTION_REFUSED, + HOST_NOT_FOUND, + RESPONSE_ERROR + } + + private final ErrorType errorType; + + public NetworkException(String message, ErrorType errorType) { + super(message); + this.errorType = errorType; + } + + public NetworkException(String message, ErrorType errorType, Throwable cause) { + super(message, cause); + this.errorType = errorType; + } + + public ErrorType getErrorType() { + return errorType; + } +} diff --git a/project/ParseException.java b/project/ParseException.java new file mode 100644 index 0000000..28f6391 --- /dev/null +++ b/project/ParseException.java @@ -0,0 +1,20 @@ +package exception; + +public class ParseException extends SpiderException { + public enum ErrorType { + INVALID_HTML, + TAG_NOT_FOUND, + REGEX_ERROR + } + + private final ErrorType errorType; + + public ParseException(String message, ErrorType errorType) { + super(message); + this.errorType = errorType; + } + + public ErrorType getErrorType() { + return errorType; + } +} diff --git a/project/SpiderException.java b/project/SpiderException.java new file mode 100644 index 0000000..9dac41a --- /dev/null +++ b/project/SpiderException.java @@ -0,0 +1,11 @@ +package exception; + +public class SpiderException extends Exception { + public SpiderException(String message) { + super(message); + } + + public SpiderException(String message, Throwable cause) { + super(message, cause); + } +}