diff --git a/BaiduStrategy.java b/BaiduStrategy.java new file mode 100644 index 0000000..71273ab --- /dev/null +++ b/BaiduStrategy.java @@ -0,0 +1,32 @@ +package strategy; + +import model.Article; +import util.HttpUtil; +import exception.SpiderException; + +public class BaiduStrategy implements CrawlStrategy { + @Override + public String getName() { + return "百度"; + } + + @Override + public String getUrl() { + return "https://www.baidu.com/"; + } + + @Override + public Article crawl() throws SpiderException { + String html = HttpUtil.get(getUrl(), "UTF-8"); + + String title = HttpUtil.extractTagSafe(html, "", ""); + + Article article = new Article(); + article.setTitle(title); + article.setContent("百度首页"); + article.setUrl(getUrl()); + article.setSource(getName()); + + return article; + } +} diff --git a/BingStrategy.java b/BingStrategy.java new file mode 100644 index 0000000..1029b7a --- /dev/null +++ b/BingStrategy.java @@ -0,0 +1,32 @@ +package strategy; + +import model.Article; +import util.HttpUtil; +import exception.SpiderException; + +public class BingStrategy implements CrawlStrategy { + @Override + public String getName() { + return "必应搜索"; + } + + @Override + public String getUrl() { + return "https://cn.bing.com/"; + } + + @Override + public Article crawl() throws SpiderException { + String html = HttpUtil.get(getUrl(), "UTF-8"); + + String title = HttpUtil.extractTagSafe(html, "", ""); + + Article article = new Article(); + article.setTitle(title); + article.setContent("微软必应搜索引擎首页"); + article.setUrl(getUrl()); + article.setSource(getName()); + + return article; + } +} diff --git a/CrawlStrategy.java b/CrawlStrategy.java new file mode 100644 index 0000000..78e1325 --- /dev/null +++ b/CrawlStrategy.java @@ -0,0 +1,10 @@ +package strategy; + +import model.Article; +import exception.SpiderException; + +public interface CrawlStrategy { + String getName(); + String getUrl(); + Article crawl() throws SpiderException; +} diff --git a/HttpBinStrategy.java b/HttpBinStrategy.java new file mode 100644 index 0000000..8683ac9 --- /dev/null +++ b/HttpBinStrategy.java @@ -0,0 +1,33 @@ +package strategy; + +import model.Article; +import util.HttpUtil; +import exception.SpiderException; + +public class HttpBinStrategy implements CrawlStrategy { + @Override + public String getName() { + return "HttpBin"; + } + + @Override + public String getUrl() { + return "https://httpbin.org/html"; + } + + @Override + public Article crawl() throws SpiderException { + String html = HttpUtil.get(getUrl(), "UTF-8"); + + String title = HttpUtil.extractTagSafe(html, "

", "

"); + String content = HttpUtil.extractTagSafe(html, "

", "

"); + + Article article = new Article(); + article.setTitle(title); + article.setContent(content); + article.setUrl(getUrl()); + article.setSource(getName()); + + return article; + } +} diff --git a/JjwxcStrategy.java b/JjwxcStrategy.java new file mode 100644 index 0000000..e6fe34f --- /dev/null +++ b/JjwxcStrategy.java @@ -0,0 +1,33 @@ +package strategy; + +import model.Article; +import util.HttpUtil; +import exception.SpiderException; + +public class JjwxcStrategy implements CrawlStrategy { + @Override + public String getName() { + return "晋江文学城"; + } + + @Override + public String getUrl() { + return "https://www.jjwxc.net/"; + } + + @Override + public Article crawl() throws SpiderException { + String html = HttpUtil.get(getUrl(), "GB18030"); + + String title = HttpUtil.extractTagSafe(html, "", ""); + String description = "晋江文学城(www.jjwxc.net)创立于2003年8月,是具备相当规模女性网络文学原创基地"; + + Article article = new Article(); + article.setTitle(title); + article.setContent(description); + article.setUrl(getUrl()); + article.setSource(getName()); + + return article; + } +}