Browse Source

上传文件至 ''

main
Hanminxi 3 weeks ago
parent
commit
1066f75e91
  1. 32
      BaiduStrategy.java
  2. 32
      BingStrategy.java
  3. 10
      CrawlStrategy.java
  4. 33
      HttpBinStrategy.java
  5. 33
      JjwxcStrategy.java

32
BaiduStrategy.java

@ -0,0 +1,32 @@
package strategy;
import model.Article;
import util.HttpUtil;
import exception.SpiderException;
public class BaiduStrategy implements CrawlStrategy {
@Override
public String getName() {
return "百度";
}
@Override
public String getUrl() {
return "https://www.baidu.com/";
}
@Override
public Article crawl() throws SpiderException {
String html = HttpUtil.get(getUrl(), "UTF-8");
String title = HttpUtil.extractTagSafe(html, "<title>", "</title>");
Article article = new Article();
article.setTitle(title);
article.setContent("百度首页");
article.setUrl(getUrl());
article.setSource(getName());
return article;
}
}

32
BingStrategy.java

@ -0,0 +1,32 @@
package strategy;
import model.Article;
import util.HttpUtil;
import exception.SpiderException;
public class BingStrategy implements CrawlStrategy {
@Override
public String getName() {
return "必应搜索";
}
@Override
public String getUrl() {
return "https://cn.bing.com/";
}
@Override
public Article crawl() throws SpiderException {
String html = HttpUtil.get(getUrl(), "UTF-8");
String title = HttpUtil.extractTagSafe(html, "<title>", "</title>");
Article article = new Article();
article.setTitle(title);
article.setContent("微软必应搜索引擎首页");
article.setUrl(getUrl());
article.setSource(getName());
return article;
}
}

10
CrawlStrategy.java

@ -0,0 +1,10 @@
package strategy;
import model.Article;
import exception.SpiderException;
public interface CrawlStrategy {
String getName();
String getUrl();
Article crawl() throws SpiderException;
}

33
HttpBinStrategy.java

@ -0,0 +1,33 @@
package strategy;
import model.Article;
import util.HttpUtil;
import exception.SpiderException;
public class HttpBinStrategy implements CrawlStrategy {
@Override
public String getName() {
return "HttpBin";
}
@Override
public String getUrl() {
return "https://httpbin.org/html";
}
@Override
public Article crawl() throws SpiderException {
String html = HttpUtil.get(getUrl(), "UTF-8");
String title = HttpUtil.extractTagSafe(html, "<h1>", "</h1>");
String content = HttpUtil.extractTagSafe(html, "<p>", "</p>");
Article article = new Article();
article.setTitle(title);
article.setContent(content);
article.setUrl(getUrl());
article.setSource(getName());
return article;
}
}

33
JjwxcStrategy.java

@ -0,0 +1,33 @@
package strategy;
import model.Article;
import util.HttpUtil;
import exception.SpiderException;
public class JjwxcStrategy implements CrawlStrategy {
@Override
public String getName() {
return "晋江文学城";
}
@Override
public String getUrl() {
return "https://www.jjwxc.net/";
}
@Override
public Article crawl() throws SpiderException {
String html = HttpUtil.get(getUrl(), "GB18030");
String title = HttpUtil.extractTagSafe(html, "<title>", "</title>");
String description = "晋江文学城(www.jjwxc.net)创立于2003年8月,是具备相当规模女性网络文学原创基地";
Article article = new Article();
article.setTitle(title);
article.setContent(description);
article.setUrl(getUrl());
article.setSource(getName());
return article;
}
}
Loading…
Cancel
Save