5 changed files with 140 additions and 0 deletions
@ -0,0 +1,32 @@ |
|||
package strategy; |
|||
|
|||
import model.Article; |
|||
import util.HttpUtil; |
|||
import exception.SpiderException; |
|||
|
|||
public class BaiduStrategy implements CrawlStrategy { |
|||
@Override |
|||
public String getName() { |
|||
return "百度"; |
|||
} |
|||
|
|||
@Override |
|||
public String getUrl() { |
|||
return "https://www.baidu.com/"; |
|||
} |
|||
|
|||
@Override |
|||
public Article crawl() throws SpiderException { |
|||
String html = HttpUtil.get(getUrl(), "UTF-8"); |
|||
|
|||
String title = HttpUtil.extractTagSafe(html, "<title>", "</title>"); |
|||
|
|||
Article article = new Article(); |
|||
article.setTitle(title); |
|||
article.setContent("百度首页"); |
|||
article.setUrl(getUrl()); |
|||
article.setSource(getName()); |
|||
|
|||
return article; |
|||
} |
|||
} |
|||
@ -0,0 +1,32 @@ |
|||
package strategy; |
|||
|
|||
import model.Article; |
|||
import util.HttpUtil; |
|||
import exception.SpiderException; |
|||
|
|||
public class BingStrategy implements CrawlStrategy { |
|||
@Override |
|||
public String getName() { |
|||
return "必应搜索"; |
|||
} |
|||
|
|||
@Override |
|||
public String getUrl() { |
|||
return "https://cn.bing.com/"; |
|||
} |
|||
|
|||
@Override |
|||
public Article crawl() throws SpiderException { |
|||
String html = HttpUtil.get(getUrl(), "UTF-8"); |
|||
|
|||
String title = HttpUtil.extractTagSafe(html, "<title>", "</title>"); |
|||
|
|||
Article article = new Article(); |
|||
article.setTitle(title); |
|||
article.setContent("微软必应搜索引擎首页"); |
|||
article.setUrl(getUrl()); |
|||
article.setSource(getName()); |
|||
|
|||
return article; |
|||
} |
|||
} |
|||
@ -0,0 +1,10 @@ |
|||
package strategy; |
|||
|
|||
import model.Article; |
|||
import exception.SpiderException; |
|||
|
|||
public interface CrawlStrategy { |
|||
String getName(); |
|||
String getUrl(); |
|||
Article crawl() throws SpiderException; |
|||
} |
|||
@ -0,0 +1,33 @@ |
|||
package strategy; |
|||
|
|||
import model.Article; |
|||
import util.HttpUtil; |
|||
import exception.SpiderException; |
|||
|
|||
public class HttpBinStrategy implements CrawlStrategy { |
|||
@Override |
|||
public String getName() { |
|||
return "HttpBin"; |
|||
} |
|||
|
|||
@Override |
|||
public String getUrl() { |
|||
return "https://httpbin.org/html"; |
|||
} |
|||
|
|||
@Override |
|||
public Article crawl() throws SpiderException { |
|||
String html = HttpUtil.get(getUrl(), "UTF-8"); |
|||
|
|||
String title = HttpUtil.extractTagSafe(html, "<h1>", "</h1>"); |
|||
String content = HttpUtil.extractTagSafe(html, "<p>", "</p>"); |
|||
|
|||
Article article = new Article(); |
|||
article.setTitle(title); |
|||
article.setContent(content); |
|||
article.setUrl(getUrl()); |
|||
article.setSource(getName()); |
|||
|
|||
return article; |
|||
} |
|||
} |
|||
@ -0,0 +1,33 @@ |
|||
package strategy; |
|||
|
|||
import model.Article; |
|||
import util.HttpUtil; |
|||
import exception.SpiderException; |
|||
|
|||
public class JjwxcStrategy implements CrawlStrategy { |
|||
@Override |
|||
public String getName() { |
|||
return "晋江文学城"; |
|||
} |
|||
|
|||
@Override |
|||
public String getUrl() { |
|||
return "https://www.jjwxc.net/"; |
|||
} |
|||
|
|||
@Override |
|||
public Article crawl() throws SpiderException { |
|||
String html = HttpUtil.get(getUrl(), "GB18030"); |
|||
|
|||
String title = HttpUtil.extractTagSafe(html, "<title>", "</title>"); |
|||
String description = "晋江文学城(www.jjwxc.net)创立于2003年8月,是具备相当规模女性网络文学原创基地"; |
|||
|
|||
Article article = new Article(); |
|||
article.setTitle(title); |
|||
article.setContent(description); |
|||
article.setUrl(getUrl()); |
|||
article.setSource(getName()); |
|||
|
|||
return article; |
|||
} |
|||
} |
|||
Loading…
Reference in new issue