You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
103 lines
3.2 KiB
103 lines
3.2 KiB
package com.crawler.crawler;
|
|
|
|
import com.crawler.crawler.impl.*;
|
|
import com.crawler.exception.InvalidUrlException;
|
|
import java.util.LinkedHashMap;
|
|
import java.util.Map;
|
|
import java.util.regex.Pattern;
|
|
|
|
public class CrawlerFactory {
|
|
private static CrawlerFactory instance;
|
|
private Map<String, Pattern> crawlerPatterns;
|
|
|
|
private CrawlerFactory() {
|
|
crawlerPatterns = new LinkedHashMap<>();
|
|
initPatterns();
|
|
}
|
|
|
|
public static CrawlerFactory getInstance() {
|
|
if (instance == null) {
|
|
instance = new CrawlerFactory();
|
|
}
|
|
return instance;
|
|
}
|
|
|
|
private void initPatterns() {
|
|
crawlerPatterns.put("MountBladeCrawler",
|
|
Pattern.compile(".*mountblade\\.com\\.cn.*", Pattern.CASE_INSENSITIVE));
|
|
crawlerPatterns.put("HunanUniversityNewsCrawler",
|
|
Pattern.compile(".*news\\.hnu\\.edu\\.cn.*", Pattern.CASE_INSENSITIVE));
|
|
crawlerPatterns.put("HunanUniversityCrawler",
|
|
Pattern.compile(".*hnu\\.edu\\.cn.*", Pattern.CASE_INSENSITIVE));
|
|
crawlerPatterns.put("ChinaWeatherCrawler",
|
|
Pattern.compile(".*weather\\.com\\.cn.*", Pattern.CASE_INSENSITIVE));
|
|
crawlerPatterns.put("ExampleCrawler",
|
|
Pattern.compile(".*", Pattern.CASE_INSENSITIVE));
|
|
}
|
|
|
|
public Crawler createCrawler(String url) {
|
|
validateUrl(url);
|
|
|
|
for (Map.Entry<String, Pattern> entry : crawlerPatterns.entrySet()) {
|
|
if (entry.getValue().matcher(url).matches()) {
|
|
return createCrawlerByName(entry.getKey());
|
|
}
|
|
}
|
|
|
|
return new ExampleCrawler();
|
|
}
|
|
|
|
private void validateUrl(String url) {
|
|
if (url == null || url.isEmpty()) {
|
|
throw new InvalidUrlException("URL不能为空", url);
|
|
}
|
|
|
|
if (!url.startsWith("http://") && !url.startsWith("https://")) {
|
|
throw new InvalidUrlException("URL格式无效,必须以http://或https://开头", url);
|
|
}
|
|
}
|
|
|
|
private Crawler createCrawlerByName(String crawlerName) {
|
|
switch (crawlerName) {
|
|
case "MountBladeCrawler":
|
|
return new MountBladeCrawler();
|
|
case "HunanUniversityNewsCrawler":
|
|
return new HunanUniversityNewsCrawler();
|
|
case "HunanUniversityCrawler":
|
|
return new HunanUniversityCrawler();
|
|
case "ChinaWeatherCrawler":
|
|
return new ChinaWeatherCrawler();
|
|
case "ExampleCrawler":
|
|
default:
|
|
return new ExampleCrawler();
|
|
}
|
|
}
|
|
|
|
public String getCrawlerName(String url) {
|
|
if (url == null || url.isEmpty()) {
|
|
return "ExampleCrawler";
|
|
}
|
|
|
|
for (Map.Entry<String, Pattern> entry : crawlerPatterns.entrySet()) {
|
|
if (entry.getValue().matcher(url).matches()) {
|
|
return entry.getKey();
|
|
}
|
|
}
|
|
|
|
return "ExampleCrawler";
|
|
}
|
|
|
|
public boolean isUrlSupported(String url) {
|
|
if (url == null || url.isEmpty()) {
|
|
return false;
|
|
}
|
|
|
|
for (Pattern pattern : crawlerPatterns.values()) {
|
|
if (pattern.matcher(url).matches()) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|