diff --git a/project/crawler/BaseCrawler.java b/project/crawler/BaseCrawler.java new file mode 100644 index 0000000..e624746 --- /dev/null +++ b/project/crawler/BaseCrawler.java @@ -0,0 +1,84 @@ +package com.crawler.crawler; + +import com.crawler.exception.HttpRequestException; +import com.crawler.exception.TimeoutException; +import com.crawler.model.CrawlerConfig; +import com.crawler.model.CrawlerData; + +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; + +public abstract class BaseCrawler implements Crawler { + protected CrawlerConfig config; + + @Override + public void setConfig(CrawlerConfig config) { + this.config = config; + } + + @Override + public CrawlerConfig getConfig() { + return config; + } + + protected String fetchHtml(String urlStr) throws HttpRequestException, TimeoutException { + HttpClient client = HttpClient.newBuilder() + .connectTimeout(Duration.ofMillis(config.getTimeout())) + .followRedirects(HttpClient.Redirect.NORMAL) + .build(); + + HttpRequest request = HttpRequest.newBuilder() + .uri(URI.create(urlStr)) + .header("User-Agent", config.getUserAgent()) + .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + .GET() + .build(); + + try { + HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); + + if (response.statusCode() < 200 || response.statusCode() >= 300) { + throw new HttpRequestException("HTTP请求失败", response.statusCode()); + } + + return response.body(); + } catch (java.net.http.HttpTimeoutException e) { + throw new TimeoutException("连接超时", e); + } catch (HttpRequestException e) { + throw e; + } catch (Exception e) { + throw new HttpRequestException("HTTP请求异常: " + e.getMessage(), 0, e); + } + } + + protected abstract List parseHtml(String html); + + @Override + public List crawl() { + List results = new ArrayList<>(); + + if (config == null || config.getTargetUrl() == null) { + return results; + } + + try { + String html = fetchHtml(config.getTargetUrl()); + results = parseHtml(html); + + for (CrawlerData data : results) { + if (data.getSource() == null) { + data.setSource(getCrawlerName()); + } + } + } catch (Exception e) { + throw new RuntimeException(e); + } + + return results; + } +} \ No newline at end of file diff --git a/project/crawler/ChinaWeatherCrawler.java b/project/crawler/ChinaWeatherCrawler.java new file mode 100644 index 0000000..36a6cbe --- /dev/null +++ b/project/crawler/ChinaWeatherCrawler.java @@ -0,0 +1,145 @@ +package com.crawler.crawler.impl; + +import com.crawler.crawler.BaseCrawler; +import com.crawler.exception.HttpRequestException; +import com.crawler.exception.TimeoutException; +import com.crawler.model.CrawlerData; + +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class ChinaWeatherCrawler extends BaseCrawler { + private static final String BASE_URL = "https://www.weather.com.cn"; + + @Override + public String getCrawlerName() { + return "ChinaWeatherCrawler"; + } + + @Override + protected String fetchHtml(String urlStr) throws HttpRequestException, TimeoutException { + if (!urlStr.startsWith("https")) { + urlStr = urlStr.replace("http://", "https://"); + } + + HttpClient client = HttpClient.newBuilder() + .connectTimeout(Duration.ofMillis(30000)) + .followRedirects(HttpClient.Redirect.NORMAL) + .build(); + + HttpRequest request = HttpRequest.newBuilder() + .uri(URI.create(urlStr)) + .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") + .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") + .GET() + .build(); + + try { + HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); + + if (response.statusCode() < 200 || response.statusCode() >= 300) { + throw new HttpRequestException("HTTP请求失败", response.statusCode()); + } + + return response.body(); + } catch (java.net.http.HttpTimeoutException e) { + throw new TimeoutException("连接超时", e); + } catch (HttpRequestException e) { + throw e; + } catch (Exception e) { + throw new HttpRequestException("HTTP请求异常: " + e.getMessage(), 0, e); + } + } + + @Override + protected List parseHtml(String html) { + List results = new ArrayList<>(); + + if (html == null || html.isEmpty()) { + return results; + } + + String cleanHtml = html.replaceAll("\\s+", " "); + + Pattern linkPattern = Pattern.compile( + "]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]{2,80})", + Pattern.CASE_INSENSITIVE + ); + + Matcher matcher = linkPattern.matcher(cleanHtml); + + int count = 0; + while (matcher.find() && count < 30) { + String url = matcher.group(1); + String title = matcher.group(2).trim(); + + if (isValidUrl(url) && isValidTitle(title)) { + url = normalizeUrl(url); + + CrawlerData data = new CrawlerData(); + data.setTitle(cleanText(title)); + data.setUrl(url); + data.setSource(getCrawlerName()); + results.add(data); + count++; + } + } + + return results; + } + + private String normalizeUrl(String url) { + if (url == null) return null; + if (url.startsWith("//")) { + return "https:" + url; + } + if (url.startsWith("/")) { + return BASE_URL + url; + } + if (!url.startsWith("http")) { + return BASE_URL + "/" + url; + } + if (url.startsWith("http://")) { + return url.replace("http://", "https://"); + } + return url; + } + + private boolean isValidUrl(String url) { + if (url == null || url.isEmpty()) { + return false; + } + + if (url.contains("mailto:") || url.contains("javascript:")) { + return false; + } + + return url.contains("weather.com.cn") || url.startsWith("/"); + } + + private boolean isValidTitle(String title) { + if (title == null || title.isEmpty()) { + return false; + } + String cleaned = cleanText(title); + return cleaned != null && cleaned.length() >= 2 && cleaned.length() <= 80; + } + + private String cleanText(String text) { + if (text == null) return null; + return text.replaceAll("<[^>]+>", "") + .replaceAll(" ", " ") + .replaceAll("&#[0-9]+;", "") + .replaceAll("&[a-zA-Z]+;", " ") + .replaceAll("\\s+", " ") + .trim(); + } +} \ No newline at end of file diff --git a/project/crawler/Crawler.java b/project/crawler/Crawler.java new file mode 100644 index 0000000..27eb523 --- /dev/null +++ b/project/crawler/Crawler.java @@ -0,0 +1,13 @@ +package com.crawler.crawler; + +import com.crawler.model.CrawlerConfig; +import com.crawler.model.CrawlerData; + +import java.util.List; + +public interface Crawler { + void setConfig(CrawlerConfig config); + CrawlerConfig getConfig(); + String getCrawlerName(); + List crawl(); +} \ No newline at end of file diff --git a/project/crawler/CrawlerFactory.java b/project/crawler/CrawlerFactory.java new file mode 100644 index 0000000..8965140 --- /dev/null +++ b/project/crawler/CrawlerFactory.java @@ -0,0 +1,103 @@ +package com.crawler.crawler; + +import com.crawler.crawler.impl.*; +import com.crawler.exception.InvalidUrlException; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.regex.Pattern; + +public class CrawlerFactory { + private static CrawlerFactory instance; + private Map crawlerPatterns; + + private CrawlerFactory() { + crawlerPatterns = new LinkedHashMap<>(); + initPatterns(); + } + + public static CrawlerFactory getInstance() { + if (instance == null) { + instance = new CrawlerFactory(); + } + return instance; + } + + private void initPatterns() { + crawlerPatterns.put("MountBladeCrawler", + Pattern.compile(".*mountblade\\.com\\.cn.*", Pattern.CASE_INSENSITIVE)); + crawlerPatterns.put("HunanUniversityNewsCrawler", + Pattern.compile(".*news\\.hnu\\.edu\\.cn.*", Pattern.CASE_INSENSITIVE)); + crawlerPatterns.put("HunanUniversityCrawler", + Pattern.compile(".*hnu\\.edu\\.cn.*", Pattern.CASE_INSENSITIVE)); + crawlerPatterns.put("ChinaWeatherCrawler", + Pattern.compile(".*weather\\.com\\.cn.*", Pattern.CASE_INSENSITIVE)); + crawlerPatterns.put("ExampleCrawler", + Pattern.compile(".*", Pattern.CASE_INSENSITIVE)); + } + + public Crawler createCrawler(String url) { + validateUrl(url); + + for (Map.Entry entry : crawlerPatterns.entrySet()) { + if (entry.getValue().matcher(url).matches()) { + return createCrawlerByName(entry.getKey()); + } + } + + return new ExampleCrawler(); + } + + private void validateUrl(String url) { + if (url == null || url.isEmpty()) { + throw new InvalidUrlException("URL不能为空", url); + } + + if (!url.startsWith("http://") && !url.startsWith("https://")) { + throw new InvalidUrlException("URL格式无效,必须以http://或https://开头", url); + } + } + + private Crawler createCrawlerByName(String crawlerName) { + switch (crawlerName) { + case "MountBladeCrawler": + return new MountBladeCrawler(); + case "HunanUniversityNewsCrawler": + return new HunanUniversityNewsCrawler(); + case "HunanUniversityCrawler": + return new HunanUniversityCrawler(); + case "ChinaWeatherCrawler": + return new ChinaWeatherCrawler(); + case "ExampleCrawler": + default: + return new ExampleCrawler(); + } + } + + public String getCrawlerName(String url) { + if (url == null || url.isEmpty()) { + return "ExampleCrawler"; + } + + for (Map.Entry entry : crawlerPatterns.entrySet()) { + if (entry.getValue().matcher(url).matches()) { + return entry.getKey(); + } + } + + return "ExampleCrawler"; + } + + public boolean isUrlSupported(String url) { + if (url == null || url.isEmpty()) { + return false; + } + + for (Pattern pattern : crawlerPatterns.values()) { + if (pattern.matcher(url).matches()) { + return true; + } + } + + return true; + } +} \ No newline at end of file diff --git a/project/crawler/ExampleCrawler.java b/project/crawler/ExampleCrawler.java new file mode 100644 index 0000000..647fcb0 --- /dev/null +++ b/project/crawler/ExampleCrawler.java @@ -0,0 +1,42 @@ +package com.crawler.crawler.impl; + +import com.crawler.crawler.BaseCrawler; +import com.crawler.model.CrawlerData; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class ExampleCrawler extends BaseCrawler { + @Override + public String getCrawlerName() { + return "ExampleCrawler"; + } + + @Override + protected List parseHtml(String html) { + List results = new ArrayList<>(); + + String cleanHtml = html.replaceAll("\\s+", " ").replaceAll(">\\s*<", "><"); + + Pattern linkPattern = Pattern.compile("]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)", Pattern.CASE_INSENSITIVE); + Matcher matcher = linkPattern.matcher(cleanHtml); + + int count = 0; + while (matcher.find() && count < 10) { + CrawlerData data = new CrawlerData(); + data.setTitle(cleanText(matcher.group(2))); + data.setUrl(matcher.group(1)); + results.add(data); + count++; + } + + return results; + } + + private String cleanText(String text) { + if (text == null) return null; + return text.replaceAll("<[^>]+>", "").trim(); + } +} \ No newline at end of file