diff --git a/project/crawler/HunanUniversityCrawler.java b/project/crawler/HunanUniversityCrawler.java new file mode 100644 index 0000000..20f9a5f --- /dev/null +++ b/project/crawler/HunanUniversityCrawler.java @@ -0,0 +1,103 @@ +package com.crawler.crawler.impl; + +import com.crawler.crawler.BaseCrawler; +import com.crawler.model.CrawlerData; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class HunanUniversityCrawler extends BaseCrawler { + private static final String BASE_URL = "https://www.hnu.edu.cn"; + + @Override + public String getCrawlerName() { + return "HunanUniversityCrawler"; + } + + @Override + protected List parseHtml(String html) { + List results = new ArrayList<>(); + + if (html == null || html.isEmpty()) { + System.out.println("警告: HTML内容为空"); + return results; + } + + String cleanHtml = html.replaceAll("\\s+", " "); + + Pattern newsPattern = Pattern.compile( + "]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>\\s*([^<]{4,80})\\s*", + Pattern.CASE_INSENSITIVE + ); + + Matcher matcher = newsPattern.matcher(cleanHtml); + + while (matcher.find() && results.size() < 30) { + String url = matcher.group(1); + String title = matcher.group(2).trim(); + + if (isValidUrl(url) && isValidTitle(title)) { + url = normalizeUrl(url); + + CrawlerData data = new CrawlerData(); + data.setTitle(cleanText(title)); + data.setUrl(url); + data.setSource(getCrawlerName()); + results.add(data); + } + } + + return results; + } + + private String normalizeUrl(String url) { + if (url == null) return null; + if (url.startsWith("//")) { + return "https:" + url; + } + if (url.startsWith("/")) { + return BASE_URL + url; + } + if (!url.startsWith("http")) { + return BASE_URL + "/" + url; + } + return url; + } + + private boolean isValidUrl(String url) { + if (url == null || url.isEmpty()) { + return false; + } + + if (url.contains("webscan.360.cn") || url.contains("mailto:") || url.contains("javascript:")) { + return false; + } + + return url.contains("hnu.edu.cn") || url.startsWith("/"); + } + + private boolean isValidTitle(String title) { + if (title == null || title.isEmpty()) { + return false; + } + String cleaned = cleanText(title); + return cleaned != null && + cleaned.length() >= 4 && + cleaned.length() <= 80 && + !cleaned.contains("360") && + !cleaned.contains("网站安全") && + !cleaned.contains("网站测"); + } + + private String cleanText(String text) { + if (text == null) return null; + return text.replaceAll("<[^>]+>", "") + .replaceAll(" ", " ") + .replaceAll("&#[0-9]+;", "") + .replaceAll("&[a-zA-Z]+;", " ") + .replaceAll("\\s+", " ") + .trim(); + } +} \ No newline at end of file diff --git a/project/crawler/HunanUniversityNewsCrawler.java b/project/crawler/HunanUniversityNewsCrawler.java new file mode 100644 index 0000000..6fa1fdb --- /dev/null +++ b/project/crawler/HunanUniversityNewsCrawler.java @@ -0,0 +1,112 @@ +package com.crawler.crawler.impl; + +import com.crawler.crawler.BaseCrawler; +import com.crawler.model.CrawlerData; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class HunanUniversityNewsCrawler extends BaseCrawler { + private static final String BASE_URL = "https://news.hnu.edu.cn"; + + @Override + public String getCrawlerName() { + return "HunanUniversityNewsCrawler"; + } + + @Override + protected List parseHtml(String html) { + List results = new ArrayList<>(); + Set seenUrls = new HashSet<>(); + + if (html == null || html.isEmpty()) { + return results; + } + + Pattern linkPattern = Pattern.compile( + "]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)", + Pattern.CASE_INSENSITIVE + ); + + Matcher matcher = linkPattern.matcher(html); + + while (matcher.find() && results.size() < 30) { + String url = matcher.group(1); + String title = matcher.group(2).trim(); + + if (isValidUrl(url) && isValidTitle(title) && !seenUrls.contains(url)) { + url = normalizeUrl(url); + + CrawlerData data = new CrawlerData(); + data.setTitle(cleanText(title)); + data.setUrl(url); + data.setSource(getCrawlerName()); + results.add(data); + seenUrls.add(url); + } + } + + return results; + } + + private String normalizeUrl(String url) { + if (url == null) return null; + url = url.trim(); + if (url.startsWith("//")) { + return "https:" + url; + } + if (url.startsWith("/")) { + return BASE_URL + url; + } + if (!url.startsWith("http")) { + return BASE_URL + "/" + url; + } + return url; + } + + private boolean isValidUrl(String url) { + if (url == null || url.isEmpty()) { + return false; + } + if (url.contains("mailto:") || url.contains("javascript:")) { + return false; + } + if (url.contains("webscan.360.cn")) { + return false; + } + return true; + } + + private boolean isValidTitle(String title) { + if (title == null || title.isEmpty()) { + return false; + } + String cleaned = cleanText(title); + if (cleaned == null || cleaned.length() < 2) { + return false; + } + if (cleaned.length() > 100) { + return false; + } + String lower = cleaned.toLowerCase(); + if (lower.contains("更多") || lower.contains("查看")) { + return false; + } + return true; + } + + private String cleanText(String text) { + if (text == null) return null; + return text.replaceAll("<[^>]+>", "") + .replaceAll(" ", " ") + .replaceAll("&#[0-9]+;", "") + .replaceAll("&[a-zA-Z]+;", " ") + .replaceAll("\\s+", " ") + .replaceAll("[<>]", "") + .trim(); + } +} \ No newline at end of file diff --git a/project/crawler/MountBladeCrawler.java b/project/crawler/MountBladeCrawler.java new file mode 100644 index 0000000..c1cf1cd --- /dev/null +++ b/project/crawler/MountBladeCrawler.java @@ -0,0 +1,132 @@ +package com.crawler.crawler.impl; + +import com.crawler.crawler.BaseCrawler; +import com.crawler.model.CrawlerData; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class MountBladeCrawler extends BaseCrawler { + private static final String BASE_URL = "https://www.mountblade.com.cn"; + + @Override + public String getCrawlerName() { + return "MountBladeCrawler"; + } + + @Override + protected List parseHtml(String html) { + List results = new ArrayList<>(); + Set seenUrls = new HashSet<>(); + + if (html == null || html.isEmpty()) { + return results; + } + + String cleanHtml = html.replaceAll("\\s+", " "); + + Pattern linkPattern = Pattern.compile( + "]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)", + Pattern.CASE_INSENSITIVE + ); + + Matcher matcher = linkPattern.matcher(cleanHtml); + + int count = 0; + while (matcher.find() && count < 30) { + String url = matcher.group(1); + String title = matcher.group(2).trim(); + + if (isValidUrl(url) && isValidTitle(title) && !seenUrls.contains(url)) { + url = normalizeUrl(url); + + CrawlerData data = new CrawlerData(); + data.setTitle(cleanText(title)); + data.setUrl(url); + data.setSource(getCrawlerName()); + data.setPublishDate(extractDateFromUrl(url)); + results.add(data); + seenUrls.add(url); + count++; + } + } + + return results; + } + + private String extractDateFromUrl(String url) { + if (url == null) return null; + + Pattern datePattern = Pattern.compile("/(\\d{4}-\\d{2}-\\d{2})/"); + Matcher matcher = datePattern.matcher(url); + + if (matcher.find()) { + return matcher.group(1); + } + + return null; + } + + private String normalizeUrl(String url) { + if (url == null) return null; + url = url.trim(); + if (url.startsWith("//")) { + return "https:" + url; + } + if (url.startsWith("/")) { + return BASE_URL + url; + } + if (!url.startsWith("http")) { + return BASE_URL + "/" + url; + } + return url; + } + + private boolean isValidUrl(String url) { + if (url == null || url.isEmpty()) { + return false; + } + + if (url.contains("mailto:") || url.contains("javascript:")) { + return false; + } + + if (url.contains("webscan.360.cn")) { + return false; + } + + return url.contains("mountblade") || url.startsWith("/"); + } + + private boolean isValidTitle(String title) { + if (title == null || title.isEmpty()) { + return false; + } + String cleaned = cleanText(title); + if (cleaned == null || cleaned.length() < 2) { + return false; + } + if (cleaned.length() > 100) { + return false; + } + String lower = cleaned.toLowerCase(); + if (lower.contains("更多") || lower.contains("查看") || lower.contains(">>")) { + return false; + } + return true; + } + + private String cleanText(String text) { + if (text == null) return null; + return text.replaceAll("<[^>]+>", "") + .replaceAll(" ", " ") + .replaceAll("&#[0-9]+;", "") + .replaceAll("&[a-zA-Z]+;", " ") + .replaceAll("\\s+", " ") + .trim(); + } +} \ No newline at end of file diff --git a/project/crawler/TestCrawler.java b/project/crawler/TestCrawler.java new file mode 100644 index 0000000..9cf4218 --- /dev/null +++ b/project/crawler/TestCrawler.java @@ -0,0 +1,42 @@ +package com.crawler.crawler.impl; + +import com.crawler.crawler.BaseCrawler; +import com.crawler.model.CrawlerData; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class TestCrawler extends BaseCrawler { + @Override + public String getCrawlerName() { + return "TestCrawler"; + } + + @Override + protected List parseHtml(String html) { + List results = new ArrayList<>(); + + String cleanHtml = html.replaceAll("\\s+", " ").replaceAll(">\\s*<", "><"); + + Pattern linkPattern = Pattern.compile("]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)", Pattern.CASE_INSENSITIVE); + Matcher matcher = linkPattern.matcher(cleanHtml); + + int count = 0; + while (matcher.find() && count < 10) { + CrawlerData data = new CrawlerData(); + data.setTitle(cleanText(matcher.group(2))); + data.setUrl(matcher.group(1)); + results.add(data); + count++; + } + + return results; + } + + private String cleanText(String text) { + if (text == null) return null; + return text.replaceAll("<[^>]+>", "").trim(); + } +} \ No newline at end of file