package com.crawler.crawler.impl; import com.crawler.crawler.BaseCrawler; import com.crawler.model.CrawlerData; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; public class HunanUniversityNewsCrawler extends BaseCrawler { private static final String BASE_URL = "https://news.hnu.edu.cn"; @Override public String getCrawlerName() { return "HunanUniversityNewsCrawler"; } @Override protected List parseHtml(String html) { List results = new ArrayList<>(); Set seenUrls = new HashSet<>(); if (html == null || html.isEmpty()) { return results; } Pattern linkPattern = Pattern.compile( "]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)", Pattern.CASE_INSENSITIVE ); Matcher matcher = linkPattern.matcher(html); while (matcher.find() && results.size() < 30) { String url = matcher.group(1); String title = matcher.group(2).trim(); if (isValidUrl(url) && isValidTitle(title) && !seenUrls.contains(url)) { url = normalizeUrl(url); CrawlerData data = new CrawlerData(); data.setTitle(cleanText(title)); data.setUrl(url); data.setSource(getCrawlerName()); results.add(data); seenUrls.add(url); } } return results; } private String normalizeUrl(String url) { if (url == null) return null; url = url.trim(); if (url.startsWith("//")) { return "https:" + url; } if (url.startsWith("/")) { return BASE_URL + url; } if (!url.startsWith("http")) { return BASE_URL + "/" + url; } return url; } private boolean isValidUrl(String url) { if (url == null || url.isEmpty()) { return false; } if (url.contains("mailto:") || url.contains("javascript:")) { return false; } if (url.contains("webscan.360.cn")) { return false; } return true; } private boolean isValidTitle(String title) { if (title == null || title.isEmpty()) { return false; } String cleaned = cleanText(title); if (cleaned == null || cleaned.length() < 2) { return false; } if (cleaned.length() > 100) { return false; } String lower = cleaned.toLowerCase(); if (lower.contains("更多") || lower.contains("查看")) { return false; } return true; } private String cleanText(String text) { if (text == null) return null; return text.replaceAll("<[^>]+>", "") .replaceAll(" ", " ") .replaceAll("&#[0-9]+;", "") .replaceAll("&[a-zA-Z]+;", " ") .replaceAll("\\s+", " ") .replaceAll("[<>]", "") .trim(); } }