package com.crawler.crawler.impl; import com.crawler.crawler.BaseCrawler; import com.crawler.model.CrawlerData; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class HunanUniversityCrawler extends BaseCrawler { private static final String BASE_URL = "https://www.hnu.edu.cn"; @Override public String getCrawlerName() { return "HunanUniversityCrawler"; } @Override protected List parseHtml(String html) { List results = new ArrayList<>(); if (html == null || html.isEmpty()) { System.out.println("警告: HTML内容为空"); return results; } String cleanHtml = html.replaceAll("\\s+", " "); Pattern newsPattern = Pattern.compile( "]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>\\s*([^<]{4,80})\\s*", Pattern.CASE_INSENSITIVE ); Matcher matcher = newsPattern.matcher(cleanHtml); while (matcher.find() && results.size() < 30) { String url = matcher.group(1); String title = matcher.group(2).trim(); if (isValidUrl(url) && isValidTitle(title)) { url = normalizeUrl(url); CrawlerData data = new CrawlerData(); data.setTitle(cleanText(title)); data.setUrl(url); data.setSource(getCrawlerName()); results.add(data); } } return results; } private String normalizeUrl(String url) { if (url == null) return null; if (url.startsWith("//")) { return "https:" + url; } if (url.startsWith("/")) { return BASE_URL + url; } if (!url.startsWith("http")) { return BASE_URL + "/" + url; } return url; } private boolean isValidUrl(String url) { if (url == null || url.isEmpty()) { return false; } if (url.contains("webscan.360.cn") || url.contains("mailto:") || url.contains("javascript:")) { return false; } return url.contains("hnu.edu.cn") || url.startsWith("/"); } private boolean isValidTitle(String title) { if (title == null || title.isEmpty()) { return false; } String cleaned = cleanText(title); return cleaned != null && cleaned.length() >= 4 && cleaned.length() <= 80 && !cleaned.contains("360") && !cleaned.contains("网站安全") && !cleaned.contains("网站测"); } private String cleanText(String text) { if (text == null) return null; return text.replaceAll("<[^>]+>", "") .replaceAll(" ", " ") .replaceAll("&#[0-9]+;", "") .replaceAll("&[a-zA-Z]+;", " ") .replaceAll("\\s+", " ") .trim(); } }