You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

112 lines
3.4 KiB

package com.crawler.crawler.impl;
import com.crawler.crawler.BaseCrawler;
import com.crawler.model.CrawlerData;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HunanUniversityNewsCrawler extends BaseCrawler {
private static final String BASE_URL = "https://news.hnu.edu.cn";
@Override
public String getCrawlerName() {
return "HunanUniversityNewsCrawler";
}
@Override
protected List<CrawlerData> parseHtml(String html) {
List<CrawlerData> results = new ArrayList<>();
Set<String> seenUrls = new HashSet<>();
if (html == null || html.isEmpty()) {
return results;
}
Pattern linkPattern = Pattern.compile(
"<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)</a>",
Pattern.CASE_INSENSITIVE
);
Matcher matcher = linkPattern.matcher(html);
while (matcher.find() && results.size() < 30) {
String url = matcher.group(1);
String title = matcher.group(2).trim();
if (isValidUrl(url) && isValidTitle(title) && !seenUrls.contains(url)) {
url = normalizeUrl(url);
CrawlerData data = new CrawlerData();
data.setTitle(cleanText(title));
data.setUrl(url);
data.setSource(getCrawlerName());
results.add(data);
seenUrls.add(url);
}
}
return results;
}
private String normalizeUrl(String url) {
if (url == null) return null;
url = url.trim();
if (url.startsWith("//")) {
return "https:" + url;
}
if (url.startsWith("/")) {
return BASE_URL + url;
}
if (!url.startsWith("http")) {
return BASE_URL + "/" + url;
}
return url;
}
private boolean isValidUrl(String url) {
if (url == null || url.isEmpty()) {
return false;
}
if (url.contains("mailto:") || url.contains("javascript:")) {
return false;
}
if (url.contains("webscan.360.cn")) {
return false;
}
return true;
}
private boolean isValidTitle(String title) {
if (title == null || title.isEmpty()) {
return false;
}
String cleaned = cleanText(title);
if (cleaned == null || cleaned.length() < 2) {
return false;
}
if (cleaned.length() > 100) {
return false;
}
String lower = cleaned.toLowerCase();
if (lower.contains("更多") || lower.contains("查看")) {
return false;
}
return true;
}
private String cleanText(String text) {
if (text == null) return null;
return text.replaceAll("<[^>]+>", "")
.replaceAll("&nbsp;", " ")
.replaceAll("&#[0-9]+;", "")
.replaceAll("&[a-zA-Z]+;", " ")
.replaceAll("\\s+", " ")
.replaceAll("[<>]", "")
.trim();
}
}