package com.crawler.crawler.impl; import com.crawler.crawler.BaseCrawler; import com.crawler.model.CrawlerData; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; public class MountBladeCrawler extends BaseCrawler { private static final String BASE_URL = "https://www.mountblade.com.cn"; @Override public String getCrawlerName() { return "MountBladeCrawler"; } @Override protected List parseHtml(String html) { List results = new ArrayList<>(); Set seenUrls = new HashSet<>(); if (html == null || html.isEmpty()) { return results; } String cleanHtml = html.replaceAll("\\s+", " "); Pattern linkPattern = Pattern.compile( "]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)", Pattern.CASE_INSENSITIVE ); Matcher matcher = linkPattern.matcher(cleanHtml); int count = 0; while (matcher.find() && count < 30) { String url = matcher.group(1); String title = matcher.group(2).trim(); if (isValidUrl(url) && isValidTitle(title) && !seenUrls.contains(url)) { url = normalizeUrl(url); CrawlerData data = new CrawlerData(); data.setTitle(cleanText(title)); data.setUrl(url); data.setSource(getCrawlerName()); data.setPublishDate(extractDateFromUrl(url)); results.add(data); seenUrls.add(url); count++; } } return results; } private String extractDateFromUrl(String url) { if (url == null) return null; Pattern datePattern = Pattern.compile("/(\\d{4}-\\d{2}-\\d{2})/"); Matcher matcher = datePattern.matcher(url); if (matcher.find()) { return matcher.group(1); } return null; } private String normalizeUrl(String url) { if (url == null) return null; url = url.trim(); if (url.startsWith("//")) { return "https:" + url; } if (url.startsWith("/")) { return BASE_URL + url; } if (!url.startsWith("http")) { return BASE_URL + "/" + url; } return url; } private boolean isValidUrl(String url) { if (url == null || url.isEmpty()) { return false; } if (url.contains("mailto:") || url.contains("javascript:")) { return false; } if (url.contains("webscan.360.cn")) { return false; } return url.contains("mountblade") || url.startsWith("/"); } private boolean isValidTitle(String title) { if (title == null || title.isEmpty()) { return false; } String cleaned = cleanText(title); if (cleaned == null || cleaned.length() < 2) { return false; } if (cleaned.length() > 100) { return false; } String lower = cleaned.toLowerCase(); if (lower.contains("更多") || lower.contains("查看") || lower.contains(">>")) { return false; } return true; } private String cleanText(String text) { if (text == null) return null; return text.replaceAll("<[^>]+>", "") .replaceAll(" ", " ") .replaceAll("&#[0-9]+;", "") .replaceAll("&[a-zA-Z]+;", " ") .replaceAll("\\s+", " ") .trim(); } }