上传文件至 'project/crawler'

3 weeks ago · 828c1bd9ff
4 changed files with 389 additions and 0 deletions
--- a/project/crawler/HunanUniversityCrawler.java
+++ b/project/crawler/HunanUniversityCrawler.java
@ -0,0 +1,103 @@
+package com.crawler.crawler.impl;
+
+import com.crawler.crawler.BaseCrawler;
+import com.crawler.model.CrawlerData;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class HunanUniversityCrawler extends BaseCrawler {
+    private static final String BASE_URL = "https://www.hnu.edu.cn";
+
+    @Override
+    public String getCrawlerName() {
+        return "HunanUniversityCrawler";
+    }
+
+    @Override
+    protected List<CrawlerData> parseHtml(String html) {
+        List<CrawlerData> results = new ArrayList<>();
+        
+        if (html == null || html.isEmpty()) {
+            System.out.println("警告: HTML内容为空");
+            return results;
+        }
+        
+        String cleanHtml = html.replaceAll("\\s+", " ");
+        
+        Pattern newsPattern = Pattern.compile(
+            "<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>\\s*([^<]{4,80})\\s*</a>",
+            Pattern.CASE_INSENSITIVE
+        );
+        
+        Matcher matcher = newsPattern.matcher(cleanHtml);
+        
+        while (matcher.find() && results.size() < 30) {
+            String url = matcher.group(1);
+            String title = matcher.group(2).trim();
+            
+            if (isValidUrl(url) && isValidTitle(title)) {
+                url = normalizeUrl(url);
+                
+                CrawlerData data = new CrawlerData();
+                data.setTitle(cleanText(title));
+                data.setUrl(url);
+                data.setSource(getCrawlerName());
+                results.add(data);
+            }
+        }
+        
+        return results;
+    }
+    
+    private String normalizeUrl(String url) {
+        if (url == null) return null;
+        if (url.startsWith("//")) {
+            return "https:" + url;
+        }
+        if (url.startsWith("/")) {
+            return BASE_URL + url;
+        }
+        if (!url.startsWith("http")) {
+            return BASE_URL + "/" + url;
+        }
+        return url;
+    }
+    
+    private boolean isValidUrl(String url) {
+        if (url == null || url.isEmpty()) {
+            return false;
+        }
+        
+        if (url.contains("webscan.360.cn") || url.contains("mailto:") || url.contains("javascript:")) {
+            return false;
+        }
+        
+        return url.contains("hnu.edu.cn") || url.startsWith("/");
+    }
+    
+    private boolean isValidTitle(String title) {
+        if (title == null || title.isEmpty()) {
+            return false;
+        }
+        String cleaned = cleanText(title);
+        return cleaned != null && 
+               cleaned.length() >= 4 && 
+               cleaned.length() <= 80 &&
+               !cleaned.contains("360") &&
+               !cleaned.contains("网站安全") &&
+               !cleaned.contains("网站测");
+    }
+    
+    private String cleanText(String text) {
+        if (text == null) return null;
+        return text.replaceAll("<[^>]+>", "")
+                   .replaceAll("&nbsp;", " ")
+                   .replaceAll("&#[0-9]+;", "")
+                   .replaceAll("&[a-zA-Z]+;", " ")
+                   .replaceAll("\\s+", " ")
+                   .trim();
+    }
+}
--- a/project/crawler/HunanUniversityNewsCrawler.java
+++ b/project/crawler/HunanUniversityNewsCrawler.java
@ -0,0 +1,112 @@
+package com.crawler.crawler.impl;
+
+import com.crawler.crawler.BaseCrawler;
+import com.crawler.model.CrawlerData;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class HunanUniversityNewsCrawler extends BaseCrawler {
+    private static final String BASE_URL = "https://news.hnu.edu.cn";
+
+    @Override
+    public String getCrawlerName() {
+        return "HunanUniversityNewsCrawler";
+    }
+
+    @Override
+    protected List<CrawlerData> parseHtml(String html) {
+        List<CrawlerData> results = new ArrayList<>();
+        Set<String> seenUrls = new HashSet<>();
+        
+        if (html == null || html.isEmpty()) {
+            return results;
+        }
+        
+        Pattern linkPattern = Pattern.compile(
+            "<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)</a>",
+            Pattern.CASE_INSENSITIVE
+        );
+        
+        Matcher matcher = linkPattern.matcher(html);
+        
+        while (matcher.find() && results.size() < 30) {
+            String url = matcher.group(1);
+            String title = matcher.group(2).trim();
+            
+            if (isValidUrl(url) && isValidTitle(title) && !seenUrls.contains(url)) {
+                url = normalizeUrl(url);
+                
+                CrawlerData data = new CrawlerData();
+                data.setTitle(cleanText(title));
+                data.setUrl(url);
+                data.setSource(getCrawlerName());
+                results.add(data);
+                seenUrls.add(url);
+            }
+        }
+        
+        return results;
+    }
+    
+    private String normalizeUrl(String url) {
+        if (url == null) return null;
+        url = url.trim();
+        if (url.startsWith("//")) {
+            return "https:" + url;
+        }
+        if (url.startsWith("/")) {
+            return BASE_URL + url;
+        }
+        if (!url.startsWith("http")) {
+            return BASE_URL + "/" + url;
+        }
+        return url;
+    }
+    
+    private boolean isValidUrl(String url) {
+        if (url == null || url.isEmpty()) {
+            return false;
+        }
+        if (url.contains("mailto:") || url.contains("javascript:")) {
+            return false;
+        }
+        if (url.contains("webscan.360.cn")) {
+            return false;
+        }
+        return true;
+    }
+    
+    private boolean isValidTitle(String title) {
+        if (title == null || title.isEmpty()) {
+            return false;
+        }
+        String cleaned = cleanText(title);
+        if (cleaned == null || cleaned.length() < 2) {
+            return false;
+        }
+        if (cleaned.length() > 100) {
+            return false;
+        }
+        String lower = cleaned.toLowerCase();
+        if (lower.contains("更多") || lower.contains("查看")) {
+            return false;
+        }
+        return true;
+    }
+    
+    private String cleanText(String text) {
+        if (text == null) return null;
+        return text.replaceAll("<[^>]+>", "")
+                   .replaceAll("&nbsp;", " ")
+                   .replaceAll("&#[0-9]+;", "")
+                   .replaceAll("&[a-zA-Z]+;", " ")
+                   .replaceAll("\\s+", " ")
+                   .replaceAll("[<>]", "")
+                   .trim();
+    }
+}
--- a/project/crawler/MountBladeCrawler.java
+++ b/project/crawler/MountBladeCrawler.java
@ -0,0 +1,132 @@
+package com.crawler.crawler.impl;
+
+import com.crawler.crawler.BaseCrawler;
+import com.crawler.model.CrawlerData;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class MountBladeCrawler extends BaseCrawler {
+    private static final String BASE_URL = "https://www.mountblade.com.cn";
+
+    @Override
+    public String getCrawlerName() {
+        return "MountBladeCrawler";
+    }
+
+    @Override
+    protected List<CrawlerData> parseHtml(String html) {
+        List<CrawlerData> results = new ArrayList<>();
+        Set<String> seenUrls = new HashSet<>();
+
+        if (html == null || html.isEmpty()) {
+            return results;
+        }
+
+        String cleanHtml = html.replaceAll("\\s+", " ");
+
+        Pattern linkPattern = Pattern.compile(
+            "<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)</a>",
+            Pattern.CASE_INSENSITIVE
+        );
+
+        Matcher matcher = linkPattern.matcher(cleanHtml);
+
+        int count = 0;
+        while (matcher.find() && count < 30) {
+            String url = matcher.group(1);
+            String title = matcher.group(2).trim();
+
+            if (isValidUrl(url) && isValidTitle(title) && !seenUrls.contains(url)) {
+                url = normalizeUrl(url);
+
+                CrawlerData data = new CrawlerData();
+                data.setTitle(cleanText(title));
+                data.setUrl(url);
+                data.setSource(getCrawlerName());
+                data.setPublishDate(extractDateFromUrl(url));
+                results.add(data);
+                seenUrls.add(url);
+                count++;
+            }
+        }
+
+        return results;
+    }
+
+    private String extractDateFromUrl(String url) {
+        if (url == null) return null;
+        
+        Pattern datePattern = Pattern.compile("/(\\d{4}-\\d{2}-\\d{2})/");
+        Matcher matcher = datePattern.matcher(url);
+        
+        if (matcher.find()) {
+            return matcher.group(1);
+        }
+        
+        return null;
+    }
+
+    private String normalizeUrl(String url) {
+        if (url == null) return null;
+        url = url.trim();
+        if (url.startsWith("//")) {
+            return "https:" + url;
+        }
+        if (url.startsWith("/")) {
+            return BASE_URL + url;
+        }
+        if (!url.startsWith("http")) {
+            return BASE_URL + "/" + url;
+        }
+        return url;
+    }
+
+    private boolean isValidUrl(String url) {
+        if (url == null || url.isEmpty()) {
+            return false;
+        }
+
+        if (url.contains("mailto:") || url.contains("javascript:")) {
+            return false;
+        }
+
+        if (url.contains("webscan.360.cn")) {
+            return false;
+        }
+
+        return url.contains("mountblade") || url.startsWith("/");
+    }
+
+    private boolean isValidTitle(String title) {
+        if (title == null || title.isEmpty()) {
+            return false;
+        }
+        String cleaned = cleanText(title);
+        if (cleaned == null || cleaned.length() < 2) {
+            return false;
+        }
+        if (cleaned.length() > 100) {
+            return false;
+        }
+        String lower = cleaned.toLowerCase();
+        if (lower.contains("更多") || lower.contains("查看") || lower.contains(">>")) {
+            return false;
+        }
+        return true;
+    }
+
+    private String cleanText(String text) {
+        if (text == null) return null;
+        return text.replaceAll("<[^>]+>", "")
+                   .replaceAll("&nbsp;", " ")
+                   .replaceAll("&#[0-9]+;", "")
+                   .replaceAll("&[a-zA-Z]+;", " ")
+                   .replaceAll("\\s+", " ")
+                   .trim();
+    }
+}
--- a/project/crawler/TestCrawler.java
+++ b/project/crawler/TestCrawler.java
@ -0,0 +1,42 @@
+package com.crawler.crawler.impl;
+
+import com.crawler.crawler.BaseCrawler;
+import com.crawler.model.CrawlerData;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class TestCrawler extends BaseCrawler {
+    @Override
+    public String getCrawlerName() {
+        return "TestCrawler";
+    }
+
+    @Override
+    protected List<CrawlerData> parseHtml(String html) {
+        List<CrawlerData> results = new ArrayList<>();
+        
+        String cleanHtml = html.replaceAll("\\s+", " ").replaceAll(">\\s*<", "><");
+        
+        Pattern linkPattern = Pattern.compile("<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)</a>", Pattern.CASE_INSENSITIVE);
+        Matcher matcher = linkPattern.matcher(cleanHtml);
+        
+        int count = 0;
+        while (matcher.find() && count < 10) {
+            CrawlerData data = new CrawlerData();
+            data.setTitle(cleanText(matcher.group(2)));
+            data.setUrl(matcher.group(1));
+            results.add(data);
+            count++;
+        }
+        
+        return results;
+    }
+    
+    private String cleanText(String text) {
+        if (text == null) return null;
+        return text.replaceAll("<[^>]+>", "").trim();
+    }
+}