4 changed files with 389 additions and 0 deletions
@ -0,0 +1,103 @@ |
|||||
|
package com.crawler.crawler.impl; |
||||
|
|
||||
|
import com.crawler.crawler.BaseCrawler; |
||||
|
import com.crawler.model.CrawlerData; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class HunanUniversityCrawler extends BaseCrawler { |
||||
|
private static final String BASE_URL = "https://www.hnu.edu.cn"; |
||||
|
|
||||
|
@Override |
||||
|
public String getCrawlerName() { |
||||
|
return "HunanUniversityCrawler"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<CrawlerData> parseHtml(String html) { |
||||
|
List<CrawlerData> results = new ArrayList<>(); |
||||
|
|
||||
|
if (html == null || html.isEmpty()) { |
||||
|
System.out.println("警告: HTML内容为空"); |
||||
|
return results; |
||||
|
} |
||||
|
|
||||
|
String cleanHtml = html.replaceAll("\\s+", " "); |
||||
|
|
||||
|
Pattern newsPattern = Pattern.compile( |
||||
|
"<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>\\s*([^<]{4,80})\\s*</a>", |
||||
|
Pattern.CASE_INSENSITIVE |
||||
|
); |
||||
|
|
||||
|
Matcher matcher = newsPattern.matcher(cleanHtml); |
||||
|
|
||||
|
while (matcher.find() && results.size() < 30) { |
||||
|
String url = matcher.group(1); |
||||
|
String title = matcher.group(2).trim(); |
||||
|
|
||||
|
if (isValidUrl(url) && isValidTitle(title)) { |
||||
|
url = normalizeUrl(url); |
||||
|
|
||||
|
CrawlerData data = new CrawlerData(); |
||||
|
data.setTitle(cleanText(title)); |
||||
|
data.setUrl(url); |
||||
|
data.setSource(getCrawlerName()); |
||||
|
results.add(data); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return results; |
||||
|
} |
||||
|
|
||||
|
private String normalizeUrl(String url) { |
||||
|
if (url == null) return null; |
||||
|
if (url.startsWith("//")) { |
||||
|
return "https:" + url; |
||||
|
} |
||||
|
if (url.startsWith("/")) { |
||||
|
return BASE_URL + url; |
||||
|
} |
||||
|
if (!url.startsWith("http")) { |
||||
|
return BASE_URL + "/" + url; |
||||
|
} |
||||
|
return url; |
||||
|
} |
||||
|
|
||||
|
private boolean isValidUrl(String url) { |
||||
|
if (url == null || url.isEmpty()) { |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
if (url.contains("webscan.360.cn") || url.contains("mailto:") || url.contains("javascript:")) { |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
return url.contains("hnu.edu.cn") || url.startsWith("/"); |
||||
|
} |
||||
|
|
||||
|
private boolean isValidTitle(String title) { |
||||
|
if (title == null || title.isEmpty()) { |
||||
|
return false; |
||||
|
} |
||||
|
String cleaned = cleanText(title); |
||||
|
return cleaned != null && |
||||
|
cleaned.length() >= 4 && |
||||
|
cleaned.length() <= 80 && |
||||
|
!cleaned.contains("360") && |
||||
|
!cleaned.contains("网站安全") && |
||||
|
!cleaned.contains("网站测"); |
||||
|
} |
||||
|
|
||||
|
private String cleanText(String text) { |
||||
|
if (text == null) return null; |
||||
|
return text.replaceAll("<[^>]+>", "") |
||||
|
.replaceAll(" ", " ") |
||||
|
.replaceAll("&#[0-9]+;", "") |
||||
|
.replaceAll("&[a-zA-Z]+;", " ") |
||||
|
.replaceAll("\\s+", " ") |
||||
|
.trim(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,112 @@ |
|||||
|
package com.crawler.crawler.impl; |
||||
|
|
||||
|
import com.crawler.crawler.BaseCrawler; |
||||
|
import com.crawler.model.CrawlerData; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.HashSet; |
||||
|
import java.util.List; |
||||
|
import java.util.Set; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class HunanUniversityNewsCrawler extends BaseCrawler { |
||||
|
private static final String BASE_URL = "https://news.hnu.edu.cn"; |
||||
|
|
||||
|
@Override |
||||
|
public String getCrawlerName() { |
||||
|
return "HunanUniversityNewsCrawler"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<CrawlerData> parseHtml(String html) { |
||||
|
List<CrawlerData> results = new ArrayList<>(); |
||||
|
Set<String> seenUrls = new HashSet<>(); |
||||
|
|
||||
|
if (html == null || html.isEmpty()) { |
||||
|
return results; |
||||
|
} |
||||
|
|
||||
|
Pattern linkPattern = Pattern.compile( |
||||
|
"<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)</a>", |
||||
|
Pattern.CASE_INSENSITIVE |
||||
|
); |
||||
|
|
||||
|
Matcher matcher = linkPattern.matcher(html); |
||||
|
|
||||
|
while (matcher.find() && results.size() < 30) { |
||||
|
String url = matcher.group(1); |
||||
|
String title = matcher.group(2).trim(); |
||||
|
|
||||
|
if (isValidUrl(url) && isValidTitle(title) && !seenUrls.contains(url)) { |
||||
|
url = normalizeUrl(url); |
||||
|
|
||||
|
CrawlerData data = new CrawlerData(); |
||||
|
data.setTitle(cleanText(title)); |
||||
|
data.setUrl(url); |
||||
|
data.setSource(getCrawlerName()); |
||||
|
results.add(data); |
||||
|
seenUrls.add(url); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return results; |
||||
|
} |
||||
|
|
||||
|
private String normalizeUrl(String url) { |
||||
|
if (url == null) return null; |
||||
|
url = url.trim(); |
||||
|
if (url.startsWith("//")) { |
||||
|
return "https:" + url; |
||||
|
} |
||||
|
if (url.startsWith("/")) { |
||||
|
return BASE_URL + url; |
||||
|
} |
||||
|
if (!url.startsWith("http")) { |
||||
|
return BASE_URL + "/" + url; |
||||
|
} |
||||
|
return url; |
||||
|
} |
||||
|
|
||||
|
private boolean isValidUrl(String url) { |
||||
|
if (url == null || url.isEmpty()) { |
||||
|
return false; |
||||
|
} |
||||
|
if (url.contains("mailto:") || url.contains("javascript:")) { |
||||
|
return false; |
||||
|
} |
||||
|
if (url.contains("webscan.360.cn")) { |
||||
|
return false; |
||||
|
} |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
private boolean isValidTitle(String title) { |
||||
|
if (title == null || title.isEmpty()) { |
||||
|
return false; |
||||
|
} |
||||
|
String cleaned = cleanText(title); |
||||
|
if (cleaned == null || cleaned.length() < 2) { |
||||
|
return false; |
||||
|
} |
||||
|
if (cleaned.length() > 100) { |
||||
|
return false; |
||||
|
} |
||||
|
String lower = cleaned.toLowerCase(); |
||||
|
if (lower.contains("更多") || lower.contains("查看")) { |
||||
|
return false; |
||||
|
} |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
private String cleanText(String text) { |
||||
|
if (text == null) return null; |
||||
|
return text.replaceAll("<[^>]+>", "") |
||||
|
.replaceAll(" ", " ") |
||||
|
.replaceAll("&#[0-9]+;", "") |
||||
|
.replaceAll("&[a-zA-Z]+;", " ") |
||||
|
.replaceAll("\\s+", " ") |
||||
|
.replaceAll("[<>]", "") |
||||
|
.trim(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,132 @@ |
|||||
|
package com.crawler.crawler.impl; |
||||
|
|
||||
|
import com.crawler.crawler.BaseCrawler; |
||||
|
import com.crawler.model.CrawlerData; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.HashSet; |
||||
|
import java.util.List; |
||||
|
import java.util.Set; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class MountBladeCrawler extends BaseCrawler { |
||||
|
private static final String BASE_URL = "https://www.mountblade.com.cn"; |
||||
|
|
||||
|
@Override |
||||
|
public String getCrawlerName() { |
||||
|
return "MountBladeCrawler"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<CrawlerData> parseHtml(String html) { |
||||
|
List<CrawlerData> results = new ArrayList<>(); |
||||
|
Set<String> seenUrls = new HashSet<>(); |
||||
|
|
||||
|
if (html == null || html.isEmpty()) { |
||||
|
return results; |
||||
|
} |
||||
|
|
||||
|
String cleanHtml = html.replaceAll("\\s+", " "); |
||||
|
|
||||
|
Pattern linkPattern = Pattern.compile( |
||||
|
"<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)</a>", |
||||
|
Pattern.CASE_INSENSITIVE |
||||
|
); |
||||
|
|
||||
|
Matcher matcher = linkPattern.matcher(cleanHtml); |
||||
|
|
||||
|
int count = 0; |
||||
|
while (matcher.find() && count < 30) { |
||||
|
String url = matcher.group(1); |
||||
|
String title = matcher.group(2).trim(); |
||||
|
|
||||
|
if (isValidUrl(url) && isValidTitle(title) && !seenUrls.contains(url)) { |
||||
|
url = normalizeUrl(url); |
||||
|
|
||||
|
CrawlerData data = new CrawlerData(); |
||||
|
data.setTitle(cleanText(title)); |
||||
|
data.setUrl(url); |
||||
|
data.setSource(getCrawlerName()); |
||||
|
data.setPublishDate(extractDateFromUrl(url)); |
||||
|
results.add(data); |
||||
|
seenUrls.add(url); |
||||
|
count++; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return results; |
||||
|
} |
||||
|
|
||||
|
private String extractDateFromUrl(String url) { |
||||
|
if (url == null) return null; |
||||
|
|
||||
|
Pattern datePattern = Pattern.compile("/(\\d{4}-\\d{2}-\\d{2})/"); |
||||
|
Matcher matcher = datePattern.matcher(url); |
||||
|
|
||||
|
if (matcher.find()) { |
||||
|
return matcher.group(1); |
||||
|
} |
||||
|
|
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
private String normalizeUrl(String url) { |
||||
|
if (url == null) return null; |
||||
|
url = url.trim(); |
||||
|
if (url.startsWith("//")) { |
||||
|
return "https:" + url; |
||||
|
} |
||||
|
if (url.startsWith("/")) { |
||||
|
return BASE_URL + url; |
||||
|
} |
||||
|
if (!url.startsWith("http")) { |
||||
|
return BASE_URL + "/" + url; |
||||
|
} |
||||
|
return url; |
||||
|
} |
||||
|
|
||||
|
private boolean isValidUrl(String url) { |
||||
|
if (url == null || url.isEmpty()) { |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
if (url.contains("mailto:") || url.contains("javascript:")) { |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
if (url.contains("webscan.360.cn")) { |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
return url.contains("mountblade") || url.startsWith("/"); |
||||
|
} |
||||
|
|
||||
|
private boolean isValidTitle(String title) { |
||||
|
if (title == null || title.isEmpty()) { |
||||
|
return false; |
||||
|
} |
||||
|
String cleaned = cleanText(title); |
||||
|
if (cleaned == null || cleaned.length() < 2) { |
||||
|
return false; |
||||
|
} |
||||
|
if (cleaned.length() > 100) { |
||||
|
return false; |
||||
|
} |
||||
|
String lower = cleaned.toLowerCase(); |
||||
|
if (lower.contains("更多") || lower.contains("查看") || lower.contains(">>")) { |
||||
|
return false; |
||||
|
} |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
private String cleanText(String text) { |
||||
|
if (text == null) return null; |
||||
|
return text.replaceAll("<[^>]+>", "") |
||||
|
.replaceAll(" ", " ") |
||||
|
.replaceAll("&#[0-9]+;", "") |
||||
|
.replaceAll("&[a-zA-Z]+;", " ") |
||||
|
.replaceAll("\\s+", " ") |
||||
|
.trim(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,42 @@ |
|||||
|
package com.crawler.crawler.impl; |
||||
|
|
||||
|
import com.crawler.crawler.BaseCrawler; |
||||
|
import com.crawler.model.CrawlerData; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class TestCrawler extends BaseCrawler { |
||||
|
@Override |
||||
|
public String getCrawlerName() { |
||||
|
return "TestCrawler"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<CrawlerData> parseHtml(String html) { |
||||
|
List<CrawlerData> results = new ArrayList<>(); |
||||
|
|
||||
|
String cleanHtml = html.replaceAll("\\s+", " ").replaceAll(">\\s*<", "><"); |
||||
|
|
||||
|
Pattern linkPattern = Pattern.compile("<a\\s+[^>]*href\\s*=\\s*[\"']([^\"']+)[\"'][^>]*>([^<]+)</a>", Pattern.CASE_INSENSITIVE); |
||||
|
Matcher matcher = linkPattern.matcher(cleanHtml); |
||||
|
|
||||
|
int count = 0; |
||||
|
while (matcher.find() && count < 10) { |
||||
|
CrawlerData data = new CrawlerData(); |
||||
|
data.setTitle(cleanText(matcher.group(2))); |
||||
|
data.setUrl(matcher.group(1)); |
||||
|
results.add(data); |
||||
|
count++; |
||||
|
} |
||||
|
|
||||
|
return results; |
||||
|
} |
||||
|
|
||||
|
private String cleanText(String text) { |
||||
|
if (text == null) return null; |
||||
|
return text.replaceAll("<[^>]+>", "").trim(); |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue