You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

271 lines
10 KiB

package strategy;
import model.CrawlResult;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import exception.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class CsdnBlogStrategy extends AbstractCrawlStrategy {
private static final Logger logger = LoggerFactory.getLogger(CsdnBlogStrategy.class);
private static final String SITE_NAME = "CSDN博客";
private static final String[] CATEGORY_URLS = {
"https://blog.csdn.net/nav/python?page=%d",
"https://blog.csdn.net/nav/java?page=%d",
"https://blog.csdn.net/nav/web?page=%d",
"https://blog.csdn.net/nav/ai?page=%d",
"https://blog.csdn.net/nav/database?page=%d",
"https://blog.csdn.net/nav/ops?page=%d",
"https://blog.csdn.net/nav/security?page=%d",
"https://blog.csdn.net/nav/mobile?page=%d",
"https://blog.csdn.net/nav/game?page=%d",
"https://blog.csdn.net/nav/arch?page=%d"
};
private static final String[] CATEGORY_NAMES = {
"Python", "Java", "Web", "人工智能", "数据库",
"运维", "安全", "移动开发", "游戏", "架构"
};
@Override
public String getBaseUrl() {
return "https://blog.csdn.net/nav/python";
}
@Override
public String getSiteName() {
return SITE_NAME;
}
@Override
public List<CrawlResult> crawlPage(int page) throws IOException, ParseException {
List<CrawlResult> results = new ArrayList<>();
int categoryIndex = (page - 1) / 5;
int pageInCategory = (page - 1) % 5 + 1;
if (categoryIndex >= CATEGORY_URLS.length) {
logger.info("CSDN博客: 页码 {} 超出分类范围", page);
return results;
}
String url = String.format(CATEGORY_URLS[categoryIndex], pageInCategory);
String categoryName = CATEGORY_NAMES[categoryIndex];
logger.info("正在爬取 CSDN {} 分类第 {} 页: {}", categoryName, pageInCategory, url);
Document doc = fetchDocument(url);
if (doc != null) {
List<CrawlResult> parsed = parseCsdnArticles(doc, url);
results.addAll(parsed);
logger.info("CSDN {} 分类第 {} 页解析完成,获取 {} 条数据", categoryName, pageInCategory, parsed.size());
}
return results;
}
private List<CrawlResult> parseCsdnArticles(Document doc, String url) {
List<CrawlResult> results = new ArrayList<>();
Elements articleItems = doc.select(".article-item");
if (articleItems.isEmpty()) {
articleItems = doc.select(".feed_article");
}
if (articleItems.isEmpty()) {
articleItems = doc.select(".list_article");
}
if (articleItems.isEmpty()) {
articleItems = doc.select("div.article-item");
}
if (articleItems.isEmpty()) {
articleItems = doc.select(".article-list");
}
if (articleItems.isEmpty()) {
articleItems = doc.select("div.feed_article");
}
if (articleItems.isEmpty()) {
Elements articleItems2 = doc.select("[class*=article]");
for (Element item : articleItems2) {
Element titleElem = item.selectFirst("h4 a");
if (titleElem == null) titleElem = item.selectFirst("h3 a");
if (titleElem == null) titleElem = item.selectFirst("a.article-title");
if (titleElem == null) titleElem = item.selectFirst("a.title");
if (titleElem != null) {
articleItems.add(item);
}
}
}
for (Element item : articleItems) {
try {
CrawlResult result = parseArticleItem(item, url);
if (result != null && result.getTitle() != null && !result.getTitle().isEmpty()) {
results.add(result);
}
} catch (Exception e) {
logger.debug("解析文章条目失败: {}", e.getMessage());
}
}
if (results.isEmpty()) {
Elements articleList = doc.select(".article-list li");
for (Element li : articleList) {
try {
Element titleElem = li.selectFirst("h2 a");
if (titleElem == null) titleElem = li.selectFirst("h3 a");
if (titleElem == null) titleElem = li.selectFirst(".title a");
if (titleElem == null) titleElem = li.selectFirst("a");
if (titleElem != null) {
String title = titleElem.text().trim();
String articleUrl = titleElem.attr("href");
Element descElem = li.selectFirst(".description");
if (descElem == null) descElem = li.selectFirst(".article-description");
if (descElem == null) descElem = li.selectFirst(".content");
String description = descElem != null ? descElem.text().trim() : "";
Element authorElem = li.selectFirst(".author");
if (authorElem == null) authorElem = li.selectFirst(".nick-name");
if (authorElem == null) authorElem = li.selectFirst("[class*=author]");
String author = authorElem != null ? authorElem.text().trim() : "CSDN用户";
if (!title.isEmpty()) {
CrawlResult result = new CrawlResult(
title,
0,
0,
10.0,
articleUrl,
author + " | " + description
);
results.add(result);
}
}
} catch (Exception e) {
logger.debug("解析 li 文章失败: {}", e.getMessage());
}
}
}
Elements articles = doc.select(".article");
for (Element article : articles) {
try {
Element titleElem = article.selectFirst("h4");
if (titleElem == null) titleElem = article.selectFirst("h3");
if (titleElem == null) titleElem = article.selectFirst(".article-title");
if (titleElem == null) titleElem = article.selectFirst("a");
String title = titleElem != null ? titleElem.text().trim() : "";
String articleUrl = titleElem != null ? titleElem.attr("href") : "";
Element descElem = article.selectFirst("p");
if (descElem == null) descElem = article.selectFirst(".description");
String description = descElem != null ? descElem.text().trim() : "";
Element authorElem = article.selectFirst(".author");
if (authorElem == null) authorElem = article.selectFirst(".nick-name");
String author = authorElem != null ? authorElem.text().trim() : "CSDN用户";
if (!title.isEmpty()) {
CrawlResult result = new CrawlResult(
title,
0,
0,
10.0,
articleUrl,
author + " | " + description
);
results.add(result);
}
} catch (Exception e) {
logger.debug("解析 article 失败: {}", e.getMessage());
}
}
return results;
}
private CrawlResult parseArticleItem(Element item, String url) {
Element titleElem = item.selectFirst("h4 a");
if (titleElem == null) titleElem = item.selectFirst("h3 a");
if (titleElem == null) titleElem = item.selectFirst("a.article-title");
if (titleElem == null) titleElem = item.selectFirst("a.title");
if (titleElem == null) titleElem = item.selectFirst("a");
if (titleElem == null) {
return null;
}
String title = titleElem.text().trim();
String articleUrl = titleElem.attr("href");
if (title.isEmpty()) {
return null;
}
Element descElem = item.selectFirst(".article-description");
if (descElem == null) descElem = item.selectFirst(".description");
if (descElem == null) descElem = item.selectFirst(".content");
if (descElem == null) descElem = item.selectFirst("p");
String description = descElem != null ? descElem.text().trim() : "";
Element authorElem = item.selectFirst(".author");
if (authorElem == null) authorElem = item.selectFirst(".nick-name");
if (authorElem == null) authorElem = item.selectFirst(".user-name");
if (authorElem == null) authorElem = item.selectFirst("[class*=author]");
String author = authorElem != null ? authorElem.text().trim() : "CSDN用户";
Element dateElem = item.selectFirst(".date");
if (dateElem == null) dateElem = item.selectFirst(".time");
if (dateElem == null) dateElem = item.selectFirst("[class*=date]");
String date = dateElem != null ? dateElem.text().trim() : "";
String extraInfo = author;
if (!date.isEmpty()) {
extraInfo += " | " + date;
}
if (!description.isEmpty()) {
extraInfo += " | " + description;
}
return new CrawlResult(title, 0, 0, 10.0, articleUrl, extraInfo);
}
@Override
public CrawlResult parseItem(Element element) throws ParseException {
Element titleElem = element.selectFirst("h4 a");
if (titleElem == null) titleElem = element.selectFirst("h3 a");
if (titleElem == null) titleElem = element.selectFirst("a");
String title = titleElem != null ? titleElem.text().trim() : "";
String url = titleElem != null ? titleElem.attr("href") : "";
if (title.isEmpty()) {
return null;
}
Element authorElem = element.selectFirst(".author");
String author = authorElem != null ? authorElem.text().trim() : "CSDN用户";
return new CrawlResult(title, 0, 0, 10.0, url, author);
}
@Override
public int getPageSize() {
return 15;
}
@Override
protected String getReferer(String url) {
return "https://blog.csdn.net/";
}
}