You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
271 lines
10 KiB
271 lines
10 KiB
package strategy;
|
|
|
|
import model.CrawlResult;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
import exception.ParseException;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public class CsdnBlogStrategy extends AbstractCrawlStrategy {
|
|
private static final Logger logger = LoggerFactory.getLogger(CsdnBlogStrategy.class);
|
|
|
|
private static final String SITE_NAME = "CSDN博客";
|
|
|
|
private static final String[] CATEGORY_URLS = {
|
|
"https://blog.csdn.net/nav/python?page=%d",
|
|
"https://blog.csdn.net/nav/java?page=%d",
|
|
"https://blog.csdn.net/nav/web?page=%d",
|
|
"https://blog.csdn.net/nav/ai?page=%d",
|
|
"https://blog.csdn.net/nav/database?page=%d",
|
|
"https://blog.csdn.net/nav/ops?page=%d",
|
|
"https://blog.csdn.net/nav/security?page=%d",
|
|
"https://blog.csdn.net/nav/mobile?page=%d",
|
|
"https://blog.csdn.net/nav/game?page=%d",
|
|
"https://blog.csdn.net/nav/arch?page=%d"
|
|
};
|
|
|
|
private static final String[] CATEGORY_NAMES = {
|
|
"Python", "Java", "Web", "人工智能", "数据库",
|
|
"运维", "安全", "移动开发", "游戏", "架构"
|
|
};
|
|
|
|
@Override
|
|
public String getBaseUrl() {
|
|
return "https://blog.csdn.net/nav/python";
|
|
}
|
|
|
|
@Override
|
|
public String getSiteName() {
|
|
return SITE_NAME;
|
|
}
|
|
|
|
@Override
|
|
public List<CrawlResult> crawlPage(int page) throws IOException, ParseException {
|
|
List<CrawlResult> results = new ArrayList<>();
|
|
|
|
int categoryIndex = (page - 1) / 5;
|
|
int pageInCategory = (page - 1) % 5 + 1;
|
|
|
|
if (categoryIndex >= CATEGORY_URLS.length) {
|
|
logger.info("CSDN博客: 页码 {} 超出分类范围", page);
|
|
return results;
|
|
}
|
|
|
|
String url = String.format(CATEGORY_URLS[categoryIndex], pageInCategory);
|
|
String categoryName = CATEGORY_NAMES[categoryIndex];
|
|
logger.info("正在爬取 CSDN {} 分类第 {} 页: {}", categoryName, pageInCategory, url);
|
|
|
|
Document doc = fetchDocument(url);
|
|
|
|
if (doc != null) {
|
|
List<CrawlResult> parsed = parseCsdnArticles(doc, url);
|
|
results.addAll(parsed);
|
|
logger.info("CSDN {} 分类第 {} 页解析完成,获取 {} 条数据", categoryName, pageInCategory, parsed.size());
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
private List<CrawlResult> parseCsdnArticles(Document doc, String url) {
|
|
List<CrawlResult> results = new ArrayList<>();
|
|
|
|
Elements articleItems = doc.select(".article-item");
|
|
if (articleItems.isEmpty()) {
|
|
articleItems = doc.select(".feed_article");
|
|
}
|
|
if (articleItems.isEmpty()) {
|
|
articleItems = doc.select(".list_article");
|
|
}
|
|
if (articleItems.isEmpty()) {
|
|
articleItems = doc.select("div.article-item");
|
|
}
|
|
if (articleItems.isEmpty()) {
|
|
articleItems = doc.select(".article-list");
|
|
}
|
|
if (articleItems.isEmpty()) {
|
|
articleItems = doc.select("div.feed_article");
|
|
}
|
|
if (articleItems.isEmpty()) {
|
|
Elements articleItems2 = doc.select("[class*=article]");
|
|
for (Element item : articleItems2) {
|
|
Element titleElem = item.selectFirst("h4 a");
|
|
if (titleElem == null) titleElem = item.selectFirst("h3 a");
|
|
if (titleElem == null) titleElem = item.selectFirst("a.article-title");
|
|
if (titleElem == null) titleElem = item.selectFirst("a.title");
|
|
if (titleElem != null) {
|
|
articleItems.add(item);
|
|
}
|
|
}
|
|
}
|
|
|
|
for (Element item : articleItems) {
|
|
try {
|
|
CrawlResult result = parseArticleItem(item, url);
|
|
if (result != null && result.getTitle() != null && !result.getTitle().isEmpty()) {
|
|
results.add(result);
|
|
}
|
|
} catch (Exception e) {
|
|
logger.debug("解析文章条目失败: {}", e.getMessage());
|
|
}
|
|
}
|
|
|
|
if (results.isEmpty()) {
|
|
Elements articleList = doc.select(".article-list li");
|
|
for (Element li : articleList) {
|
|
try {
|
|
Element titleElem = li.selectFirst("h2 a");
|
|
if (titleElem == null) titleElem = li.selectFirst("h3 a");
|
|
if (titleElem == null) titleElem = li.selectFirst(".title a");
|
|
if (titleElem == null) titleElem = li.selectFirst("a");
|
|
|
|
if (titleElem != null) {
|
|
String title = titleElem.text().trim();
|
|
String articleUrl = titleElem.attr("href");
|
|
|
|
Element descElem = li.selectFirst(".description");
|
|
if (descElem == null) descElem = li.selectFirst(".article-description");
|
|
if (descElem == null) descElem = li.selectFirst(".content");
|
|
String description = descElem != null ? descElem.text().trim() : "";
|
|
|
|
Element authorElem = li.selectFirst(".author");
|
|
if (authorElem == null) authorElem = li.selectFirst(".nick-name");
|
|
if (authorElem == null) authorElem = li.selectFirst("[class*=author]");
|
|
String author = authorElem != null ? authorElem.text().trim() : "CSDN用户";
|
|
|
|
if (!title.isEmpty()) {
|
|
CrawlResult result = new CrawlResult(
|
|
title,
|
|
0,
|
|
0,
|
|
10.0,
|
|
articleUrl,
|
|
author + " | " + description
|
|
);
|
|
results.add(result);
|
|
}
|
|
}
|
|
} catch (Exception e) {
|
|
logger.debug("解析 li 文章失败: {}", e.getMessage());
|
|
}
|
|
}
|
|
}
|
|
|
|
Elements articles = doc.select(".article");
|
|
for (Element article : articles) {
|
|
try {
|
|
Element titleElem = article.selectFirst("h4");
|
|
if (titleElem == null) titleElem = article.selectFirst("h3");
|
|
if (titleElem == null) titleElem = article.selectFirst(".article-title");
|
|
if (titleElem == null) titleElem = article.selectFirst("a");
|
|
|
|
String title = titleElem != null ? titleElem.text().trim() : "";
|
|
String articleUrl = titleElem != null ? titleElem.attr("href") : "";
|
|
|
|
Element descElem = article.selectFirst("p");
|
|
if (descElem == null) descElem = article.selectFirst(".description");
|
|
String description = descElem != null ? descElem.text().trim() : "";
|
|
|
|
Element authorElem = article.selectFirst(".author");
|
|
if (authorElem == null) authorElem = article.selectFirst(".nick-name");
|
|
String author = authorElem != null ? authorElem.text().trim() : "CSDN用户";
|
|
|
|
if (!title.isEmpty()) {
|
|
CrawlResult result = new CrawlResult(
|
|
title,
|
|
0,
|
|
0,
|
|
10.0,
|
|
articleUrl,
|
|
author + " | " + description
|
|
);
|
|
results.add(result);
|
|
}
|
|
} catch (Exception e) {
|
|
logger.debug("解析 article 失败: {}", e.getMessage());
|
|
}
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
private CrawlResult parseArticleItem(Element item, String url) {
|
|
Element titleElem = item.selectFirst("h4 a");
|
|
if (titleElem == null) titleElem = item.selectFirst("h3 a");
|
|
if (titleElem == null) titleElem = item.selectFirst("a.article-title");
|
|
if (titleElem == null) titleElem = item.selectFirst("a.title");
|
|
if (titleElem == null) titleElem = item.selectFirst("a");
|
|
|
|
if (titleElem == null) {
|
|
return null;
|
|
}
|
|
|
|
String title = titleElem.text().trim();
|
|
String articleUrl = titleElem.attr("href");
|
|
|
|
if (title.isEmpty()) {
|
|
return null;
|
|
}
|
|
|
|
Element descElem = item.selectFirst(".article-description");
|
|
if (descElem == null) descElem = item.selectFirst(".description");
|
|
if (descElem == null) descElem = item.selectFirst(".content");
|
|
if (descElem == null) descElem = item.selectFirst("p");
|
|
String description = descElem != null ? descElem.text().trim() : "";
|
|
|
|
Element authorElem = item.selectFirst(".author");
|
|
if (authorElem == null) authorElem = item.selectFirst(".nick-name");
|
|
if (authorElem == null) authorElem = item.selectFirst(".user-name");
|
|
if (authorElem == null) authorElem = item.selectFirst("[class*=author]");
|
|
String author = authorElem != null ? authorElem.text().trim() : "CSDN用户";
|
|
|
|
Element dateElem = item.selectFirst(".date");
|
|
if (dateElem == null) dateElem = item.selectFirst(".time");
|
|
if (dateElem == null) dateElem = item.selectFirst("[class*=date]");
|
|
String date = dateElem != null ? dateElem.text().trim() : "";
|
|
|
|
String extraInfo = author;
|
|
if (!date.isEmpty()) {
|
|
extraInfo += " | " + date;
|
|
}
|
|
if (!description.isEmpty()) {
|
|
extraInfo += " | " + description;
|
|
}
|
|
|
|
return new CrawlResult(title, 0, 0, 10.0, articleUrl, extraInfo);
|
|
}
|
|
|
|
@Override
|
|
public CrawlResult parseItem(Element element) throws ParseException {
|
|
Element titleElem = element.selectFirst("h4 a");
|
|
if (titleElem == null) titleElem = element.selectFirst("h3 a");
|
|
if (titleElem == null) titleElem = element.selectFirst("a");
|
|
|
|
String title = titleElem != null ? titleElem.text().trim() : "";
|
|
String url = titleElem != null ? titleElem.attr("href") : "";
|
|
|
|
if (title.isEmpty()) {
|
|
return null;
|
|
}
|
|
|
|
Element authorElem = element.selectFirst(".author");
|
|
String author = authorElem != null ? authorElem.text().trim() : "CSDN用户";
|
|
|
|
return new CrawlResult(title, 0, 0, 10.0, url, author);
|
|
}
|
|
|
|
@Override
|
|
public int getPageSize() {
|
|
return 15;
|
|
}
|
|
|
|
@Override
|
|
protected String getReferer(String url) {
|
|
return "https://blog.csdn.net/";
|
|
}
|
|
}
|