package strategy; import model.CrawlResult; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import exception.ParseException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class CsdnBlogStrategy extends AbstractCrawlStrategy { private static final Logger logger = LoggerFactory.getLogger(CsdnBlogStrategy.class); private static final String SITE_NAME = "CSDN博客"; private static final String[] CATEGORY_URLS = { "https://blog.csdn.net/nav/python?page=%d", "https://blog.csdn.net/nav/java?page=%d", "https://blog.csdn.net/nav/web?page=%d", "https://blog.csdn.net/nav/ai?page=%d", "https://blog.csdn.net/nav/database?page=%d", "https://blog.csdn.net/nav/ops?page=%d", "https://blog.csdn.net/nav/security?page=%d", "https://blog.csdn.net/nav/mobile?page=%d", "https://blog.csdn.net/nav/game?page=%d", "https://blog.csdn.net/nav/arch?page=%d" }; private static final String[] CATEGORY_NAMES = { "Python", "Java", "Web", "人工智能", "数据库", "运维", "安全", "移动开发", "游戏", "架构" }; @Override public String getBaseUrl() { return "https://blog.csdn.net/nav/python"; } @Override public String getSiteName() { return SITE_NAME; } @Override public List crawlPage(int page) throws IOException, ParseException { List results = new ArrayList<>(); int categoryIndex = (page - 1) / 5; int pageInCategory = (page - 1) % 5 + 1; if (categoryIndex >= CATEGORY_URLS.length) { logger.info("CSDN博客: 页码 {} 超出分类范围", page); return results; } String url = String.format(CATEGORY_URLS[categoryIndex], pageInCategory); String categoryName = CATEGORY_NAMES[categoryIndex]; logger.info("正在爬取 CSDN {} 分类第 {} 页: {}", categoryName, pageInCategory, url); Document doc = fetchDocument(url); if (doc != null) { List parsed = parseCsdnArticles(doc, url); results.addAll(parsed); logger.info("CSDN {} 分类第 {} 页解析完成,获取 {} 条数据", categoryName, pageInCategory, parsed.size()); } return results; } private List parseCsdnArticles(Document doc, String url) { List results = new ArrayList<>(); Elements articleItems = doc.select(".article-item"); if (articleItems.isEmpty()) { articleItems = doc.select(".feed_article"); } if (articleItems.isEmpty()) { articleItems = doc.select(".list_article"); } if (articleItems.isEmpty()) { articleItems = doc.select("div.article-item"); } if (articleItems.isEmpty()) { articleItems = doc.select(".article-list"); } if (articleItems.isEmpty()) { articleItems = doc.select("div.feed_article"); } if (articleItems.isEmpty()) { Elements articleItems2 = doc.select("[class*=article]"); for (Element item : articleItems2) { Element titleElem = item.selectFirst("h4 a"); if (titleElem == null) titleElem = item.selectFirst("h3 a"); if (titleElem == null) titleElem = item.selectFirst("a.article-title"); if (titleElem == null) titleElem = item.selectFirst("a.title"); if (titleElem != null) { articleItems.add(item); } } } for (Element item : articleItems) { try { CrawlResult result = parseArticleItem(item, url); if (result != null && result.getTitle() != null && !result.getTitle().isEmpty()) { results.add(result); } } catch (Exception e) { logger.debug("解析文章条目失败: {}", e.getMessage()); } } if (results.isEmpty()) { Elements articleList = doc.select(".article-list li"); for (Element li : articleList) { try { Element titleElem = li.selectFirst("h2 a"); if (titleElem == null) titleElem = li.selectFirst("h3 a"); if (titleElem == null) titleElem = li.selectFirst(".title a"); if (titleElem == null) titleElem = li.selectFirst("a"); if (titleElem != null) { String title = titleElem.text().trim(); String articleUrl = titleElem.attr("href"); Element descElem = li.selectFirst(".description"); if (descElem == null) descElem = li.selectFirst(".article-description"); if (descElem == null) descElem = li.selectFirst(".content"); String description = descElem != null ? descElem.text().trim() : ""; Element authorElem = li.selectFirst(".author"); if (authorElem == null) authorElem = li.selectFirst(".nick-name"); if (authorElem == null) authorElem = li.selectFirst("[class*=author]"); String author = authorElem != null ? authorElem.text().trim() : "CSDN用户"; if (!title.isEmpty()) { CrawlResult result = new CrawlResult( title, 0, 0, 10.0, articleUrl, author + " | " + description ); results.add(result); } } } catch (Exception e) { logger.debug("解析 li 文章失败: {}", e.getMessage()); } } } Elements articles = doc.select(".article"); for (Element article : articles) { try { Element titleElem = article.selectFirst("h4"); if (titleElem == null) titleElem = article.selectFirst("h3"); if (titleElem == null) titleElem = article.selectFirst(".article-title"); if (titleElem == null) titleElem = article.selectFirst("a"); String title = titleElem != null ? titleElem.text().trim() : ""; String articleUrl = titleElem != null ? titleElem.attr("href") : ""; Element descElem = article.selectFirst("p"); if (descElem == null) descElem = article.selectFirst(".description"); String description = descElem != null ? descElem.text().trim() : ""; Element authorElem = article.selectFirst(".author"); if (authorElem == null) authorElem = article.selectFirst(".nick-name"); String author = authorElem != null ? authorElem.text().trim() : "CSDN用户"; if (!title.isEmpty()) { CrawlResult result = new CrawlResult( title, 0, 0, 10.0, articleUrl, author + " | " + description ); results.add(result); } } catch (Exception e) { logger.debug("解析 article 失败: {}", e.getMessage()); } } return results; } private CrawlResult parseArticleItem(Element item, String url) { Element titleElem = item.selectFirst("h4 a"); if (titleElem == null) titleElem = item.selectFirst("h3 a"); if (titleElem == null) titleElem = item.selectFirst("a.article-title"); if (titleElem == null) titleElem = item.selectFirst("a.title"); if (titleElem == null) titleElem = item.selectFirst("a"); if (titleElem == null) { return null; } String title = titleElem.text().trim(); String articleUrl = titleElem.attr("href"); if (title.isEmpty()) { return null; } Element descElem = item.selectFirst(".article-description"); if (descElem == null) descElem = item.selectFirst(".description"); if (descElem == null) descElem = item.selectFirst(".content"); if (descElem == null) descElem = item.selectFirst("p"); String description = descElem != null ? descElem.text().trim() : ""; Element authorElem = item.selectFirst(".author"); if (authorElem == null) authorElem = item.selectFirst(".nick-name"); if (authorElem == null) authorElem = item.selectFirst(".user-name"); if (authorElem == null) authorElem = item.selectFirst("[class*=author]"); String author = authorElem != null ? authorElem.text().trim() : "CSDN用户"; Element dateElem = item.selectFirst(".date"); if (dateElem == null) dateElem = item.selectFirst(".time"); if (dateElem == null) dateElem = item.selectFirst("[class*=date]"); String date = dateElem != null ? dateElem.text().trim() : ""; String extraInfo = author; if (!date.isEmpty()) { extraInfo += " | " + date; } if (!description.isEmpty()) { extraInfo += " | " + description; } return new CrawlResult(title, 0, 0, 10.0, articleUrl, extraInfo); } @Override public CrawlResult parseItem(Element element) throws ParseException { Element titleElem = element.selectFirst("h4 a"); if (titleElem == null) titleElem = element.selectFirst("h3 a"); if (titleElem == null) titleElem = element.selectFirst("a"); String title = titleElem != null ? titleElem.text().trim() : ""; String url = titleElem != null ? titleElem.attr("href") : ""; if (title.isEmpty()) { return null; } Element authorElem = element.selectFirst(".author"); String author = authorElem != null ? authorElem.text().trim() : "CSDN用户"; return new CrawlResult(title, 0, 0, 10.0, url, author); } @Override public int getPageSize() { return 15; } @Override protected String getReferer(String url) { return "https://blog.csdn.net/"; } }