From 13f7033c614ae0541b6457269ff8d060a8ae3de2 Mon Sep 17 00:00:00 2001 From: zhangsiyuan <3837703520@qq.com> Date: Thu, 7 May 2026 19:58:26 +0800 Subject: [PATCH] =?UTF-8?q?w10-=E5=BC=A0=E6=80=9D=E6=B8=8A-202401070104?= =?UTF-8?q?=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- w10/DefaultStrategy.java | 47 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 w10/DefaultStrategy.java diff --git a/w10/DefaultStrategy.java b/w10/DefaultStrategy.java new file mode 100644 index 0000000..4e7d0e6 --- /dev/null +++ b/w10/DefaultStrategy.java @@ -0,0 +1,47 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; + +public class DefaultStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return true; + } + + @Override + public int getPriority() { + return 0; + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + + Elements titles = doc.select("h1, h2, h3, .title, .article-title, [class*=title], [id*=title]"); + for (Element e : titles) { + String title = e.text().trim(); + if (!title.isEmpty()) { + articles.add(new Article(title, url, "")); + } + } + + if (articles.isEmpty()) { + Elements links = doc.select("a[href]"); + for (Element link : links) { + String linkText = link.text().trim(); + String linkUrl = link.attr("abs:href"); + if (!linkText.isEmpty()) { + articles.add(new Article(linkText, linkUrl, "")); + } + } + } + + return articles; + } +} \ No newline at end of file