diff --git a/w10/DefaultStrategy.java b/w10/DefaultStrategy.java new file mode 100644 index 0000000..4e7d0e6 --- /dev/null +++ b/w10/DefaultStrategy.java @@ -0,0 +1,47 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; + +public class DefaultStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return true; + } + + @Override + public int getPriority() { + return 0; + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + + Elements titles = doc.select("h1, h2, h3, .title, .article-title, [class*=title], [id*=title]"); + for (Element e : titles) { + String title = e.text().trim(); + if (!title.isEmpty()) { + articles.add(new Article(title, url, "")); + } + } + + if (articles.isEmpty()) { + Elements links = doc.select("a[href]"); + for (Element link : links) { + String linkText = link.text().trim(); + String linkUrl = link.attr("abs:href"); + if (!linkText.isEmpty()) { + articles.add(new Article(linkText, linkUrl, "")); + } + } + } + + return articles; + } +} \ No newline at end of file