diff --git a/w12/BlogStrategy.java b/w12/BlogStrategy.java new file mode 100644 index 0000000..b690f2a --- /dev/null +++ b/w12/BlogStrategy.java @@ -0,0 +1,34 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class BlogStrategy implements CrawlStrategy { + private static final Pattern URL_PATTERN = Pattern.compile("https?://blog\\.example\\.com/.*"); + + @Override + public boolean supports(String url) { + return URL_PATTERN.matcher(url).matches(); + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + List
articles = new ArrayList<>(); + Elements titles = doc.select(".post-title"); + for (Element e : titles) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } + + @Override + public int getPriority() { + return 2; + } +} diff --git a/w12/CrawlStrategy.java b/w12/CrawlStrategy.java new file mode 100644 index 0000000..c4afe0a --- /dev/null +++ b/w12/CrawlStrategy.java @@ -0,0 +1,15 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import java.util.List; + +public interface CrawlStrategy { + List
parse(String url, Document doc) throws ParseException; + boolean supports(String url); + + default int getPriority() { + return 0; + } +} diff --git a/w12/DefaultStrategy.java b/w12/DefaultStrategy.java new file mode 100644 index 0000000..1cdaadc --- /dev/null +++ b/w12/DefaultStrategy.java @@ -0,0 +1,57 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class DefaultStrategy implements CrawlStrategy { + private static final Pattern URL_PATTERN = Pattern.compile("https?://[\\w\\-.]+(?:/[\\w\\-./?%&=]*)?"); + + @Override + public boolean supports(String url) { + return true; + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + List
articles = new ArrayList<>(); + + String title = doc.title(); + + String content = ""; + Element contentEl = doc.selectFirst("article, .article, #content, .content, main"); + if (contentEl != null) { + content = contentEl.text().trim(); + } else { + content = doc.body().text().trim(); + } + + if (content.length() > 200) { + content = content.substring(0, 200) + "..."; + } + + articles.add(new Article(title, url, content)); + + Elements links = doc.select("a[href]"); + for (Element link : links) { + String href = link.attr("abs:href"); + String linkText = link.text().trim(); + if (!linkText.isEmpty() && href.matches("https?://.*")) { + articles.add(new Article(linkText, href, "")); + } + } + + return articles; + } + + @Override + public int getPriority() { + return Integer.MIN_VALUE; + } +} \ No newline at end of file diff --git a/w12/HnuNewsStrategy.java b/w12/HnuNewsStrategy.java new file mode 100644 index 0000000..94aa88a --- /dev/null +++ b/w12/HnuNewsStrategy.java @@ -0,0 +1,58 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class HnuNewsStrategy implements CrawlStrategy { + private static final Pattern URL_PATTERN = Pattern.compile("https?://news\\.hnu\\.edu\\.cn/.*"); + + @Override + public boolean supports(String url) { + return URL_PATTERN.matcher(url).matches(); + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + List
articles = new ArrayList<>(); + Elements listItems = doc.select("ul.list11 li"); + + for (Element li : listItems) { + Element link = li.selectFirst("a"); + if (link == null) continue; + + String articleUrl = link.attr("href"); + if (!articleUrl.startsWith("http")) { + articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", ""); + } + + String title = ""; + Element titleEl = link.selectFirst("h4.l2.h4s2"); + if (titleEl != null) { + title = titleEl.text().trim(); + } + + String content = ""; + Element contentEl = link.selectFirst("p.l3.ps3"); + if (contentEl != null) { + content = contentEl.text().trim(); + } + + if (!title.isEmpty()) { + articles.add(new Article(title, articleUrl, content)); + } + } + + return articles; + } + + @Override + public int getPriority() { + return 3; + } +} diff --git a/w12/Main.java b/w12/Main.java new file mode 100644 index 0000000..6bf358a --- /dev/null +++ b/w12/Main.java @@ -0,0 +1,33 @@ +package com.example.datacollect; + +import com.example.datacollect.controller.CrawlerController; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class Main { + private static final Logger logger = LoggerFactory.getLogger(Main.class); + + public static void main(String[] args) { + logger.info("Starting CLI Crawler application"); + + ConsoleView view = new ConsoleView(); + ArticleRepository repository = new ArticleRepository(); + StrategyFactory strategyFactory = new StrategyFactory(); + CrawlerController controller = new CrawlerController(view, repository, strategyFactory); + + view.printSuccess("Welcome to CLI Crawler (w10_3)! Type help for commands."); + logger.debug("Application started successfully"); + + while (true) { + try { + controller.handle(view.readLine()); + } catch (Exception e) { + logger.error("Unexpected error in main loop", e); + view.printError("Unexpected error: " + e.getMessage()); + } + } + } +}