From 2d4cccf88d507961cb609650e4449a8626cba07c Mon Sep 17 00:00:00 2001 From: LiuZihan <1353843969@qq.com> Date: Sun, 24 May 2026 17:47:23 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20'project/strategy'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- project/strategy/DoubanBookStrategy.java | 33 ++++++++++++ project/strategy/DoubanTop250Strategy.java | 59 ++++++++++++++++++++++ project/strategy/MovieCrawlStrategy.java | 11 ++++ project/strategy/MovieStrategyFactory.java | 25 +++++++++ project/strategy/SinaNewsStrategy.java | 59 ++++++++++++++++++++++ 5 files changed, 187 insertions(+) create mode 100644 project/strategy/DoubanBookStrategy.java create mode 100644 project/strategy/DoubanTop250Strategy.java create mode 100644 project/strategy/MovieCrawlStrategy.java create mode 100644 project/strategy/MovieStrategyFactory.java create mode 100644 project/strategy/SinaNewsStrategy.java diff --git a/project/strategy/DoubanBookStrategy.java b/project/strategy/DoubanBookStrategy.java new file mode 100644 index 0000000..696df3f --- /dev/null +++ b/project/strategy/DoubanBookStrategy.java @@ -0,0 +1,33 @@ +package com.example.moviecli.strategy; + +import com.example.moviecli.model.Movie; +import com.example.moviecli.exception.ParseFailedException; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class DoubanBookStrategy implements MovieCrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("book.douban.com/top250"); + } + + @Override + public List parse(Document doc) throws ParseFailedException { + try { + List books = new ArrayList<>(); + Elements items = doc.select(".item"); + int rank = 1; + for (Element item : items) { + String title = item.select(".pl2 a").text().trim(); + String score = item.select(".rating_nums").text(); + books.add(new Movie(rank++, title, "", score, "图书", "豆瓣图书")); + } + return books; + } catch (Exception e) { + throw new ParseFailedException("豆瓣图书解析失败", e); + } + } +} \ No newline at end of file diff --git a/project/strategy/DoubanTop250Strategy.java b/project/strategy/DoubanTop250Strategy.java new file mode 100644 index 0000000..0e64538 --- /dev/null +++ b/project/strategy/DoubanTop250Strategy.java @@ -0,0 +1,59 @@ +package com.example.moviecli.strategy; + +import com.example.moviecli.model.Movie; +import com.example.moviecli.exception.ParseFailedException; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class DoubanTop250Strategy implements MovieCrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("movie.douban.com/top250"); + } + + @Override + public List parse(Document doc) throws ParseFailedException { + try { + List movies = new ArrayList<>(); + Elements items = doc.select(".item"); + for (Element item : items) { + String rankText = item.select(".pic em").text(); + int rank = Integer.parseInt(rankText); + String title = item.select(".title").first().text(); + String originalTitle = ""; + Elements titles = item.select(".title"); + if (titles.size() > 1) { + originalTitle = titles.get(1).text().replace("/", "").trim(); + } + String score = item.select(".rating_num").text(); + String info = item.select(".bd p").first().text(); + String year = extractYear(info); + String director = extractDirector(info); + movies.add(new Movie(rank, title, originalTitle, score, year, director)); + } + return movies; + } catch (Exception e) { + throw new ParseFailedException("豆瓣电影解析失败", e); + } + } + + private String extractYear(String info) { + for (String part : info.split(" ")) { + if (part.matches("\\d{4}")) return part; + } + return "未知"; + } + + private String extractDirector(String info) { + if (info.contains("导演:")) { + int start = info.indexOf("导演:") + 3; + int end = info.indexOf(" ", start); + if (end == -1) end = info.length(); + return info.substring(start, end).trim(); + } + return "未知"; + } +} \ No newline at end of file diff --git a/project/strategy/MovieCrawlStrategy.java b/project/strategy/MovieCrawlStrategy.java new file mode 100644 index 0000000..aad12c2 --- /dev/null +++ b/project/strategy/MovieCrawlStrategy.java @@ -0,0 +1,11 @@ +package com.example.moviecli.strategy; + +import com.example.moviecli.model.Movie; +import com.example.moviecli.exception.ParseFailedException; +import org.jsoup.nodes.Document; +import java.util.List; + +public interface MovieCrawlStrategy { + boolean supports(String url); + List parse(Document doc) throws ParseFailedException; +} \ No newline at end of file diff --git a/project/strategy/MovieStrategyFactory.java b/project/strategy/MovieStrategyFactory.java new file mode 100644 index 0000000..3a30e22 --- /dev/null +++ b/project/strategy/MovieStrategyFactory.java @@ -0,0 +1,25 @@ +package com.example.moviecli.strategy; + +import java.util.ArrayList; +import java.util.List; + +public class MovieStrategyFactory { + private final List strategies = new ArrayList<>(); + + public MovieStrategyFactory() { + strategies.add(new DoubanTop250Strategy()); + strategies.add(new SinaNewsStrategy()); // 新增 + strategies.add(new DoubanBookStrategy()); + } + + public MovieCrawlStrategy getStrategy(String url) { + for (MovieCrawlStrategy s : strategies) { + if (s.supports(url)) return s; + } + return null; + } + + public void register(MovieCrawlStrategy strategy) { + strategies.add(strategy); + } +} \ No newline at end of file diff --git a/project/strategy/SinaNewsStrategy.java b/project/strategy/SinaNewsStrategy.java new file mode 100644 index 0000000..30930e4 --- /dev/null +++ b/project/strategy/SinaNewsStrategy.java @@ -0,0 +1,59 @@ +package com.example.moviecli.strategy; + +import com.example.moviecli.model.Movie; +import com.example.moviecli.exception.ParseFailedException; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +public class SinaNewsStrategy implements MovieCrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("news.sina.com.cn"); + } + + @Override + public List parse(Document doc) throws ParseFailedException { + try { + List newsList = new ArrayList<>(); + Set titleSet = new LinkedHashSet<>(); + + // 抓取所有 a 标签,过滤出标题较长的(通常新闻标题长度 > 8) + Elements allLinks = doc.select("a"); + for (Element link : allLinks) { + String title = link.text().trim(); + // 过滤短文本、纯数字、纯符号、常见非标题文字 + if (title.length() > 8 && !title.matches("^[\\d\\s]+$") + && !title.contains("评论") && !title.contains("举报")) { + // 进一步过滤:通常新闻标题不会太短且不会包含过多标点 + titleSet.add(title); + } + if (titleSet.size() >= 120) break; // 最多抓取120条 + } + + // 如果数量不够 30,再尝试抓取特定区域 + if (titleSet.size() < 30) { + Elements newsItems = doc.select(".news-item, .blk, .main-content a"); + for (Element item : newsItems) { + String title = item.text().trim(); + if (title.length() > 8 && !titleSet.contains(title)) { + titleSet.add(title); + } + if (titleSet.size() >= 120) break; + } + } + + int rank = 1; + for (String title : titleSet) { + newsList.add(new Movie(rank++, title, "", "新闻", "新浪", "")); + } + return newsList; + } catch (Exception e) { + throw new ParseFailedException("新浪新闻解析失败", e); + } + } +} \ No newline at end of file