Browse Source

上传文件至 'project/strategy'

main
LiuZihan 4 weeks ago
parent
commit
2d4cccf88d
  1. 33
      project/strategy/DoubanBookStrategy.java
  2. 59
      project/strategy/DoubanTop250Strategy.java
  3. 11
      project/strategy/MovieCrawlStrategy.java
  4. 25
      project/strategy/MovieStrategyFactory.java
  5. 59
      project/strategy/SinaNewsStrategy.java

33
project/strategy/DoubanBookStrategy.java

@ -0,0 +1,33 @@
package com.example.moviecli.strategy;
import com.example.moviecli.model.Movie;
import com.example.moviecli.exception.ParseFailedException;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class DoubanBookStrategy implements MovieCrawlStrategy {
@Override
public boolean supports(String url) {
return url.contains("book.douban.com/top250");
}
@Override
public List<Movie> parse(Document doc) throws ParseFailedException {
try {
List<Movie> books = new ArrayList<>();
Elements items = doc.select(".item");
int rank = 1;
for (Element item : items) {
String title = item.select(".pl2 a").text().trim();
String score = item.select(".rating_nums").text();
books.add(new Movie(rank++, title, "", score, "图书", "豆瓣图书"));
}
return books;
} catch (Exception e) {
throw new ParseFailedException("豆瓣图书解析失败", e);
}
}
}

59
project/strategy/DoubanTop250Strategy.java

@ -0,0 +1,59 @@
package com.example.moviecli.strategy;
import com.example.moviecli.model.Movie;
import com.example.moviecli.exception.ParseFailedException;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class DoubanTop250Strategy implements MovieCrawlStrategy {
@Override
public boolean supports(String url) {
return url.contains("movie.douban.com/top250");
}
@Override
public List<Movie> parse(Document doc) throws ParseFailedException {
try {
List<Movie> movies = new ArrayList<>();
Elements items = doc.select(".item");
for (Element item : items) {
String rankText = item.select(".pic em").text();
int rank = Integer.parseInt(rankText);
String title = item.select(".title").first().text();
String originalTitle = "";
Elements titles = item.select(".title");
if (titles.size() > 1) {
originalTitle = titles.get(1).text().replace("/", "").trim();
}
String score = item.select(".rating_num").text();
String info = item.select(".bd p").first().text();
String year = extractYear(info);
String director = extractDirector(info);
movies.add(new Movie(rank, title, originalTitle, score, year, director));
}
return movies;
} catch (Exception e) {
throw new ParseFailedException("豆瓣电影解析失败", e);
}
}
private String extractYear(String info) {
for (String part : info.split(" ")) {
if (part.matches("\\d{4}")) return part;
}
return "未知";
}
private String extractDirector(String info) {
if (info.contains("导演:")) {
int start = info.indexOf("导演:") + 3;
int end = info.indexOf(" ", start);
if (end == -1) end = info.length();
return info.substring(start, end).trim();
}
return "未知";
}
}

11
project/strategy/MovieCrawlStrategy.java

@ -0,0 +1,11 @@
package com.example.moviecli.strategy;
import com.example.moviecli.model.Movie;
import com.example.moviecli.exception.ParseFailedException;
import org.jsoup.nodes.Document;
import java.util.List;
public interface MovieCrawlStrategy {
boolean supports(String url);
List<Movie> parse(Document doc) throws ParseFailedException;
}

25
project/strategy/MovieStrategyFactory.java

@ -0,0 +1,25 @@
package com.example.moviecli.strategy;
import java.util.ArrayList;
import java.util.List;
public class MovieStrategyFactory {
private final List<MovieCrawlStrategy> strategies = new ArrayList<>();
public MovieStrategyFactory() {
strategies.add(new DoubanTop250Strategy());
strategies.add(new SinaNewsStrategy()); // 新增
strategies.add(new DoubanBookStrategy());
}
public MovieCrawlStrategy getStrategy(String url) {
for (MovieCrawlStrategy s : strategies) {
if (s.supports(url)) return s;
}
return null;
}
public void register(MovieCrawlStrategy strategy) {
strategies.add(strategy);
}
}

59
project/strategy/SinaNewsStrategy.java

@ -0,0 +1,59 @@
package com.example.moviecli.strategy;
import com.example.moviecli.model.Movie;
import com.example.moviecli.exception.ParseFailedException;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
public class SinaNewsStrategy implements MovieCrawlStrategy {
@Override
public boolean supports(String url) {
return url.contains("news.sina.com.cn");
}
@Override
public List<Movie> parse(Document doc) throws ParseFailedException {
try {
List<Movie> newsList = new ArrayList<>();
Set<String> titleSet = new LinkedHashSet<>();
// 抓取所有 a 标签,过滤出标题较长的(通常新闻标题长度 > 8)
Elements allLinks = doc.select("a");
for (Element link : allLinks) {
String title = link.text().trim();
// 过滤短文本、纯数字、纯符号、常见非标题文字
if (title.length() > 8 && !title.matches("^[\\d\\s]+$")
&& !title.contains("评论") && !title.contains("举报")) {
// 进一步过滤:通常新闻标题不会太短且不会包含过多标点
titleSet.add(title);
}
if (titleSet.size() >= 120) break; // 最多抓取120条
}
// 如果数量不够 30,再尝试抓取特定区域
if (titleSet.size() < 30) {
Elements newsItems = doc.select(".news-item, .blk, .main-content a");
for (Element item : newsItems) {
String title = item.text().trim();
if (title.length() > 8 && !titleSet.contains(title)) {
titleSet.add(title);
}
if (titleSet.size() >= 120) break;
}
}
int rank = 1;
for (String title : titleSet) {
newsList.add(new Movie(rank++, title, "", "新闻", "新浪", ""));
}
return newsList;
} catch (Exception e) {
throw new ParseFailedException("新浪新闻解析失败", e);
}
}
}
Loading…
Cancel
Save