5 changed files with 187 additions and 0 deletions
@ -0,0 +1,33 @@ |
|||||
|
package com.example.moviecli.strategy; |
||||
|
|
||||
|
import com.example.moviecli.model.Movie; |
||||
|
import com.example.moviecli.exception.ParseFailedException; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class DoubanBookStrategy implements MovieCrawlStrategy { |
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url.contains("book.douban.com/top250"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> parse(Document doc) throws ParseFailedException { |
||||
|
try { |
||||
|
List<Movie> books = new ArrayList<>(); |
||||
|
Elements items = doc.select(".item"); |
||||
|
int rank = 1; |
||||
|
for (Element item : items) { |
||||
|
String title = item.select(".pl2 a").text().trim(); |
||||
|
String score = item.select(".rating_nums").text(); |
||||
|
books.add(new Movie(rank++, title, "", score, "图书", "豆瓣图书")); |
||||
|
} |
||||
|
return books; |
||||
|
} catch (Exception e) { |
||||
|
throw new ParseFailedException("豆瓣图书解析失败", e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,59 @@ |
|||||
|
package com.example.moviecli.strategy; |
||||
|
|
||||
|
import com.example.moviecli.model.Movie; |
||||
|
import com.example.moviecli.exception.ParseFailedException; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class DoubanTop250Strategy implements MovieCrawlStrategy { |
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url.contains("movie.douban.com/top250"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> parse(Document doc) throws ParseFailedException { |
||||
|
try { |
||||
|
List<Movie> movies = new ArrayList<>(); |
||||
|
Elements items = doc.select(".item"); |
||||
|
for (Element item : items) { |
||||
|
String rankText = item.select(".pic em").text(); |
||||
|
int rank = Integer.parseInt(rankText); |
||||
|
String title = item.select(".title").first().text(); |
||||
|
String originalTitle = ""; |
||||
|
Elements titles = item.select(".title"); |
||||
|
if (titles.size() > 1) { |
||||
|
originalTitle = titles.get(1).text().replace("/", "").trim(); |
||||
|
} |
||||
|
String score = item.select(".rating_num").text(); |
||||
|
String info = item.select(".bd p").first().text(); |
||||
|
String year = extractYear(info); |
||||
|
String director = extractDirector(info); |
||||
|
movies.add(new Movie(rank, title, originalTitle, score, year, director)); |
||||
|
} |
||||
|
return movies; |
||||
|
} catch (Exception e) { |
||||
|
throw new ParseFailedException("豆瓣电影解析失败", e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private String extractYear(String info) { |
||||
|
for (String part : info.split(" ")) { |
||||
|
if (part.matches("\\d{4}")) return part; |
||||
|
} |
||||
|
return "未知"; |
||||
|
} |
||||
|
|
||||
|
private String extractDirector(String info) { |
||||
|
if (info.contains("导演:")) { |
||||
|
int start = info.indexOf("导演:") + 3; |
||||
|
int end = info.indexOf(" ", start); |
||||
|
if (end == -1) end = info.length(); |
||||
|
return info.substring(start, end).trim(); |
||||
|
} |
||||
|
return "未知"; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package com.example.moviecli.strategy; |
||||
|
|
||||
|
import com.example.moviecli.model.Movie; |
||||
|
import com.example.moviecli.exception.ParseFailedException; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface MovieCrawlStrategy { |
||||
|
boolean supports(String url); |
||||
|
List<Movie> parse(Document doc) throws ParseFailedException; |
||||
|
} |
||||
@ -0,0 +1,25 @@ |
|||||
|
package com.example.moviecli.strategy; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class MovieStrategyFactory { |
||||
|
private final List<MovieCrawlStrategy> strategies = new ArrayList<>(); |
||||
|
|
||||
|
public MovieStrategyFactory() { |
||||
|
strategies.add(new DoubanTop250Strategy()); |
||||
|
strategies.add(new SinaNewsStrategy()); // 新增
|
||||
|
strategies.add(new DoubanBookStrategy()); |
||||
|
} |
||||
|
|
||||
|
public MovieCrawlStrategy getStrategy(String url) { |
||||
|
for (MovieCrawlStrategy s : strategies) { |
||||
|
if (s.supports(url)) return s; |
||||
|
} |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public void register(MovieCrawlStrategy strategy) { |
||||
|
strategies.add(strategy); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,59 @@ |
|||||
|
package com.example.moviecli.strategy; |
||||
|
|
||||
|
import com.example.moviecli.model.Movie; |
||||
|
import com.example.moviecli.exception.ParseFailedException; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.LinkedHashSet; |
||||
|
import java.util.List; |
||||
|
import java.util.Set; |
||||
|
|
||||
|
public class SinaNewsStrategy implements MovieCrawlStrategy { |
||||
|
@Override |
||||
|
public boolean supports(String url) { |
||||
|
return url.contains("news.sina.com.cn"); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public List<Movie> parse(Document doc) throws ParseFailedException { |
||||
|
try { |
||||
|
List<Movie> newsList = new ArrayList<>(); |
||||
|
Set<String> titleSet = new LinkedHashSet<>(); |
||||
|
|
||||
|
// 抓取所有 a 标签,过滤出标题较长的(通常新闻标题长度 > 8)
|
||||
|
Elements allLinks = doc.select("a"); |
||||
|
for (Element link : allLinks) { |
||||
|
String title = link.text().trim(); |
||||
|
// 过滤短文本、纯数字、纯符号、常见非标题文字
|
||||
|
if (title.length() > 8 && !title.matches("^[\\d\\s]+$") |
||||
|
&& !title.contains("评论") && !title.contains("举报")) { |
||||
|
// 进一步过滤:通常新闻标题不会太短且不会包含过多标点
|
||||
|
titleSet.add(title); |
||||
|
} |
||||
|
if (titleSet.size() >= 120) break; // 最多抓取120条
|
||||
|
} |
||||
|
|
||||
|
// 如果数量不够 30,再尝试抓取特定区域
|
||||
|
if (titleSet.size() < 30) { |
||||
|
Elements newsItems = doc.select(".news-item, .blk, .main-content a"); |
||||
|
for (Element item : newsItems) { |
||||
|
String title = item.text().trim(); |
||||
|
if (title.length() > 8 && !titleSet.contains(title)) { |
||||
|
titleSet.add(title); |
||||
|
} |
||||
|
if (titleSet.size() >= 120) break; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
int rank = 1; |
||||
|
for (String title : titleSet) { |
||||
|
newsList.add(new Movie(rank++, title, "", "新闻", "新浪", "")); |
||||
|
} |
||||
|
return newsList; |
||||
|
} catch (Exception e) { |
||||
|
throw new ParseFailedException("新浪新闻解析失败", e); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue