You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
78 lines
2.5 KiB
78 lines
2.5 KiB
package com.movieratings.crawler.strategy;
|
|
|
|
import com.movieratings.exception.CrawlerException;
|
|
import com.movieratings.model.Movie;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
public class BoxOfficeMojoCrawlerStrategy extends AbstractCrawlerStrategy {
|
|
private static final String URL = "https://www.boxofficemojo.com/chart/top_lifetime_gross/";
|
|
|
|
@Override
|
|
public String getSiteName() {
|
|
return "Box Office Mojo";
|
|
}
|
|
|
|
@Override
|
|
public List<Movie> crawl(int limit) {
|
|
try {
|
|
Document doc = connection(URL).get();
|
|
List<Movie> movies = new ArrayList<>();
|
|
for (Element row : doc.select("tr")) {
|
|
if (movies.size() >= limit) {
|
|
break;
|
|
}
|
|
Elements cols = row.select("td");
|
|
if (cols.size() < 4) {
|
|
continue;
|
|
}
|
|
Movie movie = parseRow(cols);
|
|
if (movie.getTitle() != null && !movie.getTitle().isBlank()) {
|
|
movies.add(movie);
|
|
}
|
|
}
|
|
return movies;
|
|
} catch (IOException e) {
|
|
throw new CrawlerException("Failed to crawl " + getSiteName(), e);
|
|
}
|
|
}
|
|
|
|
private Movie parseRow(Elements cols) {
|
|
int rank = parseCount(cols.get(0).text());
|
|
Movie movie = new Movie();
|
|
movie.setRank(rank);
|
|
movie.setTitle(cols.get(1).text().trim());
|
|
movie.setBoxOffice(parseMoney(cols.get(2).text()));
|
|
movie.setReleaseYear(parseYear(cols.get(3).text()));
|
|
movie.setRating(estimateRating(rank));
|
|
movie.setDirector("Unknown");
|
|
movie.setCountry("United States");
|
|
movie.setReviewCount(0);
|
|
movie.setPosterUrl("");
|
|
movie.setQuote("Box Office Mojo lifetime gross chart entry");
|
|
movie.setType("Movie");
|
|
movie.setSourceSite(getSiteName());
|
|
return movie;
|
|
}
|
|
|
|
private double parseMoney(String value) {
|
|
if (value == null || value.isBlank()) {
|
|
return 0.0;
|
|
}
|
|
String normalized = value.replace("$", "").replace(",", "").trim();
|
|
try {
|
|
return Double.parseDouble(normalized);
|
|
} catch (NumberFormatException e) {
|
|
return 0.0;
|
|
}
|
|
}
|
|
|
|
private double estimateRating(int rank) {
|
|
return Math.max(7.0, 8.8 - rank * 0.01);
|
|
}
|
|
}
|
|
|