You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
210 lines
8.7 KiB
210 lines
8.7 KiB
package com.crawler;
|
|
|
|
import com.crawler.model.Movie;
|
|
import com.crawler.strategy.impl.DoubanCrawler;
|
|
import com.crawler.strategy.impl.QuotesCrawler;
|
|
import com.crawler.strategy.impl.TencentCrawler;
|
|
|
|
import java.io.BufferedWriter;
|
|
import java.io.FileWriter;
|
|
import java.io.IOException;
|
|
import java.time.LocalDateTime;
|
|
import java.time.format.DateTimeFormatter;
|
|
import java.util.ArrayList;
|
|
import java.util.Comparator;
|
|
import java.util.List;
|
|
|
|
public class TestCrawler {
|
|
public static void main(String[] args) {
|
|
System.out.println("\n" + "=".repeat(80));
|
|
System.out.println(" Movie Data Crawler - 3 Sites");
|
|
System.out.println("=".repeat(80) + "\n");
|
|
|
|
List<Movie> allMovies = new ArrayList<>();
|
|
List<Movie> doubanMovies = new ArrayList<>();
|
|
List<Movie> quotesMovies = new ArrayList<>();
|
|
List<Movie> tencentMovies = new ArrayList<>();
|
|
|
|
// Crawl Douban
|
|
System.out.println("\n" + "-".repeat(80));
|
|
System.out.println("Crawling Douban Movies...");
|
|
System.out.println("-".repeat(80));
|
|
try {
|
|
DoubanCrawler doubanCrawler = new DoubanCrawler();
|
|
doubanMovies = doubanCrawler.crawl();
|
|
allMovies.addAll(doubanMovies);
|
|
System.out.println("\nDouban crawling completed! Got " + doubanMovies.size() + " movies");
|
|
printMovieList("Douban", doubanMovies);
|
|
} catch (Exception e) {
|
|
System.err.println("Douban crawling error: " + e.getMessage());
|
|
}
|
|
|
|
// Crawl Quotes
|
|
System.out.println("\n" + "-".repeat(80));
|
|
System.out.println("Crawling Quotes to Movies...");
|
|
System.out.println("-".repeat(80));
|
|
try {
|
|
QuotesCrawler quotesCrawler = new QuotesCrawler();
|
|
quotesMovies = quotesCrawler.crawl();
|
|
allMovies.addAll(quotesMovies);
|
|
System.out.println("\nQuotes crawling completed! Got " + quotesMovies.size() + " movies");
|
|
printMovieList("Quotes", quotesMovies);
|
|
} catch (Exception e) {
|
|
System.err.println("Quotes crawling error: " + e.getMessage());
|
|
}
|
|
|
|
// Crawl Tencent
|
|
System.out.println("\n" + "-".repeat(80));
|
|
System.out.println("Crawling Tencent Movies...");
|
|
System.out.println("-".repeat(80));
|
|
try {
|
|
TencentCrawler tencentCrawler = new TencentCrawler();
|
|
tencentMovies = tencentCrawler.crawl();
|
|
allMovies.addAll(tencentMovies);
|
|
System.out.println("\nTencent crawling completed! Got " + tencentMovies.size() + " movies");
|
|
printMovieList("Tencent", tencentMovies);
|
|
} catch (Exception e) {
|
|
System.err.println("Tencent crawling error: " + e.getMessage());
|
|
}
|
|
|
|
// Sort by rating descending
|
|
allMovies.sort(Comparator.comparing(Movie::getRating).reversed());
|
|
|
|
// Show summary
|
|
System.out.println("\n\n" + "=".repeat(80));
|
|
System.out.println(" Crawling Summary");
|
|
System.out.println("=".repeat(80));
|
|
System.out.println("\nData statistics:");
|
|
System.out.println(" - Douban: " + doubanMovies.size() + " movies");
|
|
System.out.println(" - Quotes: " + quotesMovies.size() + " movies");
|
|
System.out.println(" - Tencent: " + tencentMovies.size() + " movies");
|
|
System.out.println(" - Total: " + allMovies.size() + " movies");
|
|
|
|
System.out.println("\nTop 50 Movies:");
|
|
System.out.println("-".repeat(80));
|
|
System.out.printf("%-6s %-35s %-10s %-15s %-12s %-15s %-10s\n",
|
|
"Rank", "Title", "Rating", "Source", "Year", "Director", "Actors");
|
|
System.out.println("-".repeat(80));
|
|
|
|
int count = 0;
|
|
for (Movie movie : allMovies) {
|
|
if (count >= 50) break;
|
|
System.out.printf("%-6d %-35s %-10.1f %-15s %-12s %-15s %-10s\n",
|
|
count + 1,
|
|
truncate(movie.getTitle(), 35),
|
|
movie.getRating() != null ? movie.getRating() : 0.0,
|
|
movie.getSource(),
|
|
movie.getReleaseDate() != null && !movie.getReleaseDate().isEmpty() ? movie.getReleaseDate() : "Unknown",
|
|
truncate(movie.getDirector(), 15),
|
|
truncate(movie.getActors(), 10));
|
|
count++;
|
|
}
|
|
|
|
System.out.println("-".repeat(80) + "\n");
|
|
|
|
// Save to CSV file
|
|
String timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss"));
|
|
String csvFilename = "movies_" + timestamp + ".csv";
|
|
try {
|
|
saveToCSV(allMovies, doubanMovies, quotesMovies, tencentMovies, csvFilename);
|
|
System.out.println("Data saved to file: " + csvFilename);
|
|
System.out.println(" - Contains complete data, can be opened directly in Excel");
|
|
} catch (IOException e) {
|
|
System.err.println("Save CSV error: " + e.getMessage());
|
|
}
|
|
|
|
System.out.println("\n" + "=".repeat(80));
|
|
System.out.println(" Program completed!");
|
|
System.out.println("=".repeat(80) + "\n");
|
|
}
|
|
|
|
private static void printMovieList(String source, List<Movie> movies) {
|
|
System.out.println("\n" + source + " Movie List (First 30):");
|
|
System.out.println("-".repeat(80));
|
|
System.out.printf("%-6s %-35s %-10s %-12s %-15s %-10s\n",
|
|
"No.", "Title", "Rating", "Year", "Director", "Source");
|
|
System.out.println("-".repeat(80));
|
|
|
|
int index = 0;
|
|
for (Movie movie : movies) {
|
|
if (index >= 30) break;
|
|
System.out.printf("%-6d %-35s %-10.1f %-12s %-15s %-10s\n",
|
|
index + 1,
|
|
truncate(movie.getTitle(), 35),
|
|
movie.getRating() != null ? movie.getRating() : 0.0,
|
|
movie.getReleaseDate() != null && !movie.getReleaseDate().isEmpty() ? movie.getReleaseDate() : "Unknown",
|
|
truncate(movie.getDirector(), 15),
|
|
movie.getSource());
|
|
index++;
|
|
}
|
|
System.out.println("-".repeat(80));
|
|
}
|
|
|
|
private static void saveToCSV(List<Movie> allMovies, List<Movie> doubanMovies,
|
|
List<Movie> quotesMovies, List<Movie> tencentMovies,
|
|
String filename) throws IOException {
|
|
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) {
|
|
// Write header
|
|
writer.write("No.,Title,Rating,Director,Actors,Year,Source");
|
|
writer.newLine();
|
|
|
|
// Write all movies
|
|
writer.newLine();
|
|
writer.write("All Movies (Sorted by Rating)");
|
|
writer.newLine();
|
|
writeMoviesToCSV(writer, allMovies);
|
|
|
|
// Write Douban movies
|
|
writer.newLine();
|
|
writer.write("Douban Movies");
|
|
writer.newLine();
|
|
writeMoviesToCSV(writer, doubanMovies);
|
|
|
|
// Write Quotes movies
|
|
writer.newLine();
|
|
writer.write("Quotes Movies");
|
|
writer.newLine();
|
|
writeMoviesToCSV(writer, quotesMovies);
|
|
|
|
// Write Tencent movies
|
|
writer.newLine();
|
|
writer.write("Tencent Movies");
|
|
writer.newLine();
|
|
writeMoviesToCSV(writer, tencentMovies);
|
|
}
|
|
}
|
|
|
|
private static void writeMoviesToCSV(BufferedWriter writer, List<Movie> movies) throws IOException {
|
|
int index = 0;
|
|
for (Movie movie : movies) {
|
|
String line = String.format("%d,\"%s\",%.1f,\"%s\",\"%s\",\"%s\",\"%s\"",
|
|
index + 1,
|
|
escapeCSV(movie.getTitle()),
|
|
movie.getRating() != null ? movie.getRating() : 0.0,
|
|
escapeCSV(movie.getDirector()),
|
|
escapeCSV(movie.getActors()),
|
|
escapeCSV(movie.getReleaseDate()),
|
|
escapeCSV(movie.getSource()));
|
|
writer.write(line);
|
|
writer.newLine();
|
|
index++;
|
|
}
|
|
}
|
|
|
|
private static String escapeCSV(String str) {
|
|
if (str == null || str.isEmpty()) {
|
|
return "";
|
|
}
|
|
if (str.contains(",") || str.contains("\"") || str.contains("\n")) {
|
|
return "\"" + str.replace("\"", "\"\"") + "\"";
|
|
}
|
|
return str;
|
|
}
|
|
|
|
private static String truncate(String str, int maxLength) {
|
|
if (str == null || str.isEmpty()) {
|
|
return "";
|
|
}
|
|
return str.length() > maxLength ? str.substring(0, maxLength - 3) + "..." : str;
|
|
}
|
|
}
|
|
|