10 changed files with 0 additions and 810 deletions
|
@ -1,30 +0,0 @@ |
|||
import project.bean.Movie; |
|||
import project.crawler.MovieCrawler; |
|||
import project.utils.DataStorage; |
|||
import project.display.ResultDisplay; |
|||
|
|||
import java.util.List; |
|||
|
|||
public class Main { |
|||
public static void main(String[] args) { |
|||
try { |
|||
System.out.println("Starting to crawl movie data..."); |
|||
List<Movie> movies = MovieCrawler.crawlMovies(10); // Crawl 10 pages of data
|
|||
System.out.println("Crawling completed, obtained " + movies.size() + " movies data"); |
|||
|
|||
System.out.println("Saving data to CSV file..."); |
|||
DataStorage.saveToCsv(movies, "project/movies.csv"); |
|||
System.out.println("Data saved successfully"); |
|||
|
|||
System.out.println("Analyzing data..."); |
|||
ResultDisplay.displayResults(movies); |
|||
|
|||
System.out.println("Generating charts..."); |
|||
ResultDisplay.generateCharts(movies); |
|||
System.out.println("Chart generation completed, saved to project directory"); |
|||
|
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
|||
@ -1,101 +0,0 @@ |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.Connection; |
|||
import org.json.JSONArray; |
|||
import org.json.JSONObject; |
|||
import org.apache.commons.csv.*; |
|||
|
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class JobSpider { |
|||
|
|||
// ⚠️ 注意:这个 URL 可能会随时间变化,请务必按上面的步骤在 F12 中确认最新的 URL
|
|||
// 这里的参数 keyword=Java, page=1 是示例,实际需要根据网站调整
|
|||
private static final String API_URL = "https://www.iguopin.com/api/job/search?keyword=&page=1&pageSize=20"; |
|||
|
|||
public static void main(String[] args) { |
|||
List<String[]> jobList = new ArrayList<>(); |
|||
|
|||
try { |
|||
System.out.println("🚀 开始连接国聘网数据接口..."); |
|||
|
|||
// 1. 构造请求,必须伪装 Header,否则会被拒绝
|
|||
String jsonResponse = Jsoup.connect(API_URL) |
|||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36") |
|||
.header("Accept", "application/json, text/plain, */*") |
|||
.header("Referer", "https://www.iguopin.com/") // 假装是从首页跳过来的
|
|||
.timeout(5000) |
|||
.ignoreContentType(true) // 重要!允许接收非 HTML 内容 (即 JSON)
|
|||
.execute() |
|||
.body(); |
|||
|
|||
// 2. 解析 JSON 数据
|
|||
JSONObject root = new JSONObject(jsonResponse); |
|||
|
|||
// ⚠️ 关键:你需要根据 F12 看到的实际 JSON 结构调整这里的键名 (key)
|
|||
// 假设数据结构是 { "data": { "list": [...] } } 或者 { "result": [...] }
|
|||
// 下面是一个通用的猜测逻辑,请根据实际打印结果修改!
|
|||
|
|||
JSONArray jobsArray = null; |
|||
|
|||
// 尝试几种常见的结构 (你需要打印 root.toString() 来确认到底是哪一层)
|
|||
if (root.has("data")) { |
|||
JSONObject dataObj = root.getJSONObject("data"); |
|||
if (dataObj.has("list")) jobsArray = dataObj.getJSONArray("list"); |
|||
else if (dataObj.has("jobs")) jobsArray = dataObj.getJSONArray("jobs"); |
|||
} else if (root.has("result")) { |
|||
jobsArray = root.getJSONArray("result"); |
|||
} else if (root.has("jobs")) { |
|||
jobsArray = root.getJSONArray("jobs"); |
|||
} |
|||
|
|||
if (jobsArray == null) { |
|||
System.err.println("❌ 未找到职位列表数据。JSON 结构可能已变更,请打印查看:\n" + jsonResponse); |
|||
return; |
|||
} |
|||
|
|||
System.out.println("✅ 解析成功,共发现 " + jobsArray.length() + " 个职位。"); |
|||
|
|||
// 3. 提取具体字段
|
|||
for (int i = 0; i < jobsArray.length(); i++) { |
|||
JSONObject job = jobsArray.getJSONObject(i); |
|||
|
|||
// ⚠️ 再次强调:这里的 "jobName", "companyName" 必须和你 F12 里看到的一模一样!
|
|||
String title = job.optString("jobName", "未知职位"); |
|||
String company = job.optString("companyName", "未知公司"); |
|||
String salary = job.optString("salary", "面议"); |
|||
String location = job.optString("workLocation", "未知地点"); |
|||
String link = "https://www.iguopin.com/job/detail/" + job.optString("id"); // 拼接详情页链接
|
|||
|
|||
jobList.add(new String[]{title, company, salary, location, link}); |
|||
System.out.println("[" + (i+1) + "] " + title + " | " + company); |
|||
} |
|||
|
|||
// 4. 保存到 CSV
|
|||
saveToCsv(jobList, "guopin_jobs.csv"); |
|||
System.out.println("💾 数据已保存至 guopin_jobs.csv"); |
|||
|
|||
} catch (IOException e) { |
|||
e.printStackTrace(); |
|||
System.err.println("❌ 网络请求失败:可能是接口地址变了,或者被反爬拦截。"); |
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
System.err.println("❌ JSON 解析失败:请检查代码中的 key 名称是否与网页返回的一致。"); |
|||
} |
|||
} |
|||
|
|||
private static void saveToCsv(List<String[]> data, String fileName) throws IOException { |
|||
FileWriter out = new FileWriter(fileName); |
|||
// 定义表头
|
|||
CSVFormat format = CSVFormat.DEFAULT.withHeader("职位名称", "公司名称", "薪资", "地点", "链接"); |
|||
CSVPrinter printer = new CSVPrinter(out, format); |
|||
|
|||
for (String[] row : data) { |
|||
printer.printRecord(row); |
|||
} |
|||
printer.close(); |
|||
out.close(); |
|||
} |
|||
} |
|||
@ -1,42 +0,0 @@ |
|||
package project.analysis; |
|||
|
|||
import project.bean.Movie; |
|||
|
|||
import java.util.*; |
|||
import java.util.stream.Collectors; |
|||
|
|||
public class MovieAnalyzer { |
|||
public static Map<Double, Long> getRatingDistribution(List<Movie> movies) { |
|||
return movies.stream() |
|||
.collect(Collectors.groupingBy(Movie::getRating, Collectors.counting())); |
|||
} |
|||
|
|||
public static Map<Integer, Double> getYearRatingCorrelation(List<Movie> movies) { |
|||
return movies.stream() |
|||
.collect(Collectors.groupingBy(Movie::getYear, |
|||
Collectors.averagingDouble(Movie::getRating))); |
|||
} |
|||
|
|||
public static Map<String, Long> getDirectorMovieCount(List<Movie> movies) { |
|||
return movies.stream() |
|||
.collect(Collectors.groupingBy(Movie::getDirector, Collectors.counting())) |
|||
.entrySet().stream() |
|||
.filter(entry -> entry.getValue() > 1) |
|||
.sorted(Map.Entry.<String, Long>comparingByValue().reversed()) |
|||
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new)); |
|||
} |
|||
|
|||
public static double getAverageRating(List<Movie> movies) { |
|||
return movies.stream() |
|||
.mapToDouble(Movie::getRating) |
|||
.average() |
|||
.orElse(0.0); |
|||
} |
|||
|
|||
public static List<Movie> getTopRatedMovies(List<Movie> movies, int count) { |
|||
return movies.stream() |
|||
.sorted(Comparator.comparingDouble(Movie::getRating).reversed()) |
|||
.limit(count) |
|||
.collect(Collectors.toList()); |
|||
} |
|||
} |
|||
@ -1,60 +0,0 @@ |
|||
package project.bean; |
|||
|
|||
public class Movie { |
|||
private String title; |
|||
private double rating; |
|||
private int year; |
|||
private String director; |
|||
|
|||
public Movie() { |
|||
} |
|||
|
|||
public Movie(String title, double rating, int year, String director) { |
|||
this.title = title; |
|||
this.rating = rating; |
|||
this.year = year; |
|||
this.director = director; |
|||
} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public void setTitle(String title) { |
|||
this.title = title; |
|||
} |
|||
|
|||
public double getRating() { |
|||
return rating; |
|||
} |
|||
|
|||
public void setRating(double rating) { |
|||
this.rating = rating; |
|||
} |
|||
|
|||
public int getYear() { |
|||
return year; |
|||
} |
|||
|
|||
public void setYear(int year) { |
|||
this.year = year; |
|||
} |
|||
|
|||
public String getDirector() { |
|||
return director; |
|||
} |
|||
|
|||
public void setDirector(String director) { |
|||
this.director = director; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Movie{" + |
|||
"title='" + title + '\'' + |
|||
", rating=" + rating + |
|||
", year=" + year + |
|||
", director='" + director + '\'' + |
|||
'}'; |
|||
} |
|||
} |
|||
@ -1,194 +0,0 @@ |
|||
package project.crawler; |
|||
|
|||
import project.bean.Movie; |
|||
import project.utils.DataCleaner; |
|||
import project.utils.HttpUtils; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class MovieCrawler { |
|||
public static List<Movie> crawlMovies(int pageCount) throws Exception { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
|
|||
for (int page = 1; page <= pageCount; page++) { |
|||
String url = "https://movie.douban.com/top250?start=" + (page - 1) * 25; |
|||
System.out.println("Crawling page " + page + " from " + url); |
|||
try { |
|||
String html = HttpUtils.getHtml(url); |
|||
System.out.println("Got HTML content, length: " + html.length()); |
|||
|
|||
// 打印 HTML 内容的前 500 个字符,了解实际结构
|
|||
if (html.length() > 500) { |
|||
System.out.println("HTML preview: " + html.substring(0, 500) + "..."); |
|||
} |
|||
|
|||
List<Movie> pageMovies = parseMovies(html); |
|||
System.out.println("Parsed " + pageMovies.size() + " movies from page " + page); |
|||
movies.addAll(pageMovies); |
|||
} catch (Exception e) { |
|||
System.out.println("Error crawling page " + page + ": " + e.getMessage()); |
|||
} |
|||
Thread.sleep(1000); // 控制请求频率
|
|||
} |
|||
|
|||
System.out.println("Total movies crawled: " + movies.size()); |
|||
return movies; |
|||
} |
|||
|
|||
private static List<Movie> parseMovies(String html) { |
|||
List<Movie> movies = new ArrayList<>(); |
|||
|
|||
// Find all movie items by looking for <div class="item"> and matching until </div> at the same nesting level
|
|||
int startIndex = 0; |
|||
int count = 0; |
|||
|
|||
while (true) { |
|||
int itemStart = html.indexOf("<div class=\"item\">", startIndex); |
|||
if (itemStart < 0) break; |
|||
|
|||
// Find the matching </div> by counting nested divs
|
|||
int pos = itemStart + "<div class=\"item\">".length(); |
|||
int depth = 1; |
|||
int itemEnd = -1; |
|||
|
|||
while (pos < html.length() && depth > 0) { |
|||
int nextOpen = html.indexOf("<div", pos); |
|||
int nextClose = html.indexOf("</div>", pos); |
|||
|
|||
if (nextClose < 0) break; // No closing tag found
|
|||
|
|||
if (nextOpen >= 0 && nextOpen < nextClose) { |
|||
// Found an opening div before closing
|
|||
depth++; |
|||
pos = nextOpen + 4; |
|||
} else { |
|||
// Found a closing div
|
|||
depth--; |
|||
if (depth == 0) { |
|||
itemEnd = nextClose + 6; |
|||
} |
|||
pos = nextClose + 6; |
|||
} |
|||
} |
|||
|
|||
if (itemEnd > itemStart) { |
|||
count++; |
|||
String movieHtml = html.substring(itemStart, itemEnd); |
|||
// Don't print movie HTML to avoid excessive output
|
|||
Movie movie = parseMovie(movieHtml); |
|||
if (movie != null) { |
|||
movies.add(movie); |
|||
} |
|||
startIndex = itemEnd; |
|||
} else { |
|||
break; |
|||
} |
|||
} |
|||
|
|||
System.out.println("Found " + count + " movie items, parsed " + movies.size() + " valid movies"); |
|||
return movies; |
|||
} |
|||
|
|||
private static Movie parseMovie(String movieHtml) { |
|||
try { |
|||
// Extract title from img alt attribute
|
|||
String title = ""; |
|||
int altIndex = movieHtml.indexOf("alt="); |
|||
if (altIndex > 0) { |
|||
int start = movieHtml.indexOf('"', altIndex); |
|||
int end = movieHtml.indexOf('"', start + 1); |
|||
if (start > 0 && end > 0) { |
|||
title = movieHtml.substring(start + 1, end).trim(); |
|||
} |
|||
} |
|||
|
|||
// Extract rating
|
|||
double rating = 0.0; |
|||
int ratingIndex = movieHtml.indexOf("rating_num"); |
|||
if (ratingIndex > 0) { |
|||
int start = movieHtml.indexOf('>', ratingIndex); |
|||
int end = movieHtml.indexOf("</span>", start); |
|||
if (start > 0 && end > 0) { |
|||
String ratingStr = movieHtml.substring(start + 1, end).trim(); |
|||
try { |
|||
rating = Double.parseDouble(ratingStr); |
|||
} catch (NumberFormatException e) { |
|||
rating = 0.0; |
|||
} |
|||
} |
|||
} |
|||
|
|||
// Extract year and director from movie info
|
|||
int year = 0; |
|||
String director = "Unknown"; |
|||
|
|||
// Find the info section which contains year and director
|
|||
// Look for <p> tag without class or with specific class
|
|||
int infoStart = -1; |
|||
int pStart = movieHtml.indexOf("<p>"); |
|||
int pClassStart = movieHtml.indexOf("<p class=\"\">"); |
|||
|
|||
if (pStart >= 0) { |
|||
infoStart = pStart; |
|||
} |
|||
if (pClassStart >= 0 && (pStart < 0 || pClassStart < pStart)) { |
|||
infoStart = pClassStart; |
|||
} |
|||
|
|||
if (infoStart > 0) { |
|||
int infoEnd = movieHtml.indexOf("</p>", infoStart); |
|||
if (infoEnd > infoStart) { |
|||
String infoSection = movieHtml.substring(infoStart, infoEnd); |
|||
|
|||
// Extract year - look for 4-digit year after <br> tag
|
|||
int brIndex = infoSection.indexOf("<br>"); |
|||
if (brIndex > 0) { |
|||
String afterBr = infoSection.substring(brIndex + 4).trim(); |
|||
// Find first 4-digit number
|
|||
for (int i = 0; i <= afterBr.length() - 4; i++) { |
|||
String possibleYear = afterBr.substring(i, i + 4); |
|||
if (possibleYear.matches("\\d{4}")) { |
|||
try { |
|||
year = Integer.parseInt(possibleYear); |
|||
break; |
|||
} catch (NumberFormatException e) { |
|||
// Continue
|
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
// Extract director - director info is between "导演:" and " "
|
|||
// Look for the pattern: 导演: [director name]
|
|||
int directorLabelIdx = infoSection.indexOf("\u5bfc\u6f14:"); // Unicode for "导演:"
|
|||
if (directorLabelIdx >= 0) { |
|||
int directorStart = directorLabelIdx + 3; // Skip "导演:"
|
|||
int directorEnd = infoSection.indexOf(" ", directorStart); |
|||
if (directorEnd > directorStart) { |
|||
director = infoSection.substring(directorStart, directorEnd).trim(); |
|||
// Clean up any remaining HTML
|
|||
director = director.replaceAll("<[^>]*>", "").trim(); |
|||
// Extract only Chinese name (before space)
|
|||
int spaceIdx = director.indexOf(" "); |
|||
if (spaceIdx > 0) { |
|||
director = director.substring(0, spaceIdx).trim(); |
|||
} |
|||
if (director.isEmpty()) director = "Unknown"; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
// If title and rating are valid, create movie object
|
|||
if (!title.isEmpty() && rating > 0) { |
|||
return new Movie(title, rating, year, director); |
|||
} |
|||
} catch (Exception e) { |
|||
// Silently handle exceptions
|
|||
} |
|||
return null; |
|||
} |
|||
} |
|||
@ -1,47 +0,0 @@ |
|||
package project.display; |
|||
|
|||
import project.bean.Movie; |
|||
import project.analysis.MovieAnalyzer; |
|||
|
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
public class ResultDisplay { |
|||
public static void displayResults(List<Movie> movies) { |
|||
System.out.println("===== Movie Data Analysis Results ====="); |
|||
System.out.println("Total movies: " + movies.size()); |
|||
System.out.printf("Average rating: %.2f\n\n", MovieAnalyzer.getAverageRating(movies)); |
|||
|
|||
System.out.println("===== Rating Distribution ====="); |
|||
Map<Double, Long> ratingDistribution = MovieAnalyzer.getRatingDistribution(movies); |
|||
ratingDistribution.entrySet().stream() |
|||
.sorted(Map.Entry.comparingByKey()) |
|||
.forEach(entry -> System.out.printf("Rating %.1f: %d movies\n", entry.getKey(), entry.getValue())); |
|||
|
|||
System.out.println("\n===== Year-Rating Correlation ====="); |
|||
Map<Integer, Double> yearRating = MovieAnalyzer.getYearRatingCorrelation(movies); |
|||
yearRating.entrySet().stream() |
|||
.sorted(Map.Entry.comparingByKey()) |
|||
.forEach(entry -> System.out.printf("%d: %.2f\n", entry.getKey(), entry.getValue())); |
|||
|
|||
System.out.println("\n===== Director Movie Count Ranking ====="); |
|||
Map<String, Long> directorCount = MovieAnalyzer.getDirectorMovieCount(movies); |
|||
directorCount.entrySet().stream() |
|||
.limit(10) |
|||
.forEach(entry -> System.out.printf("%s: %d movies\n", entry.getKey(), entry.getValue())); |
|||
|
|||
System.out.println("\n===== Top 10 Highest Rated Movies ====="); |
|||
List<Movie> topRated = MovieAnalyzer.getTopRatedMovies(movies, 10); |
|||
for (int i = 0; i < topRated.size(); i++) { |
|||
Movie movie = topRated.get(i); |
|||
System.out.printf("%d. %s (%.1f) - %d - Director: %s\n", |
|||
i + 1, movie.getTitle(), movie.getRating(), movie.getYear(), movie.getDirector()); |
|||
} |
|||
} |
|||
|
|||
public static void generateCharts(List<Movie> movies) throws Exception { |
|||
System.out.println("\n===== Chart Generation ====="); |
|||
System.out.println("Due to environment limitations, chart generation is not implemented"); |
|||
System.out.println("Suggest using JFreeChart or other chart libraries for visualization"); |
|||
} |
|||
} |
|||
@ -1,29 +0,0 @@ |
|||
package project.utils; |
|||
|
|||
public class DataCleaner { |
|||
public static String cleanText(String text) { |
|||
if (text == null) return ""; |
|||
return text.trim() |
|||
.replaceAll("<[^>]*>", "") |
|||
.replaceAll("\\s+", " ") |
|||
.replaceAll("[\\r\\n]", ""); |
|||
} |
|||
|
|||
public static double parseRating(String ratingStr) { |
|||
if (ratingStr == null || ratingStr.isEmpty()) return 0.0; |
|||
try { |
|||
return Double.parseDouble(ratingStr.trim()); |
|||
} catch (NumberFormatException e) { |
|||
return 0.0; |
|||
} |
|||
} |
|||
|
|||
public static int parseYear(String yearStr) { |
|||
if (yearStr == null || yearStr.isEmpty()) return 0; |
|||
try { |
|||
return Integer.parseInt(yearStr.replaceAll("[^0-9]", "")); |
|||
} catch (NumberFormatException e) { |
|||
return 0; |
|||
} |
|||
} |
|||
} |
|||
@ -1,26 +0,0 @@ |
|||
package project.utils; |
|||
|
|||
import project.bean.Movie; |
|||
|
|||
import java.io.OutputStreamWriter; |
|||
import java.io.FileOutputStream; |
|||
import java.io.IOException; |
|||
import java.util.List; |
|||
|
|||
public class DataStorage { |
|||
public static void saveToCsv(List<Movie> movies, String filePath) throws IOException { |
|||
// Use UTF-8 encoding to properly handle Chinese characters
|
|||
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(filePath), "UTF-8"); |
|||
writer.write("Title,Rating,Year,Director\n"); |
|||
|
|||
for (Movie movie : movies) { |
|||
writer.write(String.format("%s,%.1f,%d,%s\n", |
|||
movie.getTitle(), |
|||
movie.getRating(), |
|||
movie.getYear(), |
|||
movie.getDirector())); |
|||
} |
|||
|
|||
writer.close(); |
|||
} |
|||
} |
|||
@ -1,30 +0,0 @@ |
|||
package project.utils; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
|
|||
public class HttpUtils { |
|||
public static String getHtml(String url) throws Exception { |
|||
URL obj = new URL(url); |
|||
HttpURLConnection con = (HttpURLConnection) obj.openConnection(); |
|||
con.setRequestMethod("GET"); |
|||
con.setRequestProperty("User-Agent", "Mozilla/5.0"); |
|||
|
|||
int responseCode = con.getResponseCode(); |
|||
if (responseCode != HttpURLConnection.HTTP_OK) { |
|||
throw new Exception("HTTP error code: " + responseCode); |
|||
} |
|||
|
|||
BufferedReader in = new BufferedReader(new InputStreamReader(con.getInputStream(), "UTF-8")); |
|||
String inputLine; |
|||
StringBuilder html = new StringBuilder(); |
|||
|
|||
while ((inputLine = in.readLine()) != null) { |
|||
html.append(inputLine); |
|||
} |
|||
in.close(); |
|||
return html.toString(); |
|||
} |
|||
} |
|||
Loading…
Reference in new issue