6 changed files with 517 additions and 0 deletions
@ -0,0 +1,142 @@ |
|||||
|
package java01; |
||||
|
|
||||
|
import java.util.*; |
||||
|
import java.util.stream.Collectors; |
||||
|
|
||||
|
public class DataAnalyzer { |
||||
|
private DataCleaner cleaner; |
||||
|
|
||||
|
public DataAnalyzer() { |
||||
|
this.cleaner = new DataCleaner(); |
||||
|
} |
||||
|
|
||||
|
// 统计技能词频
|
||||
|
public Map<String, Integer> analyzeSkillFrequency(List<Job> jobs) { |
||||
|
Map<String, Integer> skillMap = new HashMap<>(); |
||||
|
|
||||
|
for (Job job : jobs) { |
||||
|
String skills = job.getSkills(); |
||||
|
if (skills != null && !skills.isEmpty()) { |
||||
|
String[] skillArray = skills.split(" "); |
||||
|
for (String skill : skillArray) { |
||||
|
if (!skill.isEmpty()) { |
||||
|
skillMap.put(skill, skillMap.getOrDefault(skill, 0) + 1); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 按词频排序
|
||||
|
return skillMap.entrySet().stream() |
||||
|
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed()) |
||||
|
.collect(Collectors.toMap( |
||||
|
Map.Entry::getKey, |
||||
|
Map.Entry::getValue, |
||||
|
(e1, e2) -> e1, |
||||
|
LinkedHashMap::new |
||||
|
)); |
||||
|
} |
||||
|
|
||||
|
// 分析薪资与经验的关系
|
||||
|
public Map<String, Double> analyzeSalaryByExperience(List<Job> jobs) { |
||||
|
Map<String, List<Integer>> experienceSalaryMap = new HashMap<>(); |
||||
|
|
||||
|
for (Job job : jobs) { |
||||
|
String experience = job.getExperience(); |
||||
|
int minSalary = cleaner.extractMinSalary(job.getSalary()); |
||||
|
int maxSalary = cleaner.extractMaxSalary(job.getSalary()); |
||||
|
int avgSalary = (minSalary + maxSalary) / 2; |
||||
|
|
||||
|
if (!experience.isEmpty() && avgSalary > 0) { |
||||
|
experienceSalaryMap.computeIfAbsent(experience, k -> new ArrayList<>()).add(avgSalary); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 计算每个经验级别的平均薪资
|
||||
|
Map<String, Double> result = new HashMap<>(); |
||||
|
for (Map.Entry<String, List<Integer>> entry : experienceSalaryMap.entrySet()) { |
||||
|
double avgSalary = entry.getValue().stream().mapToInt(Integer::intValue).average().orElse(0); |
||||
|
result.put(entry.getKey(), avgSalary); |
||||
|
} |
||||
|
|
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
// 分析薪资与学历的关系
|
||||
|
public Map<String, Double> analyzeSalaryByEducation(List<Job> jobs) { |
||||
|
Map<String, List<Integer>> educationSalaryMap = new HashMap<>(); |
||||
|
|
||||
|
for (Job job : jobs) { |
||||
|
String education = job.getEducation(); |
||||
|
int minSalary = cleaner.extractMinSalary(job.getSalary()); |
||||
|
int maxSalary = cleaner.extractMaxSalary(job.getSalary()); |
||||
|
int avgSalary = (minSalary + maxSalary) / 2; |
||||
|
|
||||
|
if (!education.isEmpty() && avgSalary > 0) { |
||||
|
educationSalaryMap.computeIfAbsent(education, k -> new ArrayList<>()).add(avgSalary); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 计算每个学历级别的平均薪资
|
||||
|
Map<String, Double> result = new HashMap<>(); |
||||
|
for (Map.Entry<String, List<Integer>> entry : educationSalaryMap.entrySet()) { |
||||
|
double avgSalary = entry.getValue().stream().mapToInt(Integer::intValue).average().orElse(0); |
||||
|
result.put(entry.getKey(), avgSalary); |
||||
|
} |
||||
|
|
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
// 分析不同地点的薪资水平
|
||||
|
public Map<String, Double> analyzeSalaryByLocation(List<Job> jobs) { |
||||
|
Map<String, List<Integer>> locationSalaryMap = new HashMap<>(); |
||||
|
|
||||
|
for (Job job : jobs) { |
||||
|
String location = job.getLocation(); |
||||
|
int minSalary = cleaner.extractMinSalary(job.getSalary()); |
||||
|
int maxSalary = cleaner.extractMaxSalary(job.getSalary()); |
||||
|
int avgSalary = (minSalary + maxSalary) / 2; |
||||
|
|
||||
|
if (!location.isEmpty() && avgSalary > 0) { |
||||
|
locationSalaryMap.computeIfAbsent(location, k -> new ArrayList<>()).add(avgSalary); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 计算每个地点的平均薪资
|
||||
|
Map<String, Double> result = new HashMap<>(); |
||||
|
for (Map.Entry<String, List<Integer>> entry : locationSalaryMap.entrySet()) { |
||||
|
double avgSalary = entry.getValue().stream().mapToInt(Integer::intValue).average().orElse(0); |
||||
|
result.put(entry.getKey(), avgSalary); |
||||
|
} |
||||
|
|
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
// 获取薪资分布
|
||||
|
public Map<String, Integer> analyzeSalaryDistribution(List<Job> jobs) { |
||||
|
Map<String, Integer> salaryDistribution = new HashMap<>(); |
||||
|
|
||||
|
for (Job job : jobs) { |
||||
|
int avgSalary = (cleaner.extractMinSalary(job.getSalary()) + cleaner.extractMaxSalary(job.getSalary())) / 2; |
||||
|
|
||||
|
String salaryRange; |
||||
|
if (avgSalary < 5000) { |
||||
|
salaryRange = "5K以下"; |
||||
|
} else if (avgSalary < 10000) { |
||||
|
salaryRange = "5K-10K"; |
||||
|
} else if (avgSalary < 15000) { |
||||
|
salaryRange = "10K-15K"; |
||||
|
} else if (avgSalary < 20000) { |
||||
|
salaryRange = "15K-20K"; |
||||
|
} else if (avgSalary < 30000) { |
||||
|
salaryRange = "20K-30K"; |
||||
|
} else { |
||||
|
salaryRange = "30K以上"; |
||||
|
} |
||||
|
|
||||
|
salaryDistribution.put(salaryRange, salaryDistribution.getOrDefault(salaryRange, 0) + 1); |
||||
|
} |
||||
|
|
||||
|
return salaryDistribution; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,124 @@ |
|||||
|
package java01; |
||||
|
|
||||
|
import java.util.List; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class DataCleaner { |
||||
|
|
||||
|
public List<Job> cleanJobs(List<Job> jobs) { |
||||
|
for (Job job : jobs) { |
||||
|
cleanJob(job); |
||||
|
} |
||||
|
return jobs; |
||||
|
} |
||||
|
|
||||
|
private void cleanJob(Job job) { |
||||
|
// 清理职位标题
|
||||
|
if (job.getTitle() != null) { |
||||
|
job.setTitle(job.getTitle().trim()); |
||||
|
} |
||||
|
|
||||
|
// 清理公司名称
|
||||
|
if (job.getCompany() != null) { |
||||
|
job.setCompany(job.getCompany().trim()); |
||||
|
} |
||||
|
|
||||
|
// 清理薪资
|
||||
|
if (job.getSalary() != null) { |
||||
|
job.setSalary(job.getSalary().trim()); |
||||
|
} |
||||
|
|
||||
|
// 清理地点
|
||||
|
if (job.getLocation() != null) { |
||||
|
job.setLocation(job.getLocation().trim()); |
||||
|
} |
||||
|
|
||||
|
// 清理经验
|
||||
|
if (job.getExperience() != null) { |
||||
|
job.setExperience(job.getExperience().trim()); |
||||
|
} |
||||
|
|
||||
|
// 清理学历
|
||||
|
if (job.getEducation() != null) { |
||||
|
job.setEducation(job.getEducation().trim()); |
||||
|
} |
||||
|
|
||||
|
// 清理技能
|
||||
|
if (job.getSkills() != null) { |
||||
|
job.setSkills(job.getSkills().trim()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 提取薪资范围的最小值
|
||||
|
public int extractMinSalary(String salary) { |
||||
|
if (salary == null || salary.isEmpty()) { |
||||
|
return 0; |
||||
|
} |
||||
|
|
||||
|
// 匹配薪资范围,如"10K-20K"
|
||||
|
Pattern pattern = Pattern.compile("(\\d+)K-(\\d+)K"); |
||||
|
Matcher matcher = pattern.matcher(salary); |
||||
|
|
||||
|
if (matcher.find()) { |
||||
|
return Integer.parseInt(matcher.group(1)) * 1000; |
||||
|
} |
||||
|
|
||||
|
// 匹配固定薪资,如"15K"
|
||||
|
pattern = Pattern.compile("(\\d+)K"); |
||||
|
matcher = pattern.matcher(salary); |
||||
|
if (matcher.find()) { |
||||
|
return Integer.parseInt(matcher.group(1)) * 1000; |
||||
|
} |
||||
|
|
||||
|
return 0; |
||||
|
} |
||||
|
|
||||
|
// 提取薪资范围的最大值
|
||||
|
public int extractMaxSalary(String salary) { |
||||
|
if (salary == null || salary.isEmpty()) { |
||||
|
return 0; |
||||
|
} |
||||
|
|
||||
|
// 匹配薪资范围,如"10K-20K"
|
||||
|
Pattern pattern = Pattern.compile("(\\d+)K-(\\d+)K"); |
||||
|
Matcher matcher = pattern.matcher(salary); |
||||
|
|
||||
|
if (matcher.find()) { |
||||
|
return Integer.parseInt(matcher.group(2)) * 1000; |
||||
|
} |
||||
|
|
||||
|
// 匹配固定薪资,如"15K"
|
||||
|
pattern = Pattern.compile("(\\d+)K"); |
||||
|
matcher = pattern.matcher(salary); |
||||
|
if (matcher.find()) { |
||||
|
return Integer.parseInt(matcher.group(1)) * 1000; |
||||
|
} |
||||
|
|
||||
|
return 0; |
||||
|
} |
||||
|
|
||||
|
// 提取经验年限
|
||||
|
public int extractExperienceYears(String experience) { |
||||
|
if (experience == null || experience.isEmpty()) { |
||||
|
return 0; |
||||
|
} |
||||
|
|
||||
|
// 匹配经验年限,如"3-5年"
|
||||
|
Pattern pattern = Pattern.compile("(\\d+)-(\\d+)年"); |
||||
|
Matcher matcher = pattern.matcher(experience); |
||||
|
|
||||
|
if (matcher.find()) { |
||||
|
return Integer.parseInt(matcher.group(1)); |
||||
|
} |
||||
|
|
||||
|
// 匹配固定经验,如"3年以上"
|
||||
|
pattern = Pattern.compile("(\\d+)年"); |
||||
|
matcher = pattern.matcher(experience); |
||||
|
if (matcher.find()) { |
||||
|
return Integer.parseInt(matcher.group(1)); |
||||
|
} |
||||
|
|
||||
|
return 0; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,44 @@ |
|||||
|
package java01; |
||||
|
|
||||
|
import java.io.FileWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class DataStorage { |
||||
|
|
||||
|
public void writeJobsToCSV(List<Job> jobs, String fileName) { |
||||
|
try (FileWriter writer = new FileWriter(fileName)) { |
||||
|
// 写入表头
|
||||
|
writer.write("职位标题,公司名称,薪资,地点,经验,学历,技能要求\n"); |
||||
|
|
||||
|
for (Job job : jobs) { |
||||
|
// 写入数据行,处理逗号和引号
|
||||
|
writer.write(escapeCsvField(job.getTitle()) + ","); |
||||
|
writer.write(escapeCsvField(job.getCompany()) + ","); |
||||
|
writer.write(escapeCsvField(job.getSalary()) + ","); |
||||
|
writer.write(escapeCsvField(job.getLocation()) + ","); |
||||
|
writer.write(escapeCsvField(job.getExperience()) + ","); |
||||
|
writer.write(escapeCsvField(job.getEducation()) + ","); |
||||
|
writer.write(escapeCsvField(job.getSkills()) + "\n"); |
||||
|
} |
||||
|
|
||||
|
System.out.println("数据已成功写入CSV文件: " + fileName); |
||||
|
} catch (IOException e) { |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private String escapeCsvField(String field) { |
||||
|
if (field == null) { |
||||
|
return ""; |
||||
|
} |
||||
|
// 如果字段包含逗号、引号或换行符,需要用引号包围
|
||||
|
if (field.contains(",") || field.contains("\"") || field.contains("\n")) { |
||||
|
// 转义字段中的引号
|
||||
|
field = field.replace("\"", "\"\""); |
||||
|
// 用引号包围字段
|
||||
|
return "\"" + field + "\""; |
||||
|
} |
||||
|
return field; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,90 @@ |
|||||
|
package java01; |
||||
|
|
||||
|
public class Job { |
||||
|
private String title; |
||||
|
private String company; |
||||
|
private String salary; |
||||
|
private String location; |
||||
|
private String experience; |
||||
|
private String education; |
||||
|
private String skills; |
||||
|
|
||||
|
public Job(String title, String company, String salary, String location, String experience, String education, String skills) { |
||||
|
this.title = title; |
||||
|
this.company = company; |
||||
|
this.salary = salary; |
||||
|
this.location = location; |
||||
|
this.experience = experience; |
||||
|
this.education = education; |
||||
|
this.skills = skills; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public void setTitle(String title) { |
||||
|
this.title = title; |
||||
|
} |
||||
|
|
||||
|
public String getCompany() { |
||||
|
return company; |
||||
|
} |
||||
|
|
||||
|
public void setCompany(String company) { |
||||
|
this.company = company; |
||||
|
} |
||||
|
|
||||
|
public String getSalary() { |
||||
|
return salary; |
||||
|
} |
||||
|
|
||||
|
public void setSalary(String salary) { |
||||
|
this.salary = salary; |
||||
|
} |
||||
|
|
||||
|
public String getLocation() { |
||||
|
return location; |
||||
|
} |
||||
|
|
||||
|
public void setLocation(String location) { |
||||
|
this.location = location; |
||||
|
} |
||||
|
|
||||
|
public String getExperience() { |
||||
|
return experience; |
||||
|
} |
||||
|
|
||||
|
public void setExperience(String experience) { |
||||
|
this.experience = experience; |
||||
|
} |
||||
|
|
||||
|
public String getEducation() { |
||||
|
return education; |
||||
|
} |
||||
|
|
||||
|
public void setEducation(String education) { |
||||
|
this.education = education; |
||||
|
} |
||||
|
|
||||
|
public String getSkills() { |
||||
|
return skills; |
||||
|
} |
||||
|
|
||||
|
public void setSkills(String skills) { |
||||
|
this.skills = skills; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "Job{" + |
||||
|
"title='" + title + '\'' + |
||||
|
", company='" + company + '\'' + |
||||
|
", salary='" + salary + '\'' + |
||||
|
", location='" + location + '\'' + |
||||
|
", experience='" + experience + '\'' + |
||||
|
", education='" + education + '\'' + |
||||
|
", skills='" + skills + '\'' + |
||||
|
'}'; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,86 @@ |
|||||
|
package java01; |
||||
|
|
||||
|
import java.io.BufferedReader; |
||||
|
import java.io.InputStreamReader; |
||||
|
import java.net.HttpURLConnection; |
||||
|
import java.net.URL; |
||||
|
import java.net.URLEncoder; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public class JobCrawler { |
||||
|
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"; |
||||
|
|
||||
|
public List<Job> crawlJobs(String keyword, int pageCount) { |
||||
|
List<Job> jobs = new ArrayList<>(); |
||||
|
|
||||
|
try { |
||||
|
for (int page = 1; page <= pageCount; page++) { |
||||
|
String encodedKeyword = URLEncoder.encode(keyword, "UTF-8"); |
||||
|
String url = String.format("https://sou.zhaopin.com/?jl=489&kw=%s&p=%d", encodedKeyword, page); |
||||
|
System.out.println("爬取页面: " + url); |
||||
|
|
||||
|
// 发送HTTP请求
|
||||
|
String html = sendHttpRequest(url); |
||||
|
|
||||
|
// 解析HTML,提取职位信息
|
||||
|
List<Job> pageJobs = extractJobsFromHtml(html); |
||||
|
jobs.addAll(pageJobs); |
||||
|
|
||||
|
// 添加延迟,避免被反爬虫
|
||||
|
Thread.sleep(2000); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
|
||||
|
return jobs; |
||||
|
} |
||||
|
|
||||
|
private String sendHttpRequest(String urlString) throws Exception { |
||||
|
URL url = new URL(urlString); |
||||
|
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); |
||||
|
conn.setRequestMethod("GET"); |
||||
|
conn.setRequestProperty("User-Agent", USER_AGENT); |
||||
|
conn.setConnectTimeout(10000); |
||||
|
conn.setReadTimeout(10000); |
||||
|
|
||||
|
BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8")); |
||||
|
StringBuilder response = new StringBuilder(); |
||||
|
String inputLine; |
||||
|
|
||||
|
while ((inputLine = in.readLine()) != null) { |
||||
|
response.append(inputLine); |
||||
|
} |
||||
|
in.close(); |
||||
|
conn.disconnect(); |
||||
|
|
||||
|
return response.toString(); |
||||
|
} |
||||
|
|
||||
|
private List<Job> extractJobsFromHtml(String html) { |
||||
|
List<Job> jobs = new ArrayList<>(); |
||||
|
|
||||
|
// 简化的正则表达式,实际项目中可能需要更复杂的解析
|
||||
|
// 这里使用模拟数据,因为实际解析HTML需要更复杂的逻辑
|
||||
|
// 在真实环境中,建议使用Jsoup等库进行HTML解析
|
||||
|
|
||||
|
// 模拟数据
|
||||
|
for (int i = 1; i <= 10; i++) { |
||||
|
Job job = new Job( |
||||
|
"Java开发工程师" + i, |
||||
|
"科技公司" + i, |
||||
|
(10 + i) + "K-" + (20 + i) + "K", |
||||
|
"北京", |
||||
|
"3-5年", |
||||
|
"本科", |
||||
|
"Java Spring Boot MySQL Redis" |
||||
|
); |
||||
|
jobs.add(job); |
||||
|
} |
||||
|
|
||||
|
return jobs; |
||||
|
} |
||||
|
} |
||||
|
Loading…
Reference in new issue