6 changed files with 517 additions and 0 deletions
@ -0,0 +1,142 @@ |
|||
package java01; |
|||
|
|||
import java.util.*; |
|||
import java.util.stream.Collectors; |
|||
|
|||
public class DataAnalyzer { |
|||
private DataCleaner cleaner; |
|||
|
|||
public DataAnalyzer() { |
|||
this.cleaner = new DataCleaner(); |
|||
} |
|||
|
|||
// 统计技能词频
|
|||
public Map<String, Integer> analyzeSkillFrequency(List<Job> jobs) { |
|||
Map<String, Integer> skillMap = new HashMap<>(); |
|||
|
|||
for (Job job : jobs) { |
|||
String skills = job.getSkills(); |
|||
if (skills != null && !skills.isEmpty()) { |
|||
String[] skillArray = skills.split(" "); |
|||
for (String skill : skillArray) { |
|||
if (!skill.isEmpty()) { |
|||
skillMap.put(skill, skillMap.getOrDefault(skill, 0) + 1); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
// 按词频排序
|
|||
return skillMap.entrySet().stream() |
|||
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed()) |
|||
.collect(Collectors.toMap( |
|||
Map.Entry::getKey, |
|||
Map.Entry::getValue, |
|||
(e1, e2) -> e1, |
|||
LinkedHashMap::new |
|||
)); |
|||
} |
|||
|
|||
// 分析薪资与经验的关系
|
|||
public Map<String, Double> analyzeSalaryByExperience(List<Job> jobs) { |
|||
Map<String, List<Integer>> experienceSalaryMap = new HashMap<>(); |
|||
|
|||
for (Job job : jobs) { |
|||
String experience = job.getExperience(); |
|||
int minSalary = cleaner.extractMinSalary(job.getSalary()); |
|||
int maxSalary = cleaner.extractMaxSalary(job.getSalary()); |
|||
int avgSalary = (minSalary + maxSalary) / 2; |
|||
|
|||
if (!experience.isEmpty() && avgSalary > 0) { |
|||
experienceSalaryMap.computeIfAbsent(experience, k -> new ArrayList<>()).add(avgSalary); |
|||
} |
|||
} |
|||
|
|||
// 计算每个经验级别的平均薪资
|
|||
Map<String, Double> result = new HashMap<>(); |
|||
for (Map.Entry<String, List<Integer>> entry : experienceSalaryMap.entrySet()) { |
|||
double avgSalary = entry.getValue().stream().mapToInt(Integer::intValue).average().orElse(0); |
|||
result.put(entry.getKey(), avgSalary); |
|||
} |
|||
|
|||
return result; |
|||
} |
|||
|
|||
// 分析薪资与学历的关系
|
|||
public Map<String, Double> analyzeSalaryByEducation(List<Job> jobs) { |
|||
Map<String, List<Integer>> educationSalaryMap = new HashMap<>(); |
|||
|
|||
for (Job job : jobs) { |
|||
String education = job.getEducation(); |
|||
int minSalary = cleaner.extractMinSalary(job.getSalary()); |
|||
int maxSalary = cleaner.extractMaxSalary(job.getSalary()); |
|||
int avgSalary = (minSalary + maxSalary) / 2; |
|||
|
|||
if (!education.isEmpty() && avgSalary > 0) { |
|||
educationSalaryMap.computeIfAbsent(education, k -> new ArrayList<>()).add(avgSalary); |
|||
} |
|||
} |
|||
|
|||
// 计算每个学历级别的平均薪资
|
|||
Map<String, Double> result = new HashMap<>(); |
|||
for (Map.Entry<String, List<Integer>> entry : educationSalaryMap.entrySet()) { |
|||
double avgSalary = entry.getValue().stream().mapToInt(Integer::intValue).average().orElse(0); |
|||
result.put(entry.getKey(), avgSalary); |
|||
} |
|||
|
|||
return result; |
|||
} |
|||
|
|||
// 分析不同地点的薪资水平
|
|||
public Map<String, Double> analyzeSalaryByLocation(List<Job> jobs) { |
|||
Map<String, List<Integer>> locationSalaryMap = new HashMap<>(); |
|||
|
|||
for (Job job : jobs) { |
|||
String location = job.getLocation(); |
|||
int minSalary = cleaner.extractMinSalary(job.getSalary()); |
|||
int maxSalary = cleaner.extractMaxSalary(job.getSalary()); |
|||
int avgSalary = (minSalary + maxSalary) / 2; |
|||
|
|||
if (!location.isEmpty() && avgSalary > 0) { |
|||
locationSalaryMap.computeIfAbsent(location, k -> new ArrayList<>()).add(avgSalary); |
|||
} |
|||
} |
|||
|
|||
// 计算每个地点的平均薪资
|
|||
Map<String, Double> result = new HashMap<>(); |
|||
for (Map.Entry<String, List<Integer>> entry : locationSalaryMap.entrySet()) { |
|||
double avgSalary = entry.getValue().stream().mapToInt(Integer::intValue).average().orElse(0); |
|||
result.put(entry.getKey(), avgSalary); |
|||
} |
|||
|
|||
return result; |
|||
} |
|||
|
|||
// 获取薪资分布
|
|||
public Map<String, Integer> analyzeSalaryDistribution(List<Job> jobs) { |
|||
Map<String, Integer> salaryDistribution = new HashMap<>(); |
|||
|
|||
for (Job job : jobs) { |
|||
int avgSalary = (cleaner.extractMinSalary(job.getSalary()) + cleaner.extractMaxSalary(job.getSalary())) / 2; |
|||
|
|||
String salaryRange; |
|||
if (avgSalary < 5000) { |
|||
salaryRange = "5K以下"; |
|||
} else if (avgSalary < 10000) { |
|||
salaryRange = "5K-10K"; |
|||
} else if (avgSalary < 15000) { |
|||
salaryRange = "10K-15K"; |
|||
} else if (avgSalary < 20000) { |
|||
salaryRange = "15K-20K"; |
|||
} else if (avgSalary < 30000) { |
|||
salaryRange = "20K-30K"; |
|||
} else { |
|||
salaryRange = "30K以上"; |
|||
} |
|||
|
|||
salaryDistribution.put(salaryRange, salaryDistribution.getOrDefault(salaryRange, 0) + 1); |
|||
} |
|||
|
|||
return salaryDistribution; |
|||
} |
|||
} |
|||
@ -0,0 +1,124 @@ |
|||
package java01; |
|||
|
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class DataCleaner { |
|||
|
|||
public List<Job> cleanJobs(List<Job> jobs) { |
|||
for (Job job : jobs) { |
|||
cleanJob(job); |
|||
} |
|||
return jobs; |
|||
} |
|||
|
|||
private void cleanJob(Job job) { |
|||
// 清理职位标题
|
|||
if (job.getTitle() != null) { |
|||
job.setTitle(job.getTitle().trim()); |
|||
} |
|||
|
|||
// 清理公司名称
|
|||
if (job.getCompany() != null) { |
|||
job.setCompany(job.getCompany().trim()); |
|||
} |
|||
|
|||
// 清理薪资
|
|||
if (job.getSalary() != null) { |
|||
job.setSalary(job.getSalary().trim()); |
|||
} |
|||
|
|||
// 清理地点
|
|||
if (job.getLocation() != null) { |
|||
job.setLocation(job.getLocation().trim()); |
|||
} |
|||
|
|||
// 清理经验
|
|||
if (job.getExperience() != null) { |
|||
job.setExperience(job.getExperience().trim()); |
|||
} |
|||
|
|||
// 清理学历
|
|||
if (job.getEducation() != null) { |
|||
job.setEducation(job.getEducation().trim()); |
|||
} |
|||
|
|||
// 清理技能
|
|||
if (job.getSkills() != null) { |
|||
job.setSkills(job.getSkills().trim()); |
|||
} |
|||
} |
|||
|
|||
// 提取薪资范围的最小值
|
|||
public int extractMinSalary(String salary) { |
|||
if (salary == null || salary.isEmpty()) { |
|||
return 0; |
|||
} |
|||
|
|||
// 匹配薪资范围,如"10K-20K"
|
|||
Pattern pattern = Pattern.compile("(\\d+)K-(\\d+)K"); |
|||
Matcher matcher = pattern.matcher(salary); |
|||
|
|||
if (matcher.find()) { |
|||
return Integer.parseInt(matcher.group(1)) * 1000; |
|||
} |
|||
|
|||
// 匹配固定薪资,如"15K"
|
|||
pattern = Pattern.compile("(\\d+)K"); |
|||
matcher = pattern.matcher(salary); |
|||
if (matcher.find()) { |
|||
return Integer.parseInt(matcher.group(1)) * 1000; |
|||
} |
|||
|
|||
return 0; |
|||
} |
|||
|
|||
// 提取薪资范围的最大值
|
|||
public int extractMaxSalary(String salary) { |
|||
if (salary == null || salary.isEmpty()) { |
|||
return 0; |
|||
} |
|||
|
|||
// 匹配薪资范围,如"10K-20K"
|
|||
Pattern pattern = Pattern.compile("(\\d+)K-(\\d+)K"); |
|||
Matcher matcher = pattern.matcher(salary); |
|||
|
|||
if (matcher.find()) { |
|||
return Integer.parseInt(matcher.group(2)) * 1000; |
|||
} |
|||
|
|||
// 匹配固定薪资,如"15K"
|
|||
pattern = Pattern.compile("(\\d+)K"); |
|||
matcher = pattern.matcher(salary); |
|||
if (matcher.find()) { |
|||
return Integer.parseInt(matcher.group(1)) * 1000; |
|||
} |
|||
|
|||
return 0; |
|||
} |
|||
|
|||
// 提取经验年限
|
|||
public int extractExperienceYears(String experience) { |
|||
if (experience == null || experience.isEmpty()) { |
|||
return 0; |
|||
} |
|||
|
|||
// 匹配经验年限,如"3-5年"
|
|||
Pattern pattern = Pattern.compile("(\\d+)-(\\d+)年"); |
|||
Matcher matcher = pattern.matcher(experience); |
|||
|
|||
if (matcher.find()) { |
|||
return Integer.parseInt(matcher.group(1)); |
|||
} |
|||
|
|||
// 匹配固定经验,如"3年以上"
|
|||
pattern = Pattern.compile("(\\d+)年"); |
|||
matcher = pattern.matcher(experience); |
|||
if (matcher.find()) { |
|||
return Integer.parseInt(matcher.group(1)); |
|||
} |
|||
|
|||
return 0; |
|||
} |
|||
} |
|||
@ -0,0 +1,44 @@ |
|||
package java01; |
|||
|
|||
import java.io.FileWriter; |
|||
import java.io.IOException; |
|||
import java.util.List; |
|||
|
|||
public class DataStorage { |
|||
|
|||
public void writeJobsToCSV(List<Job> jobs, String fileName) { |
|||
try (FileWriter writer = new FileWriter(fileName)) { |
|||
// 写入表头
|
|||
writer.write("职位标题,公司名称,薪资,地点,经验,学历,技能要求\n"); |
|||
|
|||
for (Job job : jobs) { |
|||
// 写入数据行,处理逗号和引号
|
|||
writer.write(escapeCsvField(job.getTitle()) + ","); |
|||
writer.write(escapeCsvField(job.getCompany()) + ","); |
|||
writer.write(escapeCsvField(job.getSalary()) + ","); |
|||
writer.write(escapeCsvField(job.getLocation()) + ","); |
|||
writer.write(escapeCsvField(job.getExperience()) + ","); |
|||
writer.write(escapeCsvField(job.getEducation()) + ","); |
|||
writer.write(escapeCsvField(job.getSkills()) + "\n"); |
|||
} |
|||
|
|||
System.out.println("数据已成功写入CSV文件: " + fileName); |
|||
} catch (IOException e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
|
|||
private String escapeCsvField(String field) { |
|||
if (field == null) { |
|||
return ""; |
|||
} |
|||
// 如果字段包含逗号、引号或换行符,需要用引号包围
|
|||
if (field.contains(",") || field.contains("\"") || field.contains("\n")) { |
|||
// 转义字段中的引号
|
|||
field = field.replace("\"", "\"\""); |
|||
// 用引号包围字段
|
|||
return "\"" + field + "\""; |
|||
} |
|||
return field; |
|||
} |
|||
} |
|||
@ -0,0 +1,90 @@ |
|||
package java01; |
|||
|
|||
public class Job { |
|||
private String title; |
|||
private String company; |
|||
private String salary; |
|||
private String location; |
|||
private String experience; |
|||
private String education; |
|||
private String skills; |
|||
|
|||
public Job(String title, String company, String salary, String location, String experience, String education, String skills) { |
|||
this.title = title; |
|||
this.company = company; |
|||
this.salary = salary; |
|||
this.location = location; |
|||
this.experience = experience; |
|||
this.education = education; |
|||
this.skills = skills; |
|||
} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public void setTitle(String title) { |
|||
this.title = title; |
|||
} |
|||
|
|||
public String getCompany() { |
|||
return company; |
|||
} |
|||
|
|||
public void setCompany(String company) { |
|||
this.company = company; |
|||
} |
|||
|
|||
public String getSalary() { |
|||
return salary; |
|||
} |
|||
|
|||
public void setSalary(String salary) { |
|||
this.salary = salary; |
|||
} |
|||
|
|||
public String getLocation() { |
|||
return location; |
|||
} |
|||
|
|||
public void setLocation(String location) { |
|||
this.location = location; |
|||
} |
|||
|
|||
public String getExperience() { |
|||
return experience; |
|||
} |
|||
|
|||
public void setExperience(String experience) { |
|||
this.experience = experience; |
|||
} |
|||
|
|||
public String getEducation() { |
|||
return education; |
|||
} |
|||
|
|||
public void setEducation(String education) { |
|||
this.education = education; |
|||
} |
|||
|
|||
public String getSkills() { |
|||
return skills; |
|||
} |
|||
|
|||
public void setSkills(String skills) { |
|||
this.skills = skills; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "Job{" + |
|||
"title='" + title + '\'' + |
|||
", company='" + company + '\'' + |
|||
", salary='" + salary + '\'' + |
|||
", location='" + location + '\'' + |
|||
", experience='" + experience + '\'' + |
|||
", education='" + education + '\'' + |
|||
", skills='" + skills + '\'' + |
|||
'}'; |
|||
} |
|||
} |
|||
@ -0,0 +1,86 @@ |
|||
package java01; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.net.URLEncoder; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class JobCrawler { |
|||
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"; |
|||
|
|||
public List<Job> crawlJobs(String keyword, int pageCount) { |
|||
List<Job> jobs = new ArrayList<>(); |
|||
|
|||
try { |
|||
for (int page = 1; page <= pageCount; page++) { |
|||
String encodedKeyword = URLEncoder.encode(keyword, "UTF-8"); |
|||
String url = String.format("https://sou.zhaopin.com/?jl=489&kw=%s&p=%d", encodedKeyword, page); |
|||
System.out.println("爬取页面: " + url); |
|||
|
|||
// 发送HTTP请求
|
|||
String html = sendHttpRequest(url); |
|||
|
|||
// 解析HTML,提取职位信息
|
|||
List<Job> pageJobs = extractJobsFromHtml(html); |
|||
jobs.addAll(pageJobs); |
|||
|
|||
// 添加延迟,避免被反爬虫
|
|||
Thread.sleep(2000); |
|||
} |
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
} |
|||
|
|||
return jobs; |
|||
} |
|||
|
|||
private String sendHttpRequest(String urlString) throws Exception { |
|||
URL url = new URL(urlString); |
|||
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); |
|||
conn.setRequestMethod("GET"); |
|||
conn.setRequestProperty("User-Agent", USER_AGENT); |
|||
conn.setConnectTimeout(10000); |
|||
conn.setReadTimeout(10000); |
|||
|
|||
BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8")); |
|||
StringBuilder response = new StringBuilder(); |
|||
String inputLine; |
|||
|
|||
while ((inputLine = in.readLine()) != null) { |
|||
response.append(inputLine); |
|||
} |
|||
in.close(); |
|||
conn.disconnect(); |
|||
|
|||
return response.toString(); |
|||
} |
|||
|
|||
private List<Job> extractJobsFromHtml(String html) { |
|||
List<Job> jobs = new ArrayList<>(); |
|||
|
|||
// 简化的正则表达式,实际项目中可能需要更复杂的解析
|
|||
// 这里使用模拟数据,因为实际解析HTML需要更复杂的逻辑
|
|||
// 在真实环境中,建议使用Jsoup等库进行HTML解析
|
|||
|
|||
// 模拟数据
|
|||
for (int i = 1; i <= 10; i++) { |
|||
Job job = new Job( |
|||
"Java开发工程师" + i, |
|||
"科技公司" + i, |
|||
(10 + i) + "K-" + (20 + i) + "K", |
|||
"北京", |
|||
"3-5年", |
|||
"本科", |
|||
"Java Spring Boot MySQL Redis" |
|||
); |
|||
jobs.add(job); |
|||
} |
|||
|
|||
return jobs; |
|||
} |
|||
} |
|||
|
Loading…
Reference in new issue