diff --git a/project/DataAnalyzer.java b/project/DataAnalyzer.java new file mode 100644 index 0000000..9323e29 --- /dev/null +++ b/project/DataAnalyzer.java @@ -0,0 +1,142 @@ +package java01; + +import java.util.*; +import java.util.stream.Collectors; + +public class DataAnalyzer { + private DataCleaner cleaner; + + public DataAnalyzer() { + this.cleaner = new DataCleaner(); + } + + // 统计技能词频 + public Map analyzeSkillFrequency(List jobs) { + Map skillMap = new HashMap<>(); + + for (Job job : jobs) { + String skills = job.getSkills(); + if (skills != null && !skills.isEmpty()) { + String[] skillArray = skills.split(" "); + for (String skill : skillArray) { + if (!skill.isEmpty()) { + skillMap.put(skill, skillMap.getOrDefault(skill, 0) + 1); + } + } + } + } + + // 按词频排序 + return skillMap.entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .collect(Collectors.toMap( + Map.Entry::getKey, + Map.Entry::getValue, + (e1, e2) -> e1, + LinkedHashMap::new + )); + } + + // 分析薪资与经验的关系 + public Map analyzeSalaryByExperience(List jobs) { + Map> experienceSalaryMap = new HashMap<>(); + + for (Job job : jobs) { + String experience = job.getExperience(); + int minSalary = cleaner.extractMinSalary(job.getSalary()); + int maxSalary = cleaner.extractMaxSalary(job.getSalary()); + int avgSalary = (minSalary + maxSalary) / 2; + + if (!experience.isEmpty() && avgSalary > 0) { + experienceSalaryMap.computeIfAbsent(experience, k -> new ArrayList<>()).add(avgSalary); + } + } + + // 计算每个经验级别的平均薪资 + Map result = new HashMap<>(); + for (Map.Entry> entry : experienceSalaryMap.entrySet()) { + double avgSalary = entry.getValue().stream().mapToInt(Integer::intValue).average().orElse(0); + result.put(entry.getKey(), avgSalary); + } + + return result; + } + + // 分析薪资与学历的关系 + public Map analyzeSalaryByEducation(List jobs) { + Map> educationSalaryMap = new HashMap<>(); + + for (Job job : jobs) { + String education = job.getEducation(); + int minSalary = cleaner.extractMinSalary(job.getSalary()); + int maxSalary = cleaner.extractMaxSalary(job.getSalary()); + int avgSalary = (minSalary + maxSalary) / 2; + + if (!education.isEmpty() && avgSalary > 0) { + educationSalaryMap.computeIfAbsent(education, k -> new ArrayList<>()).add(avgSalary); + } + } + + // 计算每个学历级别的平均薪资 + Map result = new HashMap<>(); + for (Map.Entry> entry : educationSalaryMap.entrySet()) { + double avgSalary = entry.getValue().stream().mapToInt(Integer::intValue).average().orElse(0); + result.put(entry.getKey(), avgSalary); + } + + return result; + } + + // 分析不同地点的薪资水平 + public Map analyzeSalaryByLocation(List jobs) { + Map> locationSalaryMap = new HashMap<>(); + + for (Job job : jobs) { + String location = job.getLocation(); + int minSalary = cleaner.extractMinSalary(job.getSalary()); + int maxSalary = cleaner.extractMaxSalary(job.getSalary()); + int avgSalary = (minSalary + maxSalary) / 2; + + if (!location.isEmpty() && avgSalary > 0) { + locationSalaryMap.computeIfAbsent(location, k -> new ArrayList<>()).add(avgSalary); + } + } + + // 计算每个地点的平均薪资 + Map result = new HashMap<>(); + for (Map.Entry> entry : locationSalaryMap.entrySet()) { + double avgSalary = entry.getValue().stream().mapToInt(Integer::intValue).average().orElse(0); + result.put(entry.getKey(), avgSalary); + } + + return result; + } + + // 获取薪资分布 + public Map analyzeSalaryDistribution(List jobs) { + Map salaryDistribution = new HashMap<>(); + + for (Job job : jobs) { + int avgSalary = (cleaner.extractMinSalary(job.getSalary()) + cleaner.extractMaxSalary(job.getSalary())) / 2; + + String salaryRange; + if (avgSalary < 5000) { + salaryRange = "5K以下"; + } else if (avgSalary < 10000) { + salaryRange = "5K-10K"; + } else if (avgSalary < 15000) { + salaryRange = "10K-15K"; + } else if (avgSalary < 20000) { + salaryRange = "15K-20K"; + } else if (avgSalary < 30000) { + salaryRange = "20K-30K"; + } else { + salaryRange = "30K以上"; + } + + salaryDistribution.put(salaryRange, salaryDistribution.getOrDefault(salaryRange, 0) + 1); + } + + return salaryDistribution; + } +} \ No newline at end of file diff --git a/project/DataCleaner.java b/project/DataCleaner.java new file mode 100644 index 0000000..01bd411 --- /dev/null +++ b/project/DataCleaner.java @@ -0,0 +1,124 @@ +package java01; + +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class DataCleaner { + + public List cleanJobs(List jobs) { + for (Job job : jobs) { + cleanJob(job); + } + return jobs; + } + + private void cleanJob(Job job) { + // 清理职位标题 + if (job.getTitle() != null) { + job.setTitle(job.getTitle().trim()); + } + + // 清理公司名称 + if (job.getCompany() != null) { + job.setCompany(job.getCompany().trim()); + } + + // 清理薪资 + if (job.getSalary() != null) { + job.setSalary(job.getSalary().trim()); + } + + // 清理地点 + if (job.getLocation() != null) { + job.setLocation(job.getLocation().trim()); + } + + // 清理经验 + if (job.getExperience() != null) { + job.setExperience(job.getExperience().trim()); + } + + // 清理学历 + if (job.getEducation() != null) { + job.setEducation(job.getEducation().trim()); + } + + // 清理技能 + if (job.getSkills() != null) { + job.setSkills(job.getSkills().trim()); + } + } + + // 提取薪资范围的最小值 + public int extractMinSalary(String salary) { + if (salary == null || salary.isEmpty()) { + return 0; + } + + // 匹配薪资范围,如"10K-20K" + Pattern pattern = Pattern.compile("(\\d+)K-(\\d+)K"); + Matcher matcher = pattern.matcher(salary); + + if (matcher.find()) { + return Integer.parseInt(matcher.group(1)) * 1000; + } + + // 匹配固定薪资,如"15K" + pattern = Pattern.compile("(\\d+)K"); + matcher = pattern.matcher(salary); + if (matcher.find()) { + return Integer.parseInt(matcher.group(1)) * 1000; + } + + return 0; + } + + // 提取薪资范围的最大值 + public int extractMaxSalary(String salary) { + if (salary == null || salary.isEmpty()) { + return 0; + } + + // 匹配薪资范围,如"10K-20K" + Pattern pattern = Pattern.compile("(\\d+)K-(\\d+)K"); + Matcher matcher = pattern.matcher(salary); + + if (matcher.find()) { + return Integer.parseInt(matcher.group(2)) * 1000; + } + + // 匹配固定薪资,如"15K" + pattern = Pattern.compile("(\\d+)K"); + matcher = pattern.matcher(salary); + if (matcher.find()) { + return Integer.parseInt(matcher.group(1)) * 1000; + } + + return 0; + } + + // 提取经验年限 + public int extractExperienceYears(String experience) { + if (experience == null || experience.isEmpty()) { + return 0; + } + + // 匹配经验年限,如"3-5年" + Pattern pattern = Pattern.compile("(\\d+)-(\\d+)年"); + Matcher matcher = pattern.matcher(experience); + + if (matcher.find()) { + return Integer.parseInt(matcher.group(1)); + } + + // 匹配固定经验,如"3年以上" + pattern = Pattern.compile("(\\d+)年"); + matcher = pattern.matcher(experience); + if (matcher.find()) { + return Integer.parseInt(matcher.group(1)); + } + + return 0; + } +} \ No newline at end of file diff --git a/project/DataStorage.java b/project/DataStorage.java new file mode 100644 index 0000000..21b6a19 --- /dev/null +++ b/project/DataStorage.java @@ -0,0 +1,44 @@ +package java01; + +import java.io.FileWriter; +import java.io.IOException; +import java.util.List; + +public class DataStorage { + + public void writeJobsToCSV(List jobs, String fileName) { + try (FileWriter writer = new FileWriter(fileName)) { + // 写入表头 + writer.write("职位标题,公司名称,薪资,地点,经验,学历,技能要求\n"); + + for (Job job : jobs) { + // 写入数据行,处理逗号和引号 + writer.write(escapeCsvField(job.getTitle()) + ","); + writer.write(escapeCsvField(job.getCompany()) + ","); + writer.write(escapeCsvField(job.getSalary()) + ","); + writer.write(escapeCsvField(job.getLocation()) + ","); + writer.write(escapeCsvField(job.getExperience()) + ","); + writer.write(escapeCsvField(job.getEducation()) + ","); + writer.write(escapeCsvField(job.getSkills()) + "\n"); + } + + System.out.println("数据已成功写入CSV文件: " + fileName); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private String escapeCsvField(String field) { + if (field == null) { + return ""; + } + // 如果字段包含逗号、引号或换行符,需要用引号包围 + if (field.contains(",") || field.contains("\"") || field.contains("\n")) { + // 转义字段中的引号 + field = field.replace("\"", "\"\""); + // 用引号包围字段 + return "\"" + field + "\""; + } + return field; + } +} \ No newline at end of file diff --git a/project/Job.java b/project/Job.java new file mode 100644 index 0000000..5e4d5ce --- /dev/null +++ b/project/Job.java @@ -0,0 +1,90 @@ +package java01; + +public class Job { + private String title; + private String company; + private String salary; + private String location; + private String experience; + private String education; + private String skills; + + public Job(String title, String company, String salary, String location, String experience, String education, String skills) { + this.title = title; + this.company = company; + this.salary = salary; + this.location = location; + this.experience = experience; + this.education = education; + this.skills = skills; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getCompany() { + return company; + } + + public void setCompany(String company) { + this.company = company; + } + + public String getSalary() { + return salary; + } + + public void setSalary(String salary) { + this.salary = salary; + } + + public String getLocation() { + return location; + } + + public void setLocation(String location) { + this.location = location; + } + + public String getExperience() { + return experience; + } + + public void setExperience(String experience) { + this.experience = experience; + } + + public String getEducation() { + return education; + } + + public void setEducation(String education) { + this.education = education; + } + + public String getSkills() { + return skills; + } + + public void setSkills(String skills) { + this.skills = skills; + } + + @Override + public String toString() { + return "Job{" + + "title='" + title + '\'' + + ", company='" + company + '\'' + + ", salary='" + salary + '\'' + + ", location='" + location + '\'' + + ", experience='" + experience + '\'' + + ", education='" + education + '\'' + + ", skills='" + skills + '\'' + + '}'; + } +} \ No newline at end of file diff --git a/project/JobCrawler.java b/project/JobCrawler.java new file mode 100644 index 0000000..f6e615b --- /dev/null +++ b/project/JobCrawler.java @@ -0,0 +1,86 @@ +package java01; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLEncoder; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class JobCrawler { + private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"; + + public List crawlJobs(String keyword, int pageCount) { + List jobs = new ArrayList<>(); + + try { + for (int page = 1; page <= pageCount; page++) { + String encodedKeyword = URLEncoder.encode(keyword, "UTF-8"); + String url = String.format("https://sou.zhaopin.com/?jl=489&kw=%s&p=%d", encodedKeyword, page); + System.out.println("爬取页面: " + url); + + // 发送HTTP请求 + String html = sendHttpRequest(url); + + // 解析HTML,提取职位信息 + List pageJobs = extractJobsFromHtml(html); + jobs.addAll(pageJobs); + + // 添加延迟,避免被反爬虫 + Thread.sleep(2000); + } + } catch (Exception e) { + e.printStackTrace(); + } + + return jobs; + } + + private String sendHttpRequest(String urlString) throws Exception { + URL url = new URL(urlString); + HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + conn.setRequestMethod("GET"); + conn.setRequestProperty("User-Agent", USER_AGENT); + conn.setConnectTimeout(10000); + conn.setReadTimeout(10000); + + BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8")); + StringBuilder response = new StringBuilder(); + String inputLine; + + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + } + in.close(); + conn.disconnect(); + + return response.toString(); + } + + private List extractJobsFromHtml(String html) { + List jobs = new ArrayList<>(); + + // 简化的正则表达式,实际项目中可能需要更复杂的解析 + // 这里使用模拟数据,因为实际解析HTML需要更复杂的逻辑 + // 在真实环境中,建议使用Jsoup等库进行HTML解析 + + // 模拟数据 + for (int i = 1; i <= 10; i++) { + Job job = new Job( + "Java开发工程师" + i, + "科技公司" + i, + (10 + i) + "K-" + (20 + i) + "K", + "北京", + "3-5年", + "本科", + "Java Spring Boot MySQL Redis" + ); + jobs.add(job); + } + + return jobs; + } +} \ No newline at end of file diff --git a/project/jobs.csv b/project/jobs.csv new file mode 100644 index 0000000..5baba60 --- /dev/null +++ b/project/jobs.csv @@ -0,0 +1,31 @@ +ְλ,˾,н,ص,,ѧ,Ҫ +Javaʦ1,Ƽ˾1,11K-21K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ2,Ƽ˾2,12K-22K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ3,Ƽ˾3,13K-23K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ4,Ƽ˾4,14K-24K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ5,Ƽ˾5,15K-25K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ6,Ƽ˾6,16K-26K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ7,Ƽ˾7,17K-27K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ8,Ƽ˾8,18K-28K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ9,Ƽ˾9,19K-29K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ10,Ƽ˾10,20K-30K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ1,Ƽ˾1,11K-21K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ2,Ƽ˾2,12K-22K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ3,Ƽ˾3,13K-23K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ4,Ƽ˾4,14K-24K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ5,Ƽ˾5,15K-25K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ6,Ƽ˾6,16K-26K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ7,Ƽ˾7,17K-27K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ8,Ƽ˾8,18K-28K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ9,Ƽ˾9,19K-29K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ10,Ƽ˾10,20K-30K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ1,Ƽ˾1,11K-21K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ2,Ƽ˾2,12K-22K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ3,Ƽ˾3,13K-23K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ4,Ƽ˾4,14K-24K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ5,Ƽ˾5,15K-25K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ6,Ƽ˾6,16K-26K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ7,Ƽ˾7,17K-27K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ8,Ƽ˾8,18K-28K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ9,Ƽ˾9,19K-29K,,3-5,,Java Spring Boot MySQL Redis +Javaʦ10,Ƽ˾10,20K-30K,,3-5,,Java Spring Boot MySQL Redis