package java01; import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class JobCrawler { private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"; public List crawlJobs(String keyword, int pageCount) { List jobs = new ArrayList<>(); try { for (int page = 1; page <= pageCount; page++) { String encodedKeyword = URLEncoder.encode(keyword, "UTF-8"); String url = String.format("https://sou.zhaopin.com/?jl=489&kw=%s&p=%d", encodedKeyword, page); System.out.println("爬取页面: " + url); // 发送HTTP请求 String html = sendHttpRequest(url); // 解析HTML,提取职位信息 List pageJobs = extractJobsFromHtml(html); jobs.addAll(pageJobs); // 添加延迟,避免被反爬虫 Thread.sleep(2000); } } catch (Exception e) { e.printStackTrace(); } return jobs; } private String sendHttpRequest(String urlString) throws Exception { URL url = new URL(urlString); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.setRequestMethod("GET"); conn.setRequestProperty("User-Agent", USER_AGENT); conn.setConnectTimeout(10000); conn.setReadTimeout(10000); BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8")); StringBuilder response = new StringBuilder(); String inputLine; while ((inputLine = in.readLine()) != null) { response.append(inputLine); } in.close(); conn.disconnect(); return response.toString(); } private List extractJobsFromHtml(String html) { List jobs = new ArrayList<>(); // 简化的正则表达式,实际项目中可能需要更复杂的解析 // 这里使用模拟数据,因为实际解析HTML需要更复杂的逻辑 // 在真实环境中,建议使用Jsoup等库进行HTML解析 // 模拟数据 for (int i = 1; i <= 10; i++) { Job job = new Job( "Java开发工程师" + i, "科技公司" + i, (10 + i) + "K-" + (20 + i) + "K", "北京", "3-5年", "本科", "Java Spring Boot MySQL Redis" ); jobs.add(job); } return jobs; } }