1 changed files with 0 additions and 86 deletions
@ -1,86 +0,0 @@ |
|||||
package java01; |
|
||||
|
|
||||
import java.io.BufferedReader; |
|
||||
import java.io.InputStreamReader; |
|
||||
import java.net.HttpURLConnection; |
|
||||
import java.net.URL; |
|
||||
import java.net.URLEncoder; |
|
||||
import java.util.ArrayList; |
|
||||
import java.util.List; |
|
||||
import java.util.regex.Matcher; |
|
||||
import java.util.regex.Pattern; |
|
||||
|
|
||||
public class JobCrawler { |
|
||||
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"; |
|
||||
|
|
||||
public List<Job> crawlJobs(String keyword, int pageCount) { |
|
||||
List<Job> jobs = new ArrayList<>(); |
|
||||
|
|
||||
try { |
|
||||
for (int page = 1; page <= pageCount; page++) { |
|
||||
String encodedKeyword = URLEncoder.encode(keyword, "UTF-8"); |
|
||||
String url = String.format("https://sou.zhaopin.com/?jl=489&kw=%s&p=%d", encodedKeyword, page); |
|
||||
System.out.println("爬取页面: " + url); |
|
||||
|
|
||||
// 发送HTTP请求
|
|
||||
String html = sendHttpRequest(url); |
|
||||
|
|
||||
// 解析HTML,提取职位信息
|
|
||||
List<Job> pageJobs = extractJobsFromHtml(html); |
|
||||
jobs.addAll(pageJobs); |
|
||||
|
|
||||
// 添加延迟,避免被反爬虫
|
|
||||
Thread.sleep(2000); |
|
||||
} |
|
||||
} catch (Exception e) { |
|
||||
e.printStackTrace(); |
|
||||
} |
|
||||
|
|
||||
return jobs; |
|
||||
} |
|
||||
|
|
||||
private String sendHttpRequest(String urlString) throws Exception { |
|
||||
URL url = new URL(urlString); |
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); |
|
||||
conn.setRequestMethod("GET"); |
|
||||
conn.setRequestProperty("User-Agent", USER_AGENT); |
|
||||
conn.setConnectTimeout(10000); |
|
||||
conn.setReadTimeout(10000); |
|
||||
|
|
||||
BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8")); |
|
||||
StringBuilder response = new StringBuilder(); |
|
||||
String inputLine; |
|
||||
|
|
||||
while ((inputLine = in.readLine()) != null) { |
|
||||
response.append(inputLine); |
|
||||
} |
|
||||
in.close(); |
|
||||
conn.disconnect(); |
|
||||
|
|
||||
return response.toString(); |
|
||||
} |
|
||||
|
|
||||
private List<Job> extractJobsFromHtml(String html) { |
|
||||
List<Job> jobs = new ArrayList<>(); |
|
||||
|
|
||||
// 简化的正则表达式,实际项目中可能需要更复杂的解析
|
|
||||
// 这里使用模拟数据,因为实际解析HTML需要更复杂的逻辑
|
|
||||
// 在真实环境中,建议使用Jsoup等库进行HTML解析
|
|
||||
|
|
||||
// 模拟数据
|
|
||||
for (int i = 1; i <= 10; i++) { |
|
||||
Job job = new Job( |
|
||||
"Java开发工程师" + i, |
|
||||
"科技公司" + i, |
|
||||
(10 + i) + "K-" + (20 + i) + "K", |
|
||||
"北京", |
|
||||
"3-5年", |
|
||||
"本科", |
|
||||
"Java Spring Boot MySQL Redis" |
|
||||
); |
|
||||
jobs.add(job); |
|
||||
} |
|
||||
|
|
||||
return jobs; |
|
||||
} |
|
||||
} |
|
||||
Loading…
Reference in new issue