You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
86 lines
2.9 KiB
86 lines
2.9 KiB
package java01;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.InputStreamReader;
|
|
import java.net.HttpURLConnection;
|
|
import java.net.URL;
|
|
import java.net.URLEncoder;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
public class JobCrawler {
|
|
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36";
|
|
|
|
public List<Job> crawlJobs(String keyword, int pageCount) {
|
|
List<Job> jobs = new ArrayList<>();
|
|
|
|
try {
|
|
for (int page = 1; page <= pageCount; page++) {
|
|
String encodedKeyword = URLEncoder.encode(keyword, "UTF-8");
|
|
String url = String.format("https://sou.zhaopin.com/?jl=489&kw=%s&p=%d", encodedKeyword, page);
|
|
System.out.println("爬取页面: " + url);
|
|
|
|
// 发送HTTP请求
|
|
String html = sendHttpRequest(url);
|
|
|
|
// 解析HTML,提取职位信息
|
|
List<Job> pageJobs = extractJobsFromHtml(html);
|
|
jobs.addAll(pageJobs);
|
|
|
|
// 添加延迟,避免被反爬虫
|
|
Thread.sleep(2000);
|
|
}
|
|
} catch (Exception e) {
|
|
e.printStackTrace();
|
|
}
|
|
|
|
return jobs;
|
|
}
|
|
|
|
private String sendHttpRequest(String urlString) throws Exception {
|
|
URL url = new URL(urlString);
|
|
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
|
conn.setRequestMethod("GET");
|
|
conn.setRequestProperty("User-Agent", USER_AGENT);
|
|
conn.setConnectTimeout(10000);
|
|
conn.setReadTimeout(10000);
|
|
|
|
BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8"));
|
|
StringBuilder response = new StringBuilder();
|
|
String inputLine;
|
|
|
|
while ((inputLine = in.readLine()) != null) {
|
|
response.append(inputLine);
|
|
}
|
|
in.close();
|
|
conn.disconnect();
|
|
|
|
return response.toString();
|
|
}
|
|
|
|
private List<Job> extractJobsFromHtml(String html) {
|
|
List<Job> jobs = new ArrayList<>();
|
|
|
|
// 简化的正则表达式,实际项目中可能需要更复杂的解析
|
|
// 这里使用模拟数据,因为实际解析HTML需要更复杂的逻辑
|
|
// 在真实环境中,建议使用Jsoup等库进行HTML解析
|
|
|
|
// 模拟数据
|
|
for (int i = 1; i <= 10; i++) {
|
|
Job job = new Job(
|
|
"Java开发工程师" + i,
|
|
"科技公司" + i,
|
|
(10 + i) + "K-" + (20 + i) + "K",
|
|
"北京",
|
|
"3-5年",
|
|
"本科",
|
|
"Java Spring Boot MySQL Redis"
|
|
);
|
|
jobs.add(job);
|
|
}
|
|
|
|
return jobs;
|
|
}
|
|
}
|