Browse Source

微博热搜爬虫

main
Bilei 2 months ago
parent
commit
30ed53f6ec
  1. 3
      w3/.vscode/settings.json
  2. 76
      w3/pom.xml
  3. 133
      w3/src/main/java/WeiboStarHotSearcha.java
  4. 30
      w3/src/main/java/com/weibo/hotsearch/ConsoleOutputHandler.java
  5. 85
      w3/src/main/java/com/weibo/hotsearch/HotSearchApp.java
  6. 66
      w3/src/main/java/com/weibo/hotsearch/HotSearchDataSource.java
  7. 24
      w3/src/main/java/com/weibo/hotsearch/HotSearchFilter.java
  8. 46
      w3/src/main/java/com/weibo/hotsearch/OutputHandler.java
  9. 18
      w3/src/main/java/com/weibo/hotsearch/StarFilter.java
  10. 18
      w3/src/main/java/com/weibo/hotsearch/TiebaFilter.java
  11. 49
      w3/src/main/java/com/weibo/hotsearch/WeiboDataSource.java
  12. 4
      w3/src/run.bat
  13. BIN
      w3/target/classes/WeiboStarHotSearcha.class
  14. BIN
      w3/target/classes/com/weibo/hotsearch/ConsoleOutputHandler.class
  15. BIN
      w3/target/classes/com/weibo/hotsearch/HotSearchApp.class
  16. BIN
      w3/target/classes/com/weibo/hotsearch/HotSearchDataSource.class
  17. BIN
      w3/target/classes/com/weibo/hotsearch/HotSearchFilter.class
  18. BIN
      w3/target/classes/com/weibo/hotsearch/HotSearchProcessor.class
  19. BIN
      w3/target/classes/com/weibo/hotsearch/OutputHandler.class
  20. BIN
      w3/target/classes/com/weibo/hotsearch/StarFilter.class
  21. BIN
      w3/target/classes/com/weibo/hotsearch/TiebaFilter.class
  22. BIN
      w3/target/classes/com/weibo/hotsearch/WeiboDataSource.class
  23. 3
      w3/target/maven-archiver/pom.properties
  24. 2
      w3/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
  25. 2
      w3/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
  26. BIN
      w3/target/weibo-hotsearch-1.0-SNAPSHOT-jar-with-dependencies.jar
  27. BIN
      w3/target/weibo-hotsearch-1.0-SNAPSHOT.jar

3
w3/.vscode/settings.json

@ -0,0 +1,3 @@
{
"git.ignoreLimitWarning": true
}

76
w3/pom.xml

@ -0,0 +1,76 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.example</groupId>
<artifactId>weibo-hotsearch</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<!-- 添加 jsoup 依赖 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<!-- Apache HttpClient 5 -->
<dependency>
<groupId>org.apache.httpcomponents.client5</groupId>
<artifactId>httpclient5</artifactId>
<version>5.3.1</version>
</dependency>
<!-- Apache HttpClient 5 Fluent -->
<dependency>
<groupId>org.apache.httpcomponents.client5</groupId>
<artifactId>httpclient5-fluent</artifactId>
<version>5.3.1</version>
</dependency>
<!-- FastJSON2 -->
<dependency>
<groupId>com.alibaba.fastjson2</groupId>
<artifactId>fastjson2</artifactId>
<version>2.0.52</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.6.0</version>
<configuration>
<archive>
<manifest>
<mainClass>WeiboStarHotSearcha</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

133
w3/src/main/java/WeiboStarHotSearcha.java

@ -0,0 +1,133 @@
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import org.apache.hc.client5.http.fluent.Request;
import org.apache.hc.core5.util.Timeout;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class WeiboStarHotSearcha {
private static final String WEIBO_HOT_URL = "https://weibo.com/ajax/side/hotSearch";
private static final String[] STAR_KEYWORDS = {
"明星", "演员", "歌手", "爱豆", "艺人", "红毯", "综艺", "新剧",
"恋情", "官宣", "演唱会", "代言", "造型", "封面"
};
private static final int CONNECT_TIMEOUT = 10000;
private static final int RESPONSE_TIMEOUT = 10000;
private static final int MAX_RETRIES = 3;
public static void main(String[] args) {
try {
System.out.println("正在请求微博热搜数据...");
String json = fetchWithRetry(WEIBO_HOT_URL, MAX_RETRIES);
if (json == null || json.isEmpty()) {
System.out.println("获取热搜数据失败");
return;
}
JSONObject root = JSONObject.parseObject(json);
if (root == null || !root.containsKey("data")) {
System.out.println("数据格式错误或接口返回异常");
return;
}
JSONObject data = root.getJSONObject("data");
if (data == null || !data.containsKey("realtime")) {
System.out.println("热搜数据为空");
return;
}
JSONArray realtime = data.getJSONArray("realtime");
if (realtime == null || realtime.isEmpty()) {
System.out.println("热搜列表为空");
return;
}
List<JSONObject> starHotList = new ArrayList<>();
System.out.println("\n===== 明星相关热搜 =====");
for (int i = 0; i < realtime.size(); i++) {
JSONObject item = realtime.getJSONObject(i);
if (item == null) continue;
String word = item.getString("word");
if (word == null || word.isEmpty()) continue;
long num = item.getLongValue("num", 0);
int rank = item.getIntValue("rank", 0);
if (isStarRelated(word)) {
starHotList.add(item);
System.out.printf("排名:%d\t热度:%d\t热搜:%s%n", rank, num, word);
}
}
System.out.println("\n===== 明星相关热搜总数:" + starHotList.size() + " 条 =====");
if (starHotList.isEmpty()) {
System.out.println("当前热搜暂无明星相关内容");
}
} catch (IOException e) {
System.err.println("网络请求失败: " + e.getMessage());
e.printStackTrace();
} catch (Exception e) {
System.err.println("数据解析失败: " + e.getMessage());
e.printStackTrace();
}
}
private static String fetchWithRetry(String url, int maxRetries) throws IOException {
int retryCount = 0;
IOException lastException = null;
while (retryCount < maxRetries) {
try {
return Request.get(url)
.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
.addHeader("Referer", "https://weibo.com/")
.addHeader("Accept", "application/json, text/plain, */*")
.addHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
.addHeader("Accept-Encoding", "gzip, deflate, br")
.addHeader("Connection", "keep-alive")
.connectTimeout(Timeout.ofMilliseconds(CONNECT_TIMEOUT))
.responseTimeout(Timeout.ofMilliseconds(RESPONSE_TIMEOUT))
.execute()
.returnContent()
.asString();
} catch (IOException e) {
lastException = e;
retryCount++;
if (retryCount < maxRetries) {
System.out.println("请求失败,正在重试 (" + retryCount + "/" + maxRetries + ")...");
try {
Thread.sleep(2000);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
throw new IOException("重试被中断", ie);
}
}
}
}
throw lastException != null ? lastException : new IOException("请求失败");
}
private static boolean isStarRelated(String word) {
if (word == null || word.isEmpty()) {
return false;
}
for (String keyword : STAR_KEYWORDS) {
if (word.contains(keyword)) {
return true;
}
}
return false;
}
}

30
w3/src/main/java/com/weibo/hotsearch/ConsoleOutputHandler.java

@ -0,0 +1,30 @@
package com.weibo.hotsearch;
import com.alibaba.fastjson2.JSONObject;
import java.util.List;
public class ConsoleOutputHandler extends OutputHandler {
@Override
public void output(List<JSONObject> hotList, String filterName) {
System.out.println("\n===== " + filterName + " =====");
if (hotList == null || hotList.isEmpty()) {
System.out.println("当前暂无符合条件的热搜内容");
return;
}
for (int i = 0; i < hotList.size(); i++) {
JSONObject item = hotList.get(i);
System.out.println(formatHotItem(item, i, null));
}
System.out.println("\n===== 热搜总数:" + hotList.size() + " 条 =====");
}
@Override
public String getOutputType() {
return "控制台输出";
}
}

85
w3/src/main/java/com/weibo/hotsearch/HotSearchApp.java

@ -0,0 +1,85 @@
package com.weibo.hotsearch;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class HotSearchApp {
public static void main(String[] args) {
System.out.println("========== 热搜监控系统 ==========\n");
HotSearchDataSource dataSource = new WeiboDataSource();
HotSearchFilter filter = new StarFilter();
OutputHandler output = new ConsoleOutputHandler();
HotSearchProcessor processor = new HotSearchProcessor(dataSource, filter, output);
processor.run();
System.out.println("\n========== 监控结束 ==========");
}
}
class HotSearchProcessor {
private HotSearchDataSource dataSource;
private HotSearchFilter filter;
private OutputHandler output;
public HotSearchProcessor(HotSearchDataSource dataSource, HotSearchFilter filter, OutputHandler output) {
this.dataSource = dataSource;
this.filter = filter;
this.output = output;
}
public void run() {
try {
JSONArray data = dataSource.fetchData();
List<JSONObject> filteredList = filterHotSearch(data);
output.output(filteredList, dataSource.getName() + " - " + filter.getFilterName());
} catch (IOException e) {
System.err.println("[" + dataSource.getName() + "] 处理失败: " + e.getMessage());
} catch (Exception e) {
System.err.println("[" + dataSource.getName() + "] 数据解析失败: " + e.getMessage());
}
}
private List<JSONObject> filterHotSearch(JSONArray data) {
List<JSONObject> filteredList = new ArrayList<>();
if (data == null || data.isEmpty()) {
return filteredList;
}
for (int i = 0; i < data.size(); i++) {
JSONObject item = data.getJSONObject(i);
if (item == null) continue;
String word = getHotSearchWord(item);
if (word == null || word.isEmpty()) continue;
if (filter.matches(word)) {
filteredList.add(item);
}
}
return filteredList;
}
private String getHotSearchWord(JSONObject item) {
if (item == null) return null;
String word = item.getString("word");
if (word == null || word.isEmpty()) {
word = item.getString("topic_name");
}
if (word == null || word.isEmpty()) {
word = item.getString("title");
}
return word;
}
}

66
w3/src/main/java/com/weibo/hotsearch/HotSearchDataSource.java

@ -0,0 +1,66 @@
package com.weibo.hotsearch;
import com.alibaba.fastjson2.JSONArray;
import org.apache.hc.client5.http.fluent.Request;
import org.apache.hc.core5.util.Timeout;
import java.io.IOException;
public abstract class HotSearchDataSource {
protected String name;
protected String url;
protected int connectTimeout = 10000;
protected int responseTimeout = 10000;
protected int maxRetries = 3;
public HotSearchDataSource(String name, String url) {
this.name = name;
this.url = url;
}
public abstract JSONArray fetchData() throws IOException;
protected String fetchWithRetry() throws IOException {
int retryCount = 0;
IOException lastException = null;
while (retryCount < maxRetries) {
try {
return Request.get(url)
.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
.addHeader("Referer", getReferer())
.addHeader("Accept", "application/json, text/plain, */*")
.addHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
.addHeader("Accept-Encoding", "gzip, deflate, br")
.addHeader("Connection", "keep-alive")
.connectTimeout(Timeout.ofMilliseconds(connectTimeout))
.responseTimeout(Timeout.ofMilliseconds(responseTimeout))
.execute()
.returnContent()
.asString();
} catch (IOException e) {
lastException = e;
retryCount++;
if (retryCount < maxRetries) {
System.out.println("[" + name + "] 请求失败,正在重试 (" + retryCount + "/" + maxRetries + ")...");
try {
Thread.sleep(2000);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
throw new IOException("重试被中断", ie);
}
}
}
}
throw lastException != null ? lastException : new IOException("请求失败");
}
protected String getReferer() {
return "https://www.baidu.com/";
}
public String getName() {
return name;
}
}

24
w3/src/main/java/com/weibo/hotsearch/HotSearchFilter.java

@ -0,0 +1,24 @@
package com.weibo.hotsearch;
public abstract class HotSearchFilter {
protected String[] keywords;
public HotSearchFilter(String[] keywords) {
this.keywords = keywords;
}
public boolean matches(String word) {
if (word == null || word.isEmpty()) {
return false;
}
for (String keyword : keywords) {
if (word.contains(keyword)) {
return true;
}
}
return false;
}
public abstract String getFilterName();
}

46
w3/src/main/java/com/weibo/hotsearch/OutputHandler.java

@ -0,0 +1,46 @@
package com.weibo.hotsearch;
import com.alibaba.fastjson2.JSONObject;
import java.util.List;
public abstract class OutputHandler {
public abstract void output(List<JSONObject> hotList, String filterName);
public abstract String getOutputType();
protected String formatHotItem(JSONObject item, int index, String dataSourceName) {
String word = getHotSearchWord(item);
long num = getHotSearchNum(item);
int rank = getHotSearchRank(item);
if (rank > 0) {
return String.format("排名:%d\t热度:%d\t热搜:%s", rank, num, word);
} else {
return String.format("序号:%d\t热度:%d\t热搜:%s", index + 1, num, word);
}
}
protected String getHotSearchWord(JSONObject item) {
if (item == null) return "未知";
String word = item.getString("word");
if (word == null || word.isEmpty()) {
word = item.getString("topic_name");
}
if (word == null || word.isEmpty()) {
word = item.getString("title");
}
return word != null ? word : "未知";
}
protected long getHotSearchNum(JSONObject item) {
if (item == null) return 0;
return item.getLongValue("num", 0);
}
protected int getHotSearchRank(JSONObject item) {
if (item == null) return 0;
return item.getIntValue("rank", 0);
}
}

18
w3/src/main/java/com/weibo/hotsearch/StarFilter.java

@ -0,0 +1,18 @@
package com.weibo.hotsearch;
public class StarFilter extends HotSearchFilter {
private static final String[] STAR_KEYWORDS = {
"明星", "演员", "歌手", "爱豆", "艺人", "红毯", "综艺", "新剧",
"恋情", "官宣", "演唱会", "代言", "造型", "封面"
};
public StarFilter() {
super(STAR_KEYWORDS);
}
@Override
public String getFilterName() {
return "明星相关热搜";
}
}

18
w3/src/main/java/com/weibo/hotsearch/TiebaFilter.java

@ -0,0 +1,18 @@
package com.weibo.hotsearch;
public class TiebaFilter extends HotSearchFilter {
private static final String[] TIEBA_KEYWORDS = {
"贴吧", "帖子", "楼", "吧友", "吧主", "爆照", "打卡", "求助",
"分享", "吐槽", "围观", "精华", "置顶", "话题"
};
public TiebaFilter() {
super(TIEBA_KEYWORDS);
}
@Override
public String getFilterName() {
return "贴吧相关热搜";
}
}

49
w3/src/main/java/com/weibo/hotsearch/WeiboDataSource.java

@ -0,0 +1,49 @@
package com.weibo.hotsearch;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import java.io.IOException;
public class WeiboDataSource extends HotSearchDataSource {
private static final String WEIBO_HOT_URL = "https://weibo.com/ajax/side/hotSearch";
public WeiboDataSource() {
super("微博", WEIBO_HOT_URL);
}
@Override
public JSONArray fetchData() throws IOException {
System.out.println("正在获取微博热搜数据...");
String json = fetchWithRetry();
if (json == null || json.isEmpty()) {
throw new IOException("获取微博热搜数据失败");
}
JSONObject root = JSONObject.parseObject(json);
if (root == null || !root.containsKey("data")) {
throw new IOException("微博数据格式错误或接口返回异常");
}
JSONObject data = root.getJSONObject("data");
if (data == null || !data.containsKey("realtime")) {
throw new IOException("微博热搜数据为空");
}
JSONArray realtime = data.getJSONArray("realtime");
if (realtime == null || realtime.isEmpty()) {
throw new IOException("微博热搜列表为空");
}
System.out.println("成功获取微博热搜数据,共 " + realtime.size() + " 条");
return realtime;
}
@Override
protected String getReferer() {
return "https://weibo.com/";
}
}

4
w3/src/run.bat

@ -0,0 +1,4 @@
@echo off
set CLASSPATH=target\classes;C:\Users\ruiruirui\.m2\repository\org\jsoup\jsoup\1.17.2\jsoup-1.17.2.jar
java WeiboHotSearcha
pause

BIN
w3/target/classes/WeiboStarHotSearcha.class

Binary file not shown.

BIN
w3/target/classes/com/weibo/hotsearch/ConsoleOutputHandler.class

Binary file not shown.

BIN
w3/target/classes/com/weibo/hotsearch/HotSearchApp.class

Binary file not shown.

BIN
w3/target/classes/com/weibo/hotsearch/HotSearchDataSource.class

Binary file not shown.

BIN
w3/target/classes/com/weibo/hotsearch/HotSearchFilter.class

Binary file not shown.

BIN
w3/target/classes/com/weibo/hotsearch/HotSearchProcessor.class

Binary file not shown.

BIN
w3/target/classes/com/weibo/hotsearch/OutputHandler.class

Binary file not shown.

BIN
w3/target/classes/com/weibo/hotsearch/StarFilter.class

Binary file not shown.

BIN
w3/target/classes/com/weibo/hotsearch/TiebaFilter.class

Binary file not shown.

BIN
w3/target/classes/com/weibo/hotsearch/WeiboDataSource.class

Binary file not shown.

3
w3/target/maven-archiver/pom.properties

@ -0,0 +1,3 @@
artifactId=weibo-hotsearch
groupId=com.example
version=1.0-SNAPSHOT

2
w3/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst

@ -0,0 +1,2 @@
WeiboStarHotSearcha.class
WeiboHotSearch.class

2
w3/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst

@ -0,0 +1,2 @@
C:\Users\ruiruirui\java\w3\src\main\java\WeiboHotSearch.java
C:\Users\ruiruirui\java\w3\src\main\java\WeiboStarHotSearcha.java

BIN
w3/target/weibo-hotsearch-1.0-SNAPSHOT-jar-with-dependencies.jar

Binary file not shown.

BIN
w3/target/weibo-hotsearch-1.0-SNAPSHOT.jar

Binary file not shown.
Loading…
Cancel
Save