diff --git a/.gitignore b/.gitignore index 3c66885..deec77a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ target/ !**/src/test/**/target/ .kotlin +logs/* *.output.json ### IntelliJ IDEA ### diff --git a/pom.xml b/pom.xml index d4b92f0..4f2c756 100644 --- a/pom.xml +++ b/pom.xml @@ -24,5 +24,15 @@ 2.14.0 compile + + org.slf4j + slf4j-api + 2.0.16 + + + ch.qos.logback + logback-classic + 1.5.25 + \ No newline at end of file diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java index 844cf4a..2bbc11a 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/IthomeCrawlStrategy.java @@ -5,6 +5,8 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.net.URL; import java.util.ArrayList; @@ -15,6 +17,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; public class IthomeCrawlStrategy implements CrawlStrategy { + private static final Logger log = LoggerFactory.getLogger(IthomeCrawlStrategy.class); private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/(\\d+)\\.htm"); @Override @@ -47,7 +50,8 @@ public class IthomeCrawlStrategy implements CrawlStrategy { URL articleUrl = new URL(href); Document articleDoc = Jsoup.parse(articleUrl, 5000); articles.add(parseSingle(articleUrl, articleDoc)); - } catch (Exception ignored) { + } catch (Exception e) { + log.warn("Failed to fetch article: {}", href, e); } } return articles; diff --git a/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java b/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java index f166af5..0ce0f93 100644 --- a/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java +++ b/src/main/java/internal/hw/crawler/strategies/crawl/PeopleCnCrawlStrategy.java @@ -5,6 +5,8 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.net.URL; import java.util.ArrayList; @@ -16,6 +18,7 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; public class PeopleCnCrawlStrategy implements CrawlStrategy { + private static final Logger log = LoggerFactory.getLogger(PeopleCnCrawlStrategy.class); private final List supportedDomains = List.of("people.cn", "people.com.cn"); /* 示例 URL:http://env.people.com.cn/n1/2026/0530/c1010-40730688.html */ private final Pattern idRegex = Pattern.compile("(\\d+)/(\\d+)/c(\\d+)-(\\d+).html"); @@ -57,7 +60,8 @@ public class PeopleCnCrawlStrategy implements CrawlStrategy { URL articleUrl = new URL(href); Document articleDoc = Jsoup.parse(articleUrl, 5000); articles.add(parseSingle(articleUrl, articleDoc)); - } catch (Exception ignored) { + } catch (Exception e) { + log.warn("Failed to fetch article: {}", href, e); } } return articles; diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml new file mode 100644 index 0000000..12920e1 --- /dev/null +++ b/src/main/resources/logback.xml @@ -0,0 +1,20 @@ + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + logs/default.log + true + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + +