Browse Source

commit

main
unknown 3 weeks ago
parent
commit
1380c2d15e
  1. 3
      project/crawl/.gitignore
  2. BIN
      project/crawl/202506050209 林晞宸 期末实验报告.docx
  3. 55
      project/crawl/README.md
  4. 67
      project/crawl/data/sample-news.json
  5. 58
      project/crawl/pom.xml
  6. 12
      project/crawl/src/main/java/edu/homework/crawler/Main.java
  7. 74
      project/crawl/src/main/java/edu/homework/crawler/cli/CliApplication.java
  8. 11
      project/crawl/src/main/java/edu/homework/crawler/command/Command.java
  9. 44
      project/crawl/src/main/java/edu/homework/crawler/command/CommandContext.java
  10. 82
      project/crawl/src/main/java/edu/homework/crawler/command/CommandRegistry.java
  11. 55
      project/crawl/src/main/java/edu/homework/crawler/command/CrawlCommand.java
  12. 21
      project/crawl/src/main/java/edu/homework/crawler/command/ExitCommand.java
  13. 28
      project/crawl/src/main/java/edu/homework/crawler/command/HelpCommand.java
  14. 22
      project/crawl/src/main/java/edu/homework/crawler/command/SitesCommand.java
  15. 30
      project/crawl/src/main/java/edu/homework/crawler/controller/CrawlerController.java
  16. 7
      project/crawl/src/main/java/edu/homework/crawler/exception/CommandException.java
  17. 11
      project/crawl/src/main/java/edu/homework/crawler/exception/CrawlException.java
  18. 7
      project/crawl/src/main/java/edu/homework/crawler/exception/NetworkException.java
  19. 11
      project/crawl/src/main/java/edu/homework/crawler/exception/ParseException.java
  20. 7
      project/crawl/src/main/java/edu/homework/crawler/exception/SiteNotFoundException.java
  21. 7
      project/crawl/src/main/java/edu/homework/crawler/exception/StorageException.java
  22. 8
      project/crawl/src/main/java/edu/homework/crawler/model/CrawlRequest.java
  23. 11
      project/crawl/src/main/java/edu/homework/crawler/model/CrawlSummary.java
  24. 4
      project/crawl/src/main/java/edu/homework/crawler/model/NewsCandidate.java
  25. 107
      project/crawl/src/main/java/edu/homework/crawler/model/NewsItem.java
  26. 67
      project/crawl/src/main/java/edu/homework/crawler/repository/FileNewsRepository.java
  27. 17
      project/crawl/src/main/java/edu/homework/crawler/repository/OutputFormat.java
  28. 37
      project/crawl/src/main/java/edu/homework/crawler/service/NewsCrawlerService.java
  29. 177
      project/crawl/src/main/java/edu/homework/crawler/strategy/AbstractVisualSiteBuilderStrategy.java
  30. 17
      project/crawl/src/main/java/edu/homework/crawler/strategy/CrawlStrategy.java
  31. 25
      project/crawl/src/main/java/edu/homework/crawler/strategy/CsuNewsStrategy.java
  32. 25
      project/crawl/src/main/java/edu/homework/crawler/strategy/HnuNewsStrategy.java
  33. 30
      project/crawl/src/main/java/edu/homework/crawler/strategy/HunnuNewsStrategy.java
  34. 36
      project/crawl/src/main/java/edu/homework/crawler/strategy/SiteRegistry.java
  35. 39
      project/crawl/src/main/java/edu/homework/crawler/util/HttpFetcher.java
  36. 98
      project/crawl/src/main/java/edu/homework/crawler/util/TextExtractors.java
  37. 37
      project/crawl/src/main/java/edu/homework/crawler/view/ConsoleView.java
  38. 38
      project/crawl/src/test/java/edu/homework/crawler/command/CommandRegistryTest.java
  39. 29
      project/crawl/src/test/java/edu/homework/crawler/repository/FileNewsRepositoryTest.java
  40. 29
      project/crawl/src/test/java/edu/homework/crawler/util/TextExtractorsTest.java

3
project/crawl/.gitignore

@ -0,0 +1,3 @@
target/
*.log
*.tmp

BIN
project/crawl/202506050209 林晞宸 期末实验报告.docx

Binary file not shown.

55
project/crawl/README.md

@ -0,0 +1,55 @@
# University News Crawler
Java homework project for crawling:
- `https://news.hnu.edu.cn/`
- `https://news.csu.edu.cn/`
- `https://news.hunnu.edu.cn/`
The code demonstrates the required architecture:
- CLI interactive command line
- MVC: `model`, `view`, `controller`
- Command pattern: `command` package
- Strategy pattern: `strategy` package, one strategy per target website
- Custom exception hierarchy: `exception` package
- File persistence: JSON or CSV output
## Run
```powershell
mvn test
mvn exec:java -Dexec.args="crawl --site all --limit 5 --format json --out data/news.json"
```
Interactive CLI:
```powershell
mvn exec:java
```
Useful commands:
```text
help
sites
crawl --site all --limit 10 --format json --out data/news.json
crawl --site hnu --limit 5 --format csv --out data/hnu.csv
exit
```
## Output Fields
Each crawled news item includes:
- school
- site key
- title
- url
- publish time
- source
- author
- summary
- content preview
- crawled time

67
project/crawl/data/sample-news.json

File diff suppressed because one or more lines are too long

58
project/crawl/pom.xml

@ -0,0 +1,58 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>edu.homework</groupId>
<artifactId>university-news-crawler</artifactId>
<version>1.0.0</version>
<name>University News Crawler</name>
<properties>
<maven.compiler.release>17</maven.compiler.release>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<junit.version>5.10.2</junit.version>
</properties>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.17.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-jsr310</artifactId>
<version>2.17.2</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.2.5</version>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<mainClass>edu.homework.crawler.Main</mainClass>
</configuration>
</plugin>
</plugins>
</build>
</project>

12
project/crawl/src/main/java/edu/homework/crawler/Main.java

@ -0,0 +1,12 @@
package edu.homework.crawler;
import edu.homework.crawler.cli.CliApplication;
public final class Main {
private Main() {
}
public static void main(String[] args) {
CliApplication.defaultApplication().run(args);
}
}

74
project/crawl/src/main/java/edu/homework/crawler/cli/CliApplication.java

@ -0,0 +1,74 @@
package edu.homework.crawler.cli;
import edu.homework.crawler.command.CommandContext;
import edu.homework.crawler.command.CommandRegistry;
import edu.homework.crawler.command.CrawlCommand;
import edu.homework.crawler.command.ExitCommand;
import edu.homework.crawler.command.HelpCommand;
import edu.homework.crawler.command.SitesCommand;
import edu.homework.crawler.controller.CrawlerController;
import edu.homework.crawler.exception.CrawlException;
import edu.homework.crawler.repository.FileNewsRepository;
import edu.homework.crawler.service.NewsCrawlerService;
import edu.homework.crawler.strategy.SiteRegistry;
import edu.homework.crawler.view.ConsoleView;
import java.nio.charset.StandardCharsets;
import java.util.Scanner;
public class CliApplication {
private final CommandContext context;
private final CommandRegistry commandRegistry;
public CliApplication(CommandContext context, CommandRegistry commandRegistry) {
this.context = context;
this.commandRegistry = commandRegistry;
}
public static CliApplication defaultApplication() {
ConsoleView view = new ConsoleView();
SiteRegistry siteRegistry = SiteRegistry.defaults();
NewsCrawlerService service = new NewsCrawlerService(siteRegistry);
FileNewsRepository repository = new FileNewsRepository();
CrawlerController controller = new CrawlerController(service, repository);
CommandRegistry registry = new CommandRegistry();
CommandContext context = new CommandContext(controller, view, registry, siteRegistry);
registry.register(new HelpCommand());
registry.register(new SitesCommand());
registry.register(new CrawlCommand());
registry.register(new ExitCommand());
return new CliApplication(context, registry);
}
public void run(String[] args) {
if (args.length > 0) {
executeLine(String.join(" ", args));
return;
}
context.view().printWelcome();
try (Scanner scanner = new Scanner(System.in, StandardCharsets.UTF_8)) {
while (context.isRunning()) {
context.view().printPrompt();
if (!scanner.hasNextLine()) {
break;
}
executeLine(scanner.nextLine());
}
}
}
private void executeLine(String line) {
try {
commandRegistry.execute(context, line);
} catch (CrawlException e) {
context.view().printError(e.getMessage());
if (e.getCause() != null) {
context.view().printError("Cause: " + e.getCause().getMessage());
}
} catch (RuntimeException e) {
context.view().printError("Unexpected error: " + e.getMessage());
}
}
}

11
project/crawl/src/main/java/edu/homework/crawler/command/Command.java

@ -0,0 +1,11 @@
package edu.homework.crawler.command;
import edu.homework.crawler.exception.CrawlException;
public interface Command {
String name();
String description();
void execute(CommandContext context, String[] args) throws CrawlException;
}

44
project/crawl/src/main/java/edu/homework/crawler/command/CommandContext.java

@ -0,0 +1,44 @@
package edu.homework.crawler.command;
import edu.homework.crawler.controller.CrawlerController;
import edu.homework.crawler.strategy.SiteRegistry;
import edu.homework.crawler.view.ConsoleView;
public class CommandContext {
private final CrawlerController controller;
private final ConsoleView view;
private final CommandRegistry commandRegistry;
private final SiteRegistry siteRegistry;
private boolean running = true;
public CommandContext(CrawlerController controller, ConsoleView view, CommandRegistry commandRegistry, SiteRegistry siteRegistry) {
this.controller = controller;
this.view = view;
this.commandRegistry = commandRegistry;
this.siteRegistry = siteRegistry;
}
public CrawlerController controller() {
return controller;
}
public ConsoleView view() {
return view;
}
public CommandRegistry commandRegistry() {
return commandRegistry;
}
public SiteRegistry siteRegistry() {
return siteRegistry;
}
public boolean isRunning() {
return running;
}
public void stop() {
this.running = false;
}
}

82
project/crawl/src/main/java/edu/homework/crawler/command/CommandRegistry.java

@ -0,0 +1,82 @@
package edu.homework.crawler.command;
import edu.homework.crawler.exception.CommandException;
import edu.homework.crawler.exception.CrawlException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
public class CommandRegistry {
private final Map<String, Command> commands = new LinkedHashMap<>();
public void register(Command command) {
commands.put(command.name(), command);
}
public Collection<Command> commands() {
return commands.values();
}
public void execute(CommandContext context, String line) throws CrawlException {
List<String> tokens = tokenize(line);
if (tokens.isEmpty()) {
return;
}
Command command = commands.get(tokens.get(0));
if (command == null) {
throw new CommandException("Unknown command: " + tokens.get(0) + ". Type help to see commands.");
}
String[] args = tokens.subList(1, tokens.size()).toArray(String[]::new);
command.execute(context, args);
}
public List<String> tokenize(String line) throws CommandException {
List<String> tokens = new ArrayList<>();
StringBuilder current = new StringBuilder();
boolean inQuotes = false;
for (int i = 0; i < line.length(); i++) {
char ch = line.charAt(i);
if (ch == '"') {
inQuotes = !inQuotes;
continue;
}
if (Character.isWhitespace(ch) && !inQuotes) {
addToken(tokens, current);
} else {
current.append(ch);
}
}
if (inQuotes) {
throw new CommandException("Missing closing quote in command line.");
}
addToken(tokens, current);
return tokens;
}
private void addToken(List<String> tokens, StringBuilder current) {
if (current.length() > 0) {
tokens.add(current.toString());
current.setLength(0);
}
}
public Map<String, String> parseOptions(String[] args) throws CommandException {
Map<String, String> options = new LinkedHashMap<>();
List<String> list = Arrays.asList(args);
for (int i = 0; i < list.size(); i++) {
String key = list.get(i);
if (!key.startsWith("--")) {
throw new CommandException("Invalid argument: " + key + ". Options must start with --.");
}
if (i + 1 >= list.size() || list.get(i + 1).startsWith("--")) {
throw new CommandException("Missing value for option " + key + ".");
}
options.put(key.substring(2), list.get(++i));
}
return options;
}
}

55
project/crawl/src/main/java/edu/homework/crawler/command/CrawlCommand.java

@ -0,0 +1,55 @@
package edu.homework.crawler.command;
import edu.homework.crawler.exception.CommandException;
import edu.homework.crawler.exception.CrawlException;
import edu.homework.crawler.model.CrawlRequest;
import edu.homework.crawler.repository.OutputFormat;
import java.nio.file.Path;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Map;
public class CrawlCommand implements Command {
private static final DateTimeFormatter FILE_TIME = DateTimeFormatter.ofPattern("yyyyMMdd-HHmmss");
@Override
public String name() {
return "crawl";
}
@Override
public String description() {
return "Crawl news and save to a JSON or CSV file.";
}
@Override
public void execute(CommandContext context, String[] args) throws CrawlException {
Map<String, String> options = context.commandRegistry().parseOptions(args);
String site = options.getOrDefault("site", "all");
int limit = parseLimit(options.getOrDefault("limit", "10"));
OutputFormat format = OutputFormat.from(options.getOrDefault("format", "json"));
Path outputPath = Path.of(options.getOrDefault("out", defaultOutput(format)));
context.view().printInfo("Starting crawl: site=" + site + ", limit=" + limit + ", format=" + format.name().toLowerCase());
CrawlRequest request = new CrawlRequest(site, limit, format, outputPath);
context.view().printSummary(context.controller().crawl(request));
}
private int parseLimit(String value) throws CommandException {
try {
int limit = Integer.parseInt(value);
if (limit <= 0 || limit > 100) {
throw new CommandException("Limit must be between 1 and 100.");
}
return limit;
} catch (NumberFormatException e) {
throw new CommandException("Limit must be a number: " + value);
}
}
private String defaultOutput(OutputFormat format) {
String suffix = format == OutputFormat.JSON ? ".json" : ".csv";
return "data/news-" + LocalDateTime.now().format(FILE_TIME) + suffix;
}
}

21
project/crawl/src/main/java/edu/homework/crawler/command/ExitCommand.java

@ -0,0 +1,21 @@
package edu.homework.crawler.command;
import edu.homework.crawler.exception.CrawlException;
public class ExitCommand implements Command {
@Override
public String name() {
return "exit";
}
@Override
public String description() {
return "Exit interactive CLI.";
}
@Override
public void execute(CommandContext context, String[] args) throws CrawlException {
context.stop();
context.view().printInfo("Bye.");
}
}

28
project/crawl/src/main/java/edu/homework/crawler/command/HelpCommand.java

@ -0,0 +1,28 @@
package edu.homework.crawler.command;
import edu.homework.crawler.exception.CrawlException;
public class HelpCommand implements Command {
@Override
public String name() {
return "help";
}
@Override
public String description() {
return "Show command usage.";
}
@Override
public void execute(CommandContext context, String[] args) throws CrawlException {
StringBuilder builder = new StringBuilder();
builder.append("Commands:\n");
for (Command command : context.commandRegistry().commands()) {
builder.append(" ").append(command.name()).append(" - ").append(command.description()).append('\n');
}
builder.append("\nExamples:\n");
builder.append(" crawl --site all --limit 10 --format json --out data/news.json\n");
builder.append(" crawl --site hnu --limit 5 --format csv --out data/hnu.csv\n");
context.view().printHelp(builder.toString());
}
}

22
project/crawl/src/main/java/edu/homework/crawler/command/SitesCommand.java

@ -0,0 +1,22 @@
package edu.homework.crawler.command;
import edu.homework.crawler.exception.CrawlException;
public class SitesCommand implements Command {
@Override
public String name() {
return "sites";
}
@Override
public String description() {
return "List supported websites.";
}
@Override
public void execute(CommandContext context, String[] args) throws CrawlException {
context.view().printInfo("Supported sites:");
context.siteRegistry().all().forEach(strategy ->
context.view().printInfo(" - " + strategy.key() + ": " + strategy.schoolName() + " (" + strategy.baseUrl() + ")"));
}
}

30
project/crawl/src/main/java/edu/homework/crawler/controller/CrawlerController.java

@ -0,0 +1,30 @@
package edu.homework.crawler.controller;
import edu.homework.crawler.exception.CrawlException;
import edu.homework.crawler.model.CrawlRequest;
import edu.homework.crawler.model.CrawlSummary;
import edu.homework.crawler.model.NewsItem;
import edu.homework.crawler.repository.FileNewsRepository;
import edu.homework.crawler.service.NewsCrawlerService;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class CrawlerController {
private final NewsCrawlerService crawlerService;
private final FileNewsRepository newsRepository;
public CrawlerController(NewsCrawlerService crawlerService, FileNewsRepository newsRepository) {
this.crawlerService = crawlerService;
this.newsRepository = newsRepository;
}
public CrawlSummary crawl(CrawlRequest request) throws CrawlException {
List<NewsItem> items = crawlerService.crawl(request.siteKey(), request.limitPerSite());
newsRepository.save(items, request.outputFormat(), request.outputPath());
Map<String, Integer> counts = items.stream()
.collect(Collectors.groupingBy(NewsItem::getSiteKey, Collectors.collectingAndThen(Collectors.counting(), Long::intValue)));
return new CrawlSummary(items, counts, request.outputPath());
}
}

7
project/crawl/src/main/java/edu/homework/crawler/exception/CommandException.java

@ -0,0 +1,7 @@
package edu.homework.crawler.exception;
public class CommandException extends CrawlException {
public CommandException(String message) {
super(message);
}
}

11
project/crawl/src/main/java/edu/homework/crawler/exception/CrawlException.java

@ -0,0 +1,11 @@
package edu.homework.crawler.exception;
public class CrawlException extends Exception {
public CrawlException(String message) {
super(message);
}
public CrawlException(String message, Throwable cause) {
super(message, cause);
}
}

7
project/crawl/src/main/java/edu/homework/crawler/exception/NetworkException.java

@ -0,0 +1,7 @@
package edu.homework.crawler.exception;
public class NetworkException extends CrawlException {
public NetworkException(String message, Throwable cause) {
super(message, cause);
}
}

11
project/crawl/src/main/java/edu/homework/crawler/exception/ParseException.java

@ -0,0 +1,11 @@
package edu.homework.crawler.exception;
public class ParseException extends CrawlException {
public ParseException(String message) {
super(message);
}
public ParseException(String message, Throwable cause) {
super(message, cause);
}
}

7
project/crawl/src/main/java/edu/homework/crawler/exception/SiteNotFoundException.java

@ -0,0 +1,7 @@
package edu.homework.crawler.exception;
public class SiteNotFoundException extends CrawlException {
public SiteNotFoundException(String message) {
super(message);
}
}

7
project/crawl/src/main/java/edu/homework/crawler/exception/StorageException.java

@ -0,0 +1,7 @@
package edu.homework.crawler.exception;
public class StorageException extends CrawlException {
public StorageException(String message, Throwable cause) {
super(message, cause);
}
}

8
project/crawl/src/main/java/edu/homework/crawler/model/CrawlRequest.java

@ -0,0 +1,8 @@
package edu.homework.crawler.model;
import edu.homework.crawler.repository.OutputFormat;
import java.nio.file.Path;
public record CrawlRequest(String siteKey, int limitPerSite, OutputFormat outputFormat, Path outputPath) {
}

11
project/crawl/src/main/java/edu/homework/crawler/model/CrawlSummary.java

@ -0,0 +1,11 @@
package edu.homework.crawler.model;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
public record CrawlSummary(List<NewsItem> items, Map<String, Integer> siteCounts, Path outputPath) {
public int totalCount() {
return items.size();
}
}

4
project/crawl/src/main/java/edu/homework/crawler/model/NewsCandidate.java

@ -0,0 +1,4 @@
package edu.homework.crawler.model;
public record NewsCandidate(String title, String url, String publishTime) {
}

107
project/crawl/src/main/java/edu/homework/crawler/model/NewsItem.java

@ -0,0 +1,107 @@
package edu.homework.crawler.model;
import java.time.LocalDateTime;
public class NewsItem {
private String school;
private String siteKey;
private String title;
private String url;
private String publishTime;
private String source;
private String author;
private String summary;
private String contentPreview;
private LocalDateTime crawledAt;
public NewsItem() {
}
public NewsItem(String school, String siteKey, String title, String url) {
this.school = school;
this.siteKey = siteKey;
this.title = title;
this.url = url;
this.crawledAt = LocalDateTime.now();
}
public String getSchool() {
return school;
}
public void setSchool(String school) {
this.school = school;
}
public String getSiteKey() {
return siteKey;
}
public void setSiteKey(String siteKey) {
this.siteKey = siteKey;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getPublishTime() {
return publishTime;
}
public void setPublishTime(String publishTime) {
this.publishTime = publishTime;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getContentPreview() {
return contentPreview;
}
public void setContentPreview(String contentPreview) {
this.contentPreview = contentPreview;
}
public LocalDateTime getCrawledAt() {
return crawledAt;
}
public void setCrawledAt(LocalDateTime crawledAt) {
this.crawledAt = crawledAt;
}
}

67
project/crawl/src/main/java/edu/homework/crawler/repository/FileNewsRepository.java

@ -0,0 +1,67 @@
package edu.homework.crawler.repository;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import edu.homework.crawler.exception.StorageException;
import edu.homework.crawler.model.NewsItem;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
public class FileNewsRepository {
private final ObjectMapper objectMapper;
public FileNewsRepository() {
this.objectMapper = new ObjectMapper()
.registerModule(new JavaTimeModule())
.enable(SerializationFeature.INDENT_OUTPUT)
.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS);
}
public void save(List<NewsItem> items, OutputFormat format, Path outputPath) throws StorageException {
try {
Path parent = outputPath.toAbsolutePath().getParent();
if (parent != null) {
Files.createDirectories(parent);
}
if (format == OutputFormat.JSON) {
objectMapper.writeValue(outputPath.toFile(), items);
} else {
writeCsv(items, outputPath);
}
} catch (IOException e) {
throw new StorageException("Failed to save crawler data to " + outputPath, e);
}
}
private void writeCsv(List<NewsItem> items, Path outputPath) throws IOException {
try (BufferedWriter writer = Files.newBufferedWriter(outputPath, StandardCharsets.UTF_8)) {
writer.write("school,siteKey,title,url,publishTime,source,author,summary,contentPreview,crawledAt");
writer.newLine();
for (NewsItem item : items) {
writer.write(String.join(",",
csv(item.getSchool()),
csv(item.getSiteKey()),
csv(item.getTitle()),
csv(item.getUrl()),
csv(item.getPublishTime()),
csv(item.getSource()),
csv(item.getAuthor()),
csv(item.getSummary()),
csv(item.getContentPreview()),
csv(item.getCrawledAt() == null ? "" : item.getCrawledAt().toString())));
writer.newLine();
}
}
}
private String csv(String value) {
String safeValue = value == null ? "" : value;
return "\"" + safeValue.replace("\"", "\"\"") + "\"";
}
}

17
project/crawl/src/main/java/edu/homework/crawler/repository/OutputFormat.java

@ -0,0 +1,17 @@
package edu.homework.crawler.repository;
import edu.homework.crawler.exception.CommandException;
public enum OutputFormat {
JSON,
CSV;
public static OutputFormat from(String value) throws CommandException {
for (OutputFormat format : values()) {
if (format.name().equalsIgnoreCase(value)) {
return format;
}
}
throw new CommandException("Unsupported output format: " + value + ". Use json or csv.");
}
}

37
project/crawl/src/main/java/edu/homework/crawler/service/NewsCrawlerService.java

@ -0,0 +1,37 @@
package edu.homework.crawler.service;
import edu.homework.crawler.exception.CrawlException;
import edu.homework.crawler.exception.SiteNotFoundException;
import edu.homework.crawler.model.NewsItem;
import edu.homework.crawler.strategy.CrawlStrategy;
import edu.homework.crawler.strategy.SiteRegistry;
import edu.homework.crawler.util.HttpFetcher;
import java.util.ArrayList;
import java.util.List;
public class NewsCrawlerService {
private final SiteRegistry siteRegistry;
private final HttpFetcher httpFetcher;
public NewsCrawlerService(SiteRegistry siteRegistry) {
this.siteRegistry = siteRegistry;
this.httpFetcher = new HttpFetcher();
}
public List<NewsItem> crawl(String siteKey, int limitPerSite) throws CrawlException {
List<CrawlStrategy> strategies = resolveStrategies(siteKey);
List<NewsItem> items = new ArrayList<>();
for (CrawlStrategy strategy : strategies) {
items.addAll(strategy.crawl(httpFetcher, limitPerSite));
}
return items;
}
private List<CrawlStrategy> resolveStrategies(String siteKey) throws SiteNotFoundException {
if ("all".equalsIgnoreCase(siteKey)) {
return siteRegistry.all();
}
return List.of(siteRegistry.get(siteKey));
}
}

177
project/crawl/src/main/java/edu/homework/crawler/strategy/AbstractVisualSiteBuilderStrategy.java

@ -0,0 +1,177 @@
package edu.homework.crawler.strategy;
import edu.homework.crawler.exception.CrawlException;
import edu.homework.crawler.exception.ParseException;
import edu.homework.crawler.model.NewsCandidate;
import edu.homework.crawler.model.NewsItem;
import edu.homework.crawler.util.HttpFetcher;
import edu.homework.crawler.util.TextExtractors;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.URI;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
public abstract class AbstractVisualSiteBuilderStrategy implements CrawlStrategy {
private static final String NEWS_LINK_SELECTOR = "a[href*=info/][href$=.htm],a[href$=.htm]";
@Override
public List<NewsItem> crawl(HttpFetcher fetcher, int limit) throws CrawlException {
Map<String, NewsCandidate> mergedCandidates = new LinkedHashMap<>();
CrawlException lastFailure = null;
for (String startUrl : startUrls()) {
try {
Document page = fetcher.fetch(startUrl);
for (NewsCandidate candidate : extractCandidates(page, limit * 8)) {
mergedCandidates.putIfAbsent(candidate.url(), candidate);
}
} catch (CrawlException e) {
lastFailure = e;
}
}
List<NewsCandidate> candidates = sortCandidates(new ArrayList<>(mergedCandidates.values()), limit * 8);
if (candidates.isEmpty()) {
if (lastFailure != null) {
throw lastFailure;
}
throw new ParseException("No news links found on " + baseUrl());
}
List<NewsItem> items = new ArrayList<>();
for (NewsCandidate candidate : candidates) {
if (items.size() >= limit) {
break;
}
try {
fetcher.politePause();
Document detail = fetcher.fetch(candidate.url());
items.add(parseDetail(candidate, detail));
} catch (CrawlException e) {
NewsItem fallback = new NewsItem(schoolName(), key(), candidate.title(), candidate.url());
fallback.setPublishTime(candidate.publishTime());
fallback.setSummary("Detail page failed: " + e.getMessage());
fallback.setCrawledAt(LocalDateTime.now());
items.add(fallback);
}
}
return items;
}
protected List<String> startUrls() {
return List.of(baseUrl());
}
protected List<NewsCandidate> extractCandidates(Document document, int maxCandidates) {
Map<String, NewsCandidate> candidates = new LinkedHashMap<>();
Elements links = document.select(candidateSelector());
for (Element link : links) {
String url = link.absUrl("href");
if (!isAcceptableUrl(url)) {
continue;
}
String title = extractCandidateTitle(link);
if (title.isBlank() || title.length() < 4) {
continue;
}
String date = TextExtractors.findDate(neighborText(link));
candidates.putIfAbsent(url, new NewsCandidate(title, url, date));
if (candidates.size() >= maxCandidates) {
break;
}
}
return sortCandidates(new ArrayList<>(candidates.values()), maxCandidates);
}
private List<NewsCandidate> sortCandidates(List<NewsCandidate> sorted, int maxCandidates) {
sorted.sort((left, right) -> {
boolean leftHasDate = !left.publishTime().isBlank();
boolean rightHasDate = !right.publishTime().isBlank();
if (leftHasDate != rightHasDate) {
return leftHasDate ? -1 : 1;
}
return right.publishTime().compareTo(left.publishTime());
});
if (sorted.size() > maxCandidates) {
return sorted.subList(0, maxCandidates);
}
return sorted;
}
protected String candidateSelector() {
return NEWS_LINK_SELECTOR;
}
protected NewsItem parseDetail(NewsCandidate candidate, Document detail) {
NewsItem item = new NewsItem(schoolName(), key(), extractTitle(candidate, detail), candidate.url());
String pageText = TextExtractors.clean(detail.text());
item.setPublishTime(TextExtractors.firstNonBlank(
TextExtractors.findPublishTime(pageText),
candidate.publishTime()));
item.setSource(TextExtractors.findLabelValue(pageText, "来源"));
item.setAuthor(TextExtractors.findLabelValue(pageText, "作者"));
item.setSummary(TextExtractors.clean(detail.select("meta[name=description]").attr("content")));
item.setContentPreview(extractContentPreview(detail));
item.setCrawledAt(LocalDateTime.now());
return item;
}
protected String extractTitle(NewsCandidate candidate, Document detail) {
String title = TextExtractors.firstNonBlank(
detail.select("h1").first() == null ? "" : detail.select("h1").first().text(),
detail.select(".ar_tit h3").first() == null ? "" : detail.select(".ar_tit h3").first().text(),
detail.select(".subTitle2 span").first() == null ? "" : detail.select(".subTitle2 span").first().text(),
detail.select("meta[name=pageTitle]").attr("content"),
candidate.title());
return TextExtractors.limit(TextExtractors.clean(title), 160);
}
protected String extractContentPreview(Document detail) {
String content = TextExtractors.firstNonBlank(
detail.select(".v_news_content").text(),
detail.select("#vsb_content").text(),
detail.select("#vsb_content_6").text(),
detail.body() == null ? "" : detail.body().text());
return TextExtractors.limit(TextExtractors.clean(content), 320);
}
private String extractCandidateTitle(Element link) {
String nestedHeading = "";
Element heading = link.selectFirst("h1,h2,h3,h4,h5,.tit,.title,.pXZCont,.c59665");
if (heading != null) {
nestedHeading = heading.text();
}
return TextExtractors.limit(TextExtractors.clean(TextExtractors.firstNonBlank(
link.attr("title"),
nestedHeading,
link.ownText(),
link.text())), 160);
}
private String neighborText(Element link) {
StringBuilder builder = new StringBuilder(link.text()).append(' ');
Element node = link;
for (int i = 0; i < 4 && node != null; i++) {
builder.append(node.text()).append(' ');
node = node.parent();
}
return TextExtractors.clean(builder.toString());
}
protected boolean isAcceptableUrl(String url) {
if (url == null || url.isBlank()) {
return false;
}
try {
URI base = URI.create(baseUrl());
URI candidate = URI.create(url);
return base.getHost().equalsIgnoreCase(candidate.getHost()) && candidate.getPath().contains("/info/");
} catch (IllegalArgumentException e) {
return false;
}
}
}

17
project/crawl/src/main/java/edu/homework/crawler/strategy/CrawlStrategy.java

@ -0,0 +1,17 @@
package edu.homework.crawler.strategy;
import edu.homework.crawler.exception.CrawlException;
import edu.homework.crawler.model.NewsItem;
import edu.homework.crawler.util.HttpFetcher;
import java.util.List;
public interface CrawlStrategy {
String key();
String schoolName();
String baseUrl();
List<NewsItem> crawl(HttpFetcher fetcher, int limit) throws CrawlException;
}

25
project/crawl/src/main/java/edu/homework/crawler/strategy/CsuNewsStrategy.java

@ -0,0 +1,25 @@
package edu.homework.crawler.strategy;
import java.util.List;
public class CsuNewsStrategy extends AbstractVisualSiteBuilderStrategy {
@Override
public String key() {
return "csu";
}
@Override
public String schoolName() {
return "中南大学";
}
@Override
public String baseUrl() {
return "https://news.csu.edu.cn/";
}
@Override
protected List<String> startUrls() {
return List.of("https://news.csu.edu.cn/xxyw.htm", baseUrl());
}
}

25
project/crawl/src/main/java/edu/homework/crawler/strategy/HnuNewsStrategy.java

@ -0,0 +1,25 @@
package edu.homework.crawler.strategy;
import java.util.List;
public class HnuNewsStrategy extends AbstractVisualSiteBuilderStrategy {
@Override
public String key() {
return "hnu";
}
@Override
public String schoolName() {
return "湖南大学";
}
@Override
public String baseUrl() {
return "https://news.hnu.edu.cn/";
}
@Override
protected List<String> startUrls() {
return List.of("https://news.hnu.edu.cn/xw/zhxw.htm", baseUrl());
}
}

30
project/crawl/src/main/java/edu/homework/crawler/strategy/HunnuNewsStrategy.java

@ -0,0 +1,30 @@
package edu.homework.crawler.strategy;
import java.util.List;
public class HunnuNewsStrategy extends AbstractVisualSiteBuilderStrategy {
@Override
public String key() {
return "hunnu";
}
@Override
public String schoolName() {
return "湖南师范大学";
}
@Override
public String baseUrl() {
return "https://news.hunnu.edu.cn/";
}
@Override
protected List<String> startUrls() {
return List.of("https://news.hunnu.edu.cn/sdxw.htm", baseUrl());
}
@Override
protected boolean isAcceptableUrl(String url) {
return super.isAcceptableUrl(url) && url.contains("/info/1005/");
}
}

36
project/crawl/src/main/java/edu/homework/crawler/strategy/SiteRegistry.java

@ -0,0 +1,36 @@
package edu.homework.crawler.strategy;
import edu.homework.crawler.exception.SiteNotFoundException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
public class SiteRegistry {
private final Map<String, CrawlStrategy> strategies = new LinkedHashMap<>();
public static SiteRegistry defaults() {
SiteRegistry registry = new SiteRegistry();
registry.register(new HnuNewsStrategy());
registry.register(new CsuNewsStrategy());
registry.register(new HunnuNewsStrategy());
return registry;
}
public void register(CrawlStrategy strategy) {
strategies.put(strategy.key(), strategy);
}
public CrawlStrategy get(String key) throws SiteNotFoundException {
CrawlStrategy strategy = strategies.get(key.toLowerCase());
if (strategy == null) {
throw new SiteNotFoundException("Unsupported site: " + key + ". Use all, hnu, csu, or hunnu.");
}
return strategy;
}
public List<CrawlStrategy> all() {
return new ArrayList<>(strategies.values());
}
}

39
project/crawl/src/main/java/edu/homework/crawler/util/HttpFetcher.java

@ -0,0 +1,39 @@
package edu.homework.crawler.util;
import edu.homework.crawler.exception.NetworkException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
public class HttpFetcher {
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36";
public Document fetch(String url) throws NetworkException {
IOException lastFailure = null;
for (int attempt = 1; attempt <= 3; attempt++) {
try {
return Jsoup.connect(url)
.userAgent(USER_AGENT)
.referrer("https://www.baidu.com/")
.timeout(20_000)
.maxBodySize(5 * 1024 * 1024)
.followRedirects(true)
.get();
} catch (IOException e) {
lastFailure = e;
politePause();
}
}
throw new NetworkException("Network request failed: " + url, lastFailure);
}
public void politePause() {
try {
Thread.sleep(250);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}

98
project/crawl/src/main/java/edu/homework/crawler/util/TextExtractors.java

@ -0,0 +1,98 @@
package edu.homework.crawler.util;
import java.time.LocalDate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public final class TextExtractors {
private static final Pattern DATE = Pattern.compile("(20\\d{2})[-年./](\\d{1,2})[-月./](\\d{1,2})日?(?:\\s+\\d{1,2}:\\d{2})?");
private static final Pattern SPLIT_DAY_YEAR_MONTH = Pattern.compile("(?<!\\d)(\\d{1,2})\\s+(20\\d{2})[-年./](\\d{1,2})(?![-\\d])");
private static final Pattern MONTH_DAY = Pattern.compile("(?<!\\d)(\\d{1,2})[-月.](\\d{1,2})日?(?!\\d)");
private TextExtractors() {
}
public static String clean(String value) {
if (value == null) {
return "";
}
return value.replace('\u00A0', ' ')
.replaceAll("\\s+", " ")
.trim();
}
public static String firstNonBlank(String... values) {
for (String value : values) {
String cleaned = clean(value);
if (!cleaned.isBlank()) {
return cleaned;
}
}
return "";
}
public static String limit(String value, int maxLength) {
String cleaned = clean(value);
if (cleaned.length() <= maxLength) {
return cleaned;
}
return cleaned.substring(0, maxLength) + "...";
}
public static String findPublishTime(String text) {
String cleaned = clean(text);
int index = cleaned.indexOf("发布时间");
if (index >= 0) {
String slice = cleaned.substring(index, Math.min(cleaned.length(), index + 80));
String date = findDate(slice);
if (!date.isBlank()) {
return date;
}
}
return findDate(cleaned);
}
public static String findDate(String text) {
Matcher matcher = SPLIT_DAY_YEAR_MONTH.matcher(clean(text));
if (matcher.find()) {
return matcher.group(2) + "-" + pad(matcher.group(3)) + "-" + pad(matcher.group(1));
}
matcher = DATE.matcher(clean(text));
if (matcher.find()) {
return normalizeFullDate(matcher);
}
matcher = MONTH_DAY.matcher(clean(text));
if (matcher.find()) {
int month = Integer.parseInt(matcher.group(1));
int day = Integer.parseInt(matcher.group(2));
LocalDate today = LocalDate.now();
int year = today.getYear();
if (LocalDate.of(year, month, day).isAfter(today.plusDays(7))) {
year--;
}
return year + "-" + pad(matcher.group(1)) + "-" + pad(matcher.group(2));
}
return "";
}
public static String findLabelValue(String text, String label) {
String cleaned = clean(text);
Pattern pattern = Pattern.compile(label + "[::]\\s*(.*?)(?=\\s*(?:来源|作者|发布时间|点击)[::]|$)");
Matcher matcher = pattern.matcher(cleaned);
if (matcher.find()) {
String value = clean(matcher.group(1));
if (!value.contains("点击") && !value.contains("发布时间")) {
return value;
}
}
return "";
}
private static String normalizeFullDate(Matcher matcher) {
return matcher.group(1) + "-" + pad(matcher.group(2)) + "-" + pad(matcher.group(3));
}
private static String pad(String value) {
return value.length() == 1 ? "0" + value : value;
}
}

37
project/crawl/src/main/java/edu/homework/crawler/view/ConsoleView.java

@ -0,0 +1,37 @@
package edu.homework.crawler.view;
import edu.homework.crawler.model.CrawlSummary;
public class ConsoleView {
public void printWelcome() {
println("University News Crawler");
println("Type help to see commands.");
}
public void printPrompt() {
System.out.print("crawler> ");
}
public void printHelp(String text) {
println(text);
}
public void printInfo(String text) {
println(text);
}
public void printError(String text) {
System.err.println("[ERROR] " + text);
}
public void printSummary(CrawlSummary summary) {
println("Crawl finished.");
println("Total items: " + summary.totalCount());
summary.siteCounts().forEach((site, count) -> println(" - " + site + ": " + count));
println("Saved to: " + summary.outputPath().toAbsolutePath());
}
private void println(String text) {
System.out.println(text);
}
}

38
project/crawl/src/test/java/edu/homework/crawler/command/CommandRegistryTest.java

@ -0,0 +1,38 @@
package edu.homework.crawler.command;
import edu.homework.crawler.exception.CommandException;
import org.junit.jupiter.api.Test;
import java.util.List;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
class CommandRegistryTest {
@Test
void tokenizesQuotedOutputPath() throws Exception {
CommandRegistry registry = new CommandRegistry();
List<String> tokens = registry.tokenize("crawl --site all --out \"data/my news.json\"");
assertEquals(List.of("crawl", "--site", "all", "--out", "data/my news.json"), tokens);
}
@Test
void parsesOptionsAsKeyValuePairs() throws Exception {
CommandRegistry registry = new CommandRegistry();
Map<String, String> options = registry.parseOptions(new String[]{"--site", "hnu", "--limit", "5"});
assertEquals("hnu", options.get("site"));
assertEquals("5", options.get("limit"));
}
@Test
void rejectsMissingOptionValue() {
CommandRegistry registry = new CommandRegistry();
assertThrows(CommandException.class, () -> registry.parseOptions(new String[]{"--site"}));
}
}

29
project/crawl/src/test/java/edu/homework/crawler/repository/FileNewsRepositoryTest.java

@ -0,0 +1,29 @@
package edu.homework.crawler.repository;
import edu.homework.crawler.model.NewsItem;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertTrue;
class FileNewsRepositoryTest {
@TempDir
Path tempDir;
@Test
void savesJsonFile() throws Exception {
FileNewsRepository repository = new FileNewsRepository();
NewsItem item = new NewsItem("湖南大学", "hnu", "测试新闻", "https://example.com/news");
Path output = tempDir.resolve("news.json");
repository.save(List.of(item), OutputFormat.JSON, output);
String json = Files.readString(output);
assertTrue(json.contains("测试新闻"));
assertTrue(json.contains("hnu"));
}
}

29
project/crawl/src/test/java/edu/homework/crawler/util/TextExtractorsTest.java

@ -0,0 +1,29 @@
package edu.homework.crawler.util;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
class TextExtractorsTest {
@Test
void normalizesChinesePublishDate() {
String text = "来源:新闻网 作者:张三 发布时间:2026年05月28日 17:14 点击:100次";
assertEquals("2026-05-28", TextExtractors.findPublishTime(text));
}
@Test
void extractsSimpleLabelValue() {
String text = "来源:新闻网 作者:李四 发布时间:2026-05-28";
assertEquals("新闻网", TextExtractors.findLabelValue(text, "来源"));
assertEquals("李四", TextExtractors.findLabelValue(text, "作者"));
}
@Test
void normalizesSplitDayYearMonthDate() {
String text = "28 2026-05";
assertEquals("2026-05-28", TextExtractors.findDate(text));
}
}
Loading…
Cancel
Save