40 changed files with 1443 additions and 0 deletions
@ -0,0 +1,3 @@ |
|||
target/ |
|||
*.log |
|||
*.tmp |
|||
Binary file not shown.
@ -0,0 +1,55 @@ |
|||
# University News Crawler |
|||
|
|||
Java homework project for crawling: |
|||
|
|||
- `https://news.hnu.edu.cn/` |
|||
- `https://news.csu.edu.cn/` |
|||
- `https://news.hunnu.edu.cn/` |
|||
|
|||
The code demonstrates the required architecture: |
|||
|
|||
- CLI interactive command line |
|||
- MVC: `model`, `view`, `controller` |
|||
- Command pattern: `command` package |
|||
- Strategy pattern: `strategy` package, one strategy per target website |
|||
- Custom exception hierarchy: `exception` package |
|||
- File persistence: JSON or CSV output |
|||
|
|||
## Run |
|||
|
|||
```powershell |
|||
mvn test |
|||
mvn exec:java -Dexec.args="crawl --site all --limit 5 --format json --out data/news.json" |
|||
``` |
|||
|
|||
Interactive CLI: |
|||
|
|||
```powershell |
|||
mvn exec:java |
|||
``` |
|||
|
|||
Useful commands: |
|||
|
|||
```text |
|||
help |
|||
sites |
|||
crawl --site all --limit 10 --format json --out data/news.json |
|||
crawl --site hnu --limit 5 --format csv --out data/hnu.csv |
|||
exit |
|||
``` |
|||
|
|||
## Output Fields |
|||
|
|||
Each crawled news item includes: |
|||
|
|||
- school |
|||
- site key |
|||
- title |
|||
- url |
|||
- publish time |
|||
- source |
|||
- author |
|||
- summary |
|||
- content preview |
|||
- crawled time |
|||
|
|||
File diff suppressed because one or more lines are too long
@ -0,0 +1,58 @@ |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
|
|||
<groupId>edu.homework</groupId> |
|||
<artifactId>university-news-crawler</artifactId> |
|||
<version>1.0.0</version> |
|||
<name>University News Crawler</name> |
|||
|
|||
<properties> |
|||
<maven.compiler.release>17</maven.compiler.release> |
|||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
|||
<junit.version>5.10.2</junit.version> |
|||
</properties> |
|||
|
|||
<dependencies> |
|||
<dependency> |
|||
<groupId>org.jsoup</groupId> |
|||
<artifactId>jsoup</artifactId> |
|||
<version>1.17.2</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>com.fasterxml.jackson.core</groupId> |
|||
<artifactId>jackson-databind</artifactId> |
|||
<version>2.17.2</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>com.fasterxml.jackson.datatype</groupId> |
|||
<artifactId>jackson-datatype-jsr310</artifactId> |
|||
<version>2.17.2</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.junit.jupiter</groupId> |
|||
<artifactId>junit-jupiter</artifactId> |
|||
<version>${junit.version}</version> |
|||
<scope>test</scope> |
|||
</dependency> |
|||
</dependencies> |
|||
|
|||
<build> |
|||
<plugins> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-surefire-plugin</artifactId> |
|||
<version>3.2.5</version> |
|||
</plugin> |
|||
<plugin> |
|||
<groupId>org.codehaus.mojo</groupId> |
|||
<artifactId>exec-maven-plugin</artifactId> |
|||
<version>3.3.0</version> |
|||
<configuration> |
|||
<mainClass>edu.homework.crawler.Main</mainClass> |
|||
</configuration> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
</project> |
|||
@ -0,0 +1,12 @@ |
|||
package edu.homework.crawler; |
|||
|
|||
import edu.homework.crawler.cli.CliApplication; |
|||
|
|||
public final class Main { |
|||
private Main() { |
|||
} |
|||
|
|||
public static void main(String[] args) { |
|||
CliApplication.defaultApplication().run(args); |
|||
} |
|||
} |
|||
@ -0,0 +1,74 @@ |
|||
package edu.homework.crawler.cli; |
|||
|
|||
import edu.homework.crawler.command.CommandContext; |
|||
import edu.homework.crawler.command.CommandRegistry; |
|||
import edu.homework.crawler.command.CrawlCommand; |
|||
import edu.homework.crawler.command.ExitCommand; |
|||
import edu.homework.crawler.command.HelpCommand; |
|||
import edu.homework.crawler.command.SitesCommand; |
|||
import edu.homework.crawler.controller.CrawlerController; |
|||
import edu.homework.crawler.exception.CrawlException; |
|||
import edu.homework.crawler.repository.FileNewsRepository; |
|||
import edu.homework.crawler.service.NewsCrawlerService; |
|||
import edu.homework.crawler.strategy.SiteRegistry; |
|||
import edu.homework.crawler.view.ConsoleView; |
|||
|
|||
import java.nio.charset.StandardCharsets; |
|||
import java.util.Scanner; |
|||
|
|||
public class CliApplication { |
|||
private final CommandContext context; |
|||
private final CommandRegistry commandRegistry; |
|||
|
|||
public CliApplication(CommandContext context, CommandRegistry commandRegistry) { |
|||
this.context = context; |
|||
this.commandRegistry = commandRegistry; |
|||
} |
|||
|
|||
public static CliApplication defaultApplication() { |
|||
ConsoleView view = new ConsoleView(); |
|||
SiteRegistry siteRegistry = SiteRegistry.defaults(); |
|||
NewsCrawlerService service = new NewsCrawlerService(siteRegistry); |
|||
FileNewsRepository repository = new FileNewsRepository(); |
|||
CrawlerController controller = new CrawlerController(service, repository); |
|||
|
|||
CommandRegistry registry = new CommandRegistry(); |
|||
CommandContext context = new CommandContext(controller, view, registry, siteRegistry); |
|||
registry.register(new HelpCommand()); |
|||
registry.register(new SitesCommand()); |
|||
registry.register(new CrawlCommand()); |
|||
registry.register(new ExitCommand()); |
|||
return new CliApplication(context, registry); |
|||
} |
|||
|
|||
public void run(String[] args) { |
|||
if (args.length > 0) { |
|||
executeLine(String.join(" ", args)); |
|||
return; |
|||
} |
|||
|
|||
context.view().printWelcome(); |
|||
try (Scanner scanner = new Scanner(System.in, StandardCharsets.UTF_8)) { |
|||
while (context.isRunning()) { |
|||
context.view().printPrompt(); |
|||
if (!scanner.hasNextLine()) { |
|||
break; |
|||
} |
|||
executeLine(scanner.nextLine()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private void executeLine(String line) { |
|||
try { |
|||
commandRegistry.execute(context, line); |
|||
} catch (CrawlException e) { |
|||
context.view().printError(e.getMessage()); |
|||
if (e.getCause() != null) { |
|||
context.view().printError("Cause: " + e.getCause().getMessage()); |
|||
} |
|||
} catch (RuntimeException e) { |
|||
context.view().printError("Unexpected error: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package edu.homework.crawler.command; |
|||
|
|||
import edu.homework.crawler.exception.CrawlException; |
|||
|
|||
public interface Command { |
|||
String name(); |
|||
|
|||
String description(); |
|||
|
|||
void execute(CommandContext context, String[] args) throws CrawlException; |
|||
} |
|||
@ -0,0 +1,44 @@ |
|||
package edu.homework.crawler.command; |
|||
|
|||
import edu.homework.crawler.controller.CrawlerController; |
|||
import edu.homework.crawler.strategy.SiteRegistry; |
|||
import edu.homework.crawler.view.ConsoleView; |
|||
|
|||
public class CommandContext { |
|||
private final CrawlerController controller; |
|||
private final ConsoleView view; |
|||
private final CommandRegistry commandRegistry; |
|||
private final SiteRegistry siteRegistry; |
|||
private boolean running = true; |
|||
|
|||
public CommandContext(CrawlerController controller, ConsoleView view, CommandRegistry commandRegistry, SiteRegistry siteRegistry) { |
|||
this.controller = controller; |
|||
this.view = view; |
|||
this.commandRegistry = commandRegistry; |
|||
this.siteRegistry = siteRegistry; |
|||
} |
|||
|
|||
public CrawlerController controller() { |
|||
return controller; |
|||
} |
|||
|
|||
public ConsoleView view() { |
|||
return view; |
|||
} |
|||
|
|||
public CommandRegistry commandRegistry() { |
|||
return commandRegistry; |
|||
} |
|||
|
|||
public SiteRegistry siteRegistry() { |
|||
return siteRegistry; |
|||
} |
|||
|
|||
public boolean isRunning() { |
|||
return running; |
|||
} |
|||
|
|||
public void stop() { |
|||
this.running = false; |
|||
} |
|||
} |
|||
@ -0,0 +1,82 @@ |
|||
package edu.homework.crawler.command; |
|||
|
|||
import edu.homework.crawler.exception.CommandException; |
|||
import edu.homework.crawler.exception.CrawlException; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.Arrays; |
|||
import java.util.Collection; |
|||
import java.util.LinkedHashMap; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
public class CommandRegistry { |
|||
private final Map<String, Command> commands = new LinkedHashMap<>(); |
|||
|
|||
public void register(Command command) { |
|||
commands.put(command.name(), command); |
|||
} |
|||
|
|||
public Collection<Command> commands() { |
|||
return commands.values(); |
|||
} |
|||
|
|||
public void execute(CommandContext context, String line) throws CrawlException { |
|||
List<String> tokens = tokenize(line); |
|||
if (tokens.isEmpty()) { |
|||
return; |
|||
} |
|||
Command command = commands.get(tokens.get(0)); |
|||
if (command == null) { |
|||
throw new CommandException("Unknown command: " + tokens.get(0) + ". Type help to see commands."); |
|||
} |
|||
String[] args = tokens.subList(1, tokens.size()).toArray(String[]::new); |
|||
command.execute(context, args); |
|||
} |
|||
|
|||
public List<String> tokenize(String line) throws CommandException { |
|||
List<String> tokens = new ArrayList<>(); |
|||
StringBuilder current = new StringBuilder(); |
|||
boolean inQuotes = false; |
|||
for (int i = 0; i < line.length(); i++) { |
|||
char ch = line.charAt(i); |
|||
if (ch == '"') { |
|||
inQuotes = !inQuotes; |
|||
continue; |
|||
} |
|||
if (Character.isWhitespace(ch) && !inQuotes) { |
|||
addToken(tokens, current); |
|||
} else { |
|||
current.append(ch); |
|||
} |
|||
} |
|||
if (inQuotes) { |
|||
throw new CommandException("Missing closing quote in command line."); |
|||
} |
|||
addToken(tokens, current); |
|||
return tokens; |
|||
} |
|||
|
|||
private void addToken(List<String> tokens, StringBuilder current) { |
|||
if (current.length() > 0) { |
|||
tokens.add(current.toString()); |
|||
current.setLength(0); |
|||
} |
|||
} |
|||
|
|||
public Map<String, String> parseOptions(String[] args) throws CommandException { |
|||
Map<String, String> options = new LinkedHashMap<>(); |
|||
List<String> list = Arrays.asList(args); |
|||
for (int i = 0; i < list.size(); i++) { |
|||
String key = list.get(i); |
|||
if (!key.startsWith("--")) { |
|||
throw new CommandException("Invalid argument: " + key + ". Options must start with --."); |
|||
} |
|||
if (i + 1 >= list.size() || list.get(i + 1).startsWith("--")) { |
|||
throw new CommandException("Missing value for option " + key + "."); |
|||
} |
|||
options.put(key.substring(2), list.get(++i)); |
|||
} |
|||
return options; |
|||
} |
|||
} |
|||
@ -0,0 +1,55 @@ |
|||
package edu.homework.crawler.command; |
|||
|
|||
import edu.homework.crawler.exception.CommandException; |
|||
import edu.homework.crawler.exception.CrawlException; |
|||
import edu.homework.crawler.model.CrawlRequest; |
|||
import edu.homework.crawler.repository.OutputFormat; |
|||
|
|||
import java.nio.file.Path; |
|||
import java.time.LocalDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.Map; |
|||
|
|||
public class CrawlCommand implements Command { |
|||
private static final DateTimeFormatter FILE_TIME = DateTimeFormatter.ofPattern("yyyyMMdd-HHmmss"); |
|||
|
|||
@Override |
|||
public String name() { |
|||
return "crawl"; |
|||
} |
|||
|
|||
@Override |
|||
public String description() { |
|||
return "Crawl news and save to a JSON or CSV file."; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(CommandContext context, String[] args) throws CrawlException { |
|||
Map<String, String> options = context.commandRegistry().parseOptions(args); |
|||
String site = options.getOrDefault("site", "all"); |
|||
int limit = parseLimit(options.getOrDefault("limit", "10")); |
|||
OutputFormat format = OutputFormat.from(options.getOrDefault("format", "json")); |
|||
Path outputPath = Path.of(options.getOrDefault("out", defaultOutput(format))); |
|||
|
|||
context.view().printInfo("Starting crawl: site=" + site + ", limit=" + limit + ", format=" + format.name().toLowerCase()); |
|||
CrawlRequest request = new CrawlRequest(site, limit, format, outputPath); |
|||
context.view().printSummary(context.controller().crawl(request)); |
|||
} |
|||
|
|||
private int parseLimit(String value) throws CommandException { |
|||
try { |
|||
int limit = Integer.parseInt(value); |
|||
if (limit <= 0 || limit > 100) { |
|||
throw new CommandException("Limit must be between 1 and 100."); |
|||
} |
|||
return limit; |
|||
} catch (NumberFormatException e) { |
|||
throw new CommandException("Limit must be a number: " + value); |
|||
} |
|||
} |
|||
|
|||
private String defaultOutput(OutputFormat format) { |
|||
String suffix = format == OutputFormat.JSON ? ".json" : ".csv"; |
|||
return "data/news-" + LocalDateTime.now().format(FILE_TIME) + suffix; |
|||
} |
|||
} |
|||
@ -0,0 +1,21 @@ |
|||
package edu.homework.crawler.command; |
|||
|
|||
import edu.homework.crawler.exception.CrawlException; |
|||
|
|||
public class ExitCommand implements Command { |
|||
@Override |
|||
public String name() { |
|||
return "exit"; |
|||
} |
|||
|
|||
@Override |
|||
public String description() { |
|||
return "Exit interactive CLI."; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(CommandContext context, String[] args) throws CrawlException { |
|||
context.stop(); |
|||
context.view().printInfo("Bye."); |
|||
} |
|||
} |
|||
@ -0,0 +1,28 @@ |
|||
package edu.homework.crawler.command; |
|||
|
|||
import edu.homework.crawler.exception.CrawlException; |
|||
|
|||
public class HelpCommand implements Command { |
|||
@Override |
|||
public String name() { |
|||
return "help"; |
|||
} |
|||
|
|||
@Override |
|||
public String description() { |
|||
return "Show command usage."; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(CommandContext context, String[] args) throws CrawlException { |
|||
StringBuilder builder = new StringBuilder(); |
|||
builder.append("Commands:\n"); |
|||
for (Command command : context.commandRegistry().commands()) { |
|||
builder.append(" ").append(command.name()).append(" - ").append(command.description()).append('\n'); |
|||
} |
|||
builder.append("\nExamples:\n"); |
|||
builder.append(" crawl --site all --limit 10 --format json --out data/news.json\n"); |
|||
builder.append(" crawl --site hnu --limit 5 --format csv --out data/hnu.csv\n"); |
|||
context.view().printHelp(builder.toString()); |
|||
} |
|||
} |
|||
@ -0,0 +1,22 @@ |
|||
package edu.homework.crawler.command; |
|||
|
|||
import edu.homework.crawler.exception.CrawlException; |
|||
|
|||
public class SitesCommand implements Command { |
|||
@Override |
|||
public String name() { |
|||
return "sites"; |
|||
} |
|||
|
|||
@Override |
|||
public String description() { |
|||
return "List supported websites."; |
|||
} |
|||
|
|||
@Override |
|||
public void execute(CommandContext context, String[] args) throws CrawlException { |
|||
context.view().printInfo("Supported sites:"); |
|||
context.siteRegistry().all().forEach(strategy -> |
|||
context.view().printInfo(" - " + strategy.key() + ": " + strategy.schoolName() + " (" + strategy.baseUrl() + ")")); |
|||
} |
|||
} |
|||
@ -0,0 +1,30 @@ |
|||
package edu.homework.crawler.controller; |
|||
|
|||
import edu.homework.crawler.exception.CrawlException; |
|||
import edu.homework.crawler.model.CrawlRequest; |
|||
import edu.homework.crawler.model.CrawlSummary; |
|||
import edu.homework.crawler.model.NewsItem; |
|||
import edu.homework.crawler.repository.FileNewsRepository; |
|||
import edu.homework.crawler.service.NewsCrawlerService; |
|||
|
|||
import java.util.List; |
|||
import java.util.Map; |
|||
import java.util.stream.Collectors; |
|||
|
|||
public class CrawlerController { |
|||
private final NewsCrawlerService crawlerService; |
|||
private final FileNewsRepository newsRepository; |
|||
|
|||
public CrawlerController(NewsCrawlerService crawlerService, FileNewsRepository newsRepository) { |
|||
this.crawlerService = crawlerService; |
|||
this.newsRepository = newsRepository; |
|||
} |
|||
|
|||
public CrawlSummary crawl(CrawlRequest request) throws CrawlException { |
|||
List<NewsItem> items = crawlerService.crawl(request.siteKey(), request.limitPerSite()); |
|||
newsRepository.save(items, request.outputFormat(), request.outputPath()); |
|||
Map<String, Integer> counts = items.stream() |
|||
.collect(Collectors.groupingBy(NewsItem::getSiteKey, Collectors.collectingAndThen(Collectors.counting(), Long::intValue))); |
|||
return new CrawlSummary(items, counts, request.outputPath()); |
|||
} |
|||
} |
|||
@ -0,0 +1,7 @@ |
|||
package edu.homework.crawler.exception; |
|||
|
|||
public class CommandException extends CrawlException { |
|||
public CommandException(String message) { |
|||
super(message); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package edu.homework.crawler.exception; |
|||
|
|||
public class CrawlException extends Exception { |
|||
public CrawlException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public CrawlException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,7 @@ |
|||
package edu.homework.crawler.exception; |
|||
|
|||
public class NetworkException extends CrawlException { |
|||
public NetworkException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package edu.homework.crawler.exception; |
|||
|
|||
public class ParseException extends CrawlException { |
|||
public ParseException(String message) { |
|||
super(message); |
|||
} |
|||
|
|||
public ParseException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,7 @@ |
|||
package edu.homework.crawler.exception; |
|||
|
|||
public class SiteNotFoundException extends CrawlException { |
|||
public SiteNotFoundException(String message) { |
|||
super(message); |
|||
} |
|||
} |
|||
@ -0,0 +1,7 @@ |
|||
package edu.homework.crawler.exception; |
|||
|
|||
public class StorageException extends CrawlException { |
|||
public StorageException(String message, Throwable cause) { |
|||
super(message, cause); |
|||
} |
|||
} |
|||
@ -0,0 +1,8 @@ |
|||
package edu.homework.crawler.model; |
|||
|
|||
import edu.homework.crawler.repository.OutputFormat; |
|||
|
|||
import java.nio.file.Path; |
|||
|
|||
public record CrawlRequest(String siteKey, int limitPerSite, OutputFormat outputFormat, Path outputPath) { |
|||
} |
|||
@ -0,0 +1,11 @@ |
|||
package edu.homework.crawler.model; |
|||
|
|||
import java.nio.file.Path; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
public record CrawlSummary(List<NewsItem> items, Map<String, Integer> siteCounts, Path outputPath) { |
|||
public int totalCount() { |
|||
return items.size(); |
|||
} |
|||
} |
|||
@ -0,0 +1,4 @@ |
|||
package edu.homework.crawler.model; |
|||
|
|||
public record NewsCandidate(String title, String url, String publishTime) { |
|||
} |
|||
@ -0,0 +1,107 @@ |
|||
package edu.homework.crawler.model; |
|||
|
|||
import java.time.LocalDateTime; |
|||
|
|||
public class NewsItem { |
|||
private String school; |
|||
private String siteKey; |
|||
private String title; |
|||
private String url; |
|||
private String publishTime; |
|||
private String source; |
|||
private String author; |
|||
private String summary; |
|||
private String contentPreview; |
|||
private LocalDateTime crawledAt; |
|||
|
|||
public NewsItem() { |
|||
} |
|||
|
|||
public NewsItem(String school, String siteKey, String title, String url) { |
|||
this.school = school; |
|||
this.siteKey = siteKey; |
|||
this.title = title; |
|||
this.url = url; |
|||
this.crawledAt = LocalDateTime.now(); |
|||
} |
|||
|
|||
public String getSchool() { |
|||
return school; |
|||
} |
|||
|
|||
public void setSchool(String school) { |
|||
this.school = school; |
|||
} |
|||
|
|||
public String getSiteKey() { |
|||
return siteKey; |
|||
} |
|||
|
|||
public void setSiteKey(String siteKey) { |
|||
this.siteKey = siteKey; |
|||
} |
|||
|
|||
public String getTitle() { |
|||
return title; |
|||
} |
|||
|
|||
public void setTitle(String title) { |
|||
this.title = title; |
|||
} |
|||
|
|||
public String getUrl() { |
|||
return url; |
|||
} |
|||
|
|||
public void setUrl(String url) { |
|||
this.url = url; |
|||
} |
|||
|
|||
public String getPublishTime() { |
|||
return publishTime; |
|||
} |
|||
|
|||
public void setPublishTime(String publishTime) { |
|||
this.publishTime = publishTime; |
|||
} |
|||
|
|||
public String getSource() { |
|||
return source; |
|||
} |
|||
|
|||
public void setSource(String source) { |
|||
this.source = source; |
|||
} |
|||
|
|||
public String getAuthor() { |
|||
return author; |
|||
} |
|||
|
|||
public void setAuthor(String author) { |
|||
this.author = author; |
|||
} |
|||
|
|||
public String getSummary() { |
|||
return summary; |
|||
} |
|||
|
|||
public void setSummary(String summary) { |
|||
this.summary = summary; |
|||
} |
|||
|
|||
public String getContentPreview() { |
|||
return contentPreview; |
|||
} |
|||
|
|||
public void setContentPreview(String contentPreview) { |
|||
this.contentPreview = contentPreview; |
|||
} |
|||
|
|||
public LocalDateTime getCrawledAt() { |
|||
return crawledAt; |
|||
} |
|||
|
|||
public void setCrawledAt(LocalDateTime crawledAt) { |
|||
this.crawledAt = crawledAt; |
|||
} |
|||
} |
|||
@ -0,0 +1,67 @@ |
|||
package edu.homework.crawler.repository; |
|||
|
|||
import com.fasterxml.jackson.databind.ObjectMapper; |
|||
import com.fasterxml.jackson.databind.SerializationFeature; |
|||
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; |
|||
import edu.homework.crawler.exception.StorageException; |
|||
import edu.homework.crawler.model.NewsItem; |
|||
|
|||
import java.io.BufferedWriter; |
|||
import java.io.IOException; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Path; |
|||
import java.util.List; |
|||
|
|||
public class FileNewsRepository { |
|||
private final ObjectMapper objectMapper; |
|||
|
|||
public FileNewsRepository() { |
|||
this.objectMapper = new ObjectMapper() |
|||
.registerModule(new JavaTimeModule()) |
|||
.enable(SerializationFeature.INDENT_OUTPUT) |
|||
.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); |
|||
} |
|||
|
|||
public void save(List<NewsItem> items, OutputFormat format, Path outputPath) throws StorageException { |
|||
try { |
|||
Path parent = outputPath.toAbsolutePath().getParent(); |
|||
if (parent != null) { |
|||
Files.createDirectories(parent); |
|||
} |
|||
if (format == OutputFormat.JSON) { |
|||
objectMapper.writeValue(outputPath.toFile(), items); |
|||
} else { |
|||
writeCsv(items, outputPath); |
|||
} |
|||
} catch (IOException e) { |
|||
throw new StorageException("Failed to save crawler data to " + outputPath, e); |
|||
} |
|||
} |
|||
|
|||
private void writeCsv(List<NewsItem> items, Path outputPath) throws IOException { |
|||
try (BufferedWriter writer = Files.newBufferedWriter(outputPath, StandardCharsets.UTF_8)) { |
|||
writer.write("school,siteKey,title,url,publishTime,source,author,summary,contentPreview,crawledAt"); |
|||
writer.newLine(); |
|||
for (NewsItem item : items) { |
|||
writer.write(String.join(",", |
|||
csv(item.getSchool()), |
|||
csv(item.getSiteKey()), |
|||
csv(item.getTitle()), |
|||
csv(item.getUrl()), |
|||
csv(item.getPublishTime()), |
|||
csv(item.getSource()), |
|||
csv(item.getAuthor()), |
|||
csv(item.getSummary()), |
|||
csv(item.getContentPreview()), |
|||
csv(item.getCrawledAt() == null ? "" : item.getCrawledAt().toString()))); |
|||
writer.newLine(); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private String csv(String value) { |
|||
String safeValue = value == null ? "" : value; |
|||
return "\"" + safeValue.replace("\"", "\"\"") + "\""; |
|||
} |
|||
} |
|||
@ -0,0 +1,17 @@ |
|||
package edu.homework.crawler.repository; |
|||
|
|||
import edu.homework.crawler.exception.CommandException; |
|||
|
|||
public enum OutputFormat { |
|||
JSON, |
|||
CSV; |
|||
|
|||
public static OutputFormat from(String value) throws CommandException { |
|||
for (OutputFormat format : values()) { |
|||
if (format.name().equalsIgnoreCase(value)) { |
|||
return format; |
|||
} |
|||
} |
|||
throw new CommandException("Unsupported output format: " + value + ". Use json or csv."); |
|||
} |
|||
} |
|||
@ -0,0 +1,37 @@ |
|||
package edu.homework.crawler.service; |
|||
|
|||
import edu.homework.crawler.exception.CrawlException; |
|||
import edu.homework.crawler.exception.SiteNotFoundException; |
|||
import edu.homework.crawler.model.NewsItem; |
|||
import edu.homework.crawler.strategy.CrawlStrategy; |
|||
import edu.homework.crawler.strategy.SiteRegistry; |
|||
import edu.homework.crawler.util.HttpFetcher; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class NewsCrawlerService { |
|||
private final SiteRegistry siteRegistry; |
|||
private final HttpFetcher httpFetcher; |
|||
|
|||
public NewsCrawlerService(SiteRegistry siteRegistry) { |
|||
this.siteRegistry = siteRegistry; |
|||
this.httpFetcher = new HttpFetcher(); |
|||
} |
|||
|
|||
public List<NewsItem> crawl(String siteKey, int limitPerSite) throws CrawlException { |
|||
List<CrawlStrategy> strategies = resolveStrategies(siteKey); |
|||
List<NewsItem> items = new ArrayList<>(); |
|||
for (CrawlStrategy strategy : strategies) { |
|||
items.addAll(strategy.crawl(httpFetcher, limitPerSite)); |
|||
} |
|||
return items; |
|||
} |
|||
|
|||
private List<CrawlStrategy> resolveStrategies(String siteKey) throws SiteNotFoundException { |
|||
if ("all".equalsIgnoreCase(siteKey)) { |
|||
return siteRegistry.all(); |
|||
} |
|||
return List.of(siteRegistry.get(siteKey)); |
|||
} |
|||
} |
|||
@ -0,0 +1,177 @@ |
|||
package edu.homework.crawler.strategy; |
|||
|
|||
import edu.homework.crawler.exception.CrawlException; |
|||
import edu.homework.crawler.exception.ParseException; |
|||
import edu.homework.crawler.model.NewsCandidate; |
|||
import edu.homework.crawler.model.NewsItem; |
|||
import edu.homework.crawler.util.HttpFetcher; |
|||
import edu.homework.crawler.util.TextExtractors; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.net.URI; |
|||
import java.time.LocalDateTime; |
|||
import java.util.ArrayList; |
|||
import java.util.LinkedHashMap; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
public abstract class AbstractVisualSiteBuilderStrategy implements CrawlStrategy { |
|||
private static final String NEWS_LINK_SELECTOR = "a[href*=info/][href$=.htm],a[href$=.htm]"; |
|||
|
|||
@Override |
|||
public List<NewsItem> crawl(HttpFetcher fetcher, int limit) throws CrawlException { |
|||
Map<String, NewsCandidate> mergedCandidates = new LinkedHashMap<>(); |
|||
CrawlException lastFailure = null; |
|||
for (String startUrl : startUrls()) { |
|||
try { |
|||
Document page = fetcher.fetch(startUrl); |
|||
for (NewsCandidate candidate : extractCandidates(page, limit * 8)) { |
|||
mergedCandidates.putIfAbsent(candidate.url(), candidate); |
|||
} |
|||
} catch (CrawlException e) { |
|||
lastFailure = e; |
|||
} |
|||
} |
|||
List<NewsCandidate> candidates = sortCandidates(new ArrayList<>(mergedCandidates.values()), limit * 8); |
|||
if (candidates.isEmpty()) { |
|||
if (lastFailure != null) { |
|||
throw lastFailure; |
|||
} |
|||
throw new ParseException("No news links found on " + baseUrl()); |
|||
} |
|||
|
|||
List<NewsItem> items = new ArrayList<>(); |
|||
for (NewsCandidate candidate : candidates) { |
|||
if (items.size() >= limit) { |
|||
break; |
|||
} |
|||
try { |
|||
fetcher.politePause(); |
|||
Document detail = fetcher.fetch(candidate.url()); |
|||
items.add(parseDetail(candidate, detail)); |
|||
} catch (CrawlException e) { |
|||
NewsItem fallback = new NewsItem(schoolName(), key(), candidate.title(), candidate.url()); |
|||
fallback.setPublishTime(candidate.publishTime()); |
|||
fallback.setSummary("Detail page failed: " + e.getMessage()); |
|||
fallback.setCrawledAt(LocalDateTime.now()); |
|||
items.add(fallback); |
|||
} |
|||
} |
|||
return items; |
|||
} |
|||
|
|||
protected List<String> startUrls() { |
|||
return List.of(baseUrl()); |
|||
} |
|||
|
|||
protected List<NewsCandidate> extractCandidates(Document document, int maxCandidates) { |
|||
Map<String, NewsCandidate> candidates = new LinkedHashMap<>(); |
|||
Elements links = document.select(candidateSelector()); |
|||
for (Element link : links) { |
|||
String url = link.absUrl("href"); |
|||
if (!isAcceptableUrl(url)) { |
|||
continue; |
|||
} |
|||
String title = extractCandidateTitle(link); |
|||
if (title.isBlank() || title.length() < 4) { |
|||
continue; |
|||
} |
|||
String date = TextExtractors.findDate(neighborText(link)); |
|||
candidates.putIfAbsent(url, new NewsCandidate(title, url, date)); |
|||
if (candidates.size() >= maxCandidates) { |
|||
break; |
|||
} |
|||
} |
|||
return sortCandidates(new ArrayList<>(candidates.values()), maxCandidates); |
|||
} |
|||
|
|||
private List<NewsCandidate> sortCandidates(List<NewsCandidate> sorted, int maxCandidates) { |
|||
sorted.sort((left, right) -> { |
|||
boolean leftHasDate = !left.publishTime().isBlank(); |
|||
boolean rightHasDate = !right.publishTime().isBlank(); |
|||
if (leftHasDate != rightHasDate) { |
|||
return leftHasDate ? -1 : 1; |
|||
} |
|||
return right.publishTime().compareTo(left.publishTime()); |
|||
}); |
|||
if (sorted.size() > maxCandidates) { |
|||
return sorted.subList(0, maxCandidates); |
|||
} |
|||
return sorted; |
|||
} |
|||
|
|||
protected String candidateSelector() { |
|||
return NEWS_LINK_SELECTOR; |
|||
} |
|||
|
|||
protected NewsItem parseDetail(NewsCandidate candidate, Document detail) { |
|||
NewsItem item = new NewsItem(schoolName(), key(), extractTitle(candidate, detail), candidate.url()); |
|||
String pageText = TextExtractors.clean(detail.text()); |
|||
item.setPublishTime(TextExtractors.firstNonBlank( |
|||
TextExtractors.findPublishTime(pageText), |
|||
candidate.publishTime())); |
|||
item.setSource(TextExtractors.findLabelValue(pageText, "来源")); |
|||
item.setAuthor(TextExtractors.findLabelValue(pageText, "作者")); |
|||
item.setSummary(TextExtractors.clean(detail.select("meta[name=description]").attr("content"))); |
|||
item.setContentPreview(extractContentPreview(detail)); |
|||
item.setCrawledAt(LocalDateTime.now()); |
|||
return item; |
|||
} |
|||
|
|||
protected String extractTitle(NewsCandidate candidate, Document detail) { |
|||
String title = TextExtractors.firstNonBlank( |
|||
detail.select("h1").first() == null ? "" : detail.select("h1").first().text(), |
|||
detail.select(".ar_tit h3").first() == null ? "" : detail.select(".ar_tit h3").first().text(), |
|||
detail.select(".subTitle2 span").first() == null ? "" : detail.select(".subTitle2 span").first().text(), |
|||
detail.select("meta[name=pageTitle]").attr("content"), |
|||
candidate.title()); |
|||
return TextExtractors.limit(TextExtractors.clean(title), 160); |
|||
} |
|||
|
|||
protected String extractContentPreview(Document detail) { |
|||
String content = TextExtractors.firstNonBlank( |
|||
detail.select(".v_news_content").text(), |
|||
detail.select("#vsb_content").text(), |
|||
detail.select("#vsb_content_6").text(), |
|||
detail.body() == null ? "" : detail.body().text()); |
|||
return TextExtractors.limit(TextExtractors.clean(content), 320); |
|||
} |
|||
|
|||
private String extractCandidateTitle(Element link) { |
|||
String nestedHeading = ""; |
|||
Element heading = link.selectFirst("h1,h2,h3,h4,h5,.tit,.title,.pXZCont,.c59665"); |
|||
if (heading != null) { |
|||
nestedHeading = heading.text(); |
|||
} |
|||
return TextExtractors.limit(TextExtractors.clean(TextExtractors.firstNonBlank( |
|||
link.attr("title"), |
|||
nestedHeading, |
|||
link.ownText(), |
|||
link.text())), 160); |
|||
} |
|||
|
|||
private String neighborText(Element link) { |
|||
StringBuilder builder = new StringBuilder(link.text()).append(' '); |
|||
Element node = link; |
|||
for (int i = 0; i < 4 && node != null; i++) { |
|||
builder.append(node.text()).append(' '); |
|||
node = node.parent(); |
|||
} |
|||
return TextExtractors.clean(builder.toString()); |
|||
} |
|||
|
|||
protected boolean isAcceptableUrl(String url) { |
|||
if (url == null || url.isBlank()) { |
|||
return false; |
|||
} |
|||
try { |
|||
URI base = URI.create(baseUrl()); |
|||
URI candidate = URI.create(url); |
|||
return base.getHost().equalsIgnoreCase(candidate.getHost()) && candidate.getPath().contains("/info/"); |
|||
} catch (IllegalArgumentException e) { |
|||
return false; |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,17 @@ |
|||
package edu.homework.crawler.strategy; |
|||
|
|||
import edu.homework.crawler.exception.CrawlException; |
|||
import edu.homework.crawler.model.NewsItem; |
|||
import edu.homework.crawler.util.HttpFetcher; |
|||
|
|||
import java.util.List; |
|||
|
|||
public interface CrawlStrategy { |
|||
String key(); |
|||
|
|||
String schoolName(); |
|||
|
|||
String baseUrl(); |
|||
|
|||
List<NewsItem> crawl(HttpFetcher fetcher, int limit) throws CrawlException; |
|||
} |
|||
@ -0,0 +1,25 @@ |
|||
package edu.homework.crawler.strategy; |
|||
|
|||
import java.util.List; |
|||
|
|||
public class CsuNewsStrategy extends AbstractVisualSiteBuilderStrategy { |
|||
@Override |
|||
public String key() { |
|||
return "csu"; |
|||
} |
|||
|
|||
@Override |
|||
public String schoolName() { |
|||
return "中南大学"; |
|||
} |
|||
|
|||
@Override |
|||
public String baseUrl() { |
|||
return "https://news.csu.edu.cn/"; |
|||
} |
|||
|
|||
@Override |
|||
protected List<String> startUrls() { |
|||
return List.of("https://news.csu.edu.cn/xxyw.htm", baseUrl()); |
|||
} |
|||
} |
|||
@ -0,0 +1,25 @@ |
|||
package edu.homework.crawler.strategy; |
|||
|
|||
import java.util.List; |
|||
|
|||
public class HnuNewsStrategy extends AbstractVisualSiteBuilderStrategy { |
|||
@Override |
|||
public String key() { |
|||
return "hnu"; |
|||
} |
|||
|
|||
@Override |
|||
public String schoolName() { |
|||
return "湖南大学"; |
|||
} |
|||
|
|||
@Override |
|||
public String baseUrl() { |
|||
return "https://news.hnu.edu.cn/"; |
|||
} |
|||
|
|||
@Override |
|||
protected List<String> startUrls() { |
|||
return List.of("https://news.hnu.edu.cn/xw/zhxw.htm", baseUrl()); |
|||
} |
|||
} |
|||
@ -0,0 +1,30 @@ |
|||
package edu.homework.crawler.strategy; |
|||
|
|||
import java.util.List; |
|||
|
|||
public class HunnuNewsStrategy extends AbstractVisualSiteBuilderStrategy { |
|||
@Override |
|||
public String key() { |
|||
return "hunnu"; |
|||
} |
|||
|
|||
@Override |
|||
public String schoolName() { |
|||
return "湖南师范大学"; |
|||
} |
|||
|
|||
@Override |
|||
public String baseUrl() { |
|||
return "https://news.hunnu.edu.cn/"; |
|||
} |
|||
|
|||
@Override |
|||
protected List<String> startUrls() { |
|||
return List.of("https://news.hunnu.edu.cn/sdxw.htm", baseUrl()); |
|||
} |
|||
|
|||
@Override |
|||
protected boolean isAcceptableUrl(String url) { |
|||
return super.isAcceptableUrl(url) && url.contains("/info/1005/"); |
|||
} |
|||
} |
|||
@ -0,0 +1,36 @@ |
|||
package edu.homework.crawler.strategy; |
|||
|
|||
import edu.homework.crawler.exception.SiteNotFoundException; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.LinkedHashMap; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
public class SiteRegistry { |
|||
private final Map<String, CrawlStrategy> strategies = new LinkedHashMap<>(); |
|||
|
|||
public static SiteRegistry defaults() { |
|||
SiteRegistry registry = new SiteRegistry(); |
|||
registry.register(new HnuNewsStrategy()); |
|||
registry.register(new CsuNewsStrategy()); |
|||
registry.register(new HunnuNewsStrategy()); |
|||
return registry; |
|||
} |
|||
|
|||
public void register(CrawlStrategy strategy) { |
|||
strategies.put(strategy.key(), strategy); |
|||
} |
|||
|
|||
public CrawlStrategy get(String key) throws SiteNotFoundException { |
|||
CrawlStrategy strategy = strategies.get(key.toLowerCase()); |
|||
if (strategy == null) { |
|||
throw new SiteNotFoundException("Unsupported site: " + key + ". Use all, hnu, csu, or hunnu."); |
|||
} |
|||
return strategy; |
|||
} |
|||
|
|||
public List<CrawlStrategy> all() { |
|||
return new ArrayList<>(strategies.values()); |
|||
} |
|||
} |
|||
@ -0,0 +1,39 @@ |
|||
package edu.homework.crawler.util; |
|||
|
|||
import edu.homework.crawler.exception.NetworkException; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
|
|||
import java.io.IOException; |
|||
|
|||
public class HttpFetcher { |
|||
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
|||
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"; |
|||
|
|||
public Document fetch(String url) throws NetworkException { |
|||
IOException lastFailure = null; |
|||
for (int attempt = 1; attempt <= 3; attempt++) { |
|||
try { |
|||
return Jsoup.connect(url) |
|||
.userAgent(USER_AGENT) |
|||
.referrer("https://www.baidu.com/") |
|||
.timeout(20_000) |
|||
.maxBodySize(5 * 1024 * 1024) |
|||
.followRedirects(true) |
|||
.get(); |
|||
} catch (IOException e) { |
|||
lastFailure = e; |
|||
politePause(); |
|||
} |
|||
} |
|||
throw new NetworkException("Network request failed: " + url, lastFailure); |
|||
} |
|||
|
|||
public void politePause() { |
|||
try { |
|||
Thread.sleep(250); |
|||
} catch (InterruptedException e) { |
|||
Thread.currentThread().interrupt(); |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,98 @@ |
|||
package edu.homework.crawler.util; |
|||
|
|||
import java.time.LocalDate; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public final class TextExtractors { |
|||
private static final Pattern DATE = Pattern.compile("(20\\d{2})[-年./](\\d{1,2})[-月./](\\d{1,2})日?(?:\\s+\\d{1,2}:\\d{2})?"); |
|||
private static final Pattern SPLIT_DAY_YEAR_MONTH = Pattern.compile("(?<!\\d)(\\d{1,2})\\s+(20\\d{2})[-年./](\\d{1,2})(?![-\\d])"); |
|||
private static final Pattern MONTH_DAY = Pattern.compile("(?<!\\d)(\\d{1,2})[-月.](\\d{1,2})日?(?!\\d)"); |
|||
|
|||
private TextExtractors() { |
|||
} |
|||
|
|||
public static String clean(String value) { |
|||
if (value == null) { |
|||
return ""; |
|||
} |
|||
return value.replace('\u00A0', ' ') |
|||
.replaceAll("\\s+", " ") |
|||
.trim(); |
|||
} |
|||
|
|||
public static String firstNonBlank(String... values) { |
|||
for (String value : values) { |
|||
String cleaned = clean(value); |
|||
if (!cleaned.isBlank()) { |
|||
return cleaned; |
|||
} |
|||
} |
|||
return ""; |
|||
} |
|||
|
|||
public static String limit(String value, int maxLength) { |
|||
String cleaned = clean(value); |
|||
if (cleaned.length() <= maxLength) { |
|||
return cleaned; |
|||
} |
|||
return cleaned.substring(0, maxLength) + "..."; |
|||
} |
|||
|
|||
public static String findPublishTime(String text) { |
|||
String cleaned = clean(text); |
|||
int index = cleaned.indexOf("发布时间"); |
|||
if (index >= 0) { |
|||
String slice = cleaned.substring(index, Math.min(cleaned.length(), index + 80)); |
|||
String date = findDate(slice); |
|||
if (!date.isBlank()) { |
|||
return date; |
|||
} |
|||
} |
|||
return findDate(cleaned); |
|||
} |
|||
|
|||
public static String findDate(String text) { |
|||
Matcher matcher = SPLIT_DAY_YEAR_MONTH.matcher(clean(text)); |
|||
if (matcher.find()) { |
|||
return matcher.group(2) + "-" + pad(matcher.group(3)) + "-" + pad(matcher.group(1)); |
|||
} |
|||
matcher = DATE.matcher(clean(text)); |
|||
if (matcher.find()) { |
|||
return normalizeFullDate(matcher); |
|||
} |
|||
matcher = MONTH_DAY.matcher(clean(text)); |
|||
if (matcher.find()) { |
|||
int month = Integer.parseInt(matcher.group(1)); |
|||
int day = Integer.parseInt(matcher.group(2)); |
|||
LocalDate today = LocalDate.now(); |
|||
int year = today.getYear(); |
|||
if (LocalDate.of(year, month, day).isAfter(today.plusDays(7))) { |
|||
year--; |
|||
} |
|||
return year + "-" + pad(matcher.group(1)) + "-" + pad(matcher.group(2)); |
|||
} |
|||
return ""; |
|||
} |
|||
|
|||
public static String findLabelValue(String text, String label) { |
|||
String cleaned = clean(text); |
|||
Pattern pattern = Pattern.compile(label + "[::]\\s*(.*?)(?=\\s*(?:来源|作者|发布时间|点击)[::]|$)"); |
|||
Matcher matcher = pattern.matcher(cleaned); |
|||
if (matcher.find()) { |
|||
String value = clean(matcher.group(1)); |
|||
if (!value.contains("点击") && !value.contains("发布时间")) { |
|||
return value; |
|||
} |
|||
} |
|||
return ""; |
|||
} |
|||
|
|||
private static String normalizeFullDate(Matcher matcher) { |
|||
return matcher.group(1) + "-" + pad(matcher.group(2)) + "-" + pad(matcher.group(3)); |
|||
} |
|||
|
|||
private static String pad(String value) { |
|||
return value.length() == 1 ? "0" + value : value; |
|||
} |
|||
} |
|||
@ -0,0 +1,37 @@ |
|||
package edu.homework.crawler.view; |
|||
|
|||
import edu.homework.crawler.model.CrawlSummary; |
|||
|
|||
public class ConsoleView { |
|||
public void printWelcome() { |
|||
println("University News Crawler"); |
|||
println("Type help to see commands."); |
|||
} |
|||
|
|||
public void printPrompt() { |
|||
System.out.print("crawler> "); |
|||
} |
|||
|
|||
public void printHelp(String text) { |
|||
println(text); |
|||
} |
|||
|
|||
public void printInfo(String text) { |
|||
println(text); |
|||
} |
|||
|
|||
public void printError(String text) { |
|||
System.err.println("[ERROR] " + text); |
|||
} |
|||
|
|||
public void printSummary(CrawlSummary summary) { |
|||
println("Crawl finished."); |
|||
println("Total items: " + summary.totalCount()); |
|||
summary.siteCounts().forEach((site, count) -> println(" - " + site + ": " + count)); |
|||
println("Saved to: " + summary.outputPath().toAbsolutePath()); |
|||
} |
|||
|
|||
private void println(String text) { |
|||
System.out.println(text); |
|||
} |
|||
} |
|||
@ -0,0 +1,38 @@ |
|||
package edu.homework.crawler.command; |
|||
|
|||
import edu.homework.crawler.exception.CommandException; |
|||
import org.junit.jupiter.api.Test; |
|||
|
|||
import java.util.List; |
|||
import java.util.Map; |
|||
|
|||
import static org.junit.jupiter.api.Assertions.assertEquals; |
|||
import static org.junit.jupiter.api.Assertions.assertThrows; |
|||
|
|||
class CommandRegistryTest { |
|||
@Test |
|||
void tokenizesQuotedOutputPath() throws Exception { |
|||
CommandRegistry registry = new CommandRegistry(); |
|||
|
|||
List<String> tokens = registry.tokenize("crawl --site all --out \"data/my news.json\""); |
|||
|
|||
assertEquals(List.of("crawl", "--site", "all", "--out", "data/my news.json"), tokens); |
|||
} |
|||
|
|||
@Test |
|||
void parsesOptionsAsKeyValuePairs() throws Exception { |
|||
CommandRegistry registry = new CommandRegistry(); |
|||
|
|||
Map<String, String> options = registry.parseOptions(new String[]{"--site", "hnu", "--limit", "5"}); |
|||
|
|||
assertEquals("hnu", options.get("site")); |
|||
assertEquals("5", options.get("limit")); |
|||
} |
|||
|
|||
@Test |
|||
void rejectsMissingOptionValue() { |
|||
CommandRegistry registry = new CommandRegistry(); |
|||
|
|||
assertThrows(CommandException.class, () -> registry.parseOptions(new String[]{"--site"})); |
|||
} |
|||
} |
|||
@ -0,0 +1,29 @@ |
|||
package edu.homework.crawler.repository; |
|||
|
|||
import edu.homework.crawler.model.NewsItem; |
|||
import org.junit.jupiter.api.Test; |
|||
import org.junit.jupiter.api.io.TempDir; |
|||
|
|||
import java.nio.file.Files; |
|||
import java.nio.file.Path; |
|||
import java.util.List; |
|||
|
|||
import static org.junit.jupiter.api.Assertions.assertTrue; |
|||
|
|||
class FileNewsRepositoryTest { |
|||
@TempDir |
|||
Path tempDir; |
|||
|
|||
@Test |
|||
void savesJsonFile() throws Exception { |
|||
FileNewsRepository repository = new FileNewsRepository(); |
|||
NewsItem item = new NewsItem("湖南大学", "hnu", "测试新闻", "https://example.com/news"); |
|||
Path output = tempDir.resolve("news.json"); |
|||
|
|||
repository.save(List.of(item), OutputFormat.JSON, output); |
|||
|
|||
String json = Files.readString(output); |
|||
assertTrue(json.contains("测试新闻")); |
|||
assertTrue(json.contains("hnu")); |
|||
} |
|||
} |
|||
@ -0,0 +1,29 @@ |
|||
package edu.homework.crawler.util; |
|||
|
|||
import org.junit.jupiter.api.Test; |
|||
|
|||
import static org.junit.jupiter.api.Assertions.assertEquals; |
|||
|
|||
class TextExtractorsTest { |
|||
@Test |
|||
void normalizesChinesePublishDate() { |
|||
String text = "来源:新闻网 作者:张三 发布时间:2026年05月28日 17:14 点击:100次"; |
|||
|
|||
assertEquals("2026-05-28", TextExtractors.findPublishTime(text)); |
|||
} |
|||
|
|||
@Test |
|||
void extractsSimpleLabelValue() { |
|||
String text = "来源:新闻网 作者:李四 发布时间:2026-05-28"; |
|||
|
|||
assertEquals("新闻网", TextExtractors.findLabelValue(text, "来源")); |
|||
assertEquals("李四", TextExtractors.findLabelValue(text, "作者")); |
|||
} |
|||
|
|||
@Test |
|||
void normalizesSplitDayYearMonthDate() { |
|||
String text = "28 2026-05"; |
|||
|
|||
assertEquals("2026-05-28", TextExtractors.findDate(text)); |
|||
} |
|||
} |
|||
Loading…
Reference in new issue