40 changed files with 1443 additions and 1 deletions
@ -0,0 +1,3 @@ |
|||||
|
target/ |
||||
|
*.log |
||||
|
*.tmp |
||||
@ -0,0 +1,55 @@ |
|||||
|
# University News Crawler |
||||
|
|
||||
|
Java homework project for crawling: |
||||
|
|
||||
|
- `https://news.hnu.edu.cn/` |
||||
|
- `https://news.csu.edu.cn/` |
||||
|
- `https://news.hunnu.edu.cn/` |
||||
|
|
||||
|
The code demonstrates the required architecture: |
||||
|
|
||||
|
- CLI interactive command line |
||||
|
- MVC: `model`, `view`, `controller` |
||||
|
- Command pattern: `command` package |
||||
|
- Strategy pattern: `strategy` package, one strategy per target website |
||||
|
- Custom exception hierarchy: `exception` package |
||||
|
- File persistence: JSON or CSV output |
||||
|
|
||||
|
## Run |
||||
|
|
||||
|
```powershell |
||||
|
mvn test |
||||
|
mvn exec:java -Dexec.args="crawl --site all --limit 5 --format json --out data/news.json" |
||||
|
``` |
||||
|
|
||||
|
Interactive CLI: |
||||
|
|
||||
|
```powershell |
||||
|
mvn exec:java |
||||
|
``` |
||||
|
|
||||
|
Useful commands: |
||||
|
|
||||
|
```text |
||||
|
help |
||||
|
sites |
||||
|
crawl --site all --limit 10 --format json --out data/news.json |
||||
|
crawl --site hnu --limit 5 --format csv --out data/hnu.csv |
||||
|
exit |
||||
|
``` |
||||
|
|
||||
|
## Output Fields |
||||
|
|
||||
|
Each crawled news item includes: |
||||
|
|
||||
|
- school |
||||
|
- site key |
||||
|
- title |
||||
|
- url |
||||
|
- publish time |
||||
|
- source |
||||
|
- author |
||||
|
- summary |
||||
|
- content preview |
||||
|
- crawled time |
||||
|
|
||||
File diff suppressed because one or more lines are too long
@ -0,0 +1,58 @@ |
|||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" |
||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> |
||||
|
<modelVersion>4.0.0</modelVersion> |
||||
|
|
||||
|
<groupId>edu.homework</groupId> |
||||
|
<artifactId>university-news-crawler</artifactId> |
||||
|
<version>1.0.0</version> |
||||
|
<name>University News Crawler</name> |
||||
|
|
||||
|
<properties> |
||||
|
<maven.compiler.release>17</maven.compiler.release> |
||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
||||
|
<junit.version>5.10.2</junit.version> |
||||
|
</properties> |
||||
|
|
||||
|
<dependencies> |
||||
|
<dependency> |
||||
|
<groupId>org.jsoup</groupId> |
||||
|
<artifactId>jsoup</artifactId> |
||||
|
<version>1.17.2</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>com.fasterxml.jackson.core</groupId> |
||||
|
<artifactId>jackson-databind</artifactId> |
||||
|
<version>2.17.2</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>com.fasterxml.jackson.datatype</groupId> |
||||
|
<artifactId>jackson-datatype-jsr310</artifactId> |
||||
|
<version>2.17.2</version> |
||||
|
</dependency> |
||||
|
<dependency> |
||||
|
<groupId>org.junit.jupiter</groupId> |
||||
|
<artifactId>junit-jupiter</artifactId> |
||||
|
<version>${junit.version}</version> |
||||
|
<scope>test</scope> |
||||
|
</dependency> |
||||
|
</dependencies> |
||||
|
|
||||
|
<build> |
||||
|
<plugins> |
||||
|
<plugin> |
||||
|
<groupId>org.apache.maven.plugins</groupId> |
||||
|
<artifactId>maven-surefire-plugin</artifactId> |
||||
|
<version>3.2.5</version> |
||||
|
</plugin> |
||||
|
<plugin> |
||||
|
<groupId>org.codehaus.mojo</groupId> |
||||
|
<artifactId>exec-maven-plugin</artifactId> |
||||
|
<version>3.3.0</version> |
||||
|
<configuration> |
||||
|
<mainClass>edu.homework.crawler.Main</mainClass> |
||||
|
</configuration> |
||||
|
</plugin> |
||||
|
</plugins> |
||||
|
</build> |
||||
|
</project> |
||||
@ -0,0 +1,12 @@ |
|||||
|
package edu.homework.crawler; |
||||
|
|
||||
|
import edu.homework.crawler.cli.CliApplication; |
||||
|
|
||||
|
public final class Main { |
||||
|
private Main() { |
||||
|
} |
||||
|
|
||||
|
public static void main(String[] args) { |
||||
|
CliApplication.defaultApplication().run(args); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,74 @@ |
|||||
|
package edu.homework.crawler.cli; |
||||
|
|
||||
|
import edu.homework.crawler.command.CommandContext; |
||||
|
import edu.homework.crawler.command.CommandRegistry; |
||||
|
import edu.homework.crawler.command.CrawlCommand; |
||||
|
import edu.homework.crawler.command.ExitCommand; |
||||
|
import edu.homework.crawler.command.HelpCommand; |
||||
|
import edu.homework.crawler.command.SitesCommand; |
||||
|
import edu.homework.crawler.controller.CrawlerController; |
||||
|
import edu.homework.crawler.exception.CrawlException; |
||||
|
import edu.homework.crawler.repository.FileNewsRepository; |
||||
|
import edu.homework.crawler.service.NewsCrawlerService; |
||||
|
import edu.homework.crawler.strategy.SiteRegistry; |
||||
|
import edu.homework.crawler.view.ConsoleView; |
||||
|
|
||||
|
import java.nio.charset.StandardCharsets; |
||||
|
import java.util.Scanner; |
||||
|
|
||||
|
public class CliApplication { |
||||
|
private final CommandContext context; |
||||
|
private final CommandRegistry commandRegistry; |
||||
|
|
||||
|
public CliApplication(CommandContext context, CommandRegistry commandRegistry) { |
||||
|
this.context = context; |
||||
|
this.commandRegistry = commandRegistry; |
||||
|
} |
||||
|
|
||||
|
public static CliApplication defaultApplication() { |
||||
|
ConsoleView view = new ConsoleView(); |
||||
|
SiteRegistry siteRegistry = SiteRegistry.defaults(); |
||||
|
NewsCrawlerService service = new NewsCrawlerService(siteRegistry); |
||||
|
FileNewsRepository repository = new FileNewsRepository(); |
||||
|
CrawlerController controller = new CrawlerController(service, repository); |
||||
|
|
||||
|
CommandRegistry registry = new CommandRegistry(); |
||||
|
CommandContext context = new CommandContext(controller, view, registry, siteRegistry); |
||||
|
registry.register(new HelpCommand()); |
||||
|
registry.register(new SitesCommand()); |
||||
|
registry.register(new CrawlCommand()); |
||||
|
registry.register(new ExitCommand()); |
||||
|
return new CliApplication(context, registry); |
||||
|
} |
||||
|
|
||||
|
public void run(String[] args) { |
||||
|
if (args.length > 0) { |
||||
|
executeLine(String.join(" ", args)); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
context.view().printWelcome(); |
||||
|
try (Scanner scanner = new Scanner(System.in, StandardCharsets.UTF_8)) { |
||||
|
while (context.isRunning()) { |
||||
|
context.view().printPrompt(); |
||||
|
if (!scanner.hasNextLine()) { |
||||
|
break; |
||||
|
} |
||||
|
executeLine(scanner.nextLine()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void executeLine(String line) { |
||||
|
try { |
||||
|
commandRegistry.execute(context, line); |
||||
|
} catch (CrawlException e) { |
||||
|
context.view().printError(e.getMessage()); |
||||
|
if (e.getCause() != null) { |
||||
|
context.view().printError("Cause: " + e.getCause().getMessage()); |
||||
|
} |
||||
|
} catch (RuntimeException e) { |
||||
|
context.view().printError("Unexpected error: " + e.getMessage()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package edu.homework.crawler.command; |
||||
|
|
||||
|
import edu.homework.crawler.exception.CrawlException; |
||||
|
|
||||
|
public interface Command { |
||||
|
String name(); |
||||
|
|
||||
|
String description(); |
||||
|
|
||||
|
void execute(CommandContext context, String[] args) throws CrawlException; |
||||
|
} |
||||
@ -0,0 +1,44 @@ |
|||||
|
package edu.homework.crawler.command; |
||||
|
|
||||
|
import edu.homework.crawler.controller.CrawlerController; |
||||
|
import edu.homework.crawler.strategy.SiteRegistry; |
||||
|
import edu.homework.crawler.view.ConsoleView; |
||||
|
|
||||
|
public class CommandContext { |
||||
|
private final CrawlerController controller; |
||||
|
private final ConsoleView view; |
||||
|
private final CommandRegistry commandRegistry; |
||||
|
private final SiteRegistry siteRegistry; |
||||
|
private boolean running = true; |
||||
|
|
||||
|
public CommandContext(CrawlerController controller, ConsoleView view, CommandRegistry commandRegistry, SiteRegistry siteRegistry) { |
||||
|
this.controller = controller; |
||||
|
this.view = view; |
||||
|
this.commandRegistry = commandRegistry; |
||||
|
this.siteRegistry = siteRegistry; |
||||
|
} |
||||
|
|
||||
|
public CrawlerController controller() { |
||||
|
return controller; |
||||
|
} |
||||
|
|
||||
|
public ConsoleView view() { |
||||
|
return view; |
||||
|
} |
||||
|
|
||||
|
public CommandRegistry commandRegistry() { |
||||
|
return commandRegistry; |
||||
|
} |
||||
|
|
||||
|
public SiteRegistry siteRegistry() { |
||||
|
return siteRegistry; |
||||
|
} |
||||
|
|
||||
|
public boolean isRunning() { |
||||
|
return running; |
||||
|
} |
||||
|
|
||||
|
public void stop() { |
||||
|
this.running = false; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,82 @@ |
|||||
|
package edu.homework.crawler.command; |
||||
|
|
||||
|
import edu.homework.crawler.exception.CommandException; |
||||
|
import edu.homework.crawler.exception.CrawlException; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.Arrays; |
||||
|
import java.util.Collection; |
||||
|
import java.util.LinkedHashMap; |
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class CommandRegistry { |
||||
|
private final Map<String, Command> commands = new LinkedHashMap<>(); |
||||
|
|
||||
|
public void register(Command command) { |
||||
|
commands.put(command.name(), command); |
||||
|
} |
||||
|
|
||||
|
public Collection<Command> commands() { |
||||
|
return commands.values(); |
||||
|
} |
||||
|
|
||||
|
public void execute(CommandContext context, String line) throws CrawlException { |
||||
|
List<String> tokens = tokenize(line); |
||||
|
if (tokens.isEmpty()) { |
||||
|
return; |
||||
|
} |
||||
|
Command command = commands.get(tokens.get(0)); |
||||
|
if (command == null) { |
||||
|
throw new CommandException("Unknown command: " + tokens.get(0) + ". Type help to see commands."); |
||||
|
} |
||||
|
String[] args = tokens.subList(1, tokens.size()).toArray(String[]::new); |
||||
|
command.execute(context, args); |
||||
|
} |
||||
|
|
||||
|
public List<String> tokenize(String line) throws CommandException { |
||||
|
List<String> tokens = new ArrayList<>(); |
||||
|
StringBuilder current = new StringBuilder(); |
||||
|
boolean inQuotes = false; |
||||
|
for (int i = 0; i < line.length(); i++) { |
||||
|
char ch = line.charAt(i); |
||||
|
if (ch == '"') { |
||||
|
inQuotes = !inQuotes; |
||||
|
continue; |
||||
|
} |
||||
|
if (Character.isWhitespace(ch) && !inQuotes) { |
||||
|
addToken(tokens, current); |
||||
|
} else { |
||||
|
current.append(ch); |
||||
|
} |
||||
|
} |
||||
|
if (inQuotes) { |
||||
|
throw new CommandException("Missing closing quote in command line."); |
||||
|
} |
||||
|
addToken(tokens, current); |
||||
|
return tokens; |
||||
|
} |
||||
|
|
||||
|
private void addToken(List<String> tokens, StringBuilder current) { |
||||
|
if (current.length() > 0) { |
||||
|
tokens.add(current.toString()); |
||||
|
current.setLength(0); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public Map<String, String> parseOptions(String[] args) throws CommandException { |
||||
|
Map<String, String> options = new LinkedHashMap<>(); |
||||
|
List<String> list = Arrays.asList(args); |
||||
|
for (int i = 0; i < list.size(); i++) { |
||||
|
String key = list.get(i); |
||||
|
if (!key.startsWith("--")) { |
||||
|
throw new CommandException("Invalid argument: " + key + ". Options must start with --."); |
||||
|
} |
||||
|
if (i + 1 >= list.size() || list.get(i + 1).startsWith("--")) { |
||||
|
throw new CommandException("Missing value for option " + key + "."); |
||||
|
} |
||||
|
options.put(key.substring(2), list.get(++i)); |
||||
|
} |
||||
|
return options; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,55 @@ |
|||||
|
package edu.homework.crawler.command; |
||||
|
|
||||
|
import edu.homework.crawler.exception.CommandException; |
||||
|
import edu.homework.crawler.exception.CrawlException; |
||||
|
import edu.homework.crawler.model.CrawlRequest; |
||||
|
import edu.homework.crawler.repository.OutputFormat; |
||||
|
|
||||
|
import java.nio.file.Path; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.time.format.DateTimeFormatter; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class CrawlCommand implements Command { |
||||
|
private static final DateTimeFormatter FILE_TIME = DateTimeFormatter.ofPattern("yyyyMMdd-HHmmss"); |
||||
|
|
||||
|
@Override |
||||
|
public String name() { |
||||
|
return "crawl"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String description() { |
||||
|
return "Crawl news and save to a JSON or CSV file."; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(CommandContext context, String[] args) throws CrawlException { |
||||
|
Map<String, String> options = context.commandRegistry().parseOptions(args); |
||||
|
String site = options.getOrDefault("site", "all"); |
||||
|
int limit = parseLimit(options.getOrDefault("limit", "10")); |
||||
|
OutputFormat format = OutputFormat.from(options.getOrDefault("format", "json")); |
||||
|
Path outputPath = Path.of(options.getOrDefault("out", defaultOutput(format))); |
||||
|
|
||||
|
context.view().printInfo("Starting crawl: site=" + site + ", limit=" + limit + ", format=" + format.name().toLowerCase()); |
||||
|
CrawlRequest request = new CrawlRequest(site, limit, format, outputPath); |
||||
|
context.view().printSummary(context.controller().crawl(request)); |
||||
|
} |
||||
|
|
||||
|
private int parseLimit(String value) throws CommandException { |
||||
|
try { |
||||
|
int limit = Integer.parseInt(value); |
||||
|
if (limit <= 0 || limit > 100) { |
||||
|
throw new CommandException("Limit must be between 1 and 100."); |
||||
|
} |
||||
|
return limit; |
||||
|
} catch (NumberFormatException e) { |
||||
|
throw new CommandException("Limit must be a number: " + value); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private String defaultOutput(OutputFormat format) { |
||||
|
String suffix = format == OutputFormat.JSON ? ".json" : ".csv"; |
||||
|
return "data/news-" + LocalDateTime.now().format(FILE_TIME) + suffix; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,21 @@ |
|||||
|
package edu.homework.crawler.command; |
||||
|
|
||||
|
import edu.homework.crawler.exception.CrawlException; |
||||
|
|
||||
|
public class ExitCommand implements Command { |
||||
|
@Override |
||||
|
public String name() { |
||||
|
return "exit"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String description() { |
||||
|
return "Exit interactive CLI."; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(CommandContext context, String[] args) throws CrawlException { |
||||
|
context.stop(); |
||||
|
context.view().printInfo("Bye."); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,28 @@ |
|||||
|
package edu.homework.crawler.command; |
||||
|
|
||||
|
import edu.homework.crawler.exception.CrawlException; |
||||
|
|
||||
|
public class HelpCommand implements Command { |
||||
|
@Override |
||||
|
public String name() { |
||||
|
return "help"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String description() { |
||||
|
return "Show command usage."; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(CommandContext context, String[] args) throws CrawlException { |
||||
|
StringBuilder builder = new StringBuilder(); |
||||
|
builder.append("Commands:\n"); |
||||
|
for (Command command : context.commandRegistry().commands()) { |
||||
|
builder.append(" ").append(command.name()).append(" - ").append(command.description()).append('\n'); |
||||
|
} |
||||
|
builder.append("\nExamples:\n"); |
||||
|
builder.append(" crawl --site all --limit 10 --format json --out data/news.json\n"); |
||||
|
builder.append(" crawl --site hnu --limit 5 --format csv --out data/hnu.csv\n"); |
||||
|
context.view().printHelp(builder.toString()); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,22 @@ |
|||||
|
package edu.homework.crawler.command; |
||||
|
|
||||
|
import edu.homework.crawler.exception.CrawlException; |
||||
|
|
||||
|
public class SitesCommand implements Command { |
||||
|
@Override |
||||
|
public String name() { |
||||
|
return "sites"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String description() { |
||||
|
return "List supported websites."; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void execute(CommandContext context, String[] args) throws CrawlException { |
||||
|
context.view().printInfo("Supported sites:"); |
||||
|
context.siteRegistry().all().forEach(strategy -> |
||||
|
context.view().printInfo(" - " + strategy.key() + ": " + strategy.schoolName() + " (" + strategy.baseUrl() + ")")); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,30 @@ |
|||||
|
package edu.homework.crawler.controller; |
||||
|
|
||||
|
import edu.homework.crawler.exception.CrawlException; |
||||
|
import edu.homework.crawler.model.CrawlRequest; |
||||
|
import edu.homework.crawler.model.CrawlSummary; |
||||
|
import edu.homework.crawler.model.NewsItem; |
||||
|
import edu.homework.crawler.repository.FileNewsRepository; |
||||
|
import edu.homework.crawler.service.NewsCrawlerService; |
||||
|
|
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
import java.util.stream.Collectors; |
||||
|
|
||||
|
public class CrawlerController { |
||||
|
private final NewsCrawlerService crawlerService; |
||||
|
private final FileNewsRepository newsRepository; |
||||
|
|
||||
|
public CrawlerController(NewsCrawlerService crawlerService, FileNewsRepository newsRepository) { |
||||
|
this.crawlerService = crawlerService; |
||||
|
this.newsRepository = newsRepository; |
||||
|
} |
||||
|
|
||||
|
public CrawlSummary crawl(CrawlRequest request) throws CrawlException { |
||||
|
List<NewsItem> items = crawlerService.crawl(request.siteKey(), request.limitPerSite()); |
||||
|
newsRepository.save(items, request.outputFormat(), request.outputPath()); |
||||
|
Map<String, Integer> counts = items.stream() |
||||
|
.collect(Collectors.groupingBy(NewsItem::getSiteKey, Collectors.collectingAndThen(Collectors.counting(), Long::intValue))); |
||||
|
return new CrawlSummary(items, counts, request.outputPath()); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,7 @@ |
|||||
|
package edu.homework.crawler.exception; |
||||
|
|
||||
|
public class CommandException extends CrawlException { |
||||
|
public CommandException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package edu.homework.crawler.exception; |
||||
|
|
||||
|
public class CrawlException extends Exception { |
||||
|
public CrawlException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public CrawlException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,7 @@ |
|||||
|
package edu.homework.crawler.exception; |
||||
|
|
||||
|
public class NetworkException extends CrawlException { |
||||
|
public NetworkException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package edu.homework.crawler.exception; |
||||
|
|
||||
|
public class ParseException extends CrawlException { |
||||
|
public ParseException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
|
||||
|
public ParseException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,7 @@ |
|||||
|
package edu.homework.crawler.exception; |
||||
|
|
||||
|
public class SiteNotFoundException extends CrawlException { |
||||
|
public SiteNotFoundException(String message) { |
||||
|
super(message); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,7 @@ |
|||||
|
package edu.homework.crawler.exception; |
||||
|
|
||||
|
public class StorageException extends CrawlException { |
||||
|
public StorageException(String message, Throwable cause) { |
||||
|
super(message, cause); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,8 @@ |
|||||
|
package edu.homework.crawler.model; |
||||
|
|
||||
|
import edu.homework.crawler.repository.OutputFormat; |
||||
|
|
||||
|
import java.nio.file.Path; |
||||
|
|
||||
|
public record CrawlRequest(String siteKey, int limitPerSite, OutputFormat outputFormat, Path outputPath) { |
||||
|
} |
||||
@ -0,0 +1,11 @@ |
|||||
|
package edu.homework.crawler.model; |
||||
|
|
||||
|
import java.nio.file.Path; |
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public record CrawlSummary(List<NewsItem> items, Map<String, Integer> siteCounts, Path outputPath) { |
||||
|
public int totalCount() { |
||||
|
return items.size(); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,4 @@ |
|||||
|
package edu.homework.crawler.model; |
||||
|
|
||||
|
public record NewsCandidate(String title, String url, String publishTime) { |
||||
|
} |
||||
@ -0,0 +1,107 @@ |
|||||
|
package edu.homework.crawler.model; |
||||
|
|
||||
|
import java.time.LocalDateTime; |
||||
|
|
||||
|
public class NewsItem { |
||||
|
private String school; |
||||
|
private String siteKey; |
||||
|
private String title; |
||||
|
private String url; |
||||
|
private String publishTime; |
||||
|
private String source; |
||||
|
private String author; |
||||
|
private String summary; |
||||
|
private String contentPreview; |
||||
|
private LocalDateTime crawledAt; |
||||
|
|
||||
|
public NewsItem() { |
||||
|
} |
||||
|
|
||||
|
public NewsItem(String school, String siteKey, String title, String url) { |
||||
|
this.school = school; |
||||
|
this.siteKey = siteKey; |
||||
|
this.title = title; |
||||
|
this.url = url; |
||||
|
this.crawledAt = LocalDateTime.now(); |
||||
|
} |
||||
|
|
||||
|
public String getSchool() { |
||||
|
return school; |
||||
|
} |
||||
|
|
||||
|
public void setSchool(String school) { |
||||
|
this.school = school; |
||||
|
} |
||||
|
|
||||
|
public String getSiteKey() { |
||||
|
return siteKey; |
||||
|
} |
||||
|
|
||||
|
public void setSiteKey(String siteKey) { |
||||
|
this.siteKey = siteKey; |
||||
|
} |
||||
|
|
||||
|
public String getTitle() { |
||||
|
return title; |
||||
|
} |
||||
|
|
||||
|
public void setTitle(String title) { |
||||
|
this.title = title; |
||||
|
} |
||||
|
|
||||
|
public String getUrl() { |
||||
|
return url; |
||||
|
} |
||||
|
|
||||
|
public void setUrl(String url) { |
||||
|
this.url = url; |
||||
|
} |
||||
|
|
||||
|
public String getPublishTime() { |
||||
|
return publishTime; |
||||
|
} |
||||
|
|
||||
|
public void setPublishTime(String publishTime) { |
||||
|
this.publishTime = publishTime; |
||||
|
} |
||||
|
|
||||
|
public String getSource() { |
||||
|
return source; |
||||
|
} |
||||
|
|
||||
|
public void setSource(String source) { |
||||
|
this.source = source; |
||||
|
} |
||||
|
|
||||
|
public String getAuthor() { |
||||
|
return author; |
||||
|
} |
||||
|
|
||||
|
public void setAuthor(String author) { |
||||
|
this.author = author; |
||||
|
} |
||||
|
|
||||
|
public String getSummary() { |
||||
|
return summary; |
||||
|
} |
||||
|
|
||||
|
public void setSummary(String summary) { |
||||
|
this.summary = summary; |
||||
|
} |
||||
|
|
||||
|
public String getContentPreview() { |
||||
|
return contentPreview; |
||||
|
} |
||||
|
|
||||
|
public void setContentPreview(String contentPreview) { |
||||
|
this.contentPreview = contentPreview; |
||||
|
} |
||||
|
|
||||
|
public LocalDateTime getCrawledAt() { |
||||
|
return crawledAt; |
||||
|
} |
||||
|
|
||||
|
public void setCrawledAt(LocalDateTime crawledAt) { |
||||
|
this.crawledAt = crawledAt; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,67 @@ |
|||||
|
package edu.homework.crawler.repository; |
||||
|
|
||||
|
import com.fasterxml.jackson.databind.ObjectMapper; |
||||
|
import com.fasterxml.jackson.databind.SerializationFeature; |
||||
|
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; |
||||
|
import edu.homework.crawler.exception.StorageException; |
||||
|
import edu.homework.crawler.model.NewsItem; |
||||
|
|
||||
|
import java.io.BufferedWriter; |
||||
|
import java.io.IOException; |
||||
|
import java.nio.charset.StandardCharsets; |
||||
|
import java.nio.file.Files; |
||||
|
import java.nio.file.Path; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class FileNewsRepository { |
||||
|
private final ObjectMapper objectMapper; |
||||
|
|
||||
|
public FileNewsRepository() { |
||||
|
this.objectMapper = new ObjectMapper() |
||||
|
.registerModule(new JavaTimeModule()) |
||||
|
.enable(SerializationFeature.INDENT_OUTPUT) |
||||
|
.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS); |
||||
|
} |
||||
|
|
||||
|
public void save(List<NewsItem> items, OutputFormat format, Path outputPath) throws StorageException { |
||||
|
try { |
||||
|
Path parent = outputPath.toAbsolutePath().getParent(); |
||||
|
if (parent != null) { |
||||
|
Files.createDirectories(parent); |
||||
|
} |
||||
|
if (format == OutputFormat.JSON) { |
||||
|
objectMapper.writeValue(outputPath.toFile(), items); |
||||
|
} else { |
||||
|
writeCsv(items, outputPath); |
||||
|
} |
||||
|
} catch (IOException e) { |
||||
|
throw new StorageException("Failed to save crawler data to " + outputPath, e); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private void writeCsv(List<NewsItem> items, Path outputPath) throws IOException { |
||||
|
try (BufferedWriter writer = Files.newBufferedWriter(outputPath, StandardCharsets.UTF_8)) { |
||||
|
writer.write("school,siteKey,title,url,publishTime,source,author,summary,contentPreview,crawledAt"); |
||||
|
writer.newLine(); |
||||
|
for (NewsItem item : items) { |
||||
|
writer.write(String.join(",", |
||||
|
csv(item.getSchool()), |
||||
|
csv(item.getSiteKey()), |
||||
|
csv(item.getTitle()), |
||||
|
csv(item.getUrl()), |
||||
|
csv(item.getPublishTime()), |
||||
|
csv(item.getSource()), |
||||
|
csv(item.getAuthor()), |
||||
|
csv(item.getSummary()), |
||||
|
csv(item.getContentPreview()), |
||||
|
csv(item.getCrawledAt() == null ? "" : item.getCrawledAt().toString()))); |
||||
|
writer.newLine(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
private String csv(String value) { |
||||
|
String safeValue = value == null ? "" : value; |
||||
|
return "\"" + safeValue.replace("\"", "\"\"") + "\""; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,17 @@ |
|||||
|
package edu.homework.crawler.repository; |
||||
|
|
||||
|
import edu.homework.crawler.exception.CommandException; |
||||
|
|
||||
|
public enum OutputFormat { |
||||
|
JSON, |
||||
|
CSV; |
||||
|
|
||||
|
public static OutputFormat from(String value) throws CommandException { |
||||
|
for (OutputFormat format : values()) { |
||||
|
if (format.name().equalsIgnoreCase(value)) { |
||||
|
return format; |
||||
|
} |
||||
|
} |
||||
|
throw new CommandException("Unsupported output format: " + value + ". Use json or csv."); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,37 @@ |
|||||
|
package edu.homework.crawler.service; |
||||
|
|
||||
|
import edu.homework.crawler.exception.CrawlException; |
||||
|
import edu.homework.crawler.exception.SiteNotFoundException; |
||||
|
import edu.homework.crawler.model.NewsItem; |
||||
|
import edu.homework.crawler.strategy.CrawlStrategy; |
||||
|
import edu.homework.crawler.strategy.SiteRegistry; |
||||
|
import edu.homework.crawler.util.HttpFetcher; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.List; |
||||
|
|
||||
|
public class NewsCrawlerService { |
||||
|
private final SiteRegistry siteRegistry; |
||||
|
private final HttpFetcher httpFetcher; |
||||
|
|
||||
|
public NewsCrawlerService(SiteRegistry siteRegistry) { |
||||
|
this.siteRegistry = siteRegistry; |
||||
|
this.httpFetcher = new HttpFetcher(); |
||||
|
} |
||||
|
|
||||
|
public List<NewsItem> crawl(String siteKey, int limitPerSite) throws CrawlException { |
||||
|
List<CrawlStrategy> strategies = resolveStrategies(siteKey); |
||||
|
List<NewsItem> items = new ArrayList<>(); |
||||
|
for (CrawlStrategy strategy : strategies) { |
||||
|
items.addAll(strategy.crawl(httpFetcher, limitPerSite)); |
||||
|
} |
||||
|
return items; |
||||
|
} |
||||
|
|
||||
|
private List<CrawlStrategy> resolveStrategies(String siteKey) throws SiteNotFoundException { |
||||
|
if ("all".equalsIgnoreCase(siteKey)) { |
||||
|
return siteRegistry.all(); |
||||
|
} |
||||
|
return List.of(siteRegistry.get(siteKey)); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,177 @@ |
|||||
|
package edu.homework.crawler.strategy; |
||||
|
|
||||
|
import edu.homework.crawler.exception.CrawlException; |
||||
|
import edu.homework.crawler.exception.ParseException; |
||||
|
import edu.homework.crawler.model.NewsCandidate; |
||||
|
import edu.homework.crawler.model.NewsItem; |
||||
|
import edu.homework.crawler.util.HttpFetcher; |
||||
|
import edu.homework.crawler.util.TextExtractors; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
import org.jsoup.nodes.Element; |
||||
|
import org.jsoup.select.Elements; |
||||
|
|
||||
|
import java.net.URI; |
||||
|
import java.time.LocalDateTime; |
||||
|
import java.util.ArrayList; |
||||
|
import java.util.LinkedHashMap; |
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public abstract class AbstractVisualSiteBuilderStrategy implements CrawlStrategy { |
||||
|
private static final String NEWS_LINK_SELECTOR = "a[href*=info/][href$=.htm],a[href$=.htm]"; |
||||
|
|
||||
|
@Override |
||||
|
public List<NewsItem> crawl(HttpFetcher fetcher, int limit) throws CrawlException { |
||||
|
Map<String, NewsCandidate> mergedCandidates = new LinkedHashMap<>(); |
||||
|
CrawlException lastFailure = null; |
||||
|
for (String startUrl : startUrls()) { |
||||
|
try { |
||||
|
Document page = fetcher.fetch(startUrl); |
||||
|
for (NewsCandidate candidate : extractCandidates(page, limit * 8)) { |
||||
|
mergedCandidates.putIfAbsent(candidate.url(), candidate); |
||||
|
} |
||||
|
} catch (CrawlException e) { |
||||
|
lastFailure = e; |
||||
|
} |
||||
|
} |
||||
|
List<NewsCandidate> candidates = sortCandidates(new ArrayList<>(mergedCandidates.values()), limit * 8); |
||||
|
if (candidates.isEmpty()) { |
||||
|
if (lastFailure != null) { |
||||
|
throw lastFailure; |
||||
|
} |
||||
|
throw new ParseException("No news links found on " + baseUrl()); |
||||
|
} |
||||
|
|
||||
|
List<NewsItem> items = new ArrayList<>(); |
||||
|
for (NewsCandidate candidate : candidates) { |
||||
|
if (items.size() >= limit) { |
||||
|
break; |
||||
|
} |
||||
|
try { |
||||
|
fetcher.politePause(); |
||||
|
Document detail = fetcher.fetch(candidate.url()); |
||||
|
items.add(parseDetail(candidate, detail)); |
||||
|
} catch (CrawlException e) { |
||||
|
NewsItem fallback = new NewsItem(schoolName(), key(), candidate.title(), candidate.url()); |
||||
|
fallback.setPublishTime(candidate.publishTime()); |
||||
|
fallback.setSummary("Detail page failed: " + e.getMessage()); |
||||
|
fallback.setCrawledAt(LocalDateTime.now()); |
||||
|
items.add(fallback); |
||||
|
} |
||||
|
} |
||||
|
return items; |
||||
|
} |
||||
|
|
||||
|
protected List<String> startUrls() { |
||||
|
return List.of(baseUrl()); |
||||
|
} |
||||
|
|
||||
|
protected List<NewsCandidate> extractCandidates(Document document, int maxCandidates) { |
||||
|
Map<String, NewsCandidate> candidates = new LinkedHashMap<>(); |
||||
|
Elements links = document.select(candidateSelector()); |
||||
|
for (Element link : links) { |
||||
|
String url = link.absUrl("href"); |
||||
|
if (!isAcceptableUrl(url)) { |
||||
|
continue; |
||||
|
} |
||||
|
String title = extractCandidateTitle(link); |
||||
|
if (title.isBlank() || title.length() < 4) { |
||||
|
continue; |
||||
|
} |
||||
|
String date = TextExtractors.findDate(neighborText(link)); |
||||
|
candidates.putIfAbsent(url, new NewsCandidate(title, url, date)); |
||||
|
if (candidates.size() >= maxCandidates) { |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
return sortCandidates(new ArrayList<>(candidates.values()), maxCandidates); |
||||
|
} |
||||
|
|
||||
|
private List<NewsCandidate> sortCandidates(List<NewsCandidate> sorted, int maxCandidates) { |
||||
|
sorted.sort((left, right) -> { |
||||
|
boolean leftHasDate = !left.publishTime().isBlank(); |
||||
|
boolean rightHasDate = !right.publishTime().isBlank(); |
||||
|
if (leftHasDate != rightHasDate) { |
||||
|
return leftHasDate ? -1 : 1; |
||||
|
} |
||||
|
return right.publishTime().compareTo(left.publishTime()); |
||||
|
}); |
||||
|
if (sorted.size() > maxCandidates) { |
||||
|
return sorted.subList(0, maxCandidates); |
||||
|
} |
||||
|
return sorted; |
||||
|
} |
||||
|
|
||||
|
protected String candidateSelector() { |
||||
|
return NEWS_LINK_SELECTOR; |
||||
|
} |
||||
|
|
||||
|
protected NewsItem parseDetail(NewsCandidate candidate, Document detail) { |
||||
|
NewsItem item = new NewsItem(schoolName(), key(), extractTitle(candidate, detail), candidate.url()); |
||||
|
String pageText = TextExtractors.clean(detail.text()); |
||||
|
item.setPublishTime(TextExtractors.firstNonBlank( |
||||
|
TextExtractors.findPublishTime(pageText), |
||||
|
candidate.publishTime())); |
||||
|
item.setSource(TextExtractors.findLabelValue(pageText, "来源")); |
||||
|
item.setAuthor(TextExtractors.findLabelValue(pageText, "作者")); |
||||
|
item.setSummary(TextExtractors.clean(detail.select("meta[name=description]").attr("content"))); |
||||
|
item.setContentPreview(extractContentPreview(detail)); |
||||
|
item.setCrawledAt(LocalDateTime.now()); |
||||
|
return item; |
||||
|
} |
||||
|
|
||||
|
protected String extractTitle(NewsCandidate candidate, Document detail) { |
||||
|
String title = TextExtractors.firstNonBlank( |
||||
|
detail.select("h1").first() == null ? "" : detail.select("h1").first().text(), |
||||
|
detail.select(".ar_tit h3").first() == null ? "" : detail.select(".ar_tit h3").first().text(), |
||||
|
detail.select(".subTitle2 span").first() == null ? "" : detail.select(".subTitle2 span").first().text(), |
||||
|
detail.select("meta[name=pageTitle]").attr("content"), |
||||
|
candidate.title()); |
||||
|
return TextExtractors.limit(TextExtractors.clean(title), 160); |
||||
|
} |
||||
|
|
||||
|
protected String extractContentPreview(Document detail) { |
||||
|
String content = TextExtractors.firstNonBlank( |
||||
|
detail.select(".v_news_content").text(), |
||||
|
detail.select("#vsb_content").text(), |
||||
|
detail.select("#vsb_content_6").text(), |
||||
|
detail.body() == null ? "" : detail.body().text()); |
||||
|
return TextExtractors.limit(TextExtractors.clean(content), 320); |
||||
|
} |
||||
|
|
||||
|
private String extractCandidateTitle(Element link) { |
||||
|
String nestedHeading = ""; |
||||
|
Element heading = link.selectFirst("h1,h2,h3,h4,h5,.tit,.title,.pXZCont,.c59665"); |
||||
|
if (heading != null) { |
||||
|
nestedHeading = heading.text(); |
||||
|
} |
||||
|
return TextExtractors.limit(TextExtractors.clean(TextExtractors.firstNonBlank( |
||||
|
link.attr("title"), |
||||
|
nestedHeading, |
||||
|
link.ownText(), |
||||
|
link.text())), 160); |
||||
|
} |
||||
|
|
||||
|
private String neighborText(Element link) { |
||||
|
StringBuilder builder = new StringBuilder(link.text()).append(' '); |
||||
|
Element node = link; |
||||
|
for (int i = 0; i < 4 && node != null; i++) { |
||||
|
builder.append(node.text()).append(' '); |
||||
|
node = node.parent(); |
||||
|
} |
||||
|
return TextExtractors.clean(builder.toString()); |
||||
|
} |
||||
|
|
||||
|
protected boolean isAcceptableUrl(String url) { |
||||
|
if (url == null || url.isBlank()) { |
||||
|
return false; |
||||
|
} |
||||
|
try { |
||||
|
URI base = URI.create(baseUrl()); |
||||
|
URI candidate = URI.create(url); |
||||
|
return base.getHost().equalsIgnoreCase(candidate.getHost()) && candidate.getPath().contains("/info/"); |
||||
|
} catch (IllegalArgumentException e) { |
||||
|
return false; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,17 @@ |
|||||
|
package edu.homework.crawler.strategy; |
||||
|
|
||||
|
import edu.homework.crawler.exception.CrawlException; |
||||
|
import edu.homework.crawler.model.NewsItem; |
||||
|
import edu.homework.crawler.util.HttpFetcher; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public interface CrawlStrategy { |
||||
|
String key(); |
||||
|
|
||||
|
String schoolName(); |
||||
|
|
||||
|
String baseUrl(); |
||||
|
|
||||
|
List<NewsItem> crawl(HttpFetcher fetcher, int limit) throws CrawlException; |
||||
|
} |
||||
@ -0,0 +1,25 @@ |
|||||
|
package edu.homework.crawler.strategy; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public class CsuNewsStrategy extends AbstractVisualSiteBuilderStrategy { |
||||
|
@Override |
||||
|
public String key() { |
||||
|
return "csu"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String schoolName() { |
||||
|
return "中南大学"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String baseUrl() { |
||||
|
return "https://news.csu.edu.cn/"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<String> startUrls() { |
||||
|
return List.of("https://news.csu.edu.cn/xxyw.htm", baseUrl()); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,25 @@ |
|||||
|
package edu.homework.crawler.strategy; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public class HnuNewsStrategy extends AbstractVisualSiteBuilderStrategy { |
||||
|
@Override |
||||
|
public String key() { |
||||
|
return "hnu"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String schoolName() { |
||||
|
return "湖南大学"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String baseUrl() { |
||||
|
return "https://news.hnu.edu.cn/"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<String> startUrls() { |
||||
|
return List.of("https://news.hnu.edu.cn/xw/zhxw.htm", baseUrl()); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,30 @@ |
|||||
|
package edu.homework.crawler.strategy; |
||||
|
|
||||
|
import java.util.List; |
||||
|
|
||||
|
public class HunnuNewsStrategy extends AbstractVisualSiteBuilderStrategy { |
||||
|
@Override |
||||
|
public String key() { |
||||
|
return "hunnu"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String schoolName() { |
||||
|
return "湖南师范大学"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String baseUrl() { |
||||
|
return "https://news.hunnu.edu.cn/"; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected List<String> startUrls() { |
||||
|
return List.of("https://news.hunnu.edu.cn/sdxw.htm", baseUrl()); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected boolean isAcceptableUrl(String url) { |
||||
|
return super.isAcceptableUrl(url) && url.contains("/info/1005/"); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,36 @@ |
|||||
|
package edu.homework.crawler.strategy; |
||||
|
|
||||
|
import edu.homework.crawler.exception.SiteNotFoundException; |
||||
|
|
||||
|
import java.util.ArrayList; |
||||
|
import java.util.LinkedHashMap; |
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
public class SiteRegistry { |
||||
|
private final Map<String, CrawlStrategy> strategies = new LinkedHashMap<>(); |
||||
|
|
||||
|
public static SiteRegistry defaults() { |
||||
|
SiteRegistry registry = new SiteRegistry(); |
||||
|
registry.register(new HnuNewsStrategy()); |
||||
|
registry.register(new CsuNewsStrategy()); |
||||
|
registry.register(new HunnuNewsStrategy()); |
||||
|
return registry; |
||||
|
} |
||||
|
|
||||
|
public void register(CrawlStrategy strategy) { |
||||
|
strategies.put(strategy.key(), strategy); |
||||
|
} |
||||
|
|
||||
|
public CrawlStrategy get(String key) throws SiteNotFoundException { |
||||
|
CrawlStrategy strategy = strategies.get(key.toLowerCase()); |
||||
|
if (strategy == null) { |
||||
|
throw new SiteNotFoundException("Unsupported site: " + key + ". Use all, hnu, csu, or hunnu."); |
||||
|
} |
||||
|
return strategy; |
||||
|
} |
||||
|
|
||||
|
public List<CrawlStrategy> all() { |
||||
|
return new ArrayList<>(strategies.values()); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,39 @@ |
|||||
|
package edu.homework.crawler.util; |
||||
|
|
||||
|
import edu.homework.crawler.exception.NetworkException; |
||||
|
import org.jsoup.Jsoup; |
||||
|
import org.jsoup.nodes.Document; |
||||
|
|
||||
|
import java.io.IOException; |
||||
|
|
||||
|
public class HttpFetcher { |
||||
|
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
||||
|
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"; |
||||
|
|
||||
|
public Document fetch(String url) throws NetworkException { |
||||
|
IOException lastFailure = null; |
||||
|
for (int attempt = 1; attempt <= 3; attempt++) { |
||||
|
try { |
||||
|
return Jsoup.connect(url) |
||||
|
.userAgent(USER_AGENT) |
||||
|
.referrer("https://www.baidu.com/") |
||||
|
.timeout(20_000) |
||||
|
.maxBodySize(5 * 1024 * 1024) |
||||
|
.followRedirects(true) |
||||
|
.get(); |
||||
|
} catch (IOException e) { |
||||
|
lastFailure = e; |
||||
|
politePause(); |
||||
|
} |
||||
|
} |
||||
|
throw new NetworkException("Network request failed: " + url, lastFailure); |
||||
|
} |
||||
|
|
||||
|
public void politePause() { |
||||
|
try { |
||||
|
Thread.sleep(250); |
||||
|
} catch (InterruptedException e) { |
||||
|
Thread.currentThread().interrupt(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,98 @@ |
|||||
|
package edu.homework.crawler.util; |
||||
|
|
||||
|
import java.time.LocalDate; |
||||
|
import java.util.regex.Matcher; |
||||
|
import java.util.regex.Pattern; |
||||
|
|
||||
|
public final class TextExtractors { |
||||
|
private static final Pattern DATE = Pattern.compile("(20\\d{2})[-年./](\\d{1,2})[-月./](\\d{1,2})日?(?:\\s+\\d{1,2}:\\d{2})?"); |
||||
|
private static final Pattern SPLIT_DAY_YEAR_MONTH = Pattern.compile("(?<!\\d)(\\d{1,2})\\s+(20\\d{2})[-年./](\\d{1,2})(?![-\\d])"); |
||||
|
private static final Pattern MONTH_DAY = Pattern.compile("(?<!\\d)(\\d{1,2})[-月.](\\d{1,2})日?(?!\\d)"); |
||||
|
|
||||
|
private TextExtractors() { |
||||
|
} |
||||
|
|
||||
|
public static String clean(String value) { |
||||
|
if (value == null) { |
||||
|
return ""; |
||||
|
} |
||||
|
return value.replace('\u00A0', ' ') |
||||
|
.replaceAll("\\s+", " ") |
||||
|
.trim(); |
||||
|
} |
||||
|
|
||||
|
public static String firstNonBlank(String... values) { |
||||
|
for (String value : values) { |
||||
|
String cleaned = clean(value); |
||||
|
if (!cleaned.isBlank()) { |
||||
|
return cleaned; |
||||
|
} |
||||
|
} |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
public static String limit(String value, int maxLength) { |
||||
|
String cleaned = clean(value); |
||||
|
if (cleaned.length() <= maxLength) { |
||||
|
return cleaned; |
||||
|
} |
||||
|
return cleaned.substring(0, maxLength) + "..."; |
||||
|
} |
||||
|
|
||||
|
public static String findPublishTime(String text) { |
||||
|
String cleaned = clean(text); |
||||
|
int index = cleaned.indexOf("发布时间"); |
||||
|
if (index >= 0) { |
||||
|
String slice = cleaned.substring(index, Math.min(cleaned.length(), index + 80)); |
||||
|
String date = findDate(slice); |
||||
|
if (!date.isBlank()) { |
||||
|
return date; |
||||
|
} |
||||
|
} |
||||
|
return findDate(cleaned); |
||||
|
} |
||||
|
|
||||
|
public static String findDate(String text) { |
||||
|
Matcher matcher = SPLIT_DAY_YEAR_MONTH.matcher(clean(text)); |
||||
|
if (matcher.find()) { |
||||
|
return matcher.group(2) + "-" + pad(matcher.group(3)) + "-" + pad(matcher.group(1)); |
||||
|
} |
||||
|
matcher = DATE.matcher(clean(text)); |
||||
|
if (matcher.find()) { |
||||
|
return normalizeFullDate(matcher); |
||||
|
} |
||||
|
matcher = MONTH_DAY.matcher(clean(text)); |
||||
|
if (matcher.find()) { |
||||
|
int month = Integer.parseInt(matcher.group(1)); |
||||
|
int day = Integer.parseInt(matcher.group(2)); |
||||
|
LocalDate today = LocalDate.now(); |
||||
|
int year = today.getYear(); |
||||
|
if (LocalDate.of(year, month, day).isAfter(today.plusDays(7))) { |
||||
|
year--; |
||||
|
} |
||||
|
return year + "-" + pad(matcher.group(1)) + "-" + pad(matcher.group(2)); |
||||
|
} |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
public static String findLabelValue(String text, String label) { |
||||
|
String cleaned = clean(text); |
||||
|
Pattern pattern = Pattern.compile(label + "[::]\\s*(.*?)(?=\\s*(?:来源|作者|发布时间|点击)[::]|$)"); |
||||
|
Matcher matcher = pattern.matcher(cleaned); |
||||
|
if (matcher.find()) { |
||||
|
String value = clean(matcher.group(1)); |
||||
|
if (!value.contains("点击") && !value.contains("发布时间")) { |
||||
|
return value; |
||||
|
} |
||||
|
} |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
private static String normalizeFullDate(Matcher matcher) { |
||||
|
return matcher.group(1) + "-" + pad(matcher.group(2)) + "-" + pad(matcher.group(3)); |
||||
|
} |
||||
|
|
||||
|
private static String pad(String value) { |
||||
|
return value.length() == 1 ? "0" + value : value; |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,37 @@ |
|||||
|
package edu.homework.crawler.view; |
||||
|
|
||||
|
import edu.homework.crawler.model.CrawlSummary; |
||||
|
|
||||
|
public class ConsoleView { |
||||
|
public void printWelcome() { |
||||
|
println("University News Crawler"); |
||||
|
println("Type help to see commands."); |
||||
|
} |
||||
|
|
||||
|
public void printPrompt() { |
||||
|
System.out.print("crawler> "); |
||||
|
} |
||||
|
|
||||
|
public void printHelp(String text) { |
||||
|
println(text); |
||||
|
} |
||||
|
|
||||
|
public void printInfo(String text) { |
||||
|
println(text); |
||||
|
} |
||||
|
|
||||
|
public void printError(String text) { |
||||
|
System.err.println("[ERROR] " + text); |
||||
|
} |
||||
|
|
||||
|
public void printSummary(CrawlSummary summary) { |
||||
|
println("Crawl finished."); |
||||
|
println("Total items: " + summary.totalCount()); |
||||
|
summary.siteCounts().forEach((site, count) -> println(" - " + site + ": " + count)); |
||||
|
println("Saved to: " + summary.outputPath().toAbsolutePath()); |
||||
|
} |
||||
|
|
||||
|
private void println(String text) { |
||||
|
System.out.println(text); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,38 @@ |
|||||
|
package edu.homework.crawler.command; |
||||
|
|
||||
|
import edu.homework.crawler.exception.CommandException; |
||||
|
import org.junit.jupiter.api.Test; |
||||
|
|
||||
|
import java.util.List; |
||||
|
import java.util.Map; |
||||
|
|
||||
|
import static org.junit.jupiter.api.Assertions.assertEquals; |
||||
|
import static org.junit.jupiter.api.Assertions.assertThrows; |
||||
|
|
||||
|
class CommandRegistryTest { |
||||
|
@Test |
||||
|
void tokenizesQuotedOutputPath() throws Exception { |
||||
|
CommandRegistry registry = new CommandRegistry(); |
||||
|
|
||||
|
List<String> tokens = registry.tokenize("crawl --site all --out \"data/my news.json\""); |
||||
|
|
||||
|
assertEquals(List.of("crawl", "--site", "all", "--out", "data/my news.json"), tokens); |
||||
|
} |
||||
|
|
||||
|
@Test |
||||
|
void parsesOptionsAsKeyValuePairs() throws Exception { |
||||
|
CommandRegistry registry = new CommandRegistry(); |
||||
|
|
||||
|
Map<String, String> options = registry.parseOptions(new String[]{"--site", "hnu", "--limit", "5"}); |
||||
|
|
||||
|
assertEquals("hnu", options.get("site")); |
||||
|
assertEquals("5", options.get("limit")); |
||||
|
} |
||||
|
|
||||
|
@Test |
||||
|
void rejectsMissingOptionValue() { |
||||
|
CommandRegistry registry = new CommandRegistry(); |
||||
|
|
||||
|
assertThrows(CommandException.class, () -> registry.parseOptions(new String[]{"--site"})); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,29 @@ |
|||||
|
package edu.homework.crawler.repository; |
||||
|
|
||||
|
import edu.homework.crawler.model.NewsItem; |
||||
|
import org.junit.jupiter.api.Test; |
||||
|
import org.junit.jupiter.api.io.TempDir; |
||||
|
|
||||
|
import java.nio.file.Files; |
||||
|
import java.nio.file.Path; |
||||
|
import java.util.List; |
||||
|
|
||||
|
import static org.junit.jupiter.api.Assertions.assertTrue; |
||||
|
|
||||
|
class FileNewsRepositoryTest { |
||||
|
@TempDir |
||||
|
Path tempDir; |
||||
|
|
||||
|
@Test |
||||
|
void savesJsonFile() throws Exception { |
||||
|
FileNewsRepository repository = new FileNewsRepository(); |
||||
|
NewsItem item = new NewsItem("湖南大学", "hnu", "测试新闻", "https://example.com/news"); |
||||
|
Path output = tempDir.resolve("news.json"); |
||||
|
|
||||
|
repository.save(List.of(item), OutputFormat.JSON, output); |
||||
|
|
||||
|
String json = Files.readString(output); |
||||
|
assertTrue(json.contains("测试新闻")); |
||||
|
assertTrue(json.contains("hnu")); |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,29 @@ |
|||||
|
package edu.homework.crawler.util; |
||||
|
|
||||
|
import org.junit.jupiter.api.Test; |
||||
|
|
||||
|
import static org.junit.jupiter.api.Assertions.assertEquals; |
||||
|
|
||||
|
class TextExtractorsTest { |
||||
|
@Test |
||||
|
void normalizesChinesePublishDate() { |
||||
|
String text = "来源:新闻网 作者:张三 发布时间:2026年05月28日 17:14 点击:100次"; |
||||
|
|
||||
|
assertEquals("2026-05-28", TextExtractors.findPublishTime(text)); |
||||
|
} |
||||
|
|
||||
|
@Test |
||||
|
void extractsSimpleLabelValue() { |
||||
|
String text = "来源:新闻网 作者:李四 发布时间:2026-05-28"; |
||||
|
|
||||
|
assertEquals("新闻网", TextExtractors.findLabelValue(text, "来源")); |
||||
|
assertEquals("李四", TextExtractors.findLabelValue(text, "作者")); |
||||
|
} |
||||
|
|
||||
|
@Test |
||||
|
void normalizesSplitDayYearMonthDate() { |
||||
|
String text = "28 2026-05"; |
||||
|
|
||||
|
assertEquals("2026-05-28", TextExtractors.findDate(text)); |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue