diff --git a/w4/.gitconfig b/w4/.gitconfig new file mode 100644 index 0000000..e853fd6 --- /dev/null +++ b/w4/.gitconfig @@ -0,0 +1,2 @@ +[safe] + directory = * diff --git a/w4/Spider - 副本.java b/w4/Spider - 副本.java new file mode 100644 index 0000000..9a7f8f4 --- /dev/null +++ b/w4/Spider - 副本.java @@ -0,0 +1,143 @@ +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.List; + +public class Spider { + protected String baseUrl; + protected String userAgent; + + public Spider(String baseUrl) { + this.baseUrl = baseUrl; + this.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"; + } + + public Document fetchPage(String url) { + try { + return Jsoup.connect(url) + .userAgent(userAgent) + .timeout(10000) + .get(); + } catch (IOException e) { + System.err.println("Error fetching " + url + ": " + e.getMessage()); + return null; + } + } + + public List parsePage(Document doc) { + throw new UnsupportedOperationException("Subclass must override parsePage method"); + } + + public void saveData(List data, String filename) { + if (data == null || data.isEmpty()) { + System.out.println("No data to save"); + return; + } + + try (PrintWriter writer = new PrintWriter(new FileWriter(filename, false))) { + writer.println("title,rating,link"); + for (Movie movie : data) { + writer.println(movie.getTitle() + "," + movie.getRating() + "," + movie.getLink()); + } + System.out.println("Data saved to " + filename); + } catch (IOException e) { + System.err.println("Error saving data: " + e.getMessage()); + } + } + + public List run(int startPage, Integer endPage) { + List allData = new ArrayList<>(); + int currentPage = startPage; + + while (true) { + String url = endPage != null ? baseUrl + "?start=" + ((currentPage - 1) * 25) : baseUrl; + System.out.println("Processing page " + currentPage + ": " + url); + + Document doc = fetchPage(url); + if (doc == null) { + break; + } + + List pageData = parsePage(doc); + if (pageData == null || pageData.isEmpty()) { + break; + } + + allData.addAll(pageData); + + if (endPage != null && currentPage >= endPage) { + break; + } + + currentPage++; + try { + Thread.sleep(1000); // 防止请求过快被封 + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } + } + + return allData; + } +} + +class Movie { + private String title; + private String rating; + private String link; + + public Movie(String title, String rating, String link) { + this.title = title; + this.rating = rating; + this.link = link; + } + + public String getTitle() { + return title; + } + + public String getRating() { + return rating; + } + + public String getLink() { + return link; + } +} + +class DoubanMovieSpider extends Spider { + public DoubanMovieSpider() { + super("https://movie.douban.com/top250"); + } + + @Override + public List parsePage(Document doc) { + List movies = new ArrayList<>(); + Elements items = doc.select("div.item"); + + for (Element item : items) { + String title = item.select("span.title").first().text(); + String rating = item.select("span.rating_num").first().text(); + String link = item.select("a").first().attr("href"); + + movies.add(new Movie(title, rating, link)); + } + + return movies; + } +} + +class Main { + public static void main(String[] args) { + DoubanMovieSpider spider = new DoubanMovieSpider(); + List data = spider.run(1, 10); // 爬取前10页 + spider.saveData(data, "douban_movies.csv"); + System.out.println("爬取完成,共获取" + data.size() + "条数据"); + } +} \ No newline at end of file diff --git a/w4/Spider.java b/w4/Spider.java new file mode 100644 index 0000000..9a7f8f4 --- /dev/null +++ b/w4/Spider.java @@ -0,0 +1,143 @@ +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.List; + +public class Spider { + protected String baseUrl; + protected String userAgent; + + public Spider(String baseUrl) { + this.baseUrl = baseUrl; + this.userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"; + } + + public Document fetchPage(String url) { + try { + return Jsoup.connect(url) + .userAgent(userAgent) + .timeout(10000) + .get(); + } catch (IOException e) { + System.err.println("Error fetching " + url + ": " + e.getMessage()); + return null; + } + } + + public List parsePage(Document doc) { + throw new UnsupportedOperationException("Subclass must override parsePage method"); + } + + public void saveData(List data, String filename) { + if (data == null || data.isEmpty()) { + System.out.println("No data to save"); + return; + } + + try (PrintWriter writer = new PrintWriter(new FileWriter(filename, false))) { + writer.println("title,rating,link"); + for (Movie movie : data) { + writer.println(movie.getTitle() + "," + movie.getRating() + "," + movie.getLink()); + } + System.out.println("Data saved to " + filename); + } catch (IOException e) { + System.err.println("Error saving data: " + e.getMessage()); + } + } + + public List run(int startPage, Integer endPage) { + List allData = new ArrayList<>(); + int currentPage = startPage; + + while (true) { + String url = endPage != null ? baseUrl + "?start=" + ((currentPage - 1) * 25) : baseUrl; + System.out.println("Processing page " + currentPage + ": " + url); + + Document doc = fetchPage(url); + if (doc == null) { + break; + } + + List pageData = parsePage(doc); + if (pageData == null || pageData.isEmpty()) { + break; + } + + allData.addAll(pageData); + + if (endPage != null && currentPage >= endPage) { + break; + } + + currentPage++; + try { + Thread.sleep(1000); // 防止请求过快被封 + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } + } + + return allData; + } +} + +class Movie { + private String title; + private String rating; + private String link; + + public Movie(String title, String rating, String link) { + this.title = title; + this.rating = rating; + this.link = link; + } + + public String getTitle() { + return title; + } + + public String getRating() { + return rating; + } + + public String getLink() { + return link; + } +} + +class DoubanMovieSpider extends Spider { + public DoubanMovieSpider() { + super("https://movie.douban.com/top250"); + } + + @Override + public List parsePage(Document doc) { + List movies = new ArrayList<>(); + Elements items = doc.select("div.item"); + + for (Element item : items) { + String title = item.select("span.title").first().text(); + String rating = item.select("span.rating_num").first().text(); + String link = item.select("a").first().attr("href"); + + movies.add(new Movie(title, rating, link)); + } + + return movies; + } +} + +class Main { + public static void main(String[] args) { + DoubanMovieSpider spider = new DoubanMovieSpider(); + List data = spider.run(1, 10); // 爬取前10页 + spider.saveData(data, "douban_movies.csv"); + System.out.println("爬取完成,共获取" + data.size() + "条数据"); + } +} \ No newline at end of file diff --git a/w4/图形面积计算器重构 完整整合版.docx b/w4/图形面积计算器重构 完整整合版.docx new file mode 100644 index 0000000..d4b7d56 Binary files /dev/null and b/w4/图形面积计算器重构 完整整合版.docx differ