From 2d46046e7042d32273654b6569d17396da4504cb Mon Sep 17 00:00:00 2001 From: WangJunyue <2768762959@qq.com> Date: Sun, 31 May 2026 00:32:59 +0800 Subject: [PATCH] w11 --- w11/AI架构审计.docx | Bin 0 -> 21815 bytes w11/java-cli-w11/.gitignore | 4 + w11/java-cli-w11/pom.xml | 62 ++++++++++ .../java/com/example/datacollect/Main.java | 41 +++++++ .../datacollect/command/AnalyzeCommand.java | 103 ++++++++++++++++ .../example/datacollect/command/Command.java | 8 ++ .../datacollect/command/CrawlCommand.java | 87 ++++++++++++++ .../datacollect/command/ExitCommand.java | 27 +++++ .../datacollect/command/HelpCommand.java | 26 ++++ .../datacollect/command/ListCommand.java | 26 ++++ .../controller/CrawlerController.java | 64 ++++++++++ .../exception/CrawlerException.java | 10 ++ .../exception/NetworkException.java | 10 ++ .../datacollect/exception/ParseException.java | 10 ++ .../exception/UrlFormatException.java | 10 ++ .../example/datacollect/model/Article.java | 72 +++++++++++ .../repository/ArticleRepository.java | 113 ++++++++++++++++++ .../datacollect/strategy/BlogStrategy.java | 25 ++++ .../datacollect/strategy/CrawlStrategy.java | 11 ++ .../datacollect/strategy/HnuNewsStrategy.java | 77 ++++++++++++ .../datacollect/strategy/NewsStrategy.java | 25 ++++ .../datacollect/strategy/PeopleStrategy.java | 83 +++++++++++++ .../datacollect/strategy/StrategyFactory.java | 36 ++++++ .../datacollect/strategy/YouthStrategy.java | 87 ++++++++++++++ .../example/datacollect/util/RetryUtils.java | 51 ++++++++ .../example/datacollect/view/ConsoleView.java | 46 +++++++ .../src/main/resources/logback.xml | 24 ++++ w11/java-cli-w11/target/classes/logback.xml | 24 ++++ .../target/maven-archiver/pom.properties | 3 + .../compile/default-compile/createdFiles.lst | 22 ++++ .../compile/default-compile/inputFiles.lst | 22 ++++ 31 files changed, 1209 insertions(+) create mode 100644 w11/AI架构审计.docx create mode 100644 w11/java-cli-w11/.gitignore create mode 100644 w11/java-cli-w11/pom.xml create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/Main.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/command/AnalyzeCommand.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/command/Command.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/command/CrawlCommand.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/command/ExitCommand.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/command/HelpCommand.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/command/ListCommand.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/controller/CrawlerController.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/exception/CrawlerException.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/exception/NetworkException.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/exception/ParseException.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/exception/UrlFormatException.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/model/Article.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/repository/ArticleRepository.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/BlogStrategy.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/NewsStrategy.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/PeopleStrategy.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/StrategyFactory.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/YouthStrategy.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/util/RetryUtils.java create mode 100644 w11/java-cli-w11/src/main/java/com/example/datacollect/view/ConsoleView.java create mode 100644 w11/java-cli-w11/src/main/resources/logback.xml create mode 100644 w11/java-cli-w11/target/classes/logback.xml create mode 100644 w11/java-cli-w11/target/maven-archiver/pom.properties create mode 100644 w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst create mode 100644 w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst diff --git a/w11/AI架构审计.docx b/w11/AI架构审计.docx new file mode 100644 index 0000000000000000000000000000000000000000..65c0ec358982d01cd6d28dd1587fcfaee0cc2d5f GIT binary patch literal 21815 zcmeFXV{oobuqc{j#kOtRwr$(CZQHhOXC+y&ZLipNa`SzA@4DyIKKI_bzt4H9W@_f? z?&;^9w|ly$yXB>R0V4x|0)PVm01yB$CfKQ&0s;V(fB*m>0e}N(3E9~?o7g(*DSOzP zIO)*3+gRfl0t1of0|5Qt|DWf7;~i*Dp0FIGhZlJc`3jnAU6t%4FZnZ87-xcI{stJy zEGzsdSk(R1d%(_6Ld?KCHjF>B=`{n+5Hz=*X$^tU;zn|g#*-S5q-()OwYKt?#?Kgo zbT65HTVRaU-=3VzX zK2nLK!J|aY7?L5<$jTiyC@uI_`x=E5$kX|!V*rnI`Oir>_id?+ovcCh? zT?{4~Pr96HW%Jff!0C2C=szqp&N z%$#cVtnG|n@*vFxG9d*W+`Oinp#D1g?+GO4-mTksM2K3zdZ#O4_5O!@OXR*uE&mMHPuv8|w(B2=?2q9qm8;|53~Tjdk&V z`03S&{Xc36&3_&A88p|Uu-=beBui&ByMew22CgL`jkK|9x$^bNy}Amdb7~+qF~5*B z>*av@xbAy?x0u$K-vHYq#sP*3M4v-SkUBckLY`+hkb@XKVIZh-&F&-49ff6}` z37YT{o<8*(>5x+R>97pSf{;2JdtMC1D$e7T1d(!yebg=>94V(p$QEASZIv10N)$tALfNfu#+)6~TL7Xs$F)l`% zv5BaU?7whKzk==4N*URr|yoG^vQ(}3I4+IX^YI4*Q>?Ib-T+BlB*U{0Ddi`NP# zqBLpmVXf)59HTdW`palkS80t=}!I&z2y=uzqOdZewC}hK4;Kkv6Hu1{BK)v zL9_Vut@uCF_?L!de{DhK;<6~*@2xqLqSbGhnJ{HV8IMVa{eA()iB-R3xD(py1kzLV zf|4T2<(%h>+|+Oe6C-V~0TX0Pk4yq1RS8JoVP%XK$Exmp%Y6DQJ`}xto)vtBX5hb5 ze`F)G(rHFit9Y>q1#*3Ks;W@aE)QTZ0p6y|nvCopxLn3xAY*oOsQz;7#Q~ZoLuxsu zja~>YT$XIHNQc$}kjT_4(P0eG+Quc6L6AoZDDM-P55C-~ZXt%?a6cc!DJTW(U@o|; z00ig)xKT)&RbSK(S3%!9Np&V-cMxo-?C)0=u;14TFM)8`ogQWe2$#8l0@k;@BVGFqh3rJx^v29YulQ@9n7(6nBdQ**@(ytIR_*ulj=EfIzp z8;>VF|BE0g7kwuXl@GlcNsICw$T9Z`u!`~~Mv~mCe%PBA6(cN@8-vHwkU3>qc$!#0 zD*q)kA2?JSGXI-^s3_=tq5DKv>_h3(fi=MQ52>IyNi24l>~4nh1tMqcAlAj)tvIP8 z&^Y{5Lv=aE?B9Z_*TV(I~Xk!`XtLY>T;Dv!z%MhY~d2`+4Um-+q z;-ebcEo^<%bau&UX%|}G^<{YLae2zAtl;!O%n4@B_|M=(6%ocLED`te&~dqO{5Th` z+dXbTdlNmPY6y*wf@t;;>~??!=HgSTpGwOitU_9J(fb@jC5>b+x|a+?0oUoGWZX zZ$srAvDxi#A*bUja)KN6FnnneWiw|O4;<(7?7`_cY)BC^G$%d>UoAvu6%olOa+!~E z@cqk54Lt;Y^R<7p*bb=h8g9B`}`12*q#D&%tYY||Isugh@-_M^U? z``qn^o9@udOJYwkC78aYmuPB|=Xxo3P(w>@ z)1S}N8+I31b@|w7db}ZqXVj>>ZIo_!sd{~(^nPPH1o}#t@{!UNlkqZIji;%un`7m^ z29p@aUxp?GVIEDx(OTGQ+Br7Hor-(9r%O|qcm$*>*;s)F11&mm7SBA&VE+bigLg1i=59M>>KoB;*} z@O=Rj=Dc2*l<+AKR;Yho z#JyCP?qDK3gh_LU`o<0r1qW!sqKgh}t@&mgW$7N7qus?$FvEw`z%BP0Em?H2t+3cL z4pDRtEaZcb8I3;5`WT8Dnws(ni^f#<;0=AROs_Z0dT=1%$t;bcu_&cx^t%o3Gaw

$1@m8;FL{JC*5;u`j&x9qFNM?)j@z(aLW6N4DKW4)5MwF>zGl{nx|C;x=sh>HcO zBc7fKC+5fFV-vQ2JyUND&d5CE0n6x`gj$8E{F`sZZJHzpB}BGc%r2o}5f? zNJ>yp8cRnA7JT4jO*xz*o31%Nd`|k&#MR;Tq^hT+ESH7#;ab#txvf^4!#Czw$;%Ly z`>!UR)4nk~zv79pfVa%5EH^+-YKHwz)O4&~zk`AS0jS!e?rwDzjuD z|2>jiSY_tE8B$TeU&0Y(ffkW&M<09OS3l<3v$$1Vy_qcUAh;ji8)ai+n0pXMB7&VE z^^bkwuW6EvJ@5wPp$6&v`WbA9HLRqefnQ!nx~JoEGWJaqJF3Ph1t6tj>iDw9AS%Sz z+&Rzul^^=1Z8R509e)b$23@?~G=GGcRD)^g(aP1grn_5P*V9%`mh-tZyQFM-LoKBO zOI#dxlahue%y*h5UVK+t2dKc9qwV96%C#*5_Zp;F_b zjw-!*kQxw6lB#x=0Fzv8liiGTYMyLi!SBteeac07yX>L7i;*bc@&HY|LQ!zdOQ>nf z*0j9ecz;~@k>g2Z5a!x`1wkEq!V(mw0$KR+fRI$R-?q2A(mc!^tAfVVB$~29>AdNM z^`cJ{)8$)s$TKr0u<*fh$knPkMwKJ~z2t5#A}%C#=*B;j>^rwF<#T=!*f+c+2#er% z@u9tg$MT3bxAz*2f28UV`w=sFSHsOoZ)LRf%5?s?hQxQS1FR6~tY#96ei;QDaI&@WL5U9A9nL<`zyR=AJZ4qc(XEy!6qI)<`PlKULpbLd1e? z>GG`C(q=Wq^8@nV(j={UQ<(ZiWXHif=a_kfAhrXOs6sk0aMq+R_EhKcnhmegs(ulBG}7`8jWE zSNy#@jHwfk2PT)9JJpSu=^DWI3;JqD!GZE=P6OG9oAKH2gQKa26hkEbfKf+9Ped-( zs9*lDDUeqtH_^Ru#$W1z?-wJA*$@&PzpRX%fOBq4I}6MU+7F>&wEz!efHPBHR}qKE23Y-(EAH8m_{M5IvAKh+hP} z+f4=?p}r8E+Rm3>8;PAV4Zj?J%=NcUlAh3f=eC4VaYXPuraTLedWE|1Rm8ADXa9qx z{#MXS$&90tJ!WLjO(mz2s=ScZ0E= ztm+8DlKl?=+dF_SV4qkZUQ0&y$M!A18wc>|BC{YH--Kc7qE~}_k;kAC z&(W#aY>`gKMkjvZBa}68$2rf(3vY=3^-K(_$Hk_C1f6IIuBv^y?qr6xDu+DmA}Ii? zOU^>|ue|ro(lM6{H}q{x?cBoCOerwdEWy~R5tu_At7@`;2}krtb8H_5p|en&Jl>1v zb{2#R5h@v(nBGUGM1e}_?c3%CTqPPBJI5|@u%zN0j(^v4aLg9A2;A>-Fpf(+@gkpt zlo^xGFYNItW3|X%0I~wMaIBA5Z)PK{;_8_!Dk;ogx7I#{pq#HglL=)cmI3lz$>IkC zDx(i-0cfC8^DuPJDe)hLW-((M)TfpzN}kcTlI6BL?@Zf^_c2@6?b6chjz%u;0*C)l z>-es%T2C_J;D-{V8o?F1^1GxcPbO9Rx;OP?N7~@}l&B)qr`V}UTnR;c2Z||$0rThG6$hs?4{DN`trVz@*k%|5q*w$l=#n2f$^3H~5GkQrwvVeb zq=7#45$9U#ue#Uww7ga20G3(mz+x0+q%q~{Fc%e7f%Obd-ZRE_L=&HqJ6~B$fiEwW z&CEt(KBI?bl3U8=Viy6K9Q7F2GOt+MYZmBa)RX8H%=)xR$Pht(TR6;+ja%NxHZU(A zi$);?qbf-107&j{tab`G$`yKi9G`YlqA#{EM&#AMA%@f8!H|e8mo8?AK!qqnhG@D- z7$93r5u3hkmA8XYGSG50D>yH@Cn;1aiWdR z|Chys{>*lS>GmFB0y-zAj4(ppC4ikSWHi!vCTF$x#AaowJzpUjABq=>w1r+GF@GXi zPjLm7b~{E2zVPs3Wd#C!S^{(AY+fI~!t(lwsu=9{v6GE-vq}C=zY6|R`*t1-GPWK_ zNuOOaDM)!{A@g8vrxoRum$VTm(hxubdY;kcQo%VLvmZ0cEY5l04HwG&B>lPvWjkIZ zV?U6&YAaJt!qN^SRMjE~>JND~l1s+Yua?{WStbnmnc}g)J>he83`!Vd3wv!8#9;zr zf+x@J{Xd#_0yY9hi08oM3VQ(P(dMJGQD;&^LIY*Zg%s2`dX?5<-je$w4cX*poH3YJ zUUZQKfJCE`@su&^p#7pU6O2s2!luOd%_CPH!uDUS0DdY@2B@w12Cnl5G6~3@0>tnewyQaoHDUTlWuM zdDx9J4UMqnk#Ih68OnDVH&e(t@-a}~l4mA(N&422az0N4*(92pn!v?&t;Ag;9HXX! zP>xSr+yJGi&ss=NZ!?1N*KX{Jk7kPn)(LXGu&;sn1+9|b z0e}+om>8ZB5$ti&04ibVoO7q~Q0}3}9m ziP@7GJ)b0KNg&^~_=3sy^3V1oTh)`{=-@91b55iWlnC)MHH-i=4V|Yo-%Xn2cvxdu zX3(JaL55v>Xm;#ho>l^!lOe~%86cq7OgQU+2w_13wGkh;pT*ZojfLWc+~`9lgAD~U zBca*F6rq3ML~*|Dtxq7To{X(Wu_$Szf*3-C8Cj9ZBT7{IrfynKqC_#MZJ>t{n{Ajl z?J*=%{lBV*nF1?eM>8dQs)ZcHmbW#JXx|s{yg0yLtiaXmoZH4eccpNErTVrCnm35b zLok(5bddoN3kt;DREp(1F8ZMlJF4z3GN63I2c+k^t8W$aS?@^iq)bHQ**RZ0+NISb zi(ng3l)OeC_si$s)Ji@C#fB7ss3rmt%2!b>9FVKfk>O1i|NM5&X=jf_Vuz*Iym1;B z2^&)AA&*5ARy{QDqfuY%rUN+PR|PV8(HfO(i!ACi{iW4j8Ki_;a>~quZ}GBc56;CI zbEx26)VEN^D-xmrpbKvGMxkJ})h&o@WE6?UoY=@K(~;4@(FlS4{1+IO$FuzIN+OX;pc;`gCd5 zB`2=)tNRcA7oqWeuRS77R4BJLMxWH)#+8V8Xhp2>=ZS1<{({#Nn2cF1dlGZt5@^Zi3{gY3ii#Ny5ps@`{CXt82HuT&L z1`kDfQz&chiF4-L@xPSCW^ady!luUt+rz^YxK#}(LU3ufW<`27_#6=ISnTQ;F~~Dg zIZXLmf0M3Lj^z7GL-0z4*=(a>nnTyMUxVhgzgLPLp^Km#_D7mMmOD2<$sX2wJD_Fs zw20!y+Jg6;NI8bu+sJy(Id{8Q zOSzkYfI^xFm`#1h6Ny_63A>|k44ieSG!+xYU4#Vr7xv~;)t%U>Dt+G zOZ5Z)q%=mWU9GB0;ljGv``5vpf(sD?MmP2t+_j(;*Wy8{bIxx9GS0`3Iecz_sfNwx zWe7xe;a4zPrboZeF6VWB>$)5CQ*nH~8&(58*;7tH6{wG%+7X-!|8Z0= zPc^Pa;y54Pdaw~J<5jgP*Q@GWS|D0`N+>A{$5du+ynn4!c(+h1$2NNY zePZWXj$^=~=r`(b6PlRh1Nm5^U4SXIK96llr?>n&z`mnv- zw@lq#e`eFk@m28)HX_A9CDlYAKUI#FuE!A-N`}iWM?x769s1*z4Ez)*af$O@l(d@p zNZVaYL>A0hO&&7wJwMA^#fQF_&m0kjBJwBWNP_?%gCKW;#d9u*y3TWMU? z(z}cpxrb>l)m8h{%{Ts||-Qf->+C3`LiA(0FI$;E$sj8y?`=xf2{6{K!)3JP4 z4(1^A>>^|AZei?emfpX}JwoMI%D^GDx0d$jubd%_bOg^sj$MXaw%n%fWa|~ z*PD;W;l%uw+H_BzJv>$&hC&?3Rf*d(!f)y$vK*|q94{(j`Z1vNT5eWGDK(ZRe00dg z$(ntapQx6VX@DW}cs|h3I6JCNT=#Q?qbUjuIlesU?7t!DY6xAFt)Zp0yZ5%w!t#LW zD(RbhmCkwwy%p964&0~43^D@QDPOz424vx>xH?-37yvfxwAJR+E%*%aC;rEluIR7{ zY5tHvD0vjMTOtc?P#ybkT3)XbuOoOj;`I=Tt_49#Fk>fc4>-5gZZ>aa6Jv<()(=aa zJZxH3#~#X^E&J5kBWQfcPtsDWao(S=Gk#a&poJ?^J3aNsr8z?sUT2wFK27QxqXJ}W z8Gd4^F#Y1q(4K8q7J7Fghtr}CK1i`l_)5%`M8{uV*(L`?(diaVnIi|L0;>~ok>^eB z4+5ijlrs~^OI3);U;A-K%g}K=6In`6>rh^Yj<2W16Kvv$3SUd-8|3$vhpj#JN8H{v za(u1#R`%Kvuceu;hb6TYM;5r5G;MZ$!x^TzPlbHkiJQ`g=VbBfC2SsQ(cyO!!kK93 zt!iEfPZ^>jW9lmf!y1*|Tun{0_x{w5FAi?U{pZ6kocr;b+>k2Wtj;XXZ;NFFW)1pi zwu5DrM6(~pD)K*}snsN?r}dASj=HhLId=gjXxWD~{)XkWYK#`y<+&a;qOI##->*Ib z?C?SoNRC^vkzG4oV`1s;TLt!3(O{Xl!Sj5)V^ZE6nKBw@N{MpZ%aapeO^Yg8jyY#Ss5(`R!rXaeDKUja#hiXbcqol$rJIql zc9f79Q*-$t2+^STdudMBu}?q(Y2vA5myhrNWqPuyHwp*j2h#cr000{R9Pr=tbdds*|SvDKiPa4(SbA^`haj5rRl?q6pB|dImK1`V@LJDwfgg z^AV9m3kV1*ystsw%D?Yn0cs=$2`z1OvC{;^8+zIUNhh-tDkZ-{-vx7KR$2B~M$vSNNB3i2$D%z96t)QKwbdrU$Q^=1P+svMk|G8+i{ zQQHT#Y5T$@g4WubfL+T8ej*(n+4L63KDp17b>s#+sh}YgQlk=LWGyquJrd$JBTL{ABbL78P}xvt>V85RufE-EJQ?OT?_^C^JR+U$m{cAsFHeb@{+$` zn494~>0~5M4#ghCnVzDpGnl7JgB!ahJc)gdPZl`D#6~iq(ymArIfk|2Q(d*cvyVC1 zahf0`iekOa?UgM$V&lnk@W&N59n#d>(`b8Nhe4cmhuQmKN6YrP&)OT3@9&o?(S_I$ zk5{qF=vhs;1rRg2D`nX^0ec{7Kyz2hNyCe4lL~IOBAyggxxUGCvC-(V*5glfIiqCKp${HhaJ zGctnG!*k8*pZ#qmFD=2jA)JN1!B1r?qz}I%oy!=pIm+T1b^7cUY?~HLEi$ z1~5A>w~M_dF}OV6ues|TU&8=54o=EJIElazCzw2THVq)CW@-Pp zmnaGB;qYXVP77lB$mD1cp5%sr-d15`4SOEz^;_03YlC0CE$vqDyy35ch)_3{+33H0 zhRx#~&NFK(JH}PjixYetd&v)M37xI8y88AG_Y7eT&YsQ_C{oIUw; zN=p-`8Jh}S5e$!y0%koLB~R95OJJro*hOIklRMhdTaffm;uNm$f+b%$KidStxbgYf z?F2F#!@;vO?zYkdWOU%AIY-vZc%+euMqWA^#>8$DG75UZ8AZy@0vjDKZ?JM_VX@MQ z>qQV$X%Dk*XbYHDSGq=mxH{VyIZz{rj^gfyU+y;bZHjYCC6k6<&wJaLhECpu51KX) zQQ^!)D@(SoH@hOPh!|Gel`Ug2Fb+tzgz)I5ZQktgtr_7vJ?o9`qNN7>`HHh~3NL36 zK%fr`XZLD#h9V4oheBw{X`aJ%@O zM&aywm^gK3ie}qXe#*p4Q0FJZ#2@tl8fx?hh=jWx)i6w41@uK@a15pz0Z{j{rK@~2bmmg!G?AEN{IaR3#blj^mDDA#DA6NF6 zTX-~_A#<38H;)jp<5GhCku?8aaaT>}5`YfzyLIBuZUv;&j%6S~qDUWF?R$Y_l7?N8 zhIT;h(z$AJorW54@PwP<$>qpUEB7b#7;w8^(gRtGzeWQNCQcbclJf>t@3w%%>gWL^c!$ zKjE~#C%KlNAn_3_3B(`IH5g`^_E!O=np+lpw-{x|&GY<4l{emg>X<<*B2Kr-Rc78bJLw7{S@FSqq%Z)1%5s*WUCZagNgxxvgWUiU7s=O$YySrG;ER-7Nw=LzTZiK`N=8Y)(2} zDd(;^h;)AV$>k_X7>me8P%VWk#Xn*vVA1Jfe#J>7`_8-?t?9yFeI~iJ?pZSxdnNc6 zE?-keo`PBBWqCls?@^F(0=UyHMdwPwtY2cJ=Q?D|lMnnIr49v_>QQs6s(HY+W*%lR zlMmX*ZWd}Aq-d1=cF_6?!F#Q2;^VOAYR5AmuUhib$tYDhqc*5YQ@!u^^REr44T8e5?nm&w!os#UO#AdCTR_6yI5NU>$M67=>q@6^>>7J~s4dQ3H942SByw zPJ@z-sz<7r<$AQ}4+EcFKz6y%=evi54MUUJD8C=i>AxF9F!|65(4UHB9$0|?yX?!! z#M#-x*39W2W7w>=VY|T&@1v*x-G6h>*=oHe5#1S5ClgsrQ|SCIEkP886^Sf9o`(E> z%|$FSVKYxST}a2kVs>QDo(*+>v{kEL10ya1lGhy6q}v6;jX-gDF7ZC`S?M%qOCjSf znZ8SJaV^lY)AM;B$F|V`RHGo%5+7LQkRZL@v3%IRp&|hbqF`1C9IH=iEWqqL?$U;T zS3t=XK`vQnA&s&0Zritl)nbLoV4;>6XB8-$e! zrm+!+8Q_h(AVy1T#R#p*Z91CGFPSKfMF^$~+oFGO={M}1Ol7(8o z;cutQDx8!jDk^jL00lPY4Nws{!x+&- zQqKz)@foQA8MXG1LW6hq;na~()H8TSv`7#Tp)5QT$2OBISP0Qe8uYo6rx-^OUz5mu zgH~7wsw*b~i|DeT$6BgXN{T+I3C@f4vlv7TyGGRbUDI&4EIc>!EnIP35|bqoyU%;R zbJ!Aq+-sK3NnHr#QRSYDw{EOY%K|hhnuAjj^li4QBZ|5MAviLmU$V5WQMy*uF)wx8 z&k4VQj)IyJd&omG`B;J`gHXBBLA2c316B2GrmzneW}CjjGPCMHB|PLwo??yfL7s4~ ztbA-2Cmbl$Yi`H)ls_a)Bw%^D^UFfolW|8kY9$1A%t_9tp@lM(*Ilr)^4Sv8zqv!` z<3vB?8qBuGo)AfUyaCDyB+QAtJmP+g1MZq0%c*BPzay&h(?`Bwzwl(OacMDVZYx%k zc}yu)kM6W$Ofkl{0}f&K3^f)hTH9pnBz)KUFbwtmRjG@BWXtMTk0L5pMSe0 zwWfECxP;nuE58omxRwQ3+Oa}~R(0Gj=EETBK7mdgI*pzVioR&aM{YlN&spcSubFkz7nt5;z8tx!c{AOoSvdH zm4p+OFan5VO(N}{=^o;<`ZXjmnS3LG=%i&9U{jct&cpqB?;LmdscfdsfSvt=eKPzr6_Bx$ zP-hbMCM?8qLXEBX2g<}Pm z=_O?PSiBzRoA6dj1i>w<7<~$YC;5CYAdnpyCd8?zCpHtWqc?6lJ??Q-a`R=HOWV@? z6=re8xm4yoTD#t1Thv1G!)gj(kYq7z5-&blx3GFDOqMH<(SlfiA@p$15p}EfMc(!G zYN+zA&%VBnwv z4A9Ucy)?I=N9@|Gfn|9qp-1=-s#rFE?Ql7P?wZu#`_q2t!!3V6Q{5`s?ZC=LCk(7} zRy4UKI?nwK?+V2V;14zne+7oB*Sa!;Mrt9NA_v?VXkv)Yd#734ftcP$%G`mwylpnY z(|B40s#3o>&XS*>$*A)pE9LSyna0Z_lFqi_=P8_U6I9y`jstj?B4hNrNjphNYkF1c zUIn=bNt0PkW$!7|$ZwKS+;2Zkqt4du_9~W&Fik@CmW7ugfS{U08Xlq*F_wtGAE=~H zCO6ochJ(#f)i)?#o(E70(HNi>|Kytk1d+32Ky-1p_$_&_X@L$lU^>VG461#Yy89cN z*C)KUovyGkb)~T5f|MGnO#?o^+4@@;j?v)l_Oh?>15Pa&)LczD?wfEEPg3|6*~J_! zImRRUlwOnHWGO{SgwODnDSK0L=OX7MqinB}!V>MI*xlMbnKb(G&*Q1|=Nqc6!|yCw z@&YsvU*Q0rW+!rojl9DoVo=CXxo@*HrTz_$x0Z!764yY8W zYy`Z7F*IlHgL@wfJ;nZ;;;)w{9{|k&)jQ`HRog4UbG~NQq|`BtiZl}+iX}0r3Y!(h z@yUZ?SAqmVy!1`v_Y&9|?nqlPQhqu)TrxYuI(bSv z!$TSConp1^KfdA>-4QAOV5n~YX{@Ca0hfZ%jDnX^DFEeXK*tdtb!-AS^HhM9;Q%QH zPaqVIj0bI&L?9Fqm#9D{Pz)CT8HWEe_=qM9&e6LR8CBf+jY7>kJ*OL7t+pP-xo~JU zU-Cm(j8*dwwxS~Cf8hToAzu-=L6(gdLL&ofXN$%WZVMWLR6XMon7VhvlQ-mlBEgua z{KGg=kwWl4z5)K90Y(LQMEgtlbT9aX@2=UE$;YwZi&tyVkBA!m z>$Bvxk)^zv%`YID6%;jr*EC690Fw?$kVR&I;7;3sIEENZ=`Xvy$f8P<%+5-zSLZHB zNGtx1=qmck40}_J-yi5o(c3K2t$J!WODu!IsOd}e<11kRQ+XYg-QPa}rvL6v15q)D zlfVN23>N_a!2OqHaB}vrHu=|?zR;O+JZMMgEu*~tJx{^mZllroQ{?P6@f6LZl!13W zn)Gw&Gd7SxfRu zBv#+|g=u~Rmgo0siwkCFk;#yo{qUA|`3RI5%xyUKC~-ZtG&HS1H|m%uOnd~vbGkKG z{d|bV_q}%97T5SeTdIK%ka#i2&3!s`=Y4DL#s+2PEZaEq5=go~LLqldjHb76zQrM7 zml~B5KZ8bWhJ@PmVe95+XScXOZ-Ir(o6{K2d$i{$o20#VUCN9a&OK&ooseSQW;1K+ zfk~r?J5P;7#uEqC&0!z`vVaGa&jF>Bdu#EP!H%&-2tqRX?sc}ntqX{~3-AQ^o4oki z9$gI6NKHMHoqAv|ni3logWWcU0pKSnZRjW8D)g0b1geU9I0p@;=GaVSBOz-_oK-_4 zYlOW&%TEnNeY6)7A>km)RSs}{lsM?|Q*rMypu`vfk{CYD53HK+fA}4U_bTR7e7)#; ziu!4QIRQ6dr3Zd0%Iz&(W{h#1@sX)p`d0N-`lWqCJQKJfMhu0~g?Nbqmc!)mQC21~ z?75F7+4#Y1^ylL?Vw==2Zw$wZW{{eVHLoH3(e+Tk&O9^xr`l)l{nJqC$)=@X-H~=f z#TpBB!!S`hZNt3r-qgDFy7>9h`lsf0lNlk5^EAQP%)1i$eNc>{i}aNdr9#?Q30!1Y zs>o>%=#5q{QT?jB%17ygg}b%=Tc*WPK(G{3q?7|CqK0ohi6%T_vgkcb>@aObN`g$| zNE=TW5ZRAvRfj)w)Q#{xCW@Haw`Ad%aMPT5dI*k;$A!A#cB*M2`Bs^DJ!$*4VVk|w z=_y$RK_(g}2_x!cg(EecrzT>WC4w!k2JxzvS8W)EC2t8^U2?jOGwAM+i}vV?`j$ zv8D2h9H>dcI3eNbPr;aV z70y)@BJ8t+O(0o(_31%tHR3N9MA4N>B6l3R@Rug$H4v0I5;#}vcXDLm5{iyFi9^)! zNE?s!M;u0RPQyiAyW?Vcssyi?D^>6bPHWT}u5KronRE^+?zw+e_B;^PLsoPQ>Hg4b zz|Q8rRhSJ=Na#{&hAc`p8zIs~0ou5=By@WP*8to1&rfDSpBeB7-$-#}r#QcjdRM8s z+O0xXbyZ)PkGm-Swkz0_w zlHCtQ8%r-qTESm-z1@LE?}BJJ^A=2`*Z+aDlXE=^9t!9~__I)jHWNhE8*ftT9F6yz zDfig>3Px!L_-o&a^R*-oG&hovz2n}wwJf(Z40)B~es&C5ock`AI5dOWK=(~PA^Ci>Q#(oQDymSw-XL&`KLob&MNeZzdheGUfJ!;yTHt@OX7)>NM$%D zQ8we&E7RJ4<=wd&${RjlkSm*5&HE+AZ-+|_*wYa|GU1W<%54;Kd>f3Cdwu_i&Ima( zn4CTEt};$&XG~77C;iiye;PJ9eI2lKpU?=vxo=i1;PR(#4$$NR{BMAJ6f@l}KRGoI zXnH+|(%;@g89+a;ASczbVQ|5mIZ=Z2{dA%MPpOWe#r<$(5*J8)a(3yC49Bwyj^CFIBe3vH5Hb?cDk~#wOQIDpLq0I zVa=!@6X0aqV$McEvY0u(@Jt0e&|ulH8;E{l)l3C> z&s;nDwfyQ`j2B+P;w3=Tnwk&h zfj(Ic3ZLte*{IEq47ZC%RKyiII{SjW#>FJe9C6j^Ep;pxCy9CKwz7+ie_Ji+L^b2S ziiGaya)p+?#jpH41D%}&1IE$;^pZ}y#ss4aPpb@zOZdFoD9k*^{{p^-F<}WiCGlrH zrzD(0m&8V){3etO1p76}^@B!~X16g__#qN47j%$S0!=uu3wZMaLGWgIu4v>7c=7@u zC$a)T#mo($KTCDNzm&LZb{G zfJzz652X@=#s&WWMFt*#NJ;O8NGU7;g(8?A0!c6{NQ2Abj7Hf!^24$V9Hp@Rr~L=u ze`ZOdtbawU93J>DuYYJOX;EF_|9{gl50&)_TXaqJ#^pUWZ#V3G&8OHdTHPzyB3aJj zR$8avRDCvqn`Q3+x4_}TD)-F62Fn-@FCn)aHg-l-*+%f|ym-58e+0)xY~Pkhy=j2E zOdUE&`xA23cCSEXU#_&Y__Gd znOF^CceYcAVTH~P%4Dq32gBNqkA&_C;3wVtoCl@)VyZl+0R@-ZGsDAPg3A~{bWkoZ zAd70vUx3@JH4!zJ%*FceFF*#7EyISGuEHLQvLs`qTDldiBh)Jk0h3h%MsqE}k7rnd z|3IXp)hh{rQZ1kVT8r5iV*YMdf+@(Z2$i>09;}6yViEqW2=hbbb`f7gf2Ax$ZdOT< zjJ0HMA69}%s9zBZ{zDvyv;;7m^Rt*VOR!@Rrt+BizYB)@|Ddod!02!OtTwGIR8CsC zL_LnVL>>y`w241S|HoxClz^54D-?Dq(cn^@1_L~H3Gr{`Uu-p+jN&zf^+q!>bEzU6 zz@ah}P`PRx$fW&6m;EfUq!3e;dRG3Q>yS|W7w}&ujY|I` z@Q2BN`+^>L(#b)p{9&iWqkz8WAs&)7YO*ao*%wj?i{XPdX2_4+Gp9w3tL<7EKaz$e z%V5KA?HB1@z%6%e3d(JOD!`@K8DK;OD)li@V9!1J9{Uu_s$Rt^-gA^5Ta7<H)% z-t2$M%n8twuFHHf6RH=~S@jwNx9mwlDywnAPBnO)2fnZA}49P(t7ZZWDC%nQ2?LdL;QAkYekjmWs4>=>DZtsul7sR@!A9gw!(B zS>v*JHop+DeiwTWCUH4iygXhVY0ce~wEum-zpLZnYVtTcUYQGPZ`jhU^^Rn!0@9+U z3Kx6W_g+0dRIQq=%g`ZyXW?N_CK*?qeCB*DY{7x*DW7ahs4!$dQrtze9$V9F0me1w z`av{v%^F=bV9n_gxalrr-+1`XBMva{W#C>Wn|~9NuFcLcACgmoa2f3ZoymEYv8FS_|gbsLB(%_cmLbJ57x=T z<`ql*6-)nhVZnj#R*j>0kXu#jD^&F>RQ>BgIoaEmQ8J~_O1iol4K)@v{ z#vF9K_2}D`ik~m`E~^wULF%XH25+tt&_Hz#zR&M|9FfbK=E-}v5XpOR(PB~Edn}Xt zBG)+0HaO&0=LWGxde+-!#ZTrW=B2JCE`xV_xgRT`1UoLU0V^2)S376^4fX!V@fq8+ zSc^n;Lt~3EgJNtqiLw;3OvEIHm@i{2xh*oe7&|kvZ`n;LiX>l*eHpSFl>H(qLLtkH z-0AyuFQ)E)aA(e$b3Q*jo}Y6*pXa>auk(IA&#Q{Il7?HI812$8j8n;Zp?uQe33rjU zXa$bs%43s%zmqp|feN~NDmYXzx1fOHeBz6S!$7YWYnejr)$=tU!GGj;j6qPBiu;Yj zKguZgkQ=NdDfJ)2+t(zE8}SB^=kPG<==g)A0qDWs)8i$+Aq+KJc*NARTVgF@Opi@J z$qU0CU)5CzT~4gwRI@0LHpR*KH4-UZx-ZWo_sdp3MwaDH5>#nmv1yz!Zu1+@Xw@bG z<@(TNUp+_?VK5c?)DB4w;f*hA3FCZ3JuMxXL5^zEJDL`aKR)W)b>)uZfCeEptqA(_ zvu25rbzLkY$^>Bp0QmnpQMlz`XS$$tm}I3OER0 z_lH6?{KAxG;89Dtx%nwCMBO@T1<(C5uvDW%(scGxWs8(o1zXz?uMEt#Nb^Y|53)QX z>FuWiVK=J4q6Z{)38ZS+BZL<64)fCyRZ%}NiiTA-lMb2jc9ohS=CXTVIi0?G*>ZBs z+Jb<%V5Ep{Z*}ruM>j)MW$7^4-26uwTI*66vHWko=rh;vvA4o!X+nhjv*&K~M`fld zs;QaWOxYCZ0oE4?%H!a@e8~wTg((Y}vrVb(!Hw2amd)08qz~n#!9N5y?aw7*%O#cu zi$4gdzrP)rf#UQG_7iUgpyL|kd{zB@C?q^xXi5%_LIL{ zFq?&4?gqMt#!?lONwCDb+T85JS)H+PZx&7(aGYDD!M=weUmyfra`ANPBSg=^PdU5e zFffQ_$wz!Js0}{b_8YeF=8dAv;{}#k)n9cu1at*y^(o6IBp`ON*DsFKl%kiSm4<+? zh6+TRqOTZ;3S5PWoJHIvT0CSOw6Qqu?Wt4b?*1f4ZnE`I^?(l9lf0}g!kvE0s*>2+ zU;xgR;4{+@N$`OODDK0YC)D!K)wxZAN<`&|-SrB2f$jqBd?c4a?bu0OGXe3_rB35K zx|h|0^7MPe&7Llau^IZTnuGn$WN+vWp%vc{t~UwIU&urpx$S7Y_kD0_R*KU+m!&s$ zn8G7em>3<%3X7^1I47;*QcwP{5;0VpFfq7BZ>DgHWqn>KLruaYN@Ek;XGCJn94A5_O@X27`Aj3U<0C=Q#)p3+PhGs^w_A)&hvNZ z^ben3UzrBVFLKzm1!IFPaM|TY=d%sik|hITLXsBKv)$J(fRS(S)c5D#6AE`+>~g^^ zle48Z^eVw_%dPSH4_1=o_Aj^`5le&*r6w=8_#j74UK;bT5Vx{?v8H1@!{ zc|uWGkN;ViF(T>TNG0R%AO3)BSO??B-vW<_etGx?t%J`5#{#@E4;;`DQ9_NDGYts)S3%sv`bTMuv3JM0}PM zm``-2*Mq!9zFuiHwi#+`Qz+m>|8T!!>8CB&&Xy;__ZW`tW;m92r(mjA1^u0Dg9pxTlEw~(|Ojok|Xu2WwSc| z2u6Ms$$I3RiJ-P%!2I58kQWr`pj3X2SN1Gee8`?zkZ=Bxis#zRxxt5R#uU7BlU6_$2%~8G`Gm;-zn|^jpUjIiHrOq&dxqUM3IL=rZnw ALI3~& literal 0 HcmV?d00001 diff --git a/w11/java-cli-w11/.gitignore b/w11/java-cli-w11/.gitignore new file mode 100644 index 0000000..0ebcf1a --- /dev/null +++ b/w11/java-cli-w11/.gitignore @@ -0,0 +1,4 @@ +*.jar +*.jar +*.class +*.log \ No newline at end of file diff --git a/w11/java-cli-w11/pom.xml b/w11/java-cli-w11/pom.xml new file mode 100644 index 0000000..9987b1c --- /dev/null +++ b/w11/java-cli-w11/pom.xml @@ -0,0 +1,62 @@ + + 4.0.0 + com.example + datacollect-cli + 0.1.0 + + 11 + 11 + + + + org.jsoup + jsoup + 1.17.2 + + + org.slf4j + slf4j-api + 2.0.9 + + + ch.qos.logback + logback-classic + 1.4.14 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + org.apache.maven.plugins + maven-assembly-plugin + 3.3.0 + + + + com.example.datacollect.Main + + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + + + + diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/Main.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/Main.java new file mode 100644 index 0000000..ea9d151 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/Main.java @@ -0,0 +1,41 @@ +package com.example.datacollect; + +import com.example.datacollect.controller.CrawlerController; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +/*- 添加 logger 成员 +- 记录启动日志 +- 添加全局异常处理 */ +public class Main { + private static final Logger logger = LoggerFactory.getLogger(Main.class); + + public static void main(String[] args) { + try { + logger.info("Starting CLI Crawler application"); + + ConsoleView view = new ConsoleView(); + ArticleRepository repository = new ArticleRepository(); + StrategyFactory strategyFactory = new StrategyFactory(); + CrawlerController controller = new CrawlerController(view, repository, strategyFactory); + + view.printSuccess("Welcome to CLI Crawler (w10_3)! Type help for commands."); + logger.info("Application initialized successfully"); + + while (true) { + try { + controller.handle(view.readLine()); + } catch (Exception e) { + view.printError("Error: " + e.getMessage()); + logger.error("Error in main loop: {}", e.getMessage(), e); + } + } + } catch (Exception e) { + logger.error("Fatal error in application: {}", e.getMessage(), e); + System.err.println("Fatal error: " + e.getMessage()); + System.exit(1); + } + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/AnalyzeCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/AnalyzeCommand.java new file mode 100644 index 0000000..ec9bcc3 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/AnalyzeCommand.java @@ -0,0 +1,103 @@ +package com.example.datacollect.command; + +import com.example.datacollect.exception.NetworkException; +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.util.RetryUtils; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.List; +import java.util.concurrent.Callable; + +public class AnalyzeCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(AnalyzeCommand.class); + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public AnalyzeCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "analyze"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + view.printError("Usage: analyze "); + logger.warn("Invalid command: missing URL argument"); + return; + } + String url = args[1]; + logger.info("Analyze command executed for URL: {}", url); + + try { + CrawlStrategy strategy = strategyFactory.getStrategy(url); + if (strategy == null) { + view.printError("No strategy found for: " + url); + logger.error("No strategy found for URL: {}", url); + return; + } + + Callable fetchTask = () -> { + logger.debug("Fetching document from: {}", url); + try { + return Jsoup.connect(url) + .userAgent("Mozilla/5.0") + .timeout(5000) + .get(); + } catch (IOException e) { + throw new NetworkException("Failed to connect to " + url + ": " + e.getMessage(), e); + } + }; + + Document doc = RetryUtils.executeWithRetry(fetchTask); + logger.info("Successfully fetched document from: {}", url); + + List

articles = strategy.parse(url, doc); + logger.info("Parsed {} articles for analysis", articles.size()); + + int total = articles.size(); + int totalTitleLen = 0; + int totalContentLen = 0; + + for (Article a : articles) { + totalTitleLen += a.getTitle() == null ? 0 : a.getTitle().length(); + totalContentLen += a.getContent() == null ? 0 : a.getContent().length(); + } + + view.printInfo("===== 分析统计结果 ====="); + view.printInfo("文章总数:" + total + " 篇"); + view.printInfo("标题总长度:" + totalTitleLen); + view.printInfo("内容总长度:" + totalContentLen); + if (total > 0) { + view.printInfo("平均标题长度:" + (totalTitleLen / total)); + view.printInfo("平均内容长度:" + (totalContentLen / total)); + } + view.printInfo("======================"); + view.printSuccess("分析完成(数据未保存)"); + + logger.info("Analysis completed: {} articles analyzed", total); + } catch (NetworkException e) { + view.printError("Network error: " + e.getMessage()); + logger.error("Network error while analyzing {}: {}", url, e.getMessage(), e); + } catch (ParseException e) { + view.printError("Parse error: " + e.getMessage()); + logger.error("Parse error while analyzing {}: {}", url, e.getMessage(), e); + } catch (Exception e) { + view.printError("分析失败:" + e.getMessage()); + logger.error("Unexpected error while analyzing {}: {}", url, e.getMessage(), e); + } + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/Command.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/Command.java new file mode 100644 index 0000000..029cadc --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/Command.java @@ -0,0 +1,8 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; + +public interface Command { + String getName(); + void execute(String[] args, ArticleRepository repository); +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/CrawlCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/CrawlCommand.java new file mode 100644 index 0000000..dd63594 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/CrawlCommand.java @@ -0,0 +1,87 @@ +package com.example.datacollect.command; + +import com.example.datacollect.exception.NetworkException; +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.CrawlStrategy; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.util.RetryUtils; +import com.example.datacollect.view.ConsoleView; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.concurrent.Callable; + +public class CrawlCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(CrawlCommand.class); + private final ConsoleView view; + private final StrategyFactory strategyFactory; + + public CrawlCommand(ConsoleView view, StrategyFactory strategyFactory) { + this.view = view; + this.strategyFactory = strategyFactory; + } + + @Override + public String getName() { + return "crawl"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + if (args.length < 2) { + view.printError("Usage: crawl "); + logger.warn("Invalid command: missing URL argument"); + return; + } + String url = args[1]; + logger.info("Crawl started for: {}", url); + + CrawlStrategy strategy = strategyFactory.getStrategy(url); + if (strategy == null) { + view.printError("No strategy found for: " + url); + logger.error("No strategy found for URL: {}", url); + return; + } + + try { + view.printInfo("Crawling: " + url); + + Callable fetchTask = () -> { + logger.debug("Fetching document from: {}", url); + try { + return Jsoup.connect(url) + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") + .timeout(10000) + .get(); + } catch (IOException e) { + throw new NetworkException("Failed to connect to " + url + ": " + e.getMessage(), e); + } + }; + + Document doc = RetryUtils.executeWithRetry(fetchTask); + logger.info("Successfully fetched document from: {}", url); + + var articles = strategy.parse(url, doc); + logger.info("Parsed {} articles", articles.size()); + + repository.addAll(articles); + logger.info("Successfully added {} articles to repository", articles.size()); + + view.printSuccess("Crawled " + articles.size() + " articles."); + logger.info("Successfully crawled {} articles from {}", articles.size(), url); + } catch (NetworkException e) { + view.printError("Network error: " + e.getMessage()); + logger.error("Network error while crawling {}: {}", url, e.getMessage(), e); + } catch (ParseException e) { + view.printError("Parse error: " + e.getMessage()); + logger.error("Parse error while crawling {}: {}", url, e.getMessage(), e); + } catch (Exception e) { + view.printError("Failed to crawl: " + e.getMessage()); + logger.error("Unexpected error while crawling {}: {}", url, e.getMessage(), e); + } + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ExitCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ExitCommand.java new file mode 100644 index 0000000..0f1d7fd --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ExitCommand.java @@ -0,0 +1,27 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ExitCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(ExitCommand.class); + private final ConsoleView view; + + public ExitCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "exit"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + logger.info("Exit command executed, shutting down"); + view.printSuccess("Bye!"); + System.exit(0);/*退出程序 */ + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/HelpCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/HelpCommand.java new file mode 100644 index 0000000..2087695 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/HelpCommand.java @@ -0,0 +1,26 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class HelpCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(HelpCommand.class); + private final ConsoleView view; + + public HelpCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "help"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + logger.info("Help command executed"); + view.printInfo("Commands: crawl , list, help, exit, analyze"); + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ListCommand.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ListCommand.java new file mode 100644 index 0000000..9261a3d --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/command/ListCommand.java @@ -0,0 +1,26 @@ +package com.example.datacollect.command; + +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ListCommand implements Command { + private static final Logger logger = LoggerFactory.getLogger(ListCommand.class); + private final ConsoleView view; + + public ListCommand(ConsoleView view) { + this.view = view; + } + + @Override + public String getName() { + return "list"; + } + + @Override + public void execute(String[] args, ArticleRepository repository) { + logger.info("List command executed, showing {} articles", repository.size()); + view.display(repository.getAll()); + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/controller/CrawlerController.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/controller/CrawlerController.java new file mode 100644 index 0000000..5ef370a --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/controller/CrawlerController.java @@ -0,0 +1,64 @@ +package com.example.datacollect.controller; + +import com.example.datacollect.command.AnalyzeCommand; +import com.example.datacollect.command.Command; +import com.example.datacollect.command.CrawlCommand; +import com.example.datacollect.command.ExitCommand; +import com.example.datacollect.command.HelpCommand; +import com.example.datacollect.command.ListCommand; +import com.example.datacollect.repository.ArticleRepository; +import com.example.datacollect.strategy.StrategyFactory; +import com.example.datacollect.view.ConsoleView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.HashMap; +import java.util.Map; + +public class CrawlerController { + private static final Logger logger = LoggerFactory.getLogger(CrawlerController.class); + private final Map commands = new HashMap<>(); + private final ConsoleView view; + private final ArticleRepository repository; + + public CrawlerController(ConsoleView view, ArticleRepository repository, StrategyFactory strategyFactory) { + this.view = view; + this.repository = repository; + register(new HelpCommand(view)); + register(new ListCommand(view)); + register(new CrawlCommand(view, strategyFactory)); + register(new ExitCommand(view)); + register(new AnalyzeCommand(view, strategyFactory)); + logger.info("CrawlerController initialized with {} commands", commands.size()); + } + + private void register(Command command) { + commands.put(command.getName(), command); + logger.debug("Registered command: {}", command.getName()); + } + + public void handle(String input) {/* 处理用户输入 */ + String text = input == null ? "" : input.trim();/* 处理空输入 */ + if (text.isEmpty()) { + return; + } + + String[] args = text.split("\\s+");/* 解析命令行参数 */ + String cmdName = args[0].toLowerCase();/* 提取命令名称并转换为小写 */ + + logger.debug("Processing command: {}", cmdName); + + Command command = commands.get(cmdName);/* 获取命令对象 */ + if (command == null) { + view.printError("Unknown command: " + cmdName); + logger.warn("Unknown command attempted: {}", cmdName); + return; + } + + try { + command.execute(args, repository);/* 执行命令 */ + } catch (Exception e) { + view.printError("Command execution failed: " + e.getMessage()); + logger.error("Error executing command {}: {}", cmdName, e.getMessage(), e); + } + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/CrawlerException.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/CrawlerException.java new file mode 100644 index 0000000..230adb3 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/CrawlerException.java @@ -0,0 +1,10 @@ +package com.example.datacollect.exception; + +public class CrawlerException extends Exception { + public CrawlerException(String message) { + super(message); + } + public CrawlerException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/NetworkException.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/NetworkException.java new file mode 100644 index 0000000..3a24c92 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/NetworkException.java @@ -0,0 +1,10 @@ +package com.example.datacollect.exception; + +public class NetworkException extends CrawlerException { + public NetworkException(String message) { + super(message); + } + public NetworkException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/ParseException.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/ParseException.java new file mode 100644 index 0000000..09f9f20 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/ParseException.java @@ -0,0 +1,10 @@ +package com.example.datacollect.exception; + +public class ParseException extends CrawlerException { + public ParseException(String message) { + super(message); + } + public ParseException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/UrlFormatException.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/UrlFormatException.java new file mode 100644 index 0000000..0d6df24 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/exception/UrlFormatException.java @@ -0,0 +1,10 @@ +package com.example.datacollect.exception; + +public class UrlFormatException extends RuntimeException { + public UrlFormatException(String message) { + super(message); + } + public UrlFormatException(String message, Throwable cause) { + super(message, cause); + } +} \ No newline at end of file diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/model/Article.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/model/Article.java new file mode 100644 index 0000000..53b138b --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/model/Article.java @@ -0,0 +1,72 @@ +package com.example.datacollect.model; +/*- 文章模型类 +- 添加字段验证 +- 添加 toString() 方法(已有) +- 考虑添加 equals() 和 hashCode() */ +public class Article { + private String title; + private String url; + private String content; + + public Article(String title, String url, String content) { + setTitle(title); + setUrl(url); + setContent(content); + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + if (title == null) { + throw new IllegalArgumentException("Title cannot be null"); + } + if (title.trim().isEmpty()) { + throw new IllegalArgumentException("Title cannot be empty"); + } + if (title.length() > 500) { + throw new IllegalArgumentException("Title cannot exceed 500 characters"); + } + this.title = title.trim(); + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + if (url == null) { + throw new IllegalArgumentException("URL cannot be null"); + } + if (url.trim().isEmpty()) { + throw new IllegalArgumentException("URL cannot be empty"); + } + if (!url.startsWith("http://") && !url.startsWith("https://")) { + throw new IllegalArgumentException("URL must start with http:// or https://"); + } + this.url = url.trim(); + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + if (content == null) { + this.content = ""; + } else if (content.length() > 10000) { + this.content = content.substring(0, 10000);/* 截断内容到 10000 个字符 */ + } else { + this.content = content; + } + } + + @Override + public String toString() { + return "Article{" + + "title='" + title + '\'' + + ", url='" + url + '\'' + + '}'; + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/repository/ArticleRepository.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/repository/ArticleRepository.java new file mode 100644 index 0000000..8994efa --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/repository/ArticleRepository.java @@ -0,0 +1,113 @@ +package com.example.datacollect.repository; + +import com.example.datacollect.model.Article; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +/* 文章仓库 +- 添加 logger 成员 +- 增强 add() 方法的防御检查 +- 增强 addALL() 方法的防御检查 +- 添加空值检查、重复检查、长度验证 +- 记录操作日志*/ +public class ArticleRepository { + private static final Logger logger = LoggerFactory.getLogger(ArticleRepository.class); + private static final int MAX_TITLE_LENGTH = 500;/* 最大标题长度 */ + private static final int MAX_CONTENT_LENGTH = 10000;/* 最大内容长度 */ + + private final List
articles = new ArrayList<>(); + private final Set urlSet = new HashSet<>(); + + public void add(Article article) { + if (article == null) { + logger.error("Attempted to add null article"); + throw new IllegalArgumentException("Article cannot be null"); + } + + String title = article.getTitle(); + String url = article.getUrl(); + String content = article.getContent(); + + if (title == null || title.trim().isEmpty()) { + logger.warn("Attempted to add article with empty title"); + throw new IllegalArgumentException("Article title cannot be null or empty"); + } + + if (url == null || url.trim().isEmpty()) { + logger.warn("Attempted to add article with empty URL"); + throw new IllegalArgumentException("Article URL cannot be null or empty"); + } + + if (title.length() > MAX_TITLE_LENGTH) { + logger.warn("Article title too long: {} characters (max: {})", title.length(), MAX_TITLE_LENGTH); + throw new IllegalArgumentException("Article title exceeds maximum length of " + MAX_TITLE_LENGTH); + } + + if (content != null && content.length() > MAX_CONTENT_LENGTH) { + logger.warn("Article content too long: {} characters (max: {})", content.length(), MAX_CONTENT_LENGTH); + content = content.substring(0, MAX_CONTENT_LENGTH); + } + + if (!url.startsWith("http://") && !url.startsWith("https://")) { + logger.warn("Invalid URL format: {}", url); + throw new IllegalArgumentException("Article URL must start with http:// or https://"); + } + + if (urlSet.contains(url)) { + logger.warn("Duplicate article URL detected: {}", url); + return;/* 跳过重复文章 */ + } + + Article validatedArticle = new Article(title.trim(), url.trim(), content != null ? content.trim() : "");/* 创建验证后的文章 */ + articles.add(validatedArticle);/* 添加文章到列表 */ + urlSet.add(url);/* 添加URL到集合 */ + logger.debug("Added article: {}", title);/* 记录添加日志 */ + } + + public void addAll(List
articleList) { + if (articleList == null) { + logger.error("Attempted to add null article list"); + throw new IllegalArgumentException("Article list cannot be null"); + } + + int successCount = 0;/* 成功添加的文章数量 */ + int skipCount = 0;/* 跳过的无效文章数量 */ + + for (Article article : articleList) { + if (article != null) { + try { + add(article); + successCount++; + } catch (IllegalArgumentException e) { + logger.warn("Skipped invalid article: {}", e.getMessage()); + skipCount++; + } + } else { + logger.warn("Skipped null article in list"); + skipCount++; + } + } + + logger.info("Added {} articles, skipped {} invalid articles", successCount, skipCount); + } + + public List
getAll() { + logger.debug("Retrieving all articles, total: {}", articles.size()); + return Collections.unmodifiableList(articles);/* 返回不可修改的列表 */ + } + + public int size() { + return articles.size();/* 返回文章数量 */ + } + + public void clear() { + int count = articles.size();/* 记录当前文章数量 */ + articles.clear(); + urlSet.clear(); + logger.info("Cleared repository, removed {} articles", count); + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/BlogStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/BlogStrategy.java new file mode 100644 index 0000000..1e23b2b --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/BlogStrategy.java @@ -0,0 +1,25 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class BlogStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("blog.example.com"); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements titles = doc.select(".post-title"); + for (Element e : titles) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java new file mode 100644 index 0000000..ed69e19 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/CrawlStrategy.java @@ -0,0 +1,11 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import java.util.List; + +public interface CrawlStrategy { + List
parse(String url, Document doc) throws ParseException; + boolean supports(String url); +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java new file mode 100644 index 0000000..6892510 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/HnuNewsStrategy.java @@ -0,0 +1,77 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; + +/* HNU News 策略 +- 添加 logger 成员 +- 添加异常处理 +- 实现防御性编程 */ +public class HnuNewsStrategy implements CrawlStrategy { + private static final Logger logger = LoggerFactory.getLogger(HnuNewsStrategy.class); + + @Override + public boolean supports(String url) { + return url.contains("news.hnu.edu.cn");/* 支持 HNU News 网站 */ + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + logger.info("Starting to parse HNU News: {}", url); + List
articles = new ArrayList<>();/* 存储储解析后的文章 */ + + try { + Elements listItems = doc.select("ul.list11 li");/* 选择文章列表项 */ + logger.debug("Found {} list items", listItems.size());/* 记录找到的列表项数量 */ + + for (Element li : listItems) { + try { + Element link = li.selectFirst("a");/* 选择列表项中的链接 */ + if (link == null) { + logger.warn("No link found in list item");/* 记录未找到链接 */ + continue; + } + + String articleUrl = link.attr("href");/* 获取链接的 href 属性值 */ + if (!articleUrl.startsWith("http")) { + articleUrl = "https://news.hnu.edu.cn" + articleUrl.replace("..", "");/* 补全相对路径 */ + } + + String title = "";/* 存储文章标题 */ + Element titleEl = link.selectFirst("h4.l2.h4s2");/* 选择标题元素 */ + if (titleEl != null) { + title = titleEl.text().trim();/* 提取标题文本并移除首尾空格 */ + } + + String content = "";/* 存储文章内容 */ + Element contentEl = link.selectFirst("p.l3.ps3");/* 选择内容元素 */ + if (contentEl != null) { + content = contentEl.text().trim();/* 提取内容文本并移除首尾空格 */ + } + + if (!title.isEmpty()) { + Article article = new Article(title, articleUrl, content);/* 创建文章对象 */ + articles.add(article);/* 将文章添加到列表 */ + } else { + logger.warn("Empty title found, skipping article"); + } + } catch (Exception e) { + logger.error("Error parsing individual article: {}", e.getMessage()); + } + } + + logger.info("Successfully parsed {} articles from HNU News", articles.size()); + return articles; + } catch (Exception e) { + logger.error("Failed to parse HNU News page: {}", e.getMessage(), e); + throw new ParseException("Failed to parse HNU News: " + e.getMessage(), e); + } + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/NewsStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/NewsStrategy.java new file mode 100644 index 0000000..f6eb4bd --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/NewsStrategy.java @@ -0,0 +1,25 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import java.util.ArrayList; +import java.util.List; + +public class NewsStrategy implements CrawlStrategy { + @Override + public boolean supports(String url) { + return url.contains("news.example.com"); + } + + @Override + public List
parse(String url, Document doc) { + List
articles = new ArrayList<>(); + Elements items = doc.select(".article-headline"); + for (Element e : items) { + articles.add(new Article(e.text(), url, "")); + } + return articles; + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/PeopleStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/PeopleStrategy.java new file mode 100644 index 0000000..eb25935 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/PeopleStrategy.java @@ -0,0 +1,83 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; +/* 人民网策略类 */ +public class PeopleStrategy implements CrawlStrategy { + private static final Logger logger = LoggerFactory.getLogger(PeopleStrategy.class); + + @Override + public boolean supports(String url) { + return url.contains("people.com.cn");/* 检查URL是否包含people.com.cn */ + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + logger.info("Starting to parse People's Daily News: {}", url); + List
articles = new ArrayList<>();/* 初始化文章列表 */ + + try { + Elements newsItems = doc.select("div.w1000, div.news-item, li.list_item");/* 选择新闻容器 */ + logger.debug("Found {} news containers", newsItems.size()); + + if (newsItems.isEmpty()) { + newsItems = doc.select("a[href*='/n1/']");/* 选择替代选择器 */ + logger.debug("Trying alternative selector, found {} items", newsItems.size()); + } + + for (Element item : newsItems) { + try { + Element link = item.selectFirst("a");/* 选择链接元素 */ + if (link == null) { + link = item.tagName().equals("a") ? item : null;/* 检查是否为链接元素 */ + } + + if (link == null) { + logger.warn("No link found in news item"); + continue; + } + + String articleUrl = link.attr("href");/* 获取链接URL */ + if (!articleUrl.startsWith("http")) {/* 检查是否为绝对URL */ + if (articleUrl.startsWith("/")) { + articleUrl = "https://www.people.com.cn" + articleUrl; + } else { + articleUrl = "https://www.people.com.cn/" + articleUrl; + } + } + + String title = link.text().trim();/* 获取标题文本 */ + + String content = "";/* 初始化内容文本 */ + Element contentEl = item.selectFirst("p, div.ed, div.summary");/* 选择内容元素 */ + if (contentEl != null) { + content = contentEl.text().trim();/* 获取内容文本 */ + } + + if (!title.isEmpty() && title.length() > 5) { + Article article = new Article(title, articleUrl, content);/* 创建文章对象 */ + articles.add(article);/* 添加文章到列表 */ + logger.debug("Parsed article: {}", title);/* 记录解析文章 */ + } else { + logger.warn("Invalid title found, skipping article");/* 记录无效标题 */ + } + } catch (Exception e) { + logger.error("Error parsing individual article: {}", e.getMessage()); + } + } + + logger.info("Successfully parsed {} articles from People's Daily News", articles.size()); + return articles; + } catch (Exception e) { + logger.error("Failed to parse People's Daily News page: {}", e.getMessage(), e); + throw new ParseException("Failed to parse People's Daily News: " + e.getMessage(), e); + } + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/StrategyFactory.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/StrategyFactory.java new file mode 100644 index 0000000..e28aaac --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/StrategyFactory.java @@ -0,0 +1,36 @@ +package com.example.datacollect.strategy; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; + +public class StrategyFactory { + private static final Logger logger = LoggerFactory.getLogger(StrategyFactory.class); + private final List strategies = new ArrayList<>(); + + public StrategyFactory() { + strategies.add(new HnuNewsStrategy()); + strategies.add(new YouthStrategy()); + strategies.add(new PeopleStrategy()); + strategies.add(new BlogStrategy()); + strategies.add(new NewsStrategy()); + logger.info("Initialized StrategyFactory with {} strategies", strategies.size()); + } + + public CrawlStrategy getStrategy(String url) { + for (CrawlStrategy s : strategies) { + if (s.supports(url)) { + logger.debug("Found strategy {} for URL: {}", s.getClass().getSimpleName(), url); + return s; + } + } + logger.warn("No strategy found for URL: {}", url); + return null; + } + + public void register(CrawlStrategy strategy) { + strategies.add(strategy); + logger.info("Registered new strategy: {}", strategy.getClass().getSimpleName()); + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/YouthStrategy.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/YouthStrategy.java new file mode 100644 index 0000000..2bdb8d1 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/strategy/YouthStrategy.java @@ -0,0 +1,87 @@ +package com.example.datacollect.strategy; + +import com.example.datacollect.exception.ParseException; +import com.example.datacollect.model.Article; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; +/* 青年网新闻解析策略*/ +public class YouthStrategy implements CrawlStrategy { + private static final Logger logger = LoggerFactory.getLogger(YouthStrategy.class); + + @Override + public boolean supports(String url) { + return url.contains("youth.cn");/* 检查URL是否包含青年网域名 */ + } + + @Override + public List
parse(String url, Document doc) throws ParseException { + logger.info("Starting to parse Youth News: {}", url); + List
articles = new ArrayList<>(); + + try { + Elements newsItems = doc.select("div.news-item, div.article-item, li.news-list-item");/* 选择新闻项元素 */ + logger.debug("Found {} news items", newsItems.size()); + + if (newsItems.isEmpty()) { + newsItems = doc.select("a[href*='/n1/']");/* 选择替代选择器 */ + logger.debug("Trying alternative selector, found {} items", newsItems.size()); + } + + for (Element item : newsItems) { + try { + Element link = item.selectFirst("a");/* 选择链接元素 */ + if (link == null) { + link = item.tagName().equals("a") ? item : null;/* 检查是否为链接元素 */ + } + + if (link == null) { + logger.warn("No link found in news item"); + continue; + } + + String articleUrl = link.attr("href");/* 获取链接URL */ + + if (!articleUrl.startsWith("http")) {/* 检查URL是否为绝对URL */ + if (articleUrl.startsWith("/")) { + articleUrl = "https://www.youth.cn" + articleUrl; + } else { + articleUrl = "https://www.youth.cn/" + articleUrl; + } + } + + String title = link.text().trim();/* 获取链接文本 */ + if (title.isEmpty()) {/* 检查标题是否为空 */ + continue; + } + + String content = "";/* 初始化内容为空字符串 */ + Element contentEl = item.selectFirst("p.summary, p.desc, div.brief");/* 选择摘要元素 */ + if (contentEl != null) { + content = contentEl.text().trim();/* 获取摘要文本 */ + } + + if (!title.isEmpty() && title.length() > 5) { + Article article = new Article(title, articleUrl, content); + articles.add(article); + logger.debug("Parsed article: {}", title); + } else { + logger.warn("Invalid title found, skipping article"); + } + } catch (Exception e) { + logger.error("Error parsing individual article: {}", e.getMessage()); + } + } + + logger.info("Successfully parsed {} articles from Youth News", articles.size()); + return articles; + } catch (Exception e) { + logger.error("Failed to parse Youth News page: {}", e.getMessage(), e); + throw new ParseException("Failed to parse Youth News: " + e.getMessage(), e); + } + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/util/RetryUtils.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/util/RetryUtils.java new file mode 100644 index 0000000..4cb12fc --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/util/RetryUtils.java @@ -0,0 +1,51 @@ +package com.example.datacollect.util; + +import com.example.datacollect.exception.NetworkException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.concurrent.Callable; + +public class RetryUtils { + private static final Logger logger = LoggerFactory.getLogger(RetryUtils.class); + + private static final int DEFAULT_MAX_RETRIES = 3; + private static final long DEFAULT_RETRY_BASE_DELAY_MS = 500; + + public static T executeWithRetry(Callable task) throws Exception { + return executeWithRetry(task, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_BASE_DELAY_MS); + } + + public static T executeWithRetry(Callable task, int maxRetries, long baseDelayMs) throws Exception { + Exception lastException = null; + + for (int attempt = 0; attempt <= maxRetries; attempt++) { + try { + if (attempt > 0) { + long waitTime = (long) (baseDelayMs * Math.pow(2, attempt - 1)); + logger.info("Retry attempt {}/{} for task, waiting {} ms", attempt, maxRetries, waitTime); + Thread.sleep(waitTime); + } + + return task.call(); + } catch (Exception e) { + lastException = e; + + if (e instanceof NetworkException) { + logger.warn("Network error on attempt {}: {}", attempt, e.getMessage()); + + if (attempt < maxRetries) { + long nextWaitTime = (long) (baseDelayMs * Math.pow(2, attempt)); + logger.info("Will retry in {} ms...", nextWaitTime); + continue; + } + } else { + logger.error("Non-retryable error: {}", e.getMessage()); + throw e; + } + } + } + + logger.error("All {} retry attempts failed", maxRetries + 1); + throw lastException; + } +} diff --git a/w11/java-cli-w11/src/main/java/com/example/datacollect/view/ConsoleView.java b/w11/java-cli-w11/src/main/java/com/example/datacollect/view/ConsoleView.java new file mode 100644 index 0000000..4665db0 --- /dev/null +++ b/w11/java-cli-w11/src/main/java/com/example/datacollect/view/ConsoleView.java @@ -0,0 +1,46 @@ +package com.example.datacollect.view; + +import com.example.datacollect.model.Article; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.List; +import java.util.Scanner; + +public class ConsoleView { + private static final Logger logger = LoggerFactory.getLogger(ConsoleView.class); + private static final String ANSI_RESET = "\u001B[0m"; + private static final String ANSI_GREEN = "\u001B[32m"; + private static final String ANSI_RED = "\u001B[31m"; + private static final String ANSI_BLUE = "\u001B[34m"; + + private final Scanner scanner = new Scanner(System.in); + + public String readLine() { + System.out.print("> "); + String input = scanner.nextLine(); + return input;/* 返回用户输入 */ + } + + public void printSuccess(String msg) { + System.out.println(ANSI_GREEN + msg + ANSI_RESET); + } + + public void printError(String msg) { + System.out.println(ANSI_RED + msg + ANSI_RESET); + } + + public void printInfo(String msg) { + System.out.println(ANSI_BLUE + msg + ANSI_RESET); + } + + public void display(List
articles) { + if (articles.isEmpty()) { + printInfo("暂无文章,请先执行 crawl。"); + return; + } + for (int i = 0; i < articles.size(); i++) { + Article a = articles.get(i); + System.out.println((i + 1) + ". " + a.getTitle() + " | " + a.getUrl()); + } + } +} diff --git a/w11/java-cli-w11/src/main/resources/logback.xml b/w11/java-cli-w11/src/main/resources/logback.xml new file mode 100644 index 0000000..aa0a06b --- /dev/null +++ b/w11/java-cli-w11/src/main/resources/logback.xml @@ -0,0 +1,24 @@ + + + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + logs/crawler.log + + logs/crawler.%d{yyyy-MM-dd}.log + 30 + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + diff --git a/w11/java-cli-w11/target/classes/logback.xml b/w11/java-cli-w11/target/classes/logback.xml new file mode 100644 index 0000000..aa0a06b --- /dev/null +++ b/w11/java-cli-w11/target/classes/logback.xml @@ -0,0 +1,24 @@ + + + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + logs/crawler.log + + logs/crawler.%d{yyyy-MM-dd}.log + 30 + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + diff --git a/w11/java-cli-w11/target/maven-archiver/pom.properties b/w11/java-cli-w11/target/maven-archiver/pom.properties new file mode 100644 index 0000000..5c1de34 --- /dev/null +++ b/w11/java-cli-w11/target/maven-archiver/pom.properties @@ -0,0 +1,3 @@ +artifactId=datacollect-cli +groupId=com.example +version=0.1.0 diff --git a/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst b/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst new file mode 100644 index 0000000..1ead6c5 --- /dev/null +++ b/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst @@ -0,0 +1,22 @@ +com\example\datacollect\command\ListCommand.class +com\example\datacollect\strategy\PeopleStrategy.class +com\example\datacollect\command\CrawlCommand.class +com\example\datacollect\strategy\BlogStrategy.class +com\example\datacollect\repository\ArticleRepository.class +com\example\datacollect\Main.class +com\example\datacollect\view\ConsoleView.class +com\example\datacollect\command\ExitCommand.class +com\example\datacollect\command\HelpCommand.class +com\example\datacollect\util\RetryUtils.class +com\example\datacollect\strategy\NewsStrategy.class +com\example\datacollect\command\Command.class +com\example\datacollect\controller\CrawlerController.class +com\example\datacollect\exception\CrawlerException.class +com\example\datacollect\exception\NetworkException.class +com\example\datacollect\command\AnalyzeCommand.class +com\example\datacollect\strategy\StrategyFactory.class +com\example\datacollect\strategy\HnuNewsStrategy.class +com\example\datacollect\strategy\YouthStrategy.class +com\example\datacollect\exception\ParseException.class +com\example\datacollect\strategy\CrawlStrategy.class +com\example\datacollect\model\Article.class diff --git a/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst b/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst new file mode 100644 index 0000000..937e5d7 --- /dev/null +++ b/w11/java-cli-w11/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst @@ -0,0 +1,22 @@ +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\NewsStrategy.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\controller\CrawlerController.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\repository\ArticleRepository.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\HnuNewsStrategy.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\ExitCommand.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\Command.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\Main.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\CrawlCommand.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\exception\NetworkException.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\StrategyFactory.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\BlogStrategy.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\util\RetryUtils.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\HelpCommand.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\exception\CrawlerException.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\exception\ParseException.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\model\Article.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\view\ConsoleView.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\AnalyzeCommand.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\YouthStrategy.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\command\ListCommand.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\CrawlStrategy.java +C:\Users\27687\Desktop\java-cli\src\main\java\com\example\datacollect\strategy\PeopleStrategy.java