Quellcode durchsuchen

HtmlUnit爬虫

Signed-off-by: tangsong <86121657@qq.com>
tangsong vor 4 Monaten
Ursprung
Commit
f5abf0beb2
2 geänderte Dateien mit 48 neuen und 0 gelöschten Zeilen
  1. 6 0
      unis-web-app/pom.xml
  2. 42 0
      unis-web-app/src/test/java/htmlUnit/HtmlUnitTest.java

+ 6 - 0
unis-web-app/pom.xml

@@ -74,6 +74,12 @@
74 74
             <version>${dependency-check-maven.version}</version>
75 75
         </dependency>
76 76
 
77
+        <!--htmlunit爬虫-->
78
+        <dependency>
79
+            <groupId>org.htmlunit</groupId>
80
+            <artifactId>htmlunit</artifactId>
81
+            <version>4.3.0</version>
82
+        </dependency>
77 83
 
78 84
         <!-- https://mvnrepository.com/artifact/io.netty/netty-common -->
79 85
         <dependency>

+ 42 - 0
unis-web-app/src/test/java/htmlUnit/HtmlUnitTest.java

@@ -0,0 +1,42 @@
1
+package htmlUnit;
2
+
3
+import org.htmlunit.BrowserVersion;
4
+import org.htmlunit.WebClient;
5
+import org.htmlunit.html.HtmlBody;
6
+import org.htmlunit.html.HtmlPage;
7
+import org.junit.Test;
8
+
9
+import java.util.List;
10
+
11
+/**
12
+ * htmlUnit测试
13
+ * @Date: 2024-07-19 09:51
14
+ * @Author: TangSong
15
+ * @Description:
16
+ */
17
+public class HtmlUnitTest {
18
+
19
+
20
+
21
+    @Test
22
+    public  void testHtml() {
23
+        WebClient webClient=null;
24
+        try {
25
+            webClient= new WebClient(BrowserVersion.CHROME);    //定义一个WebClient
26
+            final HtmlPage page=webClient.getPage("http://fgw.qinghai.gov.cn//");    //从指定URL获取HtmlPage
27
+
28
+            List<HtmlBody> divList=page.getByXPath("//body");
29
+            for (HtmlBody htmlBody : divList) {
30
+                System.out.println("***********************************************8");
31
+                System.out.println("输出:"+htmlBody.asXml());
32
+            }
33
+
34
+        } catch (Exception e) {
35
+            // TODO: handle exception
36
+            e.printStackTrace();
37
+        }finally {
38
+            webClient.close();    //关闭客户端
39
+        }
40
+    }
41
+
42
+}