|
@@ -2,10 +2,20 @@ package htmlUnit;
|
2
|
2
|
|
3
|
3
|
import org.htmlunit.BrowserVersion;
|
4
|
4
|
import org.htmlunit.WebClient;
|
5
|
|
-import org.htmlunit.html.HtmlBody;
|
|
5
|
+import org.htmlunit.WebConnection;
|
|
6
|
+import org.htmlunit.WebRequest;
|
|
7
|
+import org.htmlunit.WebResponse;
|
|
8
|
+import org.htmlunit.html.HtmlAcronym;
|
|
9
|
+import org.htmlunit.html.HtmlAnchor;
|
|
10
|
+import org.htmlunit.html.HtmlDivision;
|
|
11
|
+import org.htmlunit.html.HtmlElement;
|
6
|
12
|
import org.htmlunit.html.HtmlPage;
|
|
13
|
+import org.htmlunit.html.HtmlParagraph;
|
|
14
|
+
|
|
15
|
+import org.htmlunit.html.HtmlSpan;
|
7
|
16
|
import org.junit.Test;
|
8
|
17
|
|
|
18
|
+import java.io.IOException;
|
9
|
19
|
import java.util.List;
|
10
|
20
|
|
11
|
21
|
/**
|
|
@@ -23,19 +33,32 @@ public class HtmlUnitTest {
|
23
|
33
|
WebClient webClient=null;
|
24
|
34
|
try {
|
25
|
35
|
webClient= new WebClient(BrowserVersion.CHROME); //定义一个WebClient
|
26
|
|
- final HtmlPage page=webClient.getPage("http://fgw.qinghai.gov.cn//"); //从指定URL获取HtmlPage
|
27
|
|
-
|
28
|
|
- List<HtmlBody> divList=page.getByXPath("//body");
|
29
|
|
- for (HtmlBody htmlBody : divList) {
|
30
|
|
- System.out.println("***********************************************8");
|
31
|
|
- System.out.println("输出:"+htmlBody.asXml());
|
|
36
|
+ // 自定义一个处理404的处理器
|
|
37
|
+ webClient.getOptions().setJavaScriptEnabled(false);
|
|
38
|
+ webClient.getOptions().setTimeout(10000*6); // 设置超时时间为10秒
|
|
39
|
+ final HtmlPage page=webClient.getPage("http://www.qinghai.gov.cn/zwgk/xwdt/qhyw/"); //从指定URL获取HtmlPage
|
|
40
|
+ List<HtmlParagraph> divList=page.getByXPath("//p[@class='item']");
|
|
41
|
+ int i = 0;
|
|
42
|
+ for (HtmlParagraph node : divList) {
|
|
43
|
+ System.out.println("***********************************************"+i);
|
|
44
|
+ HtmlSpan span = node.getFirstByXPath("span[@class='gray']");
|
|
45
|
+ if (span != null){
|
|
46
|
+ System.out.println(span.getTextContent());
|
|
47
|
+ String[] string = span.getTextContent().split("]");
|
|
48
|
+ System.out.println("[[["+string[0].replace("[","").replace("/","-").trim()+"1111");
|
|
49
|
+ }
|
|
50
|
+ HtmlAnchor anchors = node.getFirstByXPath("a[@target='_blank']");
|
|
51
|
+ System.out.println(anchors.getAttribute("href"));
|
|
52
|
+ System.out.println(anchors.getAttribute("title"));
|
|
53
|
+ System.out.println("输出:"+node.asXml());
|
|
54
|
+ i++;
|
32
|
55
|
}
|
33
|
56
|
|
34
|
57
|
} catch (Exception e) {
|
35
|
58
|
// TODO: handle exception
|
36
|
59
|
e.printStackTrace();
|
37
|
60
|
}finally {
|
38
|
|
- webClient.close(); //关闭客户端
|
|
61
|
+ // webClient.close(); //关闭客户端
|
39
|
62
|
}
|
40
|
63
|
}
|
41
|
64
|
|