Quellcode durchsuchen

抓取国务院要文

Signed-off-by: tangsong <86121657@qq.com>
tangsong vor 1 Jahr
Ursprung
Commit
c448b61b4e

+ 5 - 0
src/main/java/com/unis/news/service/BizSinStateCouncilNoticeService.java

@@ -30,4 +30,9 @@ public interface BizSinStateCouncilNoticeService extends IService<BizSinStateCou
30
      * @date  2024/07/23 10:18
30
      * @date  2024/07/23 10:18
31
      */
31
      */
32
     void add(BizSinStateCouncilNotice addParam);
32
     void add(BizSinStateCouncilNotice addParam);
33
+
34
+    /**
35
+     * 使用定时任务抓取数据
36
+     */
37
+    void taskAction();
33
 }
38
 }

+ 5 - 7
src/main/java/com/unis/news/service/impl/BizSinQinghaiNewsServiceImpl.java

@@ -42,8 +42,6 @@ import java.util.List;
42
 public class BizSinQinghaiNewsServiceImpl extends ServiceImpl<BizSinQinghaiNewsMapper, BizSinQinghaiNews> implements BizSinQinghaiNewsService {
42
 public class BizSinQinghaiNewsServiceImpl extends ServiceImpl<BizSinQinghaiNewsMapper, BizSinQinghaiNews> implements BizSinQinghaiNewsService {
43
     @Resource
43
     @Resource
44
     private NewsUrl newsUrl;
44
     private NewsUrl newsUrl;
45
-    @Resource
46
-    private WebClient webClient;
47
     @Transactional(rollbackFor = Exception.class)
45
     @Transactional(rollbackFor = Exception.class)
48
     @Override
46
     @Override
49
     public void add(BizSinQinghaiNews addParam) {
47
     public void add(BizSinQinghaiNews addParam) {
@@ -66,12 +64,14 @@ public class BizSinQinghaiNewsServiceImpl extends ServiceImpl<BizSinQinghaiNewsM
66
             List<HtmlParagraph> divList=page.getByXPath("//p[@class='item']");
64
             List<HtmlParagraph> divList=page.getByXPath("//p[@class='item']");
67
             int i = 0;
65
             int i = 0;
68
             for (HtmlParagraph node : divList) {
66
             for (HtmlParagraph node : divList) {
67
+                if (i==8){ //只抓取最近8条数
68
+                    break;
69
+                }
69
                 HtmlSpan span = node.getFirstByXPath("span[@class='gray']"); //得到发布日期
70
                 HtmlSpan span = node.getFirstByXPath("span[@class='gray']"); //得到发布日期
70
                 HtmlAnchor anchor = node.getFirstByXPath("a[@target='_blank']"); //得到发布标题与地址
71
                 HtmlAnchor anchor = node.getFirstByXPath("a[@target='_blank']"); //得到发布标题与地址
71
                 if (span == null || anchor == null){
72
                 if (span == null || anchor == null){
72
                     continue;
73
                     continue;
73
                 }
74
                 }
74
-                i++;
75
                 //先查询是否有相同标题内容,存在跳过
75
                 //先查询是否有相同标题内容,存在跳过
76
                 QueryWrapper<BizSinQinghaiNews> queryWrapper = new QueryWrapper<>();
76
                 QueryWrapper<BizSinQinghaiNews> queryWrapper = new QueryWrapper<>();
77
                 queryWrapper.lambda().eq(BizSinQinghaiNews::getTitle,anchor.getAttribute("title"));
77
                 queryWrapper.lambda().eq(BizSinQinghaiNews::getTitle,anchor.getAttribute("title"));
@@ -79,10 +79,10 @@ public class BizSinQinghaiNewsServiceImpl extends ServiceImpl<BizSinQinghaiNewsM
79
                 if (!list.isEmpty()){ //表示此内容已经存在
79
                 if (!list.isEmpty()){ //表示此内容已经存在
80
                     continue;
80
                     continue;
81
                 }
81
                 }
82
+                i++;
82
                 BizSinQinghaiNews bizSinQinghaiNews  = new BizSinQinghaiNews();
83
                 BizSinQinghaiNews bizSinQinghaiNews  = new BizSinQinghaiNews();
83
                 //发布日期
84
                 //发布日期
84
                 String[] string = span.getTextContent().split("]");
85
                 String[] string = span.getTextContent().split("]");
85
-                System.out.println("[[["+string[0].replace("[","")+"1111");
86
                 Date publishDate = DateUtils.parseDate(string[0].replace("[",""),"yyyy/MM/dd");
86
                 Date publishDate = DateUtils.parseDate(string[0].replace("[",""),"yyyy/MM/dd");
87
                 bizSinQinghaiNews.setCreateTime(new Date());
87
                 bizSinQinghaiNews.setCreateTime(new Date());
88
                 bizSinQinghaiNews.setUpdateTime(new Date());
88
                 bizSinQinghaiNews.setUpdateTime(new Date());
@@ -92,9 +92,7 @@ public class BizSinQinghaiNewsServiceImpl extends ServiceImpl<BizSinQinghaiNewsM
92
                 bizSinQinghaiNews.setIsRedirect(1);
92
                 bizSinQinghaiNews.setIsRedirect(1);
93
                 bizSinQinghaiNews.setStatus(2); //默认使用提交状态
93
                 bizSinQinghaiNews.setStatus(2); //默认使用提交状态
94
                 super.save(bizSinQinghaiNews);
94
                 super.save(bizSinQinghaiNews);
95
-                if (i==8){ //只抓取最近8条数
96
-                    break;
97
-                }
95
+
98
             }
96
             }
99
 
97
 
100
         } catch (Exception e) {
98
         } catch (Exception e) {

+ 70 - 1
src/main/java/com/unis/news/service/impl/BizSinStateCouncilNoticeServiceImpl.java

@@ -12,13 +12,24 @@
12
  */
12
  */
13
 package com.unis.news.service.impl;
13
 package com.unis.news.service.impl;
14
 
14
 
15
+import com.alibaba.fastjson2.util.DateUtils;
16
+import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
15
 import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
17
 import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
18
+import com.unis.config.NewsUrl;
19
+import com.unis.news.entity.BizSinQinghaiNews;
16
 import com.unis.news.entity.BizSinStateCouncilNotice;
20
 import com.unis.news.entity.BizSinStateCouncilNotice;
17
 import com.unis.news.mapper.BizSinStateCouncilNoticeMapper;
21
 import com.unis.news.mapper.BizSinStateCouncilNoticeMapper;
18
 import com.unis.news.service.BizSinStateCouncilNoticeService;
22
 import com.unis.news.service.BizSinStateCouncilNoticeService;
23
+import org.htmlunit.BrowserVersion;
24
+import org.htmlunit.WebClient;
25
+import org.htmlunit.html.*;
19
 import org.springframework.stereotype.Service;
26
 import org.springframework.stereotype.Service;
20
 import org.springframework.transaction.annotation.Transactional;
27
 import org.springframework.transaction.annotation.Transactional;
21
 
28
 
29
+import javax.annotation.Resource;
30
+import java.util.Date;
31
+import java.util.List;
32
+
22
 /**
33
 /**
23
  * 要闻概况-国务院公告管理表Service接口实现类
34
  * 要闻概况-国务院公告管理表Service接口实现类
24
  *
35
  *
@@ -27,11 +38,69 @@ import org.springframework.transaction.annotation.Transactional;
27
  **/
38
  **/
28
 @Service
39
 @Service
29
 public class BizSinStateCouncilNoticeServiceImpl extends ServiceImpl<BizSinStateCouncilNoticeMapper, BizSinStateCouncilNotice> implements BizSinStateCouncilNoticeService {
40
 public class BizSinStateCouncilNoticeServiceImpl extends ServiceImpl<BizSinStateCouncilNoticeMapper, BizSinStateCouncilNotice> implements BizSinStateCouncilNoticeService {
41
+    @Resource
42
+    private NewsUrl newsUrl;
30
 
43
 
31
     @Transactional(rollbackFor = Exception.class)
44
     @Transactional(rollbackFor = Exception.class)
32
     @Override
45
     @Override
33
     public void add(BizSinStateCouncilNotice addParam) {
46
     public void add(BizSinStateCouncilNotice addParam) {
34
-
47
+        // 在添加
48
+        this.save(addParam);
35
     }
49
     }
36
 
50
 
51
+    /**
52
+     * 使用定时任务抓取数据
53
+     */
54
+    @Override
55
+    public void taskAction() {
56
+        WebClient webClient = null;
57
+        try {
58
+            webClient = new WebClient(BrowserVersion.CHROME);
59
+            // 配置 WebClient 的参数,例如 JavaScript 支持
60
+            webClient.getOptions().setJavaScriptEnabled(false);
61
+            webClient.getOptions().setCssEnabled(false);
62
+            final HtmlPage page=webClient.getPage(newsUrl.getStateCouncil());    //从指定URL获取HtmlPage
63
+
64
+            List<HtmlHeading4> h4List=page.getByXPath("//h4");
65
+            int i = 0;
66
+            for (HtmlHeading4 node : h4List) {
67
+                if (i==8){ //只抓取最近8条数
68
+                    break;
69
+                }
70
+                    HtmlSpan span = node.getFirstByXPath("span[@class='date']"); //得到发布日期
71
+                    HtmlAnchor anchor = node.getFirstByXPath("a[@target='_blank']"); //得到发布标题与地址
72
+                    if (span == null || anchor == null){
73
+                        continue;
74
+                    }
75
+                    i++;
76
+                    //先查询是否有相同标题内容,存在跳过
77
+                    String title = anchor.getTextContent().trim();
78
+                    QueryWrapper<BizSinStateCouncilNotice> queryWrapper = new QueryWrapper<>();
79
+                    queryWrapper.lambda().eq(BizSinStateCouncilNotice::getTitle,title);
80
+                    List<BizSinStateCouncilNotice> list =  this.list(queryWrapper);
81
+                    if (!list.isEmpty()){ //表示此内容已经存在
82
+                        continue;
83
+                    }
84
+                    BizSinStateCouncilNotice notice  = new BizSinStateCouncilNotice();
85
+                    //发布日期
86
+                    Date publishDate = DateUtils.parseDate(span.getTextContent().trim(),"yyyy-MM-dd");
87
+                    notice.setCreateTime(new Date());
88
+                    notice.setUpdateTime(new Date());
89
+                    notice.setTitle(title);
90
+                    notice.setPublishDate(publishDate);
91
+                    notice.setRedirectUrl(anchor.getAttribute("href").trim());
92
+                    notice.setIsRedirect(1);
93
+                    notice.setStatus(2); //默认使用提交状态
94
+                    super.save(notice);
95
+
96
+            }
97
+
98
+
99
+        } catch (Exception e) {
100
+            e.printStackTrace();
101
+        }finally {
102
+            webClient.close();    //关闭客户端
103
+     }
104
+
105
+    }
37
 }
106
 }

+ 13 - 49
src/main/java/com/unis/news/tasksNews/TasksNewsRunning.java

@@ -3,6 +3,7 @@ package com.unis.news.tasksNews;
3
 import com.unis.config.NewsUrl;
3
 import com.unis.config.NewsUrl;
4
 import com.unis.news.entity.BizSinQinghaiNews;
4
 import com.unis.news.entity.BizSinQinghaiNews;
5
 import com.unis.news.service.BizSinQinghaiNewsService;
5
 import com.unis.news.service.BizSinQinghaiNewsService;
6
+import com.unis.news.service.BizSinStateCouncilNoticeService;
6
 import lombok.extern.slf4j.Slf4j;
7
 import lombok.extern.slf4j.Slf4j;
7
 import org.htmlunit.BrowserVersion;
8
 import org.htmlunit.BrowserVersion;
8
 import org.htmlunit.Page;
9
 import org.htmlunit.Page;
@@ -26,67 +27,30 @@ public class TasksNewsRunning {
26
     @Resource
27
     @Resource
27
     private BizSinQinghaiNewsService newsService;
28
     private BizSinQinghaiNewsService newsService;
28
     @Resource
29
     @Resource
30
+    private BizSinStateCouncilNoticeService bizSinStateCouncilNoticeService;
31
+    @Resource
29
     private NewsUrl newsUrl;
32
     private NewsUrl newsUrl;
30
     @Resource
33
     @Resource
31
     private WebClient webClient;
34
     private WebClient webClient;
32
 
35
 
33
-    /**
34
-     * test
35
-     */
36
-    //  n秒 直接在方法上使用注解即可完成扫描
37
-    @Scheduled(cron = "*/20 * * * * ?")
38
-    public void run() throws Exception {
39
-        // 测试查询
40
-        List<BizSinQinghaiNews> list = newsService.list();
41
-        log.info("test get all list for news: {}", list);
42
-
43
-        log.info("newsUrl stateCouncil: {}", newsUrl.getStateCouncil());
44
-        log.info("newsUrl qinghai: {}", newsUrl.getQinghai());
45
 
36
 
46
-        final HtmlPage page = webClient.getPage(newsUrl.getQinghai());
47
-        log.info("page: {}", page);
48
-
49
-        List<HtmlAnchor> divList = page.getByXPath("//a");
50
-        /*for (HtmlAnchor node : divList) {
51
-            log.info("webClient stateCouncil asXml: {}", node.asXml());
52
-            log.info("webClient stateCouncil href: {}", node.getAttribute("href"));
53
-            log.info("webClient stateCouncil href: {}", node.getAttribute("href"));
54
-            log.info("webClient stateCouncil src: {}", node.getAttribute("src"));
55
-            log.info("webClient stateCouncil getLocalName: {}", node.getLocalName());
56
-        }*/
57
-
58
-        log.info("----每天n秒执行---");
59
-    }
60
     /**
37
     /**
61
      * 抓取青海省政务网站数据,晚上3点执行一次
38
      * 抓取青海省政务网站数据,晚上3点执行一次
62
      */
39
      */
63
     @Scheduled(cron = "0 0 3 * * ?")
40
     @Scheduled(cron = "0 0 3 * * ?")
64
-    public void qinghaiNewsrun() throws Exception {
41
+    public void qinghaiNewsun()  {
65
         newsService.taskAction();
42
         newsService.taskAction();
66
-        log.info("----每天n秒执行---");
43
+        log.info("----抓取青海省政务网站数据,晚上3点执行一次---");
44
+    }
45
+    /**
46
+     * 抓取国务院网站数据,晚上3点执行一次
47
+     */
48
+    @Scheduled(cron = "0 0 3 * * ?")
49
+    public void stateCouncilNoticeun()   {
50
+        bizSinStateCouncilNoticeService.taskAction();
51
+        log.info("----抓取国务院网站数据,晚上3点执行一次---");
67
     }
52
     }
68
 
53
 
69
-    public static void main(String[] args) {
70
-        WebClient webClient=null;
71
-        try {
72
-            webClient= new WebClient(BrowserVersion.CHROME);    //定义一个WebClient
73
-            final HtmlPage page=webClient.getPage("http://fgw.qinghai.gov.cn//");    //从指定URL获取HtmlPage
74
-
75
-            List<HtmlAnchor> divList=page.getByXPath("//a");
76
-            for (HtmlAnchor node : divList) {
77
-                System.out.println("***********************************************8");
78
-                System.out.println("输出:"+node.asXml());
79
-                System.out.println("输出1:"+node.getAttribute("href"));
80
-                System.out.println("输出2:"+node.getAttribute("src"));
81
-                System.out.println("输出3:"+node.getLocalName());
82
-            }
83
 
54
 
84
-        } catch (Exception e) {
85
-            // TODO: handle exception
86
-            e.printStackTrace();
87
-        }finally {
88
-            webClient.close();    //关闭客户端
89
-        }
90
-    }
91
 
55
 
92
 }
56
 }