Переглянути джерело

添加来源字段
设计每次抓取成功后删除以前抓取数据,只保留最近抓取数据

tangsong 1 рік тому
батько
коміт
b264a6a048

+ 3 - 12
src/main/java/com/unis/news/entity/BizSinQinghaiNews.java

@@ -1,15 +1,3 @@
1
-/*
2
- * Copyright [2022] [https://www.xiaonuo.vip]
3
- *
4
- * Snowy采用APACHE LICENSE 2.0开源协议,您在使用过程中,需要注意以下几点:
5
- *
6
- * 1.请不要删除和修改根目录下的LICENSE文件。
7
- * 2.请不要删除和修改Snowy源码头部的版权声明。
8
- * 3.本项目代码可免费商业使用,商业使用请保留源码和相关描述文件的项目出处,作者声明等。
9
- * 4.分发源码时候,请注明软件出处 https://www.xiaonuo.vip
10
- * 5.不可二次分发开源参与同类竞品,如有想法可联系团队xiaonuobase@qq.com商议合作。
11
- * 6.若您的项目无法满足以上几点,需要更多功能代码,获取Snowy商业授权许可,请在官网购买授权,地址为 https://www.xiaonuo.vip
12
- */
13
 package com.unis.news.entity;
1
 package com.unis.news.entity;
14
 
2
 
15
 import com.baomidou.mybatisplus.annotation.FieldFill;
3
 import com.baomidou.mybatisplus.annotation.FieldFill;
@@ -64,6 +52,9 @@ public class BizSinQinghaiNews {
64
     /** 跳转地址 */
52
     /** 跳转地址 */
65
     private String redirectUrl;
53
     private String redirectUrl;
66
 
54
 
55
+    /** 来源 0:手动添加;1:爬虫抓取;*/
56
+    private Integer source;
57
+
67
     /** 状态,1:保存;2:提交(启用);3:禁用 */
58
     /** 状态,1:保存;2:提交(启用);3:禁用 */
68
     private Integer status;
59
     private Integer status;
69
 }
60
 }

+ 3 - 0
src/main/java/com/unis/news/entity/BizSinStateCouncilNotice.java

@@ -64,6 +64,9 @@ public class BizSinStateCouncilNotice {
64
     /** 跳转地址 */
64
     /** 跳转地址 */
65
     private String redirectUrl;
65
     private String redirectUrl;
66
 
66
 
67
+    /** 来源 0:手动添加;1:爬虫抓取;*/
68
+    private Integer source;
69
+
67
     /** 状态,1:保存;2:提交(启用);3:禁用; */
70
     /** 状态,1:保存;2:提交(启用);3:禁用; */
68
     private Integer status;
71
     private Integer status;
69
 }
72
 }

+ 21 - 9
src/main/java/com/unis/news/service/impl/BizSinQinghaiNewsServiceImpl.java

@@ -29,6 +29,7 @@ import org.springframework.stereotype.Service;
29
 import org.springframework.transaction.annotation.Transactional;
29
 import org.springframework.transaction.annotation.Transactional;
30
 
30
 
31
 import javax.annotation.Resource;
31
 import javax.annotation.Resource;
32
+import java.util.ArrayList;
32
 import java.util.Date;
33
 import java.util.Date;
33
 import java.util.List;
34
 import java.util.List;
34
 
35
 
@@ -63,6 +64,7 @@ public class BizSinQinghaiNewsServiceImpl extends ServiceImpl<BizSinQinghaiNewsM
63
             final HtmlPage page=webClient.getPage(newsUrl.getQinghai());    //从指定URL获取HtmlPage
64
             final HtmlPage page=webClient.getPage(newsUrl.getQinghai());    //从指定URL获取HtmlPage
64
             List<HtmlParagraph> divList=page.getByXPath("//p[@class='item']");
65
             List<HtmlParagraph> divList=page.getByXPath("//p[@class='item']");
65
             int i = 0;
66
             int i = 0;
67
+            List<BizSinQinghaiNews> list1 = new ArrayList<>();
66
             for (HtmlParagraph node : divList) {
68
             for (HtmlParagraph node : divList) {
67
                 if (i==8){ //只抓取最近8条数
69
                 if (i==8){ //只抓取最近8条数
68
                     break;
70
                     break;
@@ -73,13 +75,14 @@ public class BizSinQinghaiNewsServiceImpl extends ServiceImpl<BizSinQinghaiNewsM
73
                     continue;
75
                     continue;
74
                 }
76
                 }
75
                 //先查询是否有相同标题内容,存在跳过
77
                 //先查询是否有相同标题内容,存在跳过
76
-                QueryWrapper<BizSinQinghaiNews> queryWrapper = new QueryWrapper<>();
77
-                queryWrapper.lambda().eq(BizSinQinghaiNews::getTitle,anchor.getAttribute("title"));
78
-                List<BizSinQinghaiNews> list =  this.list(queryWrapper);
79
-                if (!list.isEmpty()){ //表示此内容已经存在
80
-                    continue;
81
-                }
82
-                i++;
78
+//                QueryWrapper<BizSinQinghaiNews> queryWrapper = new QueryWrapper<>();
79
+//                queryWrapper.lambda()
80
+//                        .eq(BizSinQinghaiNews::getTitle,anchor.getAttribute("title"))
81
+//                        .eq(BizSinQinghaiNews::getSource,1); //来源
82
+//                List<BizSinQinghaiNews> list =  this.list(queryWrapper);
83
+//                if (!list.isEmpty()){ //表示此内容已经存在
84
+//                    continue;
85
+//                }
83
                 BizSinQinghaiNews bizSinQinghaiNews  = new BizSinQinghaiNews();
86
                 BizSinQinghaiNews bizSinQinghaiNews  = new BizSinQinghaiNews();
84
                 //发布日期
87
                 //发布日期
85
                 String[] string = span.getTextContent().split("]");
88
                 String[] string = span.getTextContent().split("]");
@@ -90,11 +93,20 @@ public class BizSinQinghaiNewsServiceImpl extends ServiceImpl<BizSinQinghaiNewsM
90
                 bizSinQinghaiNews.setTitle(anchor.getAttribute("title").trim());
93
                 bizSinQinghaiNews.setTitle(anchor.getAttribute("title").trim());
91
                 bizSinQinghaiNews.setRedirectUrl(anchor.getAttribute("href").trim());
94
                 bizSinQinghaiNews.setRedirectUrl(anchor.getAttribute("href").trim());
92
                 bizSinQinghaiNews.setIsRedirect(1);
95
                 bizSinQinghaiNews.setIsRedirect(1);
96
+                bizSinQinghaiNews.setSource(1); //来源
93
                 bizSinQinghaiNews.setStatus(2); //默认使用提交状态
97
                 bizSinQinghaiNews.setStatus(2); //默认使用提交状态
94
-                super.save(bizSinQinghaiNews);
95
-
98
+                list1.add(bizSinQinghaiNews);
99
+                i++;
96
             }
100
             }
101
+            if (list1.size() >0) {
102
+                //清理抓取数据,只保留今天抓取数据
103
+                QueryWrapper<BizSinQinghaiNews> queryWrapper = new QueryWrapper<>();
104
+                queryWrapper.lambda()
105
+                        .eq(BizSinQinghaiNews::getSource, 1); //来源
97
 
106
 
107
+                this.remove(queryWrapper);
108
+                this.saveOrUpdateBatch(list1);
109
+            }
98
         } catch (Exception e) {
110
         } catch (Exception e) {
99
             e.printStackTrace();
111
             e.printStackTrace();
100
         }finally {
112
         }finally {

+ 24 - 12
src/main/java/com/unis/news/service/impl/BizSinStateCouncilNoticeServiceImpl.java

@@ -27,6 +27,7 @@ import org.springframework.stereotype.Service;
27
 import org.springframework.transaction.annotation.Transactional;
27
 import org.springframework.transaction.annotation.Transactional;
28
 
28
 
29
 import javax.annotation.Resource;
29
 import javax.annotation.Resource;
30
+import java.util.ArrayList;
30
 import java.util.Date;
31
 import java.util.Date;
31
 import java.util.List;
32
 import java.util.List;
32
 
33
 
@@ -63,6 +64,7 @@ public class BizSinStateCouncilNoticeServiceImpl extends ServiceImpl<BizSinState
63
 
64
 
64
             List<HtmlHeading4> h4List=page.getByXPath("//h4");
65
             List<HtmlHeading4> h4List=page.getByXPath("//h4");
65
             int i = 0;
66
             int i = 0;
67
+            List<BizSinStateCouncilNotice> list1 = new ArrayList<>();
66
             for (HtmlHeading4 node : h4List) {
68
             for (HtmlHeading4 node : h4List) {
67
                 if (i==8){ //只抓取最近8条数
69
                 if (i==8){ //只抓取最近8条数
68
                     break;
70
                     break;
@@ -72,15 +74,17 @@ public class BizSinStateCouncilNoticeServiceImpl extends ServiceImpl<BizSinState
72
                     if (span == null || anchor == null){
74
                     if (span == null || anchor == null){
73
                         continue;
75
                         continue;
74
                     }
76
                     }
75
-                    i++;
76
-                    //先查询是否有相同标题内容,存在跳过
77
-                    String title = anchor.getTextContent().trim();
78
-                    QueryWrapper<BizSinStateCouncilNotice> queryWrapper = new QueryWrapper<>();
79
-                    queryWrapper.lambda().eq(BizSinStateCouncilNotice::getTitle,title);
80
-                    List<BizSinStateCouncilNotice> list =  this.list(queryWrapper);
81
-                    if (!list.isEmpty()){ //表示此内容已经存在
82
-                        continue;
83
-                    }
77
+
78
+                String title = anchor.getTextContent().trim();
79
+                //先查询是否有相同标题内容,存在跳过
80
+//                    QueryWrapper<BizSinStateCouncilNotice> queryWrapper = new QueryWrapper<>();
81
+//                    queryWrapper.lambda()
82
+//                            .eq(BizSinStateCouncilNotice::getTitle,title)
83
+//                            .eq(BizSinStateCouncilNotice::getSource,1); //来源
84
+//                    List<BizSinStateCouncilNotice> list =  this.list(queryWrapper);
85
+//                    if (!list.isEmpty()){ //表示此内容已经存在
86
+//                        continue;
87
+//                    }
84
                     BizSinStateCouncilNotice notice  = new BizSinStateCouncilNotice();
88
                     BizSinStateCouncilNotice notice  = new BizSinStateCouncilNotice();
85
                     //发布日期
89
                     //发布日期
86
                     Date publishDate = DateUtils.parseDate(span.getTextContent().trim(),"yyyy-MM-dd");
90
                     Date publishDate = DateUtils.parseDate(span.getTextContent().trim(),"yyyy-MM-dd");
@@ -90,11 +94,19 @@ public class BizSinStateCouncilNoticeServiceImpl extends ServiceImpl<BizSinState
90
                     notice.setPublishDate(publishDate);
94
                     notice.setPublishDate(publishDate);
91
                     notice.setRedirectUrl(anchor.getAttribute("href").trim());
95
                     notice.setRedirectUrl(anchor.getAttribute("href").trim());
92
                     notice.setIsRedirect(1);
96
                     notice.setIsRedirect(1);
97
+                    notice.setSource(1); //来源
93
                     notice.setStatus(2); //默认使用提交状态
98
                     notice.setStatus(2); //默认使用提交状态
94
-                    super.save(notice);
95
-
99
+                    list1.add(notice);
100
+                    i++;
101
+            }
102
+            //清理抓取数据,只保留今天抓取数据
103
+            if (list1.size() >0){
104
+                QueryWrapper<BizSinStateCouncilNotice> queryWrapper = new QueryWrapper<>();
105
+                queryWrapper.lambda()
106
+                        .eq(BizSinStateCouncilNotice::getSource,1); //来源
107
+                this.remove(queryWrapper);
108
+                this.saveOrUpdateBatch(list1);
96
             }
109
             }
97
-
98
 
110
 
99
         } catch (Exception e) {
111
         } catch (Exception e) {
100
             e.printStackTrace();
112
             e.printStackTrace();

+ 1 - 0
src/main/java/com/unis/news/tasksNews/TasksNewsRunning.java

@@ -46,6 +46,7 @@ public class TasksNewsRunning {
46
      * 抓取国务院网站数据,晚上3点执行一次
46
      * 抓取国务院网站数据,晚上3点执行一次
47
      */
47
      */
48
     @Scheduled(cron = "0 0 3 * * ?")
48
     @Scheduled(cron = "0 0 3 * * ?")
49
+//    @Scheduled(cron = "0/20 * * * * ?")
49
     public void stateCouncilNoticeun()   {
50
     public void stateCouncilNoticeun()   {
50
         bizSinStateCouncilNoticeService.taskAction();
51
         bizSinStateCouncilNoticeService.taskAction();
51
         log.info("----抓取国务院网站数据,晚上3点执行一次---");
52
         log.info("----抓取国务院网站数据,晚上3点执行一次---");