Explorar el Código

添加来源字段
设计每次抓取成功后删除以前抓取数据,只保留最近抓取数据

tangsong hace 1 año
padre
commit
b264a6a048

+ 3 - 12
src/main/java/com/unis/news/entity/BizSinQinghaiNews.java

@@ -1,15 +1,3 @@
1
-/*
2
- * Copyright [2022] [https://www.xiaonuo.vip]
3
- *
4
- * Snowy采用APACHE LICENSE 2.0开源协议,您在使用过程中,需要注意以下几点:
5
- *
6
- * 1.请不要删除和修改根目录下的LICENSE文件。
7
- * 2.请不要删除和修改Snowy源码头部的版权声明。
8
- * 3.本项目代码可免费商业使用,商业使用请保留源码和相关描述文件的项目出处,作者声明等。
9
- * 4.分发源码时候,请注明软件出处 https://www.xiaonuo.vip
10
- * 5.不可二次分发开源参与同类竞品,如有想法可联系团队xiaonuobase@qq.com商议合作。
11
- * 6.若您的项目无法满足以上几点,需要更多功能代码,获取Snowy商业授权许可,请在官网购买授权,地址为 https://www.xiaonuo.vip
12
- */
13 1
 package com.unis.news.entity;
14 2
 
15 3
 import com.baomidou.mybatisplus.annotation.FieldFill;
@@ -64,6 +52,9 @@ public class BizSinQinghaiNews {
64 52
     /** 跳转地址 */
65 53
     private String redirectUrl;
66 54
 
55
+    /** 来源 0:手动添加;1:爬虫抓取;*/
56
+    private Integer source;
57
+
67 58
     /** 状态,1:保存;2:提交(启用);3:禁用 */
68 59
     private Integer status;
69 60
 }

+ 3 - 0
src/main/java/com/unis/news/entity/BizSinStateCouncilNotice.java

@@ -64,6 +64,9 @@ public class BizSinStateCouncilNotice {
64 64
     /** 跳转地址 */
65 65
     private String redirectUrl;
66 66
 
67
+    /** 来源 0:手动添加;1:爬虫抓取;*/
68
+    private Integer source;
69
+
67 70
     /** 状态,1:保存;2:提交(启用);3:禁用; */
68 71
     private Integer status;
69 72
 }

+ 21 - 9
src/main/java/com/unis/news/service/impl/BizSinQinghaiNewsServiceImpl.java

@@ -29,6 +29,7 @@ import org.springframework.stereotype.Service;
29 29
 import org.springframework.transaction.annotation.Transactional;
30 30
 
31 31
 import javax.annotation.Resource;
32
+import java.util.ArrayList;
32 33
 import java.util.Date;
33 34
 import java.util.List;
34 35
 
@@ -63,6 +64,7 @@ public class BizSinQinghaiNewsServiceImpl extends ServiceImpl<BizSinQinghaiNewsM
63 64
             final HtmlPage page=webClient.getPage(newsUrl.getQinghai());    //从指定URL获取HtmlPage
64 65
             List<HtmlParagraph> divList=page.getByXPath("//p[@class='item']");
65 66
             int i = 0;
67
+            List<BizSinQinghaiNews> list1 = new ArrayList<>();
66 68
             for (HtmlParagraph node : divList) {
67 69
                 if (i==8){ //只抓取最近8条数
68 70
                     break;
@@ -73,13 +75,14 @@ public class BizSinQinghaiNewsServiceImpl extends ServiceImpl<BizSinQinghaiNewsM
73 75
                     continue;
74 76
                 }
75 77
                 //先查询是否有相同标题内容,存在跳过
76
-                QueryWrapper<BizSinQinghaiNews> queryWrapper = new QueryWrapper<>();
77
-                queryWrapper.lambda().eq(BizSinQinghaiNews::getTitle,anchor.getAttribute("title"));
78
-                List<BizSinQinghaiNews> list =  this.list(queryWrapper);
79
-                if (!list.isEmpty()){ //表示此内容已经存在
80
-                    continue;
81
-                }
82
-                i++;
78
+//                QueryWrapper<BizSinQinghaiNews> queryWrapper = new QueryWrapper<>();
79
+//                queryWrapper.lambda()
80
+//                        .eq(BizSinQinghaiNews::getTitle,anchor.getAttribute("title"))
81
+//                        .eq(BizSinQinghaiNews::getSource,1); //来源
82
+//                List<BizSinQinghaiNews> list =  this.list(queryWrapper);
83
+//                if (!list.isEmpty()){ //表示此内容已经存在
84
+//                    continue;
85
+//                }
83 86
                 BizSinQinghaiNews bizSinQinghaiNews  = new BizSinQinghaiNews();
84 87
                 //发布日期
85 88
                 String[] string = span.getTextContent().split("]");
@@ -90,11 +93,20 @@ public class BizSinQinghaiNewsServiceImpl extends ServiceImpl<BizSinQinghaiNewsM
90 93
                 bizSinQinghaiNews.setTitle(anchor.getAttribute("title").trim());
91 94
                 bizSinQinghaiNews.setRedirectUrl(anchor.getAttribute("href").trim());
92 95
                 bizSinQinghaiNews.setIsRedirect(1);
96
+                bizSinQinghaiNews.setSource(1); //来源
93 97
                 bizSinQinghaiNews.setStatus(2); //默认使用提交状态
94
-                super.save(bizSinQinghaiNews);
95
-
98
+                list1.add(bizSinQinghaiNews);
99
+                i++;
96 100
             }
101
+            if (list1.size() >0) {
102
+                //清理抓取数据,只保留今天抓取数据
103
+                QueryWrapper<BizSinQinghaiNews> queryWrapper = new QueryWrapper<>();
104
+                queryWrapper.lambda()
105
+                        .eq(BizSinQinghaiNews::getSource, 1); //来源
97 106
 
107
+                this.remove(queryWrapper);
108
+                this.saveOrUpdateBatch(list1);
109
+            }
98 110
         } catch (Exception e) {
99 111
             e.printStackTrace();
100 112
         }finally {

+ 24 - 12
src/main/java/com/unis/news/service/impl/BizSinStateCouncilNoticeServiceImpl.java

@@ -27,6 +27,7 @@ import org.springframework.stereotype.Service;
27 27
 import org.springframework.transaction.annotation.Transactional;
28 28
 
29 29
 import javax.annotation.Resource;
30
+import java.util.ArrayList;
30 31
 import java.util.Date;
31 32
 import java.util.List;
32 33
 
@@ -63,6 +64,7 @@ public class BizSinStateCouncilNoticeServiceImpl extends ServiceImpl<BizSinState
63 64
 
64 65
             List<HtmlHeading4> h4List=page.getByXPath("//h4");
65 66
             int i = 0;
67
+            List<BizSinStateCouncilNotice> list1 = new ArrayList<>();
66 68
             for (HtmlHeading4 node : h4List) {
67 69
                 if (i==8){ //只抓取最近8条数
68 70
                     break;
@@ -72,15 +74,17 @@ public class BizSinStateCouncilNoticeServiceImpl extends ServiceImpl<BizSinState
72 74
                     if (span == null || anchor == null){
73 75
                         continue;
74 76
                     }
75
-                    i++;
76
-                    //先查询是否有相同标题内容,存在跳过
77
-                    String title = anchor.getTextContent().trim();
78
-                    QueryWrapper<BizSinStateCouncilNotice> queryWrapper = new QueryWrapper<>();
79
-                    queryWrapper.lambda().eq(BizSinStateCouncilNotice::getTitle,title);
80
-                    List<BizSinStateCouncilNotice> list =  this.list(queryWrapper);
81
-                    if (!list.isEmpty()){ //表示此内容已经存在
82
-                        continue;
83
-                    }
77
+
78
+                String title = anchor.getTextContent().trim();
79
+                //先查询是否有相同标题内容,存在跳过
80
+//                    QueryWrapper<BizSinStateCouncilNotice> queryWrapper = new QueryWrapper<>();
81
+//                    queryWrapper.lambda()
82
+//                            .eq(BizSinStateCouncilNotice::getTitle,title)
83
+//                            .eq(BizSinStateCouncilNotice::getSource,1); //来源
84
+//                    List<BizSinStateCouncilNotice> list =  this.list(queryWrapper);
85
+//                    if (!list.isEmpty()){ //表示此内容已经存在
86
+//                        continue;
87
+//                    }
84 88
                     BizSinStateCouncilNotice notice  = new BizSinStateCouncilNotice();
85 89
                     //发布日期
86 90
                     Date publishDate = DateUtils.parseDate(span.getTextContent().trim(),"yyyy-MM-dd");
@@ -90,11 +94,19 @@ public class BizSinStateCouncilNoticeServiceImpl extends ServiceImpl<BizSinState
90 94
                     notice.setPublishDate(publishDate);
91 95
                     notice.setRedirectUrl(anchor.getAttribute("href").trim());
92 96
                     notice.setIsRedirect(1);
97
+                    notice.setSource(1); //来源
93 98
                     notice.setStatus(2); //默认使用提交状态
94
-                    super.save(notice);
95
-
99
+                    list1.add(notice);
100
+                    i++;
101
+            }
102
+            //清理抓取数据,只保留今天抓取数据
103
+            if (list1.size() >0){
104
+                QueryWrapper<BizSinStateCouncilNotice> queryWrapper = new QueryWrapper<>();
105
+                queryWrapper.lambda()
106
+                        .eq(BizSinStateCouncilNotice::getSource,1); //来源
107
+                this.remove(queryWrapper);
108
+                this.saveOrUpdateBatch(list1);
96 109
             }
97
-
98 110
 
99 111
         } catch (Exception e) {
100 112
             e.printStackTrace();

+ 1 - 0
src/main/java/com/unis/news/tasksNews/TasksNewsRunning.java

@@ -46,6 +46,7 @@ public class TasksNewsRunning {
46 46
      * 抓取国务院网站数据,晚上3点执行一次
47 47
      */
48 48
     @Scheduled(cron = "0 0 3 * * ?")
49
+//    @Scheduled(cron = "0/20 * * * * ?")
49 50
     public void stateCouncilNoticeun()   {
50 51
         bizSinStateCouncilNoticeService.taskAction();
51 52
         log.info("----抓取国务院网站数据,晚上3点执行一次---");