From 27f08d677282be8b03fc550cf690d52c5daed675 Mon Sep 17 00:00:00 2001
From: 913071727 <913071727@qq.com>
Date: Wed, 18 Sep 2024 13:38:24 +0800
Subject: [PATCH] =?UTF-8?q?1.=E5=88=9B=E5=BB=BA=E3=80=81=E6=8F=90=E4=BA=A4?=
=?UTF-8?q?=E9=A1=B9=E7=9B=AE=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.idea/.gitignore | 3 +
.idea/inspectionProfiles/Project_Default.xml | 65 +++++
.../inspectionProfiles/profiles_settings.xml | 6 +
.idea/misc.xml | 4 +
.idea/modules.xml | 8 +
.idea/public_sentiment.iml | 8 +
.idea/vcs.xml | 6 +
collector/collector/__init__.py | 0
collector/collector/items.py | 12 +
collector/collector/middlewares.py | 101 +++++++
collector/collector/pipelines.py | 27 ++
collector/collector/settings.py | 117 ++++++++
collector/collector/spiders/__init__.py | 4 +
.../collector/spiders/collector_spider.py | 59 ++++
collector/dbs/default.db | Bin 0 -> 8192 bytes
collector/main.py | 3 +
collector/scrapy.cfg | 11 +
manage.py | 30 ++
public_sentiment/__init__.py | 2 +
public_sentiment/asgi.py | 16 ++
public_sentiment/settings.py | 136 +++++++++
public_sentiment/urls.py | 25 ++
public_sentiment/wsgi.py | 16 ++
scrawl/__init__.py | 0
scrawl/scrapy.cfg | 11 +
scrawl/scrawl/__init__.py | 0
scrawl/scrawl/items.py | 12 +
scrawl/scrawl/middlewares.py | 103 +++++++
scrawl/scrawl/pipelines.py | 13 +
scrawl/scrawl/settings.py | 105 +++++++
scrawl/scrawl/spiders/__init__.py | 4 +
scrawl/scrawl/spiders/weibo_spider.py | 18 ++
script/main.bat | 2 +
script/runserver.bat | 1 +
script/scrapyd-console.bat | 1 +
web/__init__.py | 0
web/admin.py | 3 +
web/apps.py | 6 +
web/constants/__init__.py | 0
web/constants/startup_parameter.py | 12 +
web/controller/__init__.py | 0
web/controller/base_controller.py | 42 +++
web/controller/html_parser_controller.py | 38 +++
web/dao/__init__.py | 0
web/dao/base_dao.py | 157 +++++++++++
web/dao/public_sentiment_comment_dao.py | 13 +
web/dao/training_sensitive_word_dao.py | 20 ++
web/dto/__init__.py | 0
web/dto/api_result.py | 33 +++
web/dto/service_result.py | 29 ++
web/enum/__init__.py | 0
web/enum/api_result_enum.py | 19 ++
web/enum/service_result_enum.py | 43 +++
web/handler/__init__.py | 0
web/handler/base_handler.py | 13 +
web/handler/crawl_data_handler.py | 23 ++
web/handler/html_parser_handler.py | 51 ++++
web/manager/__init__.py | 0
web/manager/gridgraph_manager.py | 258 ++++++++++++++++++
web/manager/log_manager.py | 47 ++++
web/manager/snowflake_manager.py | 54 ++++
web/migrations/__init__.py | 0
web/models.py | 3 +
web/models/__init__.py | 6 +
web/models/public_sentiment_comment.py | 30 ++
web/models/public_sentiment_source.py | 25 ++
web/models/training_sensitive_word.py | 25 ++
web/service/__init__.py | 0
web/service/base_service.py | 15 +
.../public_sentiment_comment_service.py | 31 +++
.../training_sensitive_word_service.py | 24 ++
web/spider/__init__.py | 0
web/spider/base_spider.py | 13 +
web/task/__init__.py | 0
web/task/base_task.py | 14 +
web/task/crawl_data_task.py | 23 ++
web/tests.py | 3 +
web/util/__init__.py | 0
web/util/dto_util.py | 19 ++
web/util/re_util.py | 19 ++
web/views.py | 2 +
web/vo/__init__.py | 0
web/vo/parse_html_vo.py | 13 +
83 files changed, 2055 insertions(+)
create mode 100644 .idea/.gitignore
create mode 100644 .idea/inspectionProfiles/Project_Default.xml
create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
create mode 100644 .idea/misc.xml
create mode 100644 .idea/modules.xml
create mode 100644 .idea/public_sentiment.iml
create mode 100644 .idea/vcs.xml
create mode 100644 collector/collector/__init__.py
create mode 100644 collector/collector/items.py
create mode 100644 collector/collector/middlewares.py
create mode 100644 collector/collector/pipelines.py
create mode 100644 collector/collector/settings.py
create mode 100644 collector/collector/spiders/__init__.py
create mode 100644 collector/collector/spiders/collector_spider.py
create mode 100644 collector/dbs/default.db
create mode 100644 collector/main.py
create mode 100644 collector/scrapy.cfg
create mode 100644 manage.py
create mode 100644 public_sentiment/__init__.py
create mode 100644 public_sentiment/asgi.py
create mode 100644 public_sentiment/settings.py
create mode 100644 public_sentiment/urls.py
create mode 100644 public_sentiment/wsgi.py
create mode 100644 scrawl/__init__.py
create mode 100644 scrawl/scrapy.cfg
create mode 100644 scrawl/scrawl/__init__.py
create mode 100644 scrawl/scrawl/items.py
create mode 100644 scrawl/scrawl/middlewares.py
create mode 100644 scrawl/scrawl/pipelines.py
create mode 100644 scrawl/scrawl/settings.py
create mode 100644 scrawl/scrawl/spiders/__init__.py
create mode 100644 scrawl/scrawl/spiders/weibo_spider.py
create mode 100644 script/main.bat
create mode 100644 script/runserver.bat
create mode 100644 script/scrapyd-console.bat
create mode 100644 web/__init__.py
create mode 100644 web/admin.py
create mode 100644 web/apps.py
create mode 100644 web/constants/__init__.py
create mode 100644 web/constants/startup_parameter.py
create mode 100644 web/controller/__init__.py
create mode 100644 web/controller/base_controller.py
create mode 100644 web/controller/html_parser_controller.py
create mode 100644 web/dao/__init__.py
create mode 100644 web/dao/base_dao.py
create mode 100644 web/dao/public_sentiment_comment_dao.py
create mode 100644 web/dao/training_sensitive_word_dao.py
create mode 100644 web/dto/__init__.py
create mode 100644 web/dto/api_result.py
create mode 100644 web/dto/service_result.py
create mode 100644 web/enum/__init__.py
create mode 100644 web/enum/api_result_enum.py
create mode 100644 web/enum/service_result_enum.py
create mode 100644 web/handler/__init__.py
create mode 100644 web/handler/base_handler.py
create mode 100644 web/handler/crawl_data_handler.py
create mode 100644 web/handler/html_parser_handler.py
create mode 100644 web/manager/__init__.py
create mode 100644 web/manager/gridgraph_manager.py
create mode 100644 web/manager/log_manager.py
create mode 100644 web/manager/snowflake_manager.py
create mode 100644 web/migrations/__init__.py
create mode 100644 web/models.py
create mode 100644 web/models/__init__.py
create mode 100644 web/models/public_sentiment_comment.py
create mode 100644 web/models/public_sentiment_source.py
create mode 100644 web/models/training_sensitive_word.py
create mode 100644 web/service/__init__.py
create mode 100644 web/service/base_service.py
create mode 100644 web/service/public_sentiment_comment_service.py
create mode 100644 web/service/training_sensitive_word_service.py
create mode 100644 web/spider/__init__.py
create mode 100644 web/spider/base_spider.py
create mode 100644 web/task/__init__.py
create mode 100644 web/task/base_task.py
create mode 100644 web/task/crawl_data_task.py
create mode 100644 web/tests.py
create mode 100644 web/util/__init__.py
create mode 100644 web/util/dto_util.py
create mode 100644 web/util/re_util.py
create mode 100644 web/views.py
create mode 100644 web/vo/__init__.py
create mode 100644 web/vo/parse_html_vo.py
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..72d1229
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,65 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..d1e22ec
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..89d928d
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/public_sentiment.iml b/.idea/public_sentiment.iml
new file mode 100644
index 0000000..9b31378
--- /dev/null
+++ b/.idea/public_sentiment.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/collector/collector/__init__.py b/collector/collector/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/collector/collector/items.py b/collector/collector/items.py
new file mode 100644
index 0000000..db16f10
--- /dev/null
+++ b/collector/collector/items.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import scrapy
+
+
+class SensitiveWordItem(scrapy.Item):
+ """
+ 评论
+ """
+
+ sensitive_word = scrapy.Field()
diff --git a/collector/collector/middlewares.py b/collector/collector/middlewares.py
new file mode 100644
index 0000000..7c9dec3
--- /dev/null
+++ b/collector/collector/middlewares.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class CollectorSpiderMiddleware:
+ # Not all methods need to be defined. If a method is not defined,
+ # scrapy acts as if the spider middleware does not modify the
+ # passed objects.
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ # This method is used by Scrapy to create your spiders.
+ s = cls()
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+ return s
+
+ def process_spider_input(self, response, spider):
+ # Called for each response that goes through the spider
+ # middleware and into the spider.
+
+ # Should return None or raise an exception.
+ return None
+
+ def process_spider_output(self, response, result, spider):
+ # Called with the results returned from the Spider, after
+ # it has processed the response.
+
+ # Must return an iterable of Request, or item objects.
+ for i in result:
+ yield i
+
+ def process_spider_exception(self, response, exception, spider):
+ # Called when a spider or process_spider_input() method
+ # (from other spider middleware) raises an exception.
+
+ # Should return either None or an iterable of Request or item objects.
+ pass
+
+ def process_start_requests(self, start_requests, spider):
+ # Called with the start requests of the spider, and works
+ # similarly to the process_spider_output() method, except
+ # that it doesn’t have a response associated.
+
+ # Must return only requests (not items).
+ for r in start_requests:
+ yield r
+
+ def spider_opened(self, spider):
+ spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class CollectorDownloaderMiddleware:
+ # Not all methods need to be defined. If a method is not defined,
+ # scrapy acts as if the downloader middleware does not modify the
+ # passed objects.
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ # This method is used by Scrapy to create your spiders.
+ s = cls()
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+ return s
+
+ def process_request(self, request, spider):
+ # Called for each request that goes through the downloader
+ # middleware.
+
+ # Must either:
+ # - return None: continue processing this request
+ # - or return a Response object
+ # - or return a Request object
+ # - or raise IgnoreRequest: process_exception() methods of
+ # installed downloader middleware will be called
+ return None
+
+ def process_response(self, request, response, spider):
+ # Called with the response returned from the downloader.
+
+ # Must either;
+ # - return a Response object
+ # - return a Request object
+ # - or raise IgnoreRequest
+ return response
+
+ def process_exception(self, request, exception, spider):
+ # Called when a download service or a process_request()
+ # (from other downloader middleware) raises an exception.
+
+ # Must either:
+ # - return None: continue processing this exception
+ # - return a Response object: stops process_exception() chain
+ # - return a Request object: stops process_exception() chain
+ pass
+
+ def spider_opened(self, spider):
+ spider.logger.info("Spider opened: %s" % spider.name)
diff --git a/collector/collector/pipelines.py b/collector/collector/pipelines.py
new file mode 100644
index 0000000..ff5522b
--- /dev/null
+++ b/collector/collector/pipelines.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
+from web.models import PublicSentimentComment
+from web.manager.log_manager import LogManager
+from web.service.public_sentiment_comment_service import PublicSentimentCommentService
+
+Logger = LogManager.get_logger(__name__)
+
+
+class CollectorPipeline(object):
+
+ def __init__(self):
+ super().__init__()
+
+ def process_item(self, item, spider):
+ """
+ 将数据存储在数据库中
+ """
+
+ public_sentiment_comment = PublicSentimentComment()
+ public_sentiment_comment.content = item['sensitive_word']
+
+ public_sentiment_comment_service = PublicSentimentCommentService()
+ public_sentiment_comment_service.save(public_sentiment_comment)
+ return item
diff --git a/collector/collector/settings.py b/collector/collector/settings.py
new file mode 100644
index 0000000..25e2f6e
--- /dev/null
+++ b/collector/collector/settings.py
@@ -0,0 +1,117 @@
+# Scrapy settings for collector project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+# https://docs.scrapy.org/en/latest/topics/settings.html
+# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "collector"
+
+SPIDER_MODULES = ["collector.spiders"]
+NEWSPIDER_MODULE = "collector.spiders"
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+# USER_AGENT = "collector (+http://www.yourdomain.com)"
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# 默认为True,此处改为False
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+# CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+# DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+# COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+# TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+# DEFAULT_REQUEST_HEADERS = {
+# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+# "Accept-Language": "en",
+# }
+
+DEFAULT_REQUEST_HEADERS = {
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ 'Accept-Language': 'en',
+ 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
+ 'Cookie': 'adb_isBlock=0; userid=1652710683278_ihrfq92084; prov=cn0731; city=0732; weather_city=hn_xt; region_ip=110.53.149.x; region_ver=1.2; wxIsclose=false; ifengRotator_iis3=6; ifengWindowCookieName_919=1'
+ # 默认是注释的,这个东西非常重要,如果不写很容易被判断为电脑,简单点写一个Mozilla/5.0即可
+}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+# "collector.middlewares.CollectorSpiderMiddleware": 543,
+# }
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+# DOWNLOADER_MIDDLEWARES = {
+# "collector.middlewares.CollectorDownloaderMiddleware": 543,
+# }
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+# EXTENSIONS = {
+# "scrapy.extensions.telnet.TelnetConsole": None,
+# }
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+# ITEM_PIPELINES = {
+# "collector.pipelines.CollectorPipeline": 300,
+# }
+# 项目管道,数字越小优先度越高
+ITEM_PIPELINES = {
+ 'collector.pipelines.CollectorPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+# AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+# AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+# AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = "httpcache"
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
+
+######################################### 下面的都是自定义的 ########################################
+
+import os, django
+import sys
+
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(BASE_DIR)
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "public_sentiment.settings")
+os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
+django.setup()
diff --git a/collector/collector/spiders/__init__.py b/collector/collector/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/collector/collector/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/collector/collector/spiders/collector_spider.py b/collector/collector/spiders/collector_spider.py
new file mode 100644
index 0000000..bdc274b
--- /dev/null
+++ b/collector/collector/spiders/collector_spider.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import re
+from typing import Optional, Any
+import scrapy
+
+from collector.items import SensitiveWordItem
+from web.manager.log_manager import LogManager
+from web.spider.base_spider import BaseSpider
+from web.util.re_util import ReUtil
+
+Logger = LogManager.get_logger(__name__)
+
+
+class CollectorSpider(scrapy.Spider, BaseSpider):
+ """
+ 从微博上爬数据
+ """
+
+ name = "collector-spider"
+ allowed_domains = ["s.weibo.com"]
+ # start_urls = ["https://s.weibo.com/weibo?q=%E5%8C%97%E4%BA%AC%E5%B7%A5%E5%95%86%E5%A4%A7%E5%AD%A6&nodup=1&page=5"]
+ start_urls = ["https://xm.buyiju.com/ceming/129803-zajo.html"]
+ # url = 'https://xm.buyiju.com/ceming/129803-zajo.html'
+
+ def __init__(self, name: Optional[str] = None, **kwargs: Any):
+ scrapy.Spider.__init__(self)
+ BaseSpider.__init__(self)
+
+ # def start_requests(self):
+ # yield scrapy.Request(url=self.url, callback=self.parse)
+
+ def parse(self, response):
+
+ Logger.info('从微博上爬数据')
+ # 返回的html
+ text = response.text
+
+ # 查询敏感词,并将其拼接为字符串,用|分隔
+ training_sensitive_word_list = self.training_sensitive_word_service.find_all()
+ temp_training_sensitive_word_list = list(map(lambda training_sensitive_word: str(training_sensitive_word.word), training_sensitive_word_list))
+ match_str = '.+|.+'.join(temp_training_sensitive_word_list)
+
+ # 去除返回值中的html标签
+ text_without_html = ReUtil.clear_html(text)
+ text_without_html_list = text_without_html.split('\n')
+
+ # 匹配
+ is_match = False
+ sensitive_word_item = SensitiveWordItem()
+ for item in text_without_html_list:
+ match = re.match(match_str, item)
+ if match:
+ sensitive_word_item['sensitive_word'] = match.group()
+ is_match = True
+ break
+ if is_match:
+ yield sensitive_word_item
diff --git a/collector/dbs/default.db b/collector/dbs/default.db
new file mode 100644
index 0000000000000000000000000000000000000000..ff5ee9baf89859216639f3b97020034d6c3af1af
GIT binary patch
literal 8192
zcmeI#JqyAx5C-6j2u`A#>n(zUxYQqDm2NJ6?2_VHf~~D-3U+kycbg)DPVV}??@s<85P$##AOHaf
zKmY;|fB*y_0D(UixT?-zI8?X5={S|LFbA>wOC{CEyS3az;V$A`xL8H}KE)$*U~_ho
zh~>hXOxv1IQcs#Dx5m|MMW@fNOv+Lx!f~3%<7PMaOFseu2tWV=5P$##AOHafKmY;|
JfWQw6JOJh#GJ^mB
literal 0
HcmV?d00001
diff --git a/collector/main.py b/collector/main.py
new file mode 100644
index 0000000..0691656
--- /dev/null
+++ b/collector/main.py
@@ -0,0 +1,3 @@
+from scrapy.cmdline import execute
+
+execute('scrapy crawl collector-spider'.split())
diff --git a/collector/scrapy.cfg b/collector/scrapy.cfg
new file mode 100644
index 0000000..d9ad045
--- /dev/null
+++ b/collector/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = collector.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = collector
diff --git a/manage.py b/manage.py
new file mode 100644
index 0000000..7f75b1e
--- /dev/null
+++ b/manage.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import django
+from web.manager.log_manager import LogManager
+
+sys.path.append(r"web")
+sys.path.append(r"collector")
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'public_sentiment.settings')
+django.setup()
+
+Logger = LogManager.get_logger(__name__)
+
+if __name__ == '__main__':
+
+ LogManager.get_logger("启动服务器")
+
+ try:
+ from django.core.management import execute_from_command_line
+ except ImportError as exc:
+ raise ImportError(
+ "Couldn't import Django. Are you sure it's installed and "
+ "available on your PYTHONPATH environment variable? Did you "
+ "forget to activate a virtual environment?"
+ ) from exc
+
+ execute_from_command_line(sys.argv)
diff --git a/public_sentiment/__init__.py b/public_sentiment/__init__.py
new file mode 100644
index 0000000..c45523b
--- /dev/null
+++ b/public_sentiment/__init__.py
@@ -0,0 +1,2 @@
+import pymysql
+pymysql.install_as_MySQLdb()
\ No newline at end of file
diff --git a/public_sentiment/asgi.py b/public_sentiment/asgi.py
new file mode 100644
index 0000000..d5b43f7
--- /dev/null
+++ b/public_sentiment/asgi.py
@@ -0,0 +1,16 @@
+"""
+ASGI config for public_sentiment project.
+
+It exposes the ASGI callable as a module-level variable named ``application``.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
+"""
+
+import os
+
+from django.core.asgi import get_asgi_application
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'public_sentiment.settings')
+
+application = get_asgi_application()
diff --git a/public_sentiment/settings.py b/public_sentiment/settings.py
new file mode 100644
index 0000000..913d67f
--- /dev/null
+++ b/public_sentiment/settings.py
@@ -0,0 +1,136 @@
+"""
+Django settings for public_sentiment project.
+
+Generated by 'django-admin startproject' using Django 4.2.16.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/4.2/topics/settings/
+
+For the full list of settings and their values, see
+https://docs.djangoproject.com/en/4.2/ref/settings/
+"""
+
+from pathlib import Path
+
+# Build paths inside the project like this: BASE_DIR / 'subdir'.
+BASE_DIR = Path(__file__).resolve().parent.parent
+
+# Quick-start development settings - unsuitable for production
+# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
+
+# SECURITY WARNING: keep the secret key used in production secret!
+SECRET_KEY = 'django-insecure-!*ar1k^h=h^*azpzf3sabuf4w5m)vo^aev0l6c@6qfcdh73%ze'
+
+# SECURITY WARNING: don't run with debug turned on in production!
+DEBUG = True
+
+ALLOWED_HOSTS = []
+
+# Application definition
+
+INSTALLED_APPS = [
+ 'django.contrib.admin',
+ 'django.contrib.auth',
+ 'django.contrib.contenttypes',
+ 'django.contrib.sessions',
+ 'django.contrib.messages',
+ 'django.contrib.staticfiles',
+ 'web',
+]
+
+MIDDLEWARE = [
+ 'django.middleware.security.SecurityMiddleware',
+ 'django.contrib.sessions.middleware.SessionMiddleware',
+ 'django.middleware.common.CommonMiddleware',
+ 'django.middleware.csrf.CsrfViewMiddleware',
+ 'django.contrib.auth.middleware.AuthenticationMiddleware',
+ 'django.contrib.messages.middleware.MessageMiddleware',
+ 'django.middleware.clickjacking.XFrameOptionsMiddleware',
+]
+
+ROOT_URLCONF = 'public_sentiment.urls'
+
+TEMPLATES = [
+ {
+ 'BACKEND': 'django.template.backends.django.DjangoTemplates',
+ 'DIRS': [],
+ 'APP_DIRS': True,
+ 'OPTIONS': {
+ 'context_processors': [
+ 'django.template.context_processors.debug',
+ 'django.template.context_processors.request',
+ 'django.contrib.auth.context_processors.auth',
+ 'django.contrib.messages.context_processors.messages',
+ ],
+ },
+ },
+]
+
+WSGI_APPLICATION = 'public_sentiment.wsgi.application'
+
+# Database
+# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
+
+DATABASES = {
+ 'default': {
+ 'ENGINE': 'django.db.backends.mysql', # 默认
+ 'NAME': 'base_platform', # 连接的数据库
+ 'HOST': '127.0.0.1', # mysql的ip地址
+ 'PORT': 3306, # mysql的端口
+ 'USER': 'root', # mysql的用户名
+ 'PASSWORD': '123456', # mysql的密码
+ }
+}
+
+# Password validation
+# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
+
+AUTH_PASSWORD_VALIDATORS = [
+ {
+ 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
+ },
+ {
+ 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
+ },
+ {
+ 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
+ },
+ {
+ 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
+ },
+]
+
+# Internationalization
+# https://docs.djangoproject.com/en/4.2/topics/i18n/
+
+LANGUAGE_CODE = 'en-us'
+
+TIME_ZONE = 'UTC'
+
+USE_I18N = True
+
+USE_L10N = True
+
+# USE_TZ = True
+USE_TZ = False
+
+# Static files (CSS, JavaScript, Images)
+# https://docs.djangoproject.com/en/4.2/howto/static-files/
+
+STATIC_URL = 'static/'
+
+# Default primary key field type
+# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
+
+DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
+
+# 时区
+TIME_ZONE = 'Asia/Shanghai'
+
+# gridgraph的配置
+GRID_GRAPH = {
+ 'url': 'ws://192.168.3.18:8182/gremlin',
+ 'traversal_source': 'gmodern100M',
+ 'username': 'admin',
+ 'password': 'admin'
+}
diff --git a/public_sentiment/urls.py b/public_sentiment/urls.py
new file mode 100644
index 0000000..d5694b0
--- /dev/null
+++ b/public_sentiment/urls.py
@@ -0,0 +1,25 @@
+"""
+URL configuration for public_sentiment project.
+
+The `urlpatterns` list routes URLs to views. For more information please see:
+ https://docs.djangoproject.com/en/4.2/topics/http/urls/
+Examples:
+Function views
+ 1. Add an import: from my_app import views
+ 2. Add a URL to urlpatterns: path('', views.home, name='home')
+Class-based views
+ 1. Add an import: from other_app.views import Home
+ 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
+Including another URLconf
+ 1. Import the include() function: from django.urls import include, path
+ 2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
+"""
+from django.contrib import admin
+from django.urls import path
+
+from web.controller.html_parser_controller import parse_html
+
+urlpatterns = [
+ path('admin/', admin.site.urls),
+ path('api/v1/htmlParser/parseHtml', parse_html),
+]
diff --git a/public_sentiment/wsgi.py b/public_sentiment/wsgi.py
new file mode 100644
index 0000000..36bb2a2
--- /dev/null
+++ b/public_sentiment/wsgi.py
@@ -0,0 +1,16 @@
+"""
+WSGI config for public_sentiment project.
+
+It exposes the WSGI callable as a module-level variable named ``application``.
+
+For more information on this file, see
+https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
+"""
+
+import os
+
+from django.core.wsgi import get_wsgi_application
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'public_sentiment.settings')
+
+application = get_wsgi_application()
diff --git a/scrawl/__init__.py b/scrawl/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scrawl/scrapy.cfg b/scrawl/scrapy.cfg
new file mode 100644
index 0000000..5bdf38f
--- /dev/null
+++ b/scrawl/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = scrawl.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = scrawl
diff --git a/scrawl/scrawl/__init__.py b/scrawl/scrawl/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scrawl/scrawl/items.py b/scrawl/scrawl/items.py
new file mode 100644
index 0000000..3246248
--- /dev/null
+++ b/scrawl/scrawl/items.py
@@ -0,0 +1,12 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class ScrawlItem(scrapy.Item):
+ # define the fields for your item here like:
+ # name = scrapy.Field()
+ pass
diff --git a/scrawl/scrawl/middlewares.py b/scrawl/scrawl/middlewares.py
new file mode 100644
index 0000000..cb087d4
--- /dev/null
+++ b/scrawl/scrawl/middlewares.py
@@ -0,0 +1,103 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class ScrawlSpiderMiddleware:
+ # Not all methods need to be defined. If a method is not defined,
+ # scrapy acts as if the spider middleware does not modify the
+ # passed objects.
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ # This method is used by Scrapy to create your spiders.
+ s = cls()
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+ return s
+
+ def process_spider_input(self, response, spider):
+ # Called for each response that goes through the spider
+ # middleware and into the spider.
+
+ # Should return None or raise an exception.
+ return None
+
+ def process_spider_output(self, response, result, spider):
+ # Called with the results returned from the Spider, after
+ # it has processed the response.
+
+ # Must return an iterable of Request, or item objects.
+ for i in result:
+ yield i
+
+ def process_spider_exception(self, response, exception, spider):
+ # Called when a spider or process_spider_input() method
+ # (from other spider middleware) raises an exception.
+
+ # Should return either None or an iterable of Request or item objects.
+ pass
+
+ def process_start_requests(self, start_requests, spider):
+ # Called with the start requests of the spider, and works
+ # similarly to the process_spider_output() method, except
+ # that it doesn’t have a response associated.
+
+ # Must return only requests (not items).
+ for r in start_requests:
+ yield r
+
+ def spider_opened(self, spider):
+ spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class ScrawlDownloaderMiddleware:
+ # Not all methods need to be defined. If a method is not defined,
+ # scrapy acts as if the downloader middleware does not modify the
+ # passed objects.
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ # This method is used by Scrapy to create your spiders.
+ s = cls()
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+ return s
+
+ def process_request(self, request, spider):
+ # Called for each request that goes through the downloader
+ # middleware.
+
+ # Must either:
+ # - return None: continue processing this request
+ # - or return a Response object
+ # - or return a Request object
+ # - or raise IgnoreRequest: process_exception() methods of
+ # installed downloader middleware will be called
+ return None
+
+ def process_response(self, request, response, spider):
+ # Called with the response returned from the downloader.
+
+ # Must either;
+ # - return a Response object
+ # - return a Request object
+ # - or raise IgnoreRequest
+ return response
+
+ def process_exception(self, request, exception, spider):
+ # Called when a download handler or a process_request()
+ # (from other downloader middleware) raises an exception.
+
+ # Must either:
+ # - return None: continue processing this exception
+ # - return a Response object: stops process_exception() chain
+ # - return a Request object: stops process_exception() chain
+ pass
+
+ def spider_opened(self, spider):
+ spider.logger.info("Spider opened: %s" % spider.name)
diff --git a/scrawl/scrawl/pipelines.py b/scrawl/scrawl/pipelines.py
new file mode 100644
index 0000000..b08c3ce
--- /dev/null
+++ b/scrawl/scrawl/pipelines.py
@@ -0,0 +1,13 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class ScrawlPipeline:
+ def process_item(self, item, spider):
+ return item
diff --git a/scrawl/scrawl/settings.py b/scrawl/scrawl/settings.py
new file mode 100644
index 0000000..110a02c
--- /dev/null
+++ b/scrawl/scrawl/settings.py
@@ -0,0 +1,105 @@
+# Scrapy settings for scrawl project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+# https://docs.scrapy.org/en/latest/topics/settings.html
+# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "scrawl"
+
+SPIDER_MODULES = ["scrawl.spiders"]
+NEWSPIDER_MODULE = "scrawl.spiders"
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+# USER_AGENT = "scrawl (+http://www.yourdomain.com)"
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+# CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+# DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+# COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+# TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+# DEFAULT_REQUEST_HEADERS = {
+# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+# "Accept-Language": "en",
+# }
+DEFAULT_REQUEST_HEADERS = {
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ 'Accept-Language': 'en',
+ 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
+ 'Cookie': 'adb_isBlock=0; userid=1652710683278_ihrfq92084; prov=cn0731; city=0732; weather_city=hn_xt; region_ip=110.53.149.x; region_ver=1.2; wxIsclose=false; ifengRotator_iis3=6; ifengWindowCookieName_919=1'
+ # 默认是注释的,这个东西非常重要,如果不写很容易被判断为电脑,简单点写一个Mozilla/5.0即可
+}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+# "scrawl.middlewares.ScrawlSpiderMiddleware": 543,
+# }
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+# DOWNLOADER_MIDDLEWARES = {
+# "scrawl.middlewares.ScrawlDownloaderMiddleware": 543,
+# }
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+# EXTENSIONS = {
+# "scrapy.extensions.telnet.TelnetConsole": None,
+# }
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+# ITEM_PIPELINES = {
+# "scrawl.pipelines.ScrawlPipeline": 300,
+# }
+# ITEM_PIPELINES:项目管道,300为优先级,越低越爬取的优先度越高
+ITEM_PIPELINES = {
+ 'scrawl.pipelines.ScrawlPipeline': 300,
+ # 'subeiNews.pipelines.SubeinewsMysqlPipeline': 200, # 存数据的管道
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+# AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+# AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+# AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = "httpcache"
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
diff --git a/scrawl/scrawl/spiders/__init__.py b/scrawl/scrawl/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/scrawl/scrawl/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/scrawl/scrawl/spiders/weibo_spider.py b/scrawl/scrawl/spiders/weibo_spider.py
new file mode 100644
index 0000000..5a69700
--- /dev/null
+++ b/scrawl/scrawl/spiders/weibo_spider.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import scrapy
+
+sys.path.append(r"scrawl")
+from scrawl.items import ScrawlItem
+
+
+class WeiboSpiderSpider(scrapy.Spider):
+ name = "weibo_spider"
+ allowed_domains = ["s.weibo.com"]
+ start_urls = ["https://s.weibo.com/weibo?q=%E5%8C%97%E4%BA%AC%E5%B7%A5%E5%95%86%E5%A4%A7%E5%AD%A6&nodup=1&page=5"]
+
+ def parse(self, response):
+ for con in response.xpath('//*[@id="pl_feedlist_index"]/div/div'):
+ scraw_item = ScrawlItem()
diff --git a/script/main.bat b/script/main.bat
new file mode 100644
index 0000000..e822946
--- /dev/null
+++ b/script/main.bat
@@ -0,0 +1,2 @@
+cd C:/mywork/workspace/public_sentiment/collector
+scrapy crawl collector-spider
\ No newline at end of file
diff --git a/script/runserver.bat b/script/runserver.bat
new file mode 100644
index 0000000..01b5c27
--- /dev/null
+++ b/script/runserver.bat
@@ -0,0 +1 @@
+C:\mywork\dev-env\python\Python38\python.exe C:\mywork\workspace\public_sentiment\manage.py runserver 9000
diff --git a/script/scrapyd-console.bat b/script/scrapyd-console.bat
new file mode 100644
index 0000000..56516c4
--- /dev/null
+++ b/script/scrapyd-console.bat
@@ -0,0 +1 @@
+scrapyd
\ No newline at end of file
diff --git a/web/__init__.py b/web/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/admin.py b/web/admin.py
new file mode 100644
index 0000000..8c38f3f
--- /dev/null
+++ b/web/admin.py
@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.
diff --git a/web/apps.py b/web/apps.py
new file mode 100644
index 0000000..682e923
--- /dev/null
+++ b/web/apps.py
@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class WebConfig(AppConfig):
+ default_auto_field = 'django.db.models.BigAutoField'
+ name = 'web'
diff --git a/web/constants/__init__.py b/web/constants/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/constants/startup_parameter.py b/web/constants/startup_parameter.py
new file mode 100644
index 0000000..15d2c67
--- /dev/null
+++ b/web/constants/startup_parameter.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
+"""
+启动系统时的参数
+"""
+
+
+class StartupParameter:
+ # 采集数据
+ Crawl_Data = 'crawl_data'
diff --git a/web/controller/__init__.py b/web/controller/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/controller/base_controller.py b/web/controller/base_controller.py
new file mode 100644
index 0000000..8a381f7
--- /dev/null
+++ b/web/controller/base_controller.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import json
+import sys
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+from web.handler.html_parser_handler import HtmlParserHandler
+
+sys.path.append(r"collector")
+
+from collector.settings import ITEM_PIPELINES
+
+
+class BaseController:
+ """
+ controller层的基类
+ """
+
+ def __init__(self):
+ self.html_parser_handler = HtmlParserHandler()
+
+ def to_vo(self, request, clazz):
+ """
+ 将json参数转换为vo对象
+ """
+ raw_data = request.body.decode("utf-8")
+ json_data_dict = json.loads(raw_data)
+ obj = clazz(**json_data_dict)
+ return obj
+
+ def start_scrawl(self, spider):
+ """
+ 开始执行爬虫
+ """
+
+ # get_project_settings方法并不能导入settings.py中的配置,因此此处还要硬编码导入
+ settings = get_project_settings()
+ settings['ITEM_PIPELINES'] = ITEM_PIPELINES
+ process = CrawlerProcess(settings)
+ process.crawl(spider)
+ process.start()
diff --git a/web/controller/html_parser_controller.py b/web/controller/html_parser_controller.py
new file mode 100644
index 0000000..ddd9529
--- /dev/null
+++ b/web/controller/html_parser_controller.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import json
+from collections import namedtuple
+from django.http import JsonResponse
+from rest_framework.decorators import api_view
+from twisted.protocols.amp import Box
+from collector.spiders.collector_spider import CollectorSpider
+from web.controller.base_controller import BaseController
+from web.dto.api_result import ApiResult
+from web.manager.gridgraph_manager import GridGraphManager
+from web.manager.log_manager import LogManager
+from web.util.dto_util import DtoUtil
+from web.vo.parse_html_vo import ParseHtmlVo
+
+Logger = LogManager.get_logger(__name__)
+
+base_controller = BaseController()
+
+
+@api_view(['POST'])
+def parse_html(request):
+ """
+ 解析html
+ """
+
+ Logger.info("开始解析html")
+
+ parse_html_vo = base_controller.to_vo(request, ParseHtmlVo)
+ service_result = base_controller.html_parser_handler.parse_html(parse_html_vo.url)
+
+ # grid_graph_manager = GridGraphManager()
+ # list = grid_graph_manager.query_vertex(label='person')
+
+ # base_controller.start_scrawl(CollectorSpider)
+
+ return JsonResponse(DtoUtil.service_result_to_api_result(service_result), safe=False)
diff --git a/web/dao/__init__.py b/web/dao/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/dao/base_dao.py b/web/dao/base_dao.py
new file mode 100644
index 0000000..512acb1
--- /dev/null
+++ b/web/dao/base_dao.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from datetime import datetime
+from django.db.models.query import QuerySet
+from django.db import models
+
+from web.manager.snowflake_manager import SnowflakeManager
+
+
+class BaseDao:
+ """
+ dao基类
+ """
+
+ # 子类必须覆盖这个
+ model_class = models.Model
+ save_batch_size = 1000
+
+ snowflake_manager = SnowflakeManager()
+
+ def save(self, obj):
+ """
+ 添加
+ """
+
+ if not obj:
+ return False
+ obj.id = self.snowflake_manager.next_id()
+ obj.create_time = datetime.now()
+ obj.save()
+ return True
+
+ def save_batch(self, objs, *, batch_size=save_batch_size):
+ """
+ 批量添加
+ """
+
+ if not objs:
+ return False
+ for obj in objs:
+ obj.id = snowflake.next_id()
+ self.model_class.objects.bulk_create(objs, batch_size=batch_size)
+ return True
+
+ def delete(self, obj):
+ """
+ 删除
+ """
+
+ if not obj:
+ return False
+ obj.delete()
+ return True
+
+ def delete_batch(self, objs):
+ """
+ 批量删除
+ """
+
+ if not objs:
+ return False
+ for obj in objs:
+ self.delete(obj)
+ return True
+
+ def delete_batch_by_query(self, filter_kw: dict, exclude_kw: dict):
+ """
+ 根据条件,批量删除
+ """
+
+ self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw).delete()
+ return True
+
+ def delete_by_fake(self, obj):
+ """
+ 假删除/伪删除
+ """
+
+ if obj is None:
+ return False
+ obj.is_deleted = True
+ obj.save()
+ return True
+
+ def update(self, obj):
+ """
+ 更新
+ """
+
+ if not obj:
+ return False
+ obj.save()
+ return True
+
+ def update_batch(self, objs):
+ """
+ 批量更新
+ """
+
+ if not objs:
+ return False
+ for obj in objs:
+ self.update(obj)
+ return True
+
+ def update_batch_by_query(self, query_kwargs: dict, exclude_kw: dict, newattrs_kwargs: dict):
+ """
+ 根据条件,批量更新
+ """
+
+ self.model_class.objects.filter(**query_kwargs).exclude(**exclude_kw).update(**newattrs_kwargs)
+
+ def find_one(self, filter_kw: dict, exclude_kw: dict, order_bys: list):
+ """
+ 根据条件,返回一条记录
+ """
+
+ qs = self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw)
+ if order_bys:
+ qs = qs.order_by(*order_bys)
+ return qs.first()
+
+ def find_queryset(self, filter_kw: dict, exclude_kw: dict, order_bys: list) -> QuerySet:
+ """
+ 根据条件,返回QuerySet
+ """
+ if order_bys != None and len(order_bys) != 0:
+ query_set = self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw)
+ for by in order_bys:
+ query_set = query_set.order_by(by)
+ return query_set
+ else:
+ return self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw)
+
+ def find_list(self, filter_kw: dict, exclude_kw: dict, order_bys: list) -> list:
+ """
+ 根据条件,返回对象列表
+ """
+
+ queryset = self.find_queryset(filter_kw, exclude_kw, order_bys)
+ model_instances = [model for model in queryset]
+ return model_instances
+
+ def is_exists(self, filter_kw: dict, exclude_kw: dict) -> bool:
+ """
+ 根据条件,判断记录是否存在
+ """
+
+ return self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw).exists()
+
+ def get_count(self, filter_kw: dict, exclude_kw: dict) -> int:
+ """
+ 根据条件,计数
+ """
+
+ return self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw).count()
diff --git a/web/dao/public_sentiment_comment_dao.py b/web/dao/public_sentiment_comment_dao.py
new file mode 100644
index 0000000..36b936c
--- /dev/null
+++ b/web/dao/public_sentiment_comment_dao.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from web.dao.base_dao import BaseDao
+from web.models import PublicSentimentComment
+
+
+class PublicSentimentCommentDao(BaseDao):
+ """
+ Comment的dao类
+ """
+
+ model_class = PublicSentimentComment
diff --git a/web/dao/training_sensitive_word_dao.py b/web/dao/training_sensitive_word_dao.py
new file mode 100644
index 0000000..e347357
--- /dev/null
+++ b/web/dao/training_sensitive_word_dao.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from web.dao.base_dao import BaseDao
+from web.models import TrainingSensitiveWord
+
+
+class TrainingSensitiveWordDao(BaseDao):
+ """
+ TrainingSensitiveWord的dao类
+ """
+
+ model_class = TrainingSensitiveWord
+
+ def find_all(self):
+ """
+ 查询所有记录
+ """
+
+ return self.find_list(dict(), dict(), list())
diff --git a/web/dto/__init__.py b/web/dto/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/dto/api_result.py b/web/dto/api_result.py
new file mode 100644
index 0000000..c9c87d5
--- /dev/null
+++ b/web/dto/api_result.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
+class ApiResult:
+ """
+ 接口返回类
+ """
+
+ def __init__(self):
+ super().__init__()
+
+ def __init__(self, success, code, data, message):
+ # 只要服务端没报错,success都是True
+ self.success = success
+ # 根据处理结果不同,返回不同的值
+ self.code = code
+ # 返回数据
+ self.data = data
+ # 提示信息
+ self.message = message
+
+ @staticmethod
+ def instance(success, code, data, message):
+ return ApiResult(success, code, data, message).__dict__
+
+ @staticmethod
+ def ok(code, data, message):
+ return ApiResult(True, code, data, message).__dict__
+
+ @staticmethod
+ def fail(code, data, message):
+ return ApiResult(False, code, data, message).__dict__
diff --git a/web/dto/service_result.py b/web/dto/service_result.py
new file mode 100644
index 0000000..1e7c02a
--- /dev/null
+++ b/web/dto/service_result.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
+class ServiceResult:
+ """
+ service层返回值对象
+ """
+
+ def __init__(self):
+ super().__init__()
+
+ def __init__(self, success, code, data, message):
+ # 只要服务端没报错,success都是True
+ self.success = success
+ # 根据处理结果不同,返回不同的值
+ self.code = code
+ # 返回数据
+ self.data = data
+ # 提示信息
+ self.message = message
+
+ @staticmethod
+ def ok(code, data, message):
+ return ServiceResult(True, code, data, message)
+
+ @staticmethod
+ def fail(code, data, message):
+ return ServiceResult(False, code, data, message)
diff --git a/web/enum/__init__.py b/web/enum/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/enum/api_result_enum.py b/web/enum/api_result_enum.py
new file mode 100644
index 0000000..37e39fe
--- /dev/null
+++ b/web/enum/api_result_enum.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
+from enum import Enum
+
+
+class ApiResultEnum(Enum):
+ """
+ ApiResult类的的枚举类型
+ """
+
+ # 成功
+ # SUCCESS = 200
+ # SUCCESS_DESCRIPTION = '成功'
+
+ # 失败
+ FAIL = 4000
+ FAIL_DESCRIPTION = '失败'
diff --git a/web/enum/service_result_enum.py b/web/enum/service_result_enum.py
new file mode 100644
index 0000000..689bd75
--- /dev/null
+++ b/web/enum/service_result_enum.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
+from enum import Enum
+
+
+class ServiceResultEnum(Enum):
+ """
+ ServiceResult类的的枚举类型
+ """
+
+ # 成功
+ SUCCESS = 200
+ SUCCESS_DESCRIPTION = '成功'
+
+ # 失败
+ FAIL = 3000
+ FAIL_DESCRIPTION = '失败'
+
+ # 添加成功
+ SAVE_SUCCESS = 3001
+ SAVE_SUCCESS_DESCRIPTION = '添加成功'
+
+ # 删除成功
+ DELETE_SUCCESS = 3002
+ DELETE_SUCCESS_DESCRIPTION = '删除成功'
+
+ # 修改成功
+ UPDATE_SUCCESS = 3003
+ UPDATE_SUCCESS_DESCRIPTION = '修改成功'
+
+ # 查询成功
+ SELECT_SUCCESS = 3004
+ SELECT_SUCCESS_DESCRIPTION = '查询成功'
+
+ # 不存在敏感词
+ NOT_EXIST_SENSITIVE_WORD = 3005
+ NOT_EXIST_SENSITIVE_WORD_DESCRIPTION = '不存在敏感词'
+
+ # 存在敏感词
+ EXIST_SENSITIVE_WORD = 3006
+ EXIST_SENSITIVE_WORD_DESCRIPTION = '存在敏感词'
diff --git a/web/handler/__init__.py b/web/handler/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/handler/base_handler.py b/web/handler/base_handler.py
new file mode 100644
index 0000000..9b82925
--- /dev/null
+++ b/web/handler/base_handler.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from web.service.training_sensitive_word_service import TrainingSensitiveWordService
+
+
+class BaseHandler:
+ """
+ handler层的基类
+ """
+
+ def __init__(self):
+ self.training_sensitive_word_service = TrainingSensitiveWordService()
diff --git a/web/handler/crawl_data_handler.py b/web/handler/crawl_data_handler.py
new file mode 100644
index 0000000..5ed159f
--- /dev/null
+++ b/web/handler/crawl_data_handler.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from web.manager.log_manager import LogManager
+from web.handler.base_handler import BaseHandler
+
+Logger = LogManager.get_logger(__name__)
+
+"""
+采集数据的handler
+"""
+
+
+class CrawlDataHandler(BaseHandler):
+
+ def collect_data_from_weibo(self):
+ """
+ 从新浪微博采集数据
+ """
+
+ Logger.info("开始从新浪微博采集数据")
+
+
diff --git a/web/handler/html_parser_handler.py b/web/handler/html_parser_handler.py
new file mode 100644
index 0000000..81b3a16
--- /dev/null
+++ b/web/handler/html_parser_handler.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import re
+import requests
+
+from web.enum.service_result_enum import ServiceResultEnum
+from web.dto.service_result import ServiceResult
+from web.handler.base_handler import BaseHandler
+from web.manager.log_manager import LogManager
+from web.util.re_util import ReUtil
+
+Logger = LogManager.get_logger(__name__)
+
+
+class HtmlParserHandler(BaseHandler):
+ """
+ html解析器类
+ """
+
+ def parse_html(self, url):
+ """
+ 解析html网页
+ """
+
+ response = requests.get(url)
+ text = response.text
+
+ # 查询敏感词,并将其拼接为字符串,用|分隔
+ service_result = self.training_sensitive_word_service.find_all()
+ if service_result is not None and service_result.success is True:
+ training_sensitive_word_list = service_result.data
+ temp_training_sensitive_word_list = list(
+ map(lambda training_sensitive_word: str(training_sensitive_word.word), training_sensitive_word_list))
+ match_str = '.+|.+'.join(temp_training_sensitive_word_list)
+
+ # 去除返回值中的html标签
+ text_without_html = ReUtil.clear_html(text)
+ text_without_html_list = text_without_html.split('\n')
+
+ # 匹配
+ for item in text_without_html_list:
+ match = re.match(match_str, item)
+ if match:
+ return ServiceResult.ok(ServiceResultEnum.EXIST_SENSITIVE_WORD.value, match.group(),
+ ServiceResultEnum.EXIST_SENSITIVE_WORD_DESCRIPTION.value)
+ return ServiceResult.ok(ServiceResultEnum.NOT_EXIST_SENSITIVE_WORD.value, None,
+ ServiceResultEnum.NOT_EXIST_SENSITIVE_WORD_DESCRIPTION.value)
+ else:
+ return ServiceResult.fail(ServiceResultEnum.FAIL.value, None,
+ ServiceResultEnum.FAIL_DESCRIPTION.value)
diff --git a/web/manager/__init__.py b/web/manager/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/manager/gridgraph_manager.py b/web/manager/gridgraph_manager.py
new file mode 100644
index 0000000..6a79fd7
--- /dev/null
+++ b/web/manager/gridgraph_manager.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from gremlin_python import statics
+from gremlin_python.process.anonymous_traversal import traversal
+from gremlin_python.process.graph_traversal import __
+from gremlin_python.process.strategies import *
+from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
+from gremlin_python.process.traversal import T
+from gremlin_python.process.traversal import Order
+from gremlin_python.process.traversal import Cardinality
+from gremlin_python.process.traversal import Column
+from gremlin_python.process.traversal import Direction
+from gremlin_python.process.traversal import Operator
+from gremlin_python.process.traversal import P
+from gremlin_python.process.traversal import Pop
+from gremlin_python.process.traversal import Scope
+from gremlin_python.process.traversal import Barrier
+from gremlin_python.process.traversal import Bindings
+from gremlin_python.process.traversal import WithOptions
+from gremlin_python.driver import client
+from public_sentiment.settings import GRID_GRAPH
+
+
+class GridGraphManager:
+ """
+ gridgraph的管理器类
+ """
+
+ def __init__(self):
+ self.graph = traversal().withRemote(
+ DriverRemoteConnection(GRID_GRAPH['url'], GRID_GRAPH['traversal_source'], username=GRID_GRAPH['username'],
+ password=GRID_GRAPH['password']))
+
+ def add_vertex(self, label, properties=None):
+ """
+ add vertex
+ :param graph: graph, type: GraphTraversalSource
+ :param label: label, type: str
+ :param properties: property dict, like {'p1': 'value1', 'p2': 'value2'}
+ :return: vertex, Vertex(id, label)
+ """
+ vert = self.graph.addV(label)
+ if properties:
+ for key in properties.keys():
+ vert.property(key, properties.get(key))
+ return vert.next()
+
+ def add_edge(self, label, v_from, v_to, properties=None):
+ """
+ add edge
+ :param graph: graph, type: GraphTraversalSource
+ :param label: label, type: str
+ :param v_from: long vertex id or Vertex(id, label) of from
+ :param v_to: long vertex id or Vertex(id, label) of to
+ :param properties: property dict, like {'p1': 'value1', 'p2': 'value2'}
+ :return: None
+ """
+ if isinstance(v_from, int):
+ v_from = self.graph.V().hasId(v_from).next()
+ if isinstance(v_to, int):
+ v_to = self.graph.V().hasId(v_to).next()
+ edge = self.graph.V(v_from).addE(label).to(v_to)
+ if properties:
+ for key in properties.keys():
+ edge.property(key, properties.get(key))
+ edge.next()
+
+ def drop_vertex(self, v_id=None, label=None, properties=None):
+ """
+ drop all vertex or specific vertex
+ :param graph: graph, type: GraphTraversalSource
+ :param v_id: long vertex id or Vertex(id, label)
+ :param label: label, type: str
+ :param properties: property list, like ['p1', 'p2', {'p3': 'value'}]
+ :return: None
+ """
+ if isinstance(v_id, int):
+ v_id = self.graph.V().hasId(v_id).next()
+ travel = self.graph.V(v_id) if v_id else self.graph.V()
+ if label:
+ travel = travel.hasLabel(label)
+ if properties:
+ for p in properties:
+ if isinstance(p, dict):
+ key = list(p.keys())[0]
+ travel = travel.has(key, p.get(key))
+ else:
+ travel = travel.has(p)
+ travel.drop().iterate()
+
+ def drop_edge(self, e_id=None, label=None, properties=None):
+ """
+ drop all edges or specific edge
+ :param graph: graph, type: GraphTraversalSource
+ :param e_id: edge id, type str
+ :param label: label, type: str
+ :param properties: property list, like ['p1', 'p2', {'p3': 'value'}]
+ :return: None
+ """
+ travel = self.graph.E(e_id) if e_id else self.graph.E()
+ if label:
+ travel = travel.hasLabel(label)
+ if properties:
+ for p in properties:
+ if isinstance(p, dict):
+ key = list(p.keys())[0]
+ travel = travel.has(key, p.get(key))
+ else:
+ travel = travel.has(p)
+ travel.drop().iterate()
+
+ def query_vertex(self, v_id=None, label=None, properties=None):
+ """
+ query graph vertex (value) list
+ :param graph: graph, type: GraphTraversalSource
+ :param v_id: long vertex id or Vertex(id, label)
+ :param label: label, type: str
+ :param properties: property list, like ['p1', 'p2', {'p3': 'value'}]
+ :return: vertex list or vertex value list
+ """
+ if isinstance(v_id, int):
+ v_id = self.graph.V().hasId(v_id).next()
+ travel = self.graph.V(v_id) if v_id else self.graph.V()
+ if label:
+ travel = travel.hasLabel(label)
+ if properties:
+ for p in properties:
+ if isinstance(p, dict):
+ key = list(p.keys())[0]
+ travel = travel.has(key, p.get(key))
+ else:
+ travel = travel.has(p)
+ # return travel.valueMap().toList()
+ return travel.toList()
+
+ def query_edge(self, e_id=None, label=None, properties=None):
+ """
+ query graph edge value list
+ :param graph: graph, type: GraphTraversalSource
+ :param e_id: edge id, type str
+ :param label: label, type: str
+ :param properties: property list, like ['p1', 'p2', {'p3': 'value'}]
+ :return: valueMap list
+ """
+ travel = self.graph.E(e_id) if e_id else self.graph.E()
+ if label:
+ travel = travel.hasLabel(label)
+ if properties:
+ for p in properties:
+ if isinstance(p, dict):
+ key = list(p.keys())[0]
+ travel = travel.has(key, p.get(key))
+ else:
+ travel = travel.has(p)
+ return travel.valueMap().toList()
+
+ def query_edges_of_vertex(self, v_id):
+ """
+ query all edges of vertex
+ :param graph: graph, type: GraphTraversalSource
+ :param v_id: v_id: long vertex id or Vertex(id, label)
+ :return: edge list
+ """
+ if isinstance(v_id, int):
+ v_id = self.graph.V().hasId(v_id).next()
+ result = []
+ in_edges = self.graph.V(v_id).inE().toList()
+ out_edges = self.graph.V(v_id).outE().toList()
+ result.extend(in_edges)
+ result.extend(out_edges)
+ return result
+
+ def query_near_vertex(self, v_id):
+ """
+ query near vertices of vertex
+ :param graph: graph, type: GraphTraversalSource
+ :param v_id: v_id: long vertex id or Vertex(id, label)
+ :return: vertex list
+ """
+ if isinstance(v_id, int):
+ v_id = self.graph.V().hasId(v_id).next()
+ result = []
+ out_v = self.graph.V(v_id).out().toList()
+ in_v = self.graph.V(v_id).in_().toList()
+ result.extend(out_v)
+ result.extend(in_v)
+ return result
+
+ def get_edge_id(self):
+ """
+ get edge id
+ :param edge: Egde(id, label, outV, inV)
+ :return: edge id, type str
+ """
+ return self.graph.id.get('@value').get('relationId')
+
+ def vertex_to_dict(self, vertex):
+ """
+ transfer Vertex's info to dict
+ :param graph: graph, type: GraphTraversalSource
+ :param vertex: vertex, Vertex(id, label)
+ :return: vertex info dict
+ """
+ properties = self.graph.V(vertex).valueMap().toList()[0]
+ for key in properties.keys():
+ properties[key] = properties.get(key)[0]
+ return {
+ 'id': vertex.id,
+ 'label': vertex.label,
+ 'properties': properties
+ }
+
+ def edge_to_dict(self, edge):
+ """
+ transfer Edge's info to dict
+ :param graph: graph, type: GraphTraversalSource
+ :param edge: edge, Edge(id, label, outV, inV)
+ :return: edge info dict
+ """
+ e_id = self.get_edge_id(edge)
+ properties = self.graph.E(e_id).valueMap().toList()[0]
+ return {
+ 'id': e_id,
+ 'label': edge.label,
+ 'properties': properties
+ }
+
+ def judge_vertex_in_graph(self, vertex_dict):
+ """
+ judge a vertex whether in graph
+ :param graph: graph, type: GraphTraversalSource
+ :param vertex_dict: vertex dict, like {'label': 'value1', 'properties': {'p1': 'v1', ...}}
+ :return: None or Vertex(id,label)
+ """
+ label = vertex_dict.get('label')
+ properties = vertex_dict.get('properties')
+ travel = self.graph.V()
+ if label:
+ travel = travel.hasLabel(label)
+ if properties:
+ for k in properties.keys():
+ travel = travel.has(k, properties.get(k))
+ if travel.hasNext():
+ return travel.next()
+ return None
+
+ def get_sub_graph(self, vertices=None, edges=None, vertex_properties=None):
+ """
+ get sub graph
+ :param graph: graph, type: GraphTraversalSource
+ :param vertices: hasLabel('label').has('property').has('age', gt(20))
+ :param edges: hasLabel('label').has('property')
+ :param vertex_properties:
+ :return: sub_graph, type: GraphTraversalSource
+ """
+ strategy = SubgraphStrategy(vertices=vertices, edges=edges, vertex_properties=vertex_properties)
+ return self.graph.withStrategies(strategy)
diff --git a/web/manager/log_manager.py b/web/manager/log_manager.py
new file mode 100644
index 0000000..27e298d
--- /dev/null
+++ b/web/manager/log_manager.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import logging
+
+
+class LogManager:
+ """
+ 日志处理器类,同时在控制台和日志文件中打印日志
+ """
+
+ # 日志对象
+ Logger = None
+
+ def __init__(self):
+ super(LogManager, self).__init__()
+
+ @staticmethod
+ def get_logger(param_name, log_file='/mywork/log/public-sentiment/public-sentiment.log', level=logging.INFO):
+ """
+ 获取日志对象
+ :param param_name:
+ :param log_file:
+ :param level:
+ :return:
+ """
+
+ if LogManager.Logger is None:
+ LogManager.Logger = logging.getLogger(param_name)
+ LogManager.Logger.setLevel(level=level)
+
+ formatter = logging.Formatter(
+ '%(asctime)s [%(threadName)s-%(thread)d] [%(levelname)s] %(name)s.%(funcName)s[%(lineno)d] %(message)s')
+
+ file_handler = logging.FileHandler(log_file, encoding="utf-8")
+ file_handler.setLevel(level=level)
+ file_handler.setFormatter(formatter)
+
+ console = logging.StreamHandler()
+ console.setFormatter(formatter)
+ console.setLevel(level)
+
+ LogManager.Logger.addHandler(file_handler)
+ LogManager.Logger.addHandler(console)
+ return LogManager.Logger
+ else:
+ return LogManager.Logger
diff --git a/web/manager/snowflake_manager.py b/web/manager/snowflake_manager.py
new file mode 100644
index 0000000..ec1c6c9
--- /dev/null
+++ b/web/manager/snowflake_manager.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import time
+
+
+class SnowflakeManager(object):
+ """
+ Twitter的雪花算法实现
+ """
+
+ def __init__(self, start_time=1420041600000):
+ self.start_time = start_time / 1000 # 以秒为单位
+ self.last_timestamp = -1
+
+ # 41 bits时间戳
+ self.timestamp_shift = 22
+ # 10 bits机器编号
+ self.machine_id_shift = 12
+ # 12 bits序列号
+ self.sequence_shift = 0
+
+ # 41 bits可以表示的最大值,2^41 - 1
+ self.max_timestamp = -1 ^ (-1 << self.timestamp_shift)
+ # 10 bits可以表示的最大值,2^10 - 1
+ self.max_machine_id = -1 ^ (-1 << self.machine_id_shift)
+ # 12 bits可以表示的最大值,2^12 - 1
+ self.max_sequence = -1 ^ (-1 << self.sequence_shift)
+
+ # 机器编号和序列号暂时不使用,可以通过参数传入
+ self.machine_id = 0
+ self.sequence = 0
+
+ def next_id(self):
+ timestamp = int(time.time())
+ if timestamp < self.last_timestamp:
+ raise ValueError('Current timestamp is less than last timestamp.')
+
+ if timestamp == self.last_timestamp:
+ self.sequence = (self.sequence + 1) & self.max_sequence
+ if self.sequence == 0:
+ timestamp = self.til_next_millis(self.last_timestamp)
+ else:
+ self.sequence = 0
+
+ self.last_timestamp = timestamp
+ return ((timestamp - int(self.start_time)) << self.timestamp_shift) | (
+ self.machine_id << self.machine_id_shift) | self.sequence
+
+ def til_next_millis(self, last_timestamp):
+ timestamp = int(time.time())
+ while timestamp <= last_timestamp:
+ timestamp = int(time.time())
+ return timestamp
diff --git a/web/migrations/__init__.py b/web/migrations/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/models.py b/web/models.py
new file mode 100644
index 0000000..71a8362
--- /dev/null
+++ b/web/models.py
@@ -0,0 +1,3 @@
+from django.db import models
+
+# Create your models here.
diff --git a/web/models/__init__.py b/web/models/__init__.py
new file mode 100644
index 0000000..ad16e9c
--- /dev/null
+++ b/web/models/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from .public_sentiment_comment import PublicSentimentComment
+from .public_sentiment_source import PublicSentimentSource
+from .training_sensitive_word import TrainingSensitiveWord
diff --git a/web/models/public_sentiment_comment.py b/web/models/public_sentiment_comment.py
new file mode 100644
index 0000000..0acf841
--- /dev/null
+++ b/web/models/public_sentiment_comment.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from django.core.validators import MaxValueValidator
+from django.db import models
+
+
+class PublicSentimentComment(models.Model):
+ """
+ 评论表
+ """
+
+ # 主键
+ id = models.AutoField(primary_key=True)
+
+ # 内容
+ content = models.CharField(max_length=2550, null=True, blank=True)
+
+ # 来源id
+ source_id = models.BigIntegerField(validators=[MaxValueValidator(9223372036854775807)], db_index=True, null=False,
+ blank=False)
+
+ # 创建时间
+ create_time = models.DateTimeField(null=False, blank=False)
+
+ class Meta:
+ managed = True
+ db_table = 'ps_comment'
+ verbose_name = '评论表'
+ verbose_name_plural = verbose_name
\ No newline at end of file
diff --git a/web/models/public_sentiment_source.py b/web/models/public_sentiment_source.py
new file mode 100644
index 0000000..95b5edc
--- /dev/null
+++ b/web/models/public_sentiment_source.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from django.db import models
+
+
+class PublicSentimentSource(models.Model):
+ """
+ 来源表
+ """
+
+ # 主键
+ id = models.AutoField(primary_key=True)
+
+ # 域名
+ domain_name = models.CharField(max_length=255, null=True, blank=True)
+
+ # 名称
+ name = models.CharField(max_length=255, null=True, blank=True)
+
+ class Meta:
+ managed = True
+ db_table = 'ps_source'
+ verbose_name = '来源表'
+ verbose_name_plural = verbose_name
diff --git a/web/models/training_sensitive_word.py b/web/models/training_sensitive_word.py
new file mode 100644
index 0000000..36a8070
--- /dev/null
+++ b/web/models/training_sensitive_word.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from django.db import models
+
+
+class TrainingSensitiveWord(models.Model):
+ """
+ 敏感词表
+ """
+
+ # 主键
+ id = models.AutoField(primary_key=True)
+
+ # 类型
+ type = models.CharField(max_length=255, null=True, blank=True)
+
+ # 敏感词
+ word = models.CharField(max_length=255, null=True, blank=True)
+
+ class Meta:
+ managed = True
+ db_table = 'training_sensitive_word'
+ verbose_name = '敏感词表'
+ verbose_name_plural = verbose_name
diff --git a/web/service/__init__.py b/web/service/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/service/base_service.py b/web/service/base_service.py
new file mode 100644
index 0000000..cb03b4c
--- /dev/null
+++ b/web/service/base_service.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from web.dao.public_sentiment_comment_dao import PublicSentimentCommentDao
+from web.dao.training_sensitive_word_dao import TrainingSensitiveWordDao
+
+
+class BaseService:
+ """
+ service层的基类
+ """
+
+ def __init__(self):
+ self.public_sentiment_comment_dao = PublicSentimentCommentDao()
+ self.training_sensitive_word_dao = TrainingSensitiveWordDao()
diff --git a/web/service/public_sentiment_comment_service.py b/web/service/public_sentiment_comment_service.py
new file mode 100644
index 0000000..fa751bc
--- /dev/null
+++ b/web/service/public_sentiment_comment_service.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from web.manager.log_manager import LogManager
+from web.service.base_service import BaseService
+
+Logger = LogManager.get_logger(__name__)
+
+
+class PublicSentimentCommentService(BaseService):
+ """
+ PublicSentimentComment的service类
+ """
+
+ def save(self, public_sentiment_comment):
+ """
+ 保存
+ """
+
+ Logger.info('保存PublicSentimentComment对象')
+
+ self.public_sentiment_comment_dao.save(public_sentiment_comment)
+
+ def find_all(self):
+ """
+ 查询所有记录
+ """
+
+ Logger.info('查询所有记录')
+
+ return self.public_sentiment_comment_dao.find_list(dict(), dict(), list())
diff --git a/web/service/training_sensitive_word_service.py b/web/service/training_sensitive_word_service.py
new file mode 100644
index 0000000..c6faaf8
--- /dev/null
+++ b/web/service/training_sensitive_word_service.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python查询所有记录
+# -*- coding: utf-8 -*-
+from web.dto.service_result import ServiceResult
+from web.enum.service_result_enum import ServiceResultEnum
+from web.manager.log_manager import LogManager
+from web.service.base_service import BaseService
+
+Logger = LogManager.get_logger(__name__)
+
+
+class TrainingSensitiveWordService(BaseService):
+ """
+ TrainingSensitiveWord的service类
+ """
+
+ def find_all(self):
+ """
+ 查询所有记录
+ """
+
+ Logger.info('查询所有记录')
+
+ return ServiceResult.ok(ServiceResultEnum.SELECT_SUCCESS, self.training_sensitive_word_dao.find_all(),
+ ServiceResultEnum.SELECT_SUCCESS_DESCRIPTION)
diff --git a/web/spider/__init__.py b/web/spider/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/spider/base_spider.py b/web/spider/base_spider.py
new file mode 100644
index 0000000..ccd005a
--- /dev/null
+++ b/web/spider/base_spider.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from web.service.training_sensitive_word_service import TrainingSensitiveWordService
+
+
+class BaseSpider:
+ """
+ Spider层的基类
+ """
+
+ def __init__(self):
+ self.training_sensitive_word_service = TrainingSensitiveWordService()
diff --git a/web/task/__init__.py b/web/task/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/task/base_task.py b/web/task/base_task.py
new file mode 100644
index 0000000..fc91784
--- /dev/null
+++ b/web/task/base_task.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
+"""
+task基类
+"""
+from web.handler.crawl_data_handler import CrawlDataHandler
+
+
+class BaseTask:
+
+ def __init__(self):
+ self.crawl_data_handler = CrawlDataHandler()
diff --git a/web/task/crawl_data_task.py b/web/task/crawl_data_task.py
new file mode 100644
index 0000000..0538dd3
--- /dev/null
+++ b/web/task/crawl_data_task.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from web.manager.log_manager import LogManager
+from web.task.base_task import BaseTask
+
+Logger = LogManager.get_logger(__name__)
+
+"""
+添加注释
+"""
+
+
+class CrawlDataTask(BaseTask):
+
+ def collect_data_from_weibo(self):
+ """
+ 从新浪微博采集数据
+ """
+
+ Logger.info("开始从新浪微博采集数据")
+
+ self.crawl_data_handler.collect_data_from_weibo()
diff --git a/web/tests.py b/web/tests.py
new file mode 100644
index 0000000..7ce503c
--- /dev/null
+++ b/web/tests.py
@@ -0,0 +1,3 @@
+from django.test import TestCase
+
+# Create your tests here.
diff --git a/web/util/__init__.py b/web/util/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/util/dto_util.py b/web/util/dto_util.py
new file mode 100644
index 0000000..3249f72
--- /dev/null
+++ b/web/util/dto_util.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from web.dto.api_result import ApiResult
+from web.dto.service_result import ServiceResult
+
+
+class DtoUtil:
+ """
+ dto的工具类
+ """
+
+ @staticmethod
+ def service_result_to_api_result(service_result: ServiceResult) -> ApiResult:
+ """
+ 将ServiceResult对象转换为ApiResult对象
+ """
+
+ return ApiResult.instance(service_result.success, service_result.code, service_result.data,
+ service_result.message)
diff --git a/web/util/re_util.py b/web/util/re_util.py
new file mode 100644
index 0000000..79a4fc5
--- /dev/null
+++ b/web/util/re_util.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from bs4 import BeautifulSoup
+
+
+class ReUtil:
+ """
+ 正则表达式的工具类
+ """
+
+ @staticmethod
+ def clear_html(text_with_html):
+ """
+ 清除html
+ """
+
+ soup = BeautifulSoup(text_with_html, 'html.parser')
+ return soup.get_text()
diff --git a/web/views.py b/web/views.py
new file mode 100644
index 0000000..faa18be
--- /dev/null
+++ b/web/views.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
diff --git a/web/vo/__init__.py b/web/vo/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/web/vo/parse_html_vo.py b/web/vo/parse_html_vo.py
new file mode 100644
index 0000000..959b24d
--- /dev/null
+++ b/web/vo/parse_html_vo.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from pydantic import BaseModel
+
+
+class ParseHtmlVo(BaseModel):
+ """
+ 解析html的vo类
+ """
+
+ # 地址
+ url: str