From 27f08d677282be8b03fc550cf690d52c5daed675 Mon Sep 17 00:00:00 2001 From: 913071727 <913071727@qq.com> Date: Wed, 18 Sep 2024 13:38:24 +0800 Subject: [PATCH] =?UTF-8?q?1.=E5=88=9B=E5=BB=BA=E3=80=81=E6=8F=90=E4=BA=A4?= =?UTF-8?q?=E9=A1=B9=E7=9B=AE=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .idea/.gitignore | 3 + .idea/inspectionProfiles/Project_Default.xml | 65 +++++ .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 4 + .idea/modules.xml | 8 + .idea/public_sentiment.iml | 8 + .idea/vcs.xml | 6 + collector/collector/__init__.py | 0 collector/collector/items.py | 12 + collector/collector/middlewares.py | 101 +++++++ collector/collector/pipelines.py | 27 ++ collector/collector/settings.py | 117 ++++++++ collector/collector/spiders/__init__.py | 4 + .../collector/spiders/collector_spider.py | 59 ++++ collector/dbs/default.db | Bin 0 -> 8192 bytes collector/main.py | 3 + collector/scrapy.cfg | 11 + manage.py | 30 ++ public_sentiment/__init__.py | 2 + public_sentiment/asgi.py | 16 ++ public_sentiment/settings.py | 136 +++++++++ public_sentiment/urls.py | 25 ++ public_sentiment/wsgi.py | 16 ++ scrawl/__init__.py | 0 scrawl/scrapy.cfg | 11 + scrawl/scrawl/__init__.py | 0 scrawl/scrawl/items.py | 12 + scrawl/scrawl/middlewares.py | 103 +++++++ scrawl/scrawl/pipelines.py | 13 + scrawl/scrawl/settings.py | 105 +++++++ scrawl/scrawl/spiders/__init__.py | 4 + scrawl/scrawl/spiders/weibo_spider.py | 18 ++ script/main.bat | 2 + script/runserver.bat | 1 + script/scrapyd-console.bat | 1 + web/__init__.py | 0 web/admin.py | 3 + web/apps.py | 6 + web/constants/__init__.py | 0 web/constants/startup_parameter.py | 12 + web/controller/__init__.py | 0 web/controller/base_controller.py | 42 +++ web/controller/html_parser_controller.py | 38 +++ web/dao/__init__.py | 0 web/dao/base_dao.py | 157 +++++++++++ web/dao/public_sentiment_comment_dao.py | 13 + web/dao/training_sensitive_word_dao.py | 20 ++ web/dto/__init__.py | 0 web/dto/api_result.py | 33 +++ web/dto/service_result.py | 29 ++ web/enum/__init__.py | 0 web/enum/api_result_enum.py | 19 ++ web/enum/service_result_enum.py | 43 +++ web/handler/__init__.py | 0 web/handler/base_handler.py | 13 + web/handler/crawl_data_handler.py | 23 ++ web/handler/html_parser_handler.py | 51 ++++ web/manager/__init__.py | 0 web/manager/gridgraph_manager.py | 258 ++++++++++++++++++ web/manager/log_manager.py | 47 ++++ web/manager/snowflake_manager.py | 54 ++++ web/migrations/__init__.py | 0 web/models.py | 3 + web/models/__init__.py | 6 + web/models/public_sentiment_comment.py | 30 ++ web/models/public_sentiment_source.py | 25 ++ web/models/training_sensitive_word.py | 25 ++ web/service/__init__.py | 0 web/service/base_service.py | 15 + .../public_sentiment_comment_service.py | 31 +++ .../training_sensitive_word_service.py | 24 ++ web/spider/__init__.py | 0 web/spider/base_spider.py | 13 + web/task/__init__.py | 0 web/task/base_task.py | 14 + web/task/crawl_data_task.py | 23 ++ web/tests.py | 3 + web/util/__init__.py | 0 web/util/dto_util.py | 19 ++ web/util/re_util.py | 19 ++ web/views.py | 2 + web/vo/__init__.py | 0 web/vo/parse_html_vo.py | 13 + 83 files changed, 2055 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/public_sentiment.iml create mode 100644 .idea/vcs.xml create mode 100644 collector/collector/__init__.py create mode 100644 collector/collector/items.py create mode 100644 collector/collector/middlewares.py create mode 100644 collector/collector/pipelines.py create mode 100644 collector/collector/settings.py create mode 100644 collector/collector/spiders/__init__.py create mode 100644 collector/collector/spiders/collector_spider.py create mode 100644 collector/dbs/default.db create mode 100644 collector/main.py create mode 100644 collector/scrapy.cfg create mode 100644 manage.py create mode 100644 public_sentiment/__init__.py create mode 100644 public_sentiment/asgi.py create mode 100644 public_sentiment/settings.py create mode 100644 public_sentiment/urls.py create mode 100644 public_sentiment/wsgi.py create mode 100644 scrawl/__init__.py create mode 100644 scrawl/scrapy.cfg create mode 100644 scrawl/scrawl/__init__.py create mode 100644 scrawl/scrawl/items.py create mode 100644 scrawl/scrawl/middlewares.py create mode 100644 scrawl/scrawl/pipelines.py create mode 100644 scrawl/scrawl/settings.py create mode 100644 scrawl/scrawl/spiders/__init__.py create mode 100644 scrawl/scrawl/spiders/weibo_spider.py create mode 100644 script/main.bat create mode 100644 script/runserver.bat create mode 100644 script/scrapyd-console.bat create mode 100644 web/__init__.py create mode 100644 web/admin.py create mode 100644 web/apps.py create mode 100644 web/constants/__init__.py create mode 100644 web/constants/startup_parameter.py create mode 100644 web/controller/__init__.py create mode 100644 web/controller/base_controller.py create mode 100644 web/controller/html_parser_controller.py create mode 100644 web/dao/__init__.py create mode 100644 web/dao/base_dao.py create mode 100644 web/dao/public_sentiment_comment_dao.py create mode 100644 web/dao/training_sensitive_word_dao.py create mode 100644 web/dto/__init__.py create mode 100644 web/dto/api_result.py create mode 100644 web/dto/service_result.py create mode 100644 web/enum/__init__.py create mode 100644 web/enum/api_result_enum.py create mode 100644 web/enum/service_result_enum.py create mode 100644 web/handler/__init__.py create mode 100644 web/handler/base_handler.py create mode 100644 web/handler/crawl_data_handler.py create mode 100644 web/handler/html_parser_handler.py create mode 100644 web/manager/__init__.py create mode 100644 web/manager/gridgraph_manager.py create mode 100644 web/manager/log_manager.py create mode 100644 web/manager/snowflake_manager.py create mode 100644 web/migrations/__init__.py create mode 100644 web/models.py create mode 100644 web/models/__init__.py create mode 100644 web/models/public_sentiment_comment.py create mode 100644 web/models/public_sentiment_source.py create mode 100644 web/models/training_sensitive_word.py create mode 100644 web/service/__init__.py create mode 100644 web/service/base_service.py create mode 100644 web/service/public_sentiment_comment_service.py create mode 100644 web/service/training_sensitive_word_service.py create mode 100644 web/spider/__init__.py create mode 100644 web/spider/base_spider.py create mode 100644 web/task/__init__.py create mode 100644 web/task/base_task.py create mode 100644 web/task/crawl_data_task.py create mode 100644 web/tests.py create mode 100644 web/util/__init__.py create mode 100644 web/util/dto_util.py create mode 100644 web/util/re_util.py create mode 100644 web/views.py create mode 100644 web/vo/__init__.py create mode 100644 web/vo/parse_html_vo.py diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..72d1229 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,65 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d1e22ec --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..89d928d --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/public_sentiment.iml b/.idea/public_sentiment.iml new file mode 100644 index 0000000..9b31378 --- /dev/null +++ b/.idea/public_sentiment.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/collector/collector/__init__.py b/collector/collector/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/collector/collector/items.py b/collector/collector/items.py new file mode 100644 index 0000000..db16f10 --- /dev/null +++ b/collector/collector/items.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import scrapy + + +class SensitiveWordItem(scrapy.Item): + """ + 评论 + """ + + sensitive_word = scrapy.Field() diff --git a/collector/collector/middlewares.py b/collector/collector/middlewares.py new file mode 100644 index 0000000..7c9dec3 --- /dev/null +++ b/collector/collector/middlewares.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class CollectorSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class CollectorDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download service or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/collector/collector/pipelines.py b/collector/collector/pipelines.py new file mode 100644 index 0000000..ff5522b --- /dev/null +++ b/collector/collector/pipelines.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + + +from web.models import PublicSentimentComment +from web.manager.log_manager import LogManager +from web.service.public_sentiment_comment_service import PublicSentimentCommentService + +Logger = LogManager.get_logger(__name__) + + +class CollectorPipeline(object): + + def __init__(self): + super().__init__() + + def process_item(self, item, spider): + """ + 将数据存储在数据库中 + """ + + public_sentiment_comment = PublicSentimentComment() + public_sentiment_comment.content = item['sensitive_word'] + + public_sentiment_comment_service = PublicSentimentCommentService() + public_sentiment_comment_service.save(public_sentiment_comment) + return item diff --git a/collector/collector/settings.py b/collector/collector/settings.py new file mode 100644 index 0000000..25e2f6e --- /dev/null +++ b/collector/collector/settings.py @@ -0,0 +1,117 @@ +# Scrapy settings for collector project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "collector" + +SPIDER_MODULES = ["collector.spiders"] +NEWSPIDER_MODULE = "collector.spiders" + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +# USER_AGENT = "collector (+http://www.yourdomain.com)" + +# Obey robots.txt rules +# ROBOTSTXT_OBEY = True +# 默认为True,此处改为False +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +# } + +DEFAULT_REQUEST_HEADERS = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en', + 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36', + 'Cookie': 'adb_isBlock=0; userid=1652710683278_ihrfq92084; prov=cn0731; city=0732; weather_city=hn_xt; region_ip=110.53.149.x; region_ver=1.2; wxIsclose=false; ifengRotator_iis3=6; ifengWindowCookieName_919=1' + # 默认是注释的,这个东西非常重要,如果不写很容易被判断为电脑,简单点写一个Mozilla/5.0即可 +} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# "collector.middlewares.CollectorSpiderMiddleware": 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# "collector.middlewares.CollectorDownloaderMiddleware": 543, +# } + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +# ITEM_PIPELINES = { +# "collector.pipelines.CollectorPipeline": 300, +# } +# 项目管道,数字越小优先度越高 +ITEM_PIPELINES = { + 'collector.pipelines.CollectorPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = "httpcache" +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" + +######################################### 下面的都是自定义的 ######################################## + +import os, django +import sys + +BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(BASE_DIR) +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "public_sentiment.settings") +os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true" +django.setup() diff --git a/collector/collector/spiders/__init__.py b/collector/collector/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/collector/collector/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/collector/collector/spiders/collector_spider.py b/collector/collector/spiders/collector_spider.py new file mode 100644 index 0000000..bdc274b --- /dev/null +++ b/collector/collector/spiders/collector_spider.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import re +from typing import Optional, Any +import scrapy + +from collector.items import SensitiveWordItem +from web.manager.log_manager import LogManager +from web.spider.base_spider import BaseSpider +from web.util.re_util import ReUtil + +Logger = LogManager.get_logger(__name__) + + +class CollectorSpider(scrapy.Spider, BaseSpider): + """ + 从微博上爬数据 + """ + + name = "collector-spider" + allowed_domains = ["s.weibo.com"] + # start_urls = ["https://s.weibo.com/weibo?q=%E5%8C%97%E4%BA%AC%E5%B7%A5%E5%95%86%E5%A4%A7%E5%AD%A6&nodup=1&page=5"] + start_urls = ["https://xm.buyiju.com/ceming/129803-zajo.html"] + # url = 'https://xm.buyiju.com/ceming/129803-zajo.html' + + def __init__(self, name: Optional[str] = None, **kwargs: Any): + scrapy.Spider.__init__(self) + BaseSpider.__init__(self) + + # def start_requests(self): + # yield scrapy.Request(url=self.url, callback=self.parse) + + def parse(self, response): + + Logger.info('从微博上爬数据') + # 返回的html + text = response.text + + # 查询敏感词,并将其拼接为字符串,用|分隔 + training_sensitive_word_list = self.training_sensitive_word_service.find_all() + temp_training_sensitive_word_list = list(map(lambda training_sensitive_word: str(training_sensitive_word.word), training_sensitive_word_list)) + match_str = '.+|.+'.join(temp_training_sensitive_word_list) + + # 去除返回值中的html标签 + text_without_html = ReUtil.clear_html(text) + text_without_html_list = text_without_html.split('\n') + + # 匹配 + is_match = False + sensitive_word_item = SensitiveWordItem() + for item in text_without_html_list: + match = re.match(match_str, item) + if match: + sensitive_word_item['sensitive_word'] = match.group() + is_match = True + break + if is_match: + yield sensitive_word_item diff --git a/collector/dbs/default.db b/collector/dbs/default.db new file mode 100644 index 0000000000000000000000000000000000000000..ff5ee9baf89859216639f3b97020034d6c3af1af GIT binary patch literal 8192 zcmeI#JqyAx5C-6j2u`A#>n(zUxYQqDm2NJ6?2_VHf~~D-3U+kycbg)DPVV}??@s<85P$##AOHaf zKmY;|fB*y_0D(UixT?-zI8?X5={S|LFbA>wOC{CEyS3az;V$A`xL8H}KE)$*U~_ho zh~>hXOxv1IQcs#Dx5m|MMW@fNOv+Lx!f~3%<7PMaOFseu2tWV=5P$##AOHafKmY;| JfWQw6JOJh#GJ^mB literal 0 HcmV?d00001 diff --git a/collector/main.py b/collector/main.py new file mode 100644 index 0000000..0691656 --- /dev/null +++ b/collector/main.py @@ -0,0 +1,3 @@ +from scrapy.cmdline import execute + +execute('scrapy crawl collector-spider'.split()) diff --git a/collector/scrapy.cfg b/collector/scrapy.cfg new file mode 100644 index 0000000..d9ad045 --- /dev/null +++ b/collector/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = collector.settings + +[deploy] +#url = http://localhost:6800/ +project = collector diff --git a/manage.py b/manage.py new file mode 100644 index 0000000..7f75b1e --- /dev/null +++ b/manage.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import sys +import django +from web.manager.log_manager import LogManager + +sys.path.append(r"web") +sys.path.append(r"collector") + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'public_sentiment.settings') +django.setup() + +Logger = LogManager.get_logger(__name__) + +if __name__ == '__main__': + + LogManager.get_logger("启动服务器") + + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + + execute_from_command_line(sys.argv) diff --git a/public_sentiment/__init__.py b/public_sentiment/__init__.py new file mode 100644 index 0000000..c45523b --- /dev/null +++ b/public_sentiment/__init__.py @@ -0,0 +1,2 @@ +import pymysql +pymysql.install_as_MySQLdb() \ No newline at end of file diff --git a/public_sentiment/asgi.py b/public_sentiment/asgi.py new file mode 100644 index 0000000..d5b43f7 --- /dev/null +++ b/public_sentiment/asgi.py @@ -0,0 +1,16 @@ +""" +ASGI config for public_sentiment project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/ +""" + +import os + +from django.core.asgi import get_asgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'public_sentiment.settings') + +application = get_asgi_application() diff --git a/public_sentiment/settings.py b/public_sentiment/settings.py new file mode 100644 index 0000000..913d67f --- /dev/null +++ b/public_sentiment/settings.py @@ -0,0 +1,136 @@ +""" +Django settings for public_sentiment project. + +Generated by 'django-admin startproject' using Django 4.2.16. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/4.2/ref/settings/ +""" + +from pathlib import Path + +# Build paths inside the project like this: BASE_DIR / 'subdir'. +BASE_DIR = Path(__file__).resolve().parent.parent + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = 'django-insecure-!*ar1k^h=h^*azpzf3sabuf4w5m)vo^aev0l6c@6qfcdh73%ze' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [] + +# Application definition + +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'web', +] + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'public_sentiment.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'public_sentiment.wsgi.application' + +# Database +# https://docs.djangoproject.com/en/4.2/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.mysql', # 默认 + 'NAME': 'base_platform', # 连接的数据库 + 'HOST': '127.0.0.1', # mysql的ip地址 + 'PORT': 3306, # mysql的端口 + 'USER': 'root', # mysql的用户名 + 'PASSWORD': '123456', # mysql的密码 + } +} + +# Password validation +# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + +# Internationalization +# https://docs.djangoproject.com/en/4.2/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'UTC' + +USE_I18N = True + +USE_L10N = True + +# USE_TZ = True +USE_TZ = False + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/4.2/howto/static-files/ + +STATIC_URL = 'static/' + +# Default primary key field type +# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field + +DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' + +# 时区 +TIME_ZONE = 'Asia/Shanghai' + +# gridgraph的配置 +GRID_GRAPH = { + 'url': 'ws://192.168.3.18:8182/gremlin', + 'traversal_source': 'gmodern100M', + 'username': 'admin', + 'password': 'admin' +} diff --git a/public_sentiment/urls.py b/public_sentiment/urls.py new file mode 100644 index 0000000..d5694b0 --- /dev/null +++ b/public_sentiment/urls.py @@ -0,0 +1,25 @@ +""" +URL configuration for public_sentiment project. + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/4.2/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" +from django.contrib import admin +from django.urls import path + +from web.controller.html_parser_controller import parse_html + +urlpatterns = [ + path('admin/', admin.site.urls), + path('api/v1/htmlParser/parseHtml', parse_html), +] diff --git a/public_sentiment/wsgi.py b/public_sentiment/wsgi.py new file mode 100644 index 0000000..36bb2a2 --- /dev/null +++ b/public_sentiment/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for public_sentiment project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'public_sentiment.settings') + +application = get_wsgi_application() diff --git a/scrawl/__init__.py b/scrawl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrawl/scrapy.cfg b/scrawl/scrapy.cfg new file mode 100644 index 0000000..5bdf38f --- /dev/null +++ b/scrawl/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = scrawl.settings + +[deploy] +#url = http://localhost:6800/ +project = scrawl diff --git a/scrawl/scrawl/__init__.py b/scrawl/scrawl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrawl/scrawl/items.py b/scrawl/scrawl/items.py new file mode 100644 index 0000000..3246248 --- /dev/null +++ b/scrawl/scrawl/items.py @@ -0,0 +1,12 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class ScrawlItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/scrawl/scrawl/middlewares.py b/scrawl/scrawl/middlewares.py new file mode 100644 index 0000000..cb087d4 --- /dev/null +++ b/scrawl/scrawl/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class ScrawlSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class ScrawlDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/scrawl/scrawl/pipelines.py b/scrawl/scrawl/pipelines.py new file mode 100644 index 0000000..b08c3ce --- /dev/null +++ b/scrawl/scrawl/pipelines.py @@ -0,0 +1,13 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class ScrawlPipeline: + def process_item(self, item, spider): + return item diff --git a/scrawl/scrawl/settings.py b/scrawl/scrawl/settings.py new file mode 100644 index 0000000..110a02c --- /dev/null +++ b/scrawl/scrawl/settings.py @@ -0,0 +1,105 @@ +# Scrapy settings for scrawl project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "scrawl" + +SPIDER_MODULES = ["scrawl.spiders"] +NEWSPIDER_MODULE = "scrawl.spiders" + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +# USER_AGENT = "scrawl (+http://www.yourdomain.com)" + +# Obey robots.txt rules +# ROBOTSTXT_OBEY = True +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +# } +DEFAULT_REQUEST_HEADERS = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en', + 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36', + 'Cookie': 'adb_isBlock=0; userid=1652710683278_ihrfq92084; prov=cn0731; city=0732; weather_city=hn_xt; region_ip=110.53.149.x; region_ver=1.2; wxIsclose=false; ifengRotator_iis3=6; ifengWindowCookieName_919=1' + # 默认是注释的,这个东西非常重要,如果不写很容易被判断为电脑,简单点写一个Mozilla/5.0即可 +} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# "scrawl.middlewares.ScrawlSpiderMiddleware": 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# "scrawl.middlewares.ScrawlDownloaderMiddleware": 543, +# } + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +# ITEM_PIPELINES = { +# "scrawl.pipelines.ScrawlPipeline": 300, +# } +# ITEM_PIPELINES:项目管道,300为优先级,越低越爬取的优先度越高 +ITEM_PIPELINES = { + 'scrawl.pipelines.ScrawlPipeline': 300, + # 'subeiNews.pipelines.SubeinewsMysqlPipeline': 200, # 存数据的管道 +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = "httpcache" +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" diff --git a/scrawl/scrawl/spiders/__init__.py b/scrawl/scrawl/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/scrawl/scrawl/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/scrawl/scrawl/spiders/weibo_spider.py b/scrawl/scrawl/spiders/weibo_spider.py new file mode 100644 index 0000000..5a69700 --- /dev/null +++ b/scrawl/scrawl/spiders/weibo_spider.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import scrapy + +sys.path.append(r"scrawl") +from scrawl.items import ScrawlItem + + +class WeiboSpiderSpider(scrapy.Spider): + name = "weibo_spider" + allowed_domains = ["s.weibo.com"] + start_urls = ["https://s.weibo.com/weibo?q=%E5%8C%97%E4%BA%AC%E5%B7%A5%E5%95%86%E5%A4%A7%E5%AD%A6&nodup=1&page=5"] + + def parse(self, response): + for con in response.xpath('//*[@id="pl_feedlist_index"]/div/div'): + scraw_item = ScrawlItem() diff --git a/script/main.bat b/script/main.bat new file mode 100644 index 0000000..e822946 --- /dev/null +++ b/script/main.bat @@ -0,0 +1,2 @@ +cd C:/mywork/workspace/public_sentiment/collector +scrapy crawl collector-spider \ No newline at end of file diff --git a/script/runserver.bat b/script/runserver.bat new file mode 100644 index 0000000..01b5c27 --- /dev/null +++ b/script/runserver.bat @@ -0,0 +1 @@ +C:\mywork\dev-env\python\Python38\python.exe C:\mywork\workspace\public_sentiment\manage.py runserver 9000 diff --git a/script/scrapyd-console.bat b/script/scrapyd-console.bat new file mode 100644 index 0000000..56516c4 --- /dev/null +++ b/script/scrapyd-console.bat @@ -0,0 +1 @@ +scrapyd \ No newline at end of file diff --git a/web/__init__.py b/web/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/admin.py b/web/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/web/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/web/apps.py b/web/apps.py new file mode 100644 index 0000000..682e923 --- /dev/null +++ b/web/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class WebConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'web' diff --git a/web/constants/__init__.py b/web/constants/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/constants/startup_parameter.py b/web/constants/startup_parameter.py new file mode 100644 index 0000000..15d2c67 --- /dev/null +++ b/web/constants/startup_parameter.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + + +""" +启动系统时的参数 +""" + + +class StartupParameter: + # 采集数据 + Crawl_Data = 'crawl_data' diff --git a/web/controller/__init__.py b/web/controller/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/controller/base_controller.py b/web/controller/base_controller.py new file mode 100644 index 0000000..8a381f7 --- /dev/null +++ b/web/controller/base_controller.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import json +import sys +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings +from web.handler.html_parser_handler import HtmlParserHandler + +sys.path.append(r"collector") + +from collector.settings import ITEM_PIPELINES + + +class BaseController: + """ + controller层的基类 + """ + + def __init__(self): + self.html_parser_handler = HtmlParserHandler() + + def to_vo(self, request, clazz): + """ + 将json参数转换为vo对象 + """ + raw_data = request.body.decode("utf-8") + json_data_dict = json.loads(raw_data) + obj = clazz(**json_data_dict) + return obj + + def start_scrawl(self, spider): + """ + 开始执行爬虫 + """ + + # get_project_settings方法并不能导入settings.py中的配置,因此此处还要硬编码导入 + settings = get_project_settings() + settings['ITEM_PIPELINES'] = ITEM_PIPELINES + process = CrawlerProcess(settings) + process.crawl(spider) + process.start() diff --git a/web/controller/html_parser_controller.py b/web/controller/html_parser_controller.py new file mode 100644 index 0000000..ddd9529 --- /dev/null +++ b/web/controller/html_parser_controller.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import json +from collections import namedtuple +from django.http import JsonResponse +from rest_framework.decorators import api_view +from twisted.protocols.amp import Box +from collector.spiders.collector_spider import CollectorSpider +from web.controller.base_controller import BaseController +from web.dto.api_result import ApiResult +from web.manager.gridgraph_manager import GridGraphManager +from web.manager.log_manager import LogManager +from web.util.dto_util import DtoUtil +from web.vo.parse_html_vo import ParseHtmlVo + +Logger = LogManager.get_logger(__name__) + +base_controller = BaseController() + + +@api_view(['POST']) +def parse_html(request): + """ + 解析html + """ + + Logger.info("开始解析html") + + parse_html_vo = base_controller.to_vo(request, ParseHtmlVo) + service_result = base_controller.html_parser_handler.parse_html(parse_html_vo.url) + + # grid_graph_manager = GridGraphManager() + # list = grid_graph_manager.query_vertex(label='person') + + # base_controller.start_scrawl(CollectorSpider) + + return JsonResponse(DtoUtil.service_result_to_api_result(service_result), safe=False) diff --git a/web/dao/__init__.py b/web/dao/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/dao/base_dao.py b/web/dao/base_dao.py new file mode 100644 index 0000000..512acb1 --- /dev/null +++ b/web/dao/base_dao.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from datetime import datetime +from django.db.models.query import QuerySet +from django.db import models + +from web.manager.snowflake_manager import SnowflakeManager + + +class BaseDao: + """ + dao基类 + """ + + # 子类必须覆盖这个 + model_class = models.Model + save_batch_size = 1000 + + snowflake_manager = SnowflakeManager() + + def save(self, obj): + """ + 添加 + """ + + if not obj: + return False + obj.id = self.snowflake_manager.next_id() + obj.create_time = datetime.now() + obj.save() + return True + + def save_batch(self, objs, *, batch_size=save_batch_size): + """ + 批量添加 + """ + + if not objs: + return False + for obj in objs: + obj.id = snowflake.next_id() + self.model_class.objects.bulk_create(objs, batch_size=batch_size) + return True + + def delete(self, obj): + """ + 删除 + """ + + if not obj: + return False + obj.delete() + return True + + def delete_batch(self, objs): + """ + 批量删除 + """ + + if not objs: + return False + for obj in objs: + self.delete(obj) + return True + + def delete_batch_by_query(self, filter_kw: dict, exclude_kw: dict): + """ + 根据条件,批量删除 + """ + + self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw).delete() + return True + + def delete_by_fake(self, obj): + """ + 假删除/伪删除 + """ + + if obj is None: + return False + obj.is_deleted = True + obj.save() + return True + + def update(self, obj): + """ + 更新 + """ + + if not obj: + return False + obj.save() + return True + + def update_batch(self, objs): + """ + 批量更新 + """ + + if not objs: + return False + for obj in objs: + self.update(obj) + return True + + def update_batch_by_query(self, query_kwargs: dict, exclude_kw: dict, newattrs_kwargs: dict): + """ + 根据条件,批量更新 + """ + + self.model_class.objects.filter(**query_kwargs).exclude(**exclude_kw).update(**newattrs_kwargs) + + def find_one(self, filter_kw: dict, exclude_kw: dict, order_bys: list): + """ + 根据条件,返回一条记录 + """ + + qs = self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw) + if order_bys: + qs = qs.order_by(*order_bys) + return qs.first() + + def find_queryset(self, filter_kw: dict, exclude_kw: dict, order_bys: list) -> QuerySet: + """ + 根据条件,返回QuerySet + """ + if order_bys != None and len(order_bys) != 0: + query_set = self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw) + for by in order_bys: + query_set = query_set.order_by(by) + return query_set + else: + return self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw) + + def find_list(self, filter_kw: dict, exclude_kw: dict, order_bys: list) -> list: + """ + 根据条件,返回对象列表 + """ + + queryset = self.find_queryset(filter_kw, exclude_kw, order_bys) + model_instances = [model for model in queryset] + return model_instances + + def is_exists(self, filter_kw: dict, exclude_kw: dict) -> bool: + """ + 根据条件,判断记录是否存在 + """ + + return self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw).exists() + + def get_count(self, filter_kw: dict, exclude_kw: dict) -> int: + """ + 根据条件,计数 + """ + + return self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw).count() diff --git a/web/dao/public_sentiment_comment_dao.py b/web/dao/public_sentiment_comment_dao.py new file mode 100644 index 0000000..36b936c --- /dev/null +++ b/web/dao/public_sentiment_comment_dao.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from web.dao.base_dao import BaseDao +from web.models import PublicSentimentComment + + +class PublicSentimentCommentDao(BaseDao): + """ + Comment的dao类 + """ + + model_class = PublicSentimentComment diff --git a/web/dao/training_sensitive_word_dao.py b/web/dao/training_sensitive_word_dao.py new file mode 100644 index 0000000..e347357 --- /dev/null +++ b/web/dao/training_sensitive_word_dao.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from web.dao.base_dao import BaseDao +from web.models import TrainingSensitiveWord + + +class TrainingSensitiveWordDao(BaseDao): + """ + TrainingSensitiveWord的dao类 + """ + + model_class = TrainingSensitiveWord + + def find_all(self): + """ + 查询所有记录 + """ + + return self.find_list(dict(), dict(), list()) diff --git a/web/dto/__init__.py b/web/dto/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/dto/api_result.py b/web/dto/api_result.py new file mode 100644 index 0000000..c9c87d5 --- /dev/null +++ b/web/dto/api_result.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + + +class ApiResult: + """ + 接口返回类 + """ + + def __init__(self): + super().__init__() + + def __init__(self, success, code, data, message): + # 只要服务端没报错,success都是True + self.success = success + # 根据处理结果不同,返回不同的值 + self.code = code + # 返回数据 + self.data = data + # 提示信息 + self.message = message + + @staticmethod + def instance(success, code, data, message): + return ApiResult(success, code, data, message).__dict__ + + @staticmethod + def ok(code, data, message): + return ApiResult(True, code, data, message).__dict__ + + @staticmethod + def fail(code, data, message): + return ApiResult(False, code, data, message).__dict__ diff --git a/web/dto/service_result.py b/web/dto/service_result.py new file mode 100644 index 0000000..1e7c02a --- /dev/null +++ b/web/dto/service_result.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + + +class ServiceResult: + """ + service层返回值对象 + """ + + def __init__(self): + super().__init__() + + def __init__(self, success, code, data, message): + # 只要服务端没报错,success都是True + self.success = success + # 根据处理结果不同,返回不同的值 + self.code = code + # 返回数据 + self.data = data + # 提示信息 + self.message = message + + @staticmethod + def ok(code, data, message): + return ServiceResult(True, code, data, message) + + @staticmethod + def fail(code, data, message): + return ServiceResult(False, code, data, message) diff --git a/web/enum/__init__.py b/web/enum/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/enum/api_result_enum.py b/web/enum/api_result_enum.py new file mode 100644 index 0000000..37e39fe --- /dev/null +++ b/web/enum/api_result_enum.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + + +from enum import Enum + + +class ApiResultEnum(Enum): + """ + ApiResult类的的枚举类型 + """ + + # 成功 + # SUCCESS = 200 + # SUCCESS_DESCRIPTION = '成功' + + # 失败 + FAIL = 4000 + FAIL_DESCRIPTION = '失败' diff --git a/web/enum/service_result_enum.py b/web/enum/service_result_enum.py new file mode 100644 index 0000000..689bd75 --- /dev/null +++ b/web/enum/service_result_enum.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + + +from enum import Enum + + +class ServiceResultEnum(Enum): + """ + ServiceResult类的的枚举类型 + """ + + # 成功 + SUCCESS = 200 + SUCCESS_DESCRIPTION = '成功' + + # 失败 + FAIL = 3000 + FAIL_DESCRIPTION = '失败' + + # 添加成功 + SAVE_SUCCESS = 3001 + SAVE_SUCCESS_DESCRIPTION = '添加成功' + + # 删除成功 + DELETE_SUCCESS = 3002 + DELETE_SUCCESS_DESCRIPTION = '删除成功' + + # 修改成功 + UPDATE_SUCCESS = 3003 + UPDATE_SUCCESS_DESCRIPTION = '修改成功' + + # 查询成功 + SELECT_SUCCESS = 3004 + SELECT_SUCCESS_DESCRIPTION = '查询成功' + + # 不存在敏感词 + NOT_EXIST_SENSITIVE_WORD = 3005 + NOT_EXIST_SENSITIVE_WORD_DESCRIPTION = '不存在敏感词' + + # 存在敏感词 + EXIST_SENSITIVE_WORD = 3006 + EXIST_SENSITIVE_WORD_DESCRIPTION = '存在敏感词' diff --git a/web/handler/__init__.py b/web/handler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/handler/base_handler.py b/web/handler/base_handler.py new file mode 100644 index 0000000..9b82925 --- /dev/null +++ b/web/handler/base_handler.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from web.service.training_sensitive_word_service import TrainingSensitiveWordService + + +class BaseHandler: + """ + handler层的基类 + """ + + def __init__(self): + self.training_sensitive_word_service = TrainingSensitiveWordService() diff --git a/web/handler/crawl_data_handler.py b/web/handler/crawl_data_handler.py new file mode 100644 index 0000000..5ed159f --- /dev/null +++ b/web/handler/crawl_data_handler.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from web.manager.log_manager import LogManager +from web.handler.base_handler import BaseHandler + +Logger = LogManager.get_logger(__name__) + +""" +采集数据的handler +""" + + +class CrawlDataHandler(BaseHandler): + + def collect_data_from_weibo(self): + """ + 从新浪微博采集数据 + """ + + Logger.info("开始从新浪微博采集数据") + + diff --git a/web/handler/html_parser_handler.py b/web/handler/html_parser_handler.py new file mode 100644 index 0000000..81b3a16 --- /dev/null +++ b/web/handler/html_parser_handler.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import re +import requests + +from web.enum.service_result_enum import ServiceResultEnum +from web.dto.service_result import ServiceResult +from web.handler.base_handler import BaseHandler +from web.manager.log_manager import LogManager +from web.util.re_util import ReUtil + +Logger = LogManager.get_logger(__name__) + + +class HtmlParserHandler(BaseHandler): + """ + html解析器类 + """ + + def parse_html(self, url): + """ + 解析html网页 + """ + + response = requests.get(url) + text = response.text + + # 查询敏感词,并将其拼接为字符串,用|分隔 + service_result = self.training_sensitive_word_service.find_all() + if service_result is not None and service_result.success is True: + training_sensitive_word_list = service_result.data + temp_training_sensitive_word_list = list( + map(lambda training_sensitive_word: str(training_sensitive_word.word), training_sensitive_word_list)) + match_str = '.+|.+'.join(temp_training_sensitive_word_list) + + # 去除返回值中的html标签 + text_without_html = ReUtil.clear_html(text) + text_without_html_list = text_without_html.split('\n') + + # 匹配 + for item in text_without_html_list: + match = re.match(match_str, item) + if match: + return ServiceResult.ok(ServiceResultEnum.EXIST_SENSITIVE_WORD.value, match.group(), + ServiceResultEnum.EXIST_SENSITIVE_WORD_DESCRIPTION.value) + return ServiceResult.ok(ServiceResultEnum.NOT_EXIST_SENSITIVE_WORD.value, None, + ServiceResultEnum.NOT_EXIST_SENSITIVE_WORD_DESCRIPTION.value) + else: + return ServiceResult.fail(ServiceResultEnum.FAIL.value, None, + ServiceResultEnum.FAIL_DESCRIPTION.value) diff --git a/web/manager/__init__.py b/web/manager/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/manager/gridgraph_manager.py b/web/manager/gridgraph_manager.py new file mode 100644 index 0000000..6a79fd7 --- /dev/null +++ b/web/manager/gridgraph_manager.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from gremlin_python import statics +from gremlin_python.process.anonymous_traversal import traversal +from gremlin_python.process.graph_traversal import __ +from gremlin_python.process.strategies import * +from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection +from gremlin_python.process.traversal import T +from gremlin_python.process.traversal import Order +from gremlin_python.process.traversal import Cardinality +from gremlin_python.process.traversal import Column +from gremlin_python.process.traversal import Direction +from gremlin_python.process.traversal import Operator +from gremlin_python.process.traversal import P +from gremlin_python.process.traversal import Pop +from gremlin_python.process.traversal import Scope +from gremlin_python.process.traversal import Barrier +from gremlin_python.process.traversal import Bindings +from gremlin_python.process.traversal import WithOptions +from gremlin_python.driver import client +from public_sentiment.settings import GRID_GRAPH + + +class GridGraphManager: + """ + gridgraph的管理器类 + """ + + def __init__(self): + self.graph = traversal().withRemote( + DriverRemoteConnection(GRID_GRAPH['url'], GRID_GRAPH['traversal_source'], username=GRID_GRAPH['username'], + password=GRID_GRAPH['password'])) + + def add_vertex(self, label, properties=None): + """ + add vertex + :param graph: graph, type: GraphTraversalSource + :param label: label, type: str + :param properties: property dict, like {'p1': 'value1', 'p2': 'value2'} + :return: vertex, Vertex(id, label) + """ + vert = self.graph.addV(label) + if properties: + for key in properties.keys(): + vert.property(key, properties.get(key)) + return vert.next() + + def add_edge(self, label, v_from, v_to, properties=None): + """ + add edge + :param graph: graph, type: GraphTraversalSource + :param label: label, type: str + :param v_from: long vertex id or Vertex(id, label) of from + :param v_to: long vertex id or Vertex(id, label) of to + :param properties: property dict, like {'p1': 'value1', 'p2': 'value2'} + :return: None + """ + if isinstance(v_from, int): + v_from = self.graph.V().hasId(v_from).next() + if isinstance(v_to, int): + v_to = self.graph.V().hasId(v_to).next() + edge = self.graph.V(v_from).addE(label).to(v_to) + if properties: + for key in properties.keys(): + edge.property(key, properties.get(key)) + edge.next() + + def drop_vertex(self, v_id=None, label=None, properties=None): + """ + drop all vertex or specific vertex + :param graph: graph, type: GraphTraversalSource + :param v_id: long vertex id or Vertex(id, label) + :param label: label, type: str + :param properties: property list, like ['p1', 'p2', {'p3': 'value'}] + :return: None + """ + if isinstance(v_id, int): + v_id = self.graph.V().hasId(v_id).next() + travel = self.graph.V(v_id) if v_id else self.graph.V() + if label: + travel = travel.hasLabel(label) + if properties: + for p in properties: + if isinstance(p, dict): + key = list(p.keys())[0] + travel = travel.has(key, p.get(key)) + else: + travel = travel.has(p) + travel.drop().iterate() + + def drop_edge(self, e_id=None, label=None, properties=None): + """ + drop all edges or specific edge + :param graph: graph, type: GraphTraversalSource + :param e_id: edge id, type str + :param label: label, type: str + :param properties: property list, like ['p1', 'p2', {'p3': 'value'}] + :return: None + """ + travel = self.graph.E(e_id) if e_id else self.graph.E() + if label: + travel = travel.hasLabel(label) + if properties: + for p in properties: + if isinstance(p, dict): + key = list(p.keys())[0] + travel = travel.has(key, p.get(key)) + else: + travel = travel.has(p) + travel.drop().iterate() + + def query_vertex(self, v_id=None, label=None, properties=None): + """ + query graph vertex (value) list + :param graph: graph, type: GraphTraversalSource + :param v_id: long vertex id or Vertex(id, label) + :param label: label, type: str + :param properties: property list, like ['p1', 'p2', {'p3': 'value'}] + :return: vertex list or vertex value list + """ + if isinstance(v_id, int): + v_id = self.graph.V().hasId(v_id).next() + travel = self.graph.V(v_id) if v_id else self.graph.V() + if label: + travel = travel.hasLabel(label) + if properties: + for p in properties: + if isinstance(p, dict): + key = list(p.keys())[0] + travel = travel.has(key, p.get(key)) + else: + travel = travel.has(p) + # return travel.valueMap().toList() + return travel.toList() + + def query_edge(self, e_id=None, label=None, properties=None): + """ + query graph edge value list + :param graph: graph, type: GraphTraversalSource + :param e_id: edge id, type str + :param label: label, type: str + :param properties: property list, like ['p1', 'p2', {'p3': 'value'}] + :return: valueMap list + """ + travel = self.graph.E(e_id) if e_id else self.graph.E() + if label: + travel = travel.hasLabel(label) + if properties: + for p in properties: + if isinstance(p, dict): + key = list(p.keys())[0] + travel = travel.has(key, p.get(key)) + else: + travel = travel.has(p) + return travel.valueMap().toList() + + def query_edges_of_vertex(self, v_id): + """ + query all edges of vertex + :param graph: graph, type: GraphTraversalSource + :param v_id: v_id: long vertex id or Vertex(id, label) + :return: edge list + """ + if isinstance(v_id, int): + v_id = self.graph.V().hasId(v_id).next() + result = [] + in_edges = self.graph.V(v_id).inE().toList() + out_edges = self.graph.V(v_id).outE().toList() + result.extend(in_edges) + result.extend(out_edges) + return result + + def query_near_vertex(self, v_id): + """ + query near vertices of vertex + :param graph: graph, type: GraphTraversalSource + :param v_id: v_id: long vertex id or Vertex(id, label) + :return: vertex list + """ + if isinstance(v_id, int): + v_id = self.graph.V().hasId(v_id).next() + result = [] + out_v = self.graph.V(v_id).out().toList() + in_v = self.graph.V(v_id).in_().toList() + result.extend(out_v) + result.extend(in_v) + return result + + def get_edge_id(self): + """ + get edge id + :param edge: Egde(id, label, outV, inV) + :return: edge id, type str + """ + return self.graph.id.get('@value').get('relationId') + + def vertex_to_dict(self, vertex): + """ + transfer Vertex's info to dict + :param graph: graph, type: GraphTraversalSource + :param vertex: vertex, Vertex(id, label) + :return: vertex info dict + """ + properties = self.graph.V(vertex).valueMap().toList()[0] + for key in properties.keys(): + properties[key] = properties.get(key)[0] + return { + 'id': vertex.id, + 'label': vertex.label, + 'properties': properties + } + + def edge_to_dict(self, edge): + """ + transfer Edge's info to dict + :param graph: graph, type: GraphTraversalSource + :param edge: edge, Edge(id, label, outV, inV) + :return: edge info dict + """ + e_id = self.get_edge_id(edge) + properties = self.graph.E(e_id).valueMap().toList()[0] + return { + 'id': e_id, + 'label': edge.label, + 'properties': properties + } + + def judge_vertex_in_graph(self, vertex_dict): + """ + judge a vertex whether in graph + :param graph: graph, type: GraphTraversalSource + :param vertex_dict: vertex dict, like {'label': 'value1', 'properties': {'p1': 'v1', ...}} + :return: None or Vertex(id,label) + """ + label = vertex_dict.get('label') + properties = vertex_dict.get('properties') + travel = self.graph.V() + if label: + travel = travel.hasLabel(label) + if properties: + for k in properties.keys(): + travel = travel.has(k, properties.get(k)) + if travel.hasNext(): + return travel.next() + return None + + def get_sub_graph(self, vertices=None, edges=None, vertex_properties=None): + """ + get sub graph + :param graph: graph, type: GraphTraversalSource + :param vertices: hasLabel('label').has('property').has('age', gt(20)) + :param edges: hasLabel('label').has('property') + :param vertex_properties: + :return: sub_graph, type: GraphTraversalSource + """ + strategy = SubgraphStrategy(vertices=vertices, edges=edges, vertex_properties=vertex_properties) + return self.graph.withStrategies(strategy) diff --git a/web/manager/log_manager.py b/web/manager/log_manager.py new file mode 100644 index 0000000..27e298d --- /dev/null +++ b/web/manager/log_manager.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import logging + + +class LogManager: + """ + 日志处理器类,同时在控制台和日志文件中打印日志 + """ + + # 日志对象 + Logger = None + + def __init__(self): + super(LogManager, self).__init__() + + @staticmethod + def get_logger(param_name, log_file='/mywork/log/public-sentiment/public-sentiment.log', level=logging.INFO): + """ + 获取日志对象 + :param param_name: + :param log_file: + :param level: + :return: + """ + + if LogManager.Logger is None: + LogManager.Logger = logging.getLogger(param_name) + LogManager.Logger.setLevel(level=level) + + formatter = logging.Formatter( + '%(asctime)s [%(threadName)s-%(thread)d] [%(levelname)s] %(name)s.%(funcName)s[%(lineno)d] %(message)s') + + file_handler = logging.FileHandler(log_file, encoding="utf-8") + file_handler.setLevel(level=level) + file_handler.setFormatter(formatter) + + console = logging.StreamHandler() + console.setFormatter(formatter) + console.setLevel(level) + + LogManager.Logger.addHandler(file_handler) + LogManager.Logger.addHandler(console) + return LogManager.Logger + else: + return LogManager.Logger diff --git a/web/manager/snowflake_manager.py b/web/manager/snowflake_manager.py new file mode 100644 index 0000000..ec1c6c9 --- /dev/null +++ b/web/manager/snowflake_manager.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import time + + +class SnowflakeManager(object): + """ + Twitter的雪花算法实现 + """ + + def __init__(self, start_time=1420041600000): + self.start_time = start_time / 1000 # 以秒为单位 + self.last_timestamp = -1 + + # 41 bits时间戳 + self.timestamp_shift = 22 + # 10 bits机器编号 + self.machine_id_shift = 12 + # 12 bits序列号 + self.sequence_shift = 0 + + # 41 bits可以表示的最大值,2^41 - 1 + self.max_timestamp = -1 ^ (-1 << self.timestamp_shift) + # 10 bits可以表示的最大值,2^10 - 1 + self.max_machine_id = -1 ^ (-1 << self.machine_id_shift) + # 12 bits可以表示的最大值,2^12 - 1 + self.max_sequence = -1 ^ (-1 << self.sequence_shift) + + # 机器编号和序列号暂时不使用,可以通过参数传入 + self.machine_id = 0 + self.sequence = 0 + + def next_id(self): + timestamp = int(time.time()) + if timestamp < self.last_timestamp: + raise ValueError('Current timestamp is less than last timestamp.') + + if timestamp == self.last_timestamp: + self.sequence = (self.sequence + 1) & self.max_sequence + if self.sequence == 0: + timestamp = self.til_next_millis(self.last_timestamp) + else: + self.sequence = 0 + + self.last_timestamp = timestamp + return ((timestamp - int(self.start_time)) << self.timestamp_shift) | ( + self.machine_id << self.machine_id_shift) | self.sequence + + def til_next_millis(self, last_timestamp): + timestamp = int(time.time()) + while timestamp <= last_timestamp: + timestamp = int(time.time()) + return timestamp diff --git a/web/migrations/__init__.py b/web/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/models.py b/web/models.py new file mode 100644 index 0000000..71a8362 --- /dev/null +++ b/web/models.py @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here. diff --git a/web/models/__init__.py b/web/models/__init__.py new file mode 100644 index 0000000..ad16e9c --- /dev/null +++ b/web/models/__init__.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from .public_sentiment_comment import PublicSentimentComment +from .public_sentiment_source import PublicSentimentSource +from .training_sensitive_word import TrainingSensitiveWord diff --git a/web/models/public_sentiment_comment.py b/web/models/public_sentiment_comment.py new file mode 100644 index 0000000..0acf841 --- /dev/null +++ b/web/models/public_sentiment_comment.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from django.core.validators import MaxValueValidator +from django.db import models + + +class PublicSentimentComment(models.Model): + """ + 评论表 + """ + + # 主键 + id = models.AutoField(primary_key=True) + + # 内容 + content = models.CharField(max_length=2550, null=True, blank=True) + + # 来源id + source_id = models.BigIntegerField(validators=[MaxValueValidator(9223372036854775807)], db_index=True, null=False, + blank=False) + + # 创建时间 + create_time = models.DateTimeField(null=False, blank=False) + + class Meta: + managed = True + db_table = 'ps_comment' + verbose_name = '评论表' + verbose_name_plural = verbose_name \ No newline at end of file diff --git a/web/models/public_sentiment_source.py b/web/models/public_sentiment_source.py new file mode 100644 index 0000000..95b5edc --- /dev/null +++ b/web/models/public_sentiment_source.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from django.db import models + + +class PublicSentimentSource(models.Model): + """ + 来源表 + """ + + # 主键 + id = models.AutoField(primary_key=True) + + # 域名 + domain_name = models.CharField(max_length=255, null=True, blank=True) + + # 名称 + name = models.CharField(max_length=255, null=True, blank=True) + + class Meta: + managed = True + db_table = 'ps_source' + verbose_name = '来源表' + verbose_name_plural = verbose_name diff --git a/web/models/training_sensitive_word.py b/web/models/training_sensitive_word.py new file mode 100644 index 0000000..36a8070 --- /dev/null +++ b/web/models/training_sensitive_word.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from django.db import models + + +class TrainingSensitiveWord(models.Model): + """ + 敏感词表 + """ + + # 主键 + id = models.AutoField(primary_key=True) + + # 类型 + type = models.CharField(max_length=255, null=True, blank=True) + + # 敏感词 + word = models.CharField(max_length=255, null=True, blank=True) + + class Meta: + managed = True + db_table = 'training_sensitive_word' + verbose_name = '敏感词表' + verbose_name_plural = verbose_name diff --git a/web/service/__init__.py b/web/service/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/service/base_service.py b/web/service/base_service.py new file mode 100644 index 0000000..cb03b4c --- /dev/null +++ b/web/service/base_service.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from web.dao.public_sentiment_comment_dao import PublicSentimentCommentDao +from web.dao.training_sensitive_word_dao import TrainingSensitiveWordDao + + +class BaseService: + """ + service层的基类 + """ + + def __init__(self): + self.public_sentiment_comment_dao = PublicSentimentCommentDao() + self.training_sensitive_word_dao = TrainingSensitiveWordDao() diff --git a/web/service/public_sentiment_comment_service.py b/web/service/public_sentiment_comment_service.py new file mode 100644 index 0000000..fa751bc --- /dev/null +++ b/web/service/public_sentiment_comment_service.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from web.manager.log_manager import LogManager +from web.service.base_service import BaseService + +Logger = LogManager.get_logger(__name__) + + +class PublicSentimentCommentService(BaseService): + """ + PublicSentimentComment的service类 + """ + + def save(self, public_sentiment_comment): + """ + 保存 + """ + + Logger.info('保存PublicSentimentComment对象') + + self.public_sentiment_comment_dao.save(public_sentiment_comment) + + def find_all(self): + """ + 查询所有记录 + """ + + Logger.info('查询所有记录') + + return self.public_sentiment_comment_dao.find_list(dict(), dict(), list()) diff --git a/web/service/training_sensitive_word_service.py b/web/service/training_sensitive_word_service.py new file mode 100644 index 0000000..c6faaf8 --- /dev/null +++ b/web/service/training_sensitive_word_service.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python查询所有记录 +# -*- coding: utf-8 -*- +from web.dto.service_result import ServiceResult +from web.enum.service_result_enum import ServiceResultEnum +from web.manager.log_manager import LogManager +from web.service.base_service import BaseService + +Logger = LogManager.get_logger(__name__) + + +class TrainingSensitiveWordService(BaseService): + """ + TrainingSensitiveWord的service类 + """ + + def find_all(self): + """ + 查询所有记录 + """ + + Logger.info('查询所有记录') + + return ServiceResult.ok(ServiceResultEnum.SELECT_SUCCESS, self.training_sensitive_word_dao.find_all(), + ServiceResultEnum.SELECT_SUCCESS_DESCRIPTION) diff --git a/web/spider/__init__.py b/web/spider/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/spider/base_spider.py b/web/spider/base_spider.py new file mode 100644 index 0000000..ccd005a --- /dev/null +++ b/web/spider/base_spider.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from web.service.training_sensitive_word_service import TrainingSensitiveWordService + + +class BaseSpider: + """ + Spider层的基类 + """ + + def __init__(self): + self.training_sensitive_word_service = TrainingSensitiveWordService() diff --git a/web/task/__init__.py b/web/task/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/task/base_task.py b/web/task/base_task.py new file mode 100644 index 0000000..fc91784 --- /dev/null +++ b/web/task/base_task.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + + +""" +task基类 +""" +from web.handler.crawl_data_handler import CrawlDataHandler + + +class BaseTask: + + def __init__(self): + self.crawl_data_handler = CrawlDataHandler() diff --git a/web/task/crawl_data_task.py b/web/task/crawl_data_task.py new file mode 100644 index 0000000..0538dd3 --- /dev/null +++ b/web/task/crawl_data_task.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from web.manager.log_manager import LogManager +from web.task.base_task import BaseTask + +Logger = LogManager.get_logger(__name__) + +""" +添加注释 +""" + + +class CrawlDataTask(BaseTask): + + def collect_data_from_weibo(self): + """ + 从新浪微博采集数据 + """ + + Logger.info("开始从新浪微博采集数据") + + self.crawl_data_handler.collect_data_from_weibo() diff --git a/web/tests.py b/web/tests.py new file mode 100644 index 0000000..7ce503c --- /dev/null +++ b/web/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/web/util/__init__.py b/web/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/util/dto_util.py b/web/util/dto_util.py new file mode 100644 index 0000000..3249f72 --- /dev/null +++ b/web/util/dto_util.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from web.dto.api_result import ApiResult +from web.dto.service_result import ServiceResult + + +class DtoUtil: + """ + dto的工具类 + """ + + @staticmethod + def service_result_to_api_result(service_result: ServiceResult) -> ApiResult: + """ + 将ServiceResult对象转换为ApiResult对象 + """ + + return ApiResult.instance(service_result.success, service_result.code, service_result.data, + service_result.message) diff --git a/web/util/re_util.py b/web/util/re_util.py new file mode 100644 index 0000000..79a4fc5 --- /dev/null +++ b/web/util/re_util.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from bs4 import BeautifulSoup + + +class ReUtil: + """ + 正则表达式的工具类 + """ + + @staticmethod + def clear_html(text_with_html): + """ + 清除html + """ + + soup = BeautifulSoup(text_with_html, 'html.parser') + return soup.get_text() diff --git a/web/views.py b/web/views.py new file mode 100644 index 0000000..faa18be --- /dev/null +++ b/web/views.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- diff --git a/web/vo/__init__.py b/web/vo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/vo/parse_html_vo.py b/web/vo/parse_html_vo.py new file mode 100644 index 0000000..959b24d --- /dev/null +++ b/web/vo/parse_html_vo.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from pydantic import BaseModel + + +class ParseHtmlVo(BaseModel): + """ + 解析html的vo类 + """ + + # 地址 + url: str