1.创建、提交项目。
This commit is contained in:
commit
27f08d6772
3
.idea/.gitignore
vendored
Normal file
3
.idea/.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
65
.idea/inspectionProfiles/Project_Default.xml
Normal file
65
.idea/inspectionProfiles/Project_Default.xml
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<profile version="1.0">
|
||||||
|
<option name="myName" value="Project Default" />
|
||||||
|
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||||
|
<option name="ignoredPackages">
|
||||||
|
<value>
|
||||||
|
<list size="52">
|
||||||
|
<item index="0" class="java.lang.String" itemvalue="mysqlclient" />
|
||||||
|
<item index="1" class="java.lang.String" itemvalue="tushare" />
|
||||||
|
<item index="2" class="java.lang.String" itemvalue="bs4" />
|
||||||
|
<item index="3" class="java.lang.String" itemvalue="tzlocal" />
|
||||||
|
<item index="4" class="java.lang.String" itemvalue="html5lib" />
|
||||||
|
<item index="5" class="java.lang.String" itemvalue="tabulate" />
|
||||||
|
<item index="6" class="java.lang.String" itemvalue="python-dateutil" />
|
||||||
|
<item index="7" class="java.lang.String" itemvalue="cycler" />
|
||||||
|
<item index="8" class="java.lang.String" itemvalue="backports.zoneinfo" />
|
||||||
|
<item index="9" class="java.lang.String" itemvalue="certifi" />
|
||||||
|
<item index="10" class="java.lang.String" itemvalue="lxml" />
|
||||||
|
<item index="11" class="java.lang.String" itemvalue="soupsieve" />
|
||||||
|
<item index="12" class="java.lang.String" itemvalue="pyparsing" />
|
||||||
|
<item index="13" class="java.lang.String" itemvalue="pypinyin" />
|
||||||
|
<item index="14" class="java.lang.String" itemvalue="xlrd" />
|
||||||
|
<item index="15" class="java.lang.String" itemvalue="beautifulsoup4" />
|
||||||
|
<item index="16" class="java.lang.String" itemvalue="asgiref" />
|
||||||
|
<item index="17" class="java.lang.String" itemvalue="kiwisolver" />
|
||||||
|
<item index="18" class="java.lang.String" itemvalue="typing-extensions" />
|
||||||
|
<item index="19" class="java.lang.String" itemvalue="akshare" />
|
||||||
|
<item index="20" class="java.lang.String" itemvalue="APScheduler" />
|
||||||
|
<item index="21" class="java.lang.String" itemvalue="simplejson" />
|
||||||
|
<item index="22" class="java.lang.String" itemvalue="fonttools" />
|
||||||
|
<item index="23" class="java.lang.String" itemvalue="matplotlib" />
|
||||||
|
<item index="24" class="java.lang.String" itemvalue="charset-normalizer" />
|
||||||
|
<item index="25" class="java.lang.String" itemvalue="PyMySQL" />
|
||||||
|
<item index="26" class="java.lang.String" itemvalue="addcomments" />
|
||||||
|
<item index="27" class="java.lang.String" itemvalue="idna" />
|
||||||
|
<item index="28" class="java.lang.String" itemvalue="decorator" />
|
||||||
|
<item index="29" class="java.lang.String" itemvalue="cx-Oracle" />
|
||||||
|
<item index="30" class="java.lang.String" itemvalue="numpy" />
|
||||||
|
<item index="31" class="java.lang.String" itemvalue="requests" />
|
||||||
|
<item index="32" class="java.lang.String" itemvalue="importlib-metadata" />
|
||||||
|
<item index="33" class="java.lang.String" itemvalue="py-mini-racer" />
|
||||||
|
<item index="34" class="java.lang.String" itemvalue="websocket-client" />
|
||||||
|
<item index="35" class="java.lang.String" itemvalue="sqlparse" />
|
||||||
|
<item index="36" class="java.lang.String" itemvalue="zipp" />
|
||||||
|
<item index="37" class="java.lang.String" itemvalue="jsonpath" />
|
||||||
|
<item index="38" class="java.lang.String" itemvalue="urllib3" />
|
||||||
|
<item index="39" class="java.lang.String" itemvalue="baostock" />
|
||||||
|
<item index="40" class="java.lang.String" itemvalue="six" />
|
||||||
|
<item index="41" class="java.lang.String" itemvalue="tzdata" />
|
||||||
|
<item index="42" class="java.lang.String" itemvalue="packaging" />
|
||||||
|
<item index="43" class="java.lang.String" itemvalue="et-xmlfile" />
|
||||||
|
<item index="44" class="java.lang.String" itemvalue="pandas" />
|
||||||
|
<item index="45" class="java.lang.String" itemvalue="tqdm" />
|
||||||
|
<item index="46" class="java.lang.String" itemvalue="django" />
|
||||||
|
<item index="47" class="java.lang.String" itemvalue="colorama" />
|
||||||
|
<item index="48" class="java.lang.String" itemvalue="pytz" />
|
||||||
|
<item index="49" class="java.lang.String" itemvalue="webencodings" />
|
||||||
|
<item index="50" class="java.lang.String" itemvalue="openpyxl" />
|
||||||
|
<item index="51" class="java.lang.String" itemvalue="Pillow" />
|
||||||
|
</list>
|
||||||
|
</value>
|
||||||
|
</option>
|
||||||
|
</inspection_tool>
|
||||||
|
</profile>
|
||||||
|
</component>
|
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
4
.idea/misc.xml
Normal file
4
.idea/misc.xml
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/public_sentiment.iml" filepath="$PROJECT_DIR$/.idea/public_sentiment.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
8
.idea/public_sentiment.iml
Normal file
8
.idea/public_sentiment.iml
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
6
.idea/vcs.xml
Normal file
6
.idea/vcs.xml
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
0
collector/collector/__init__.py
Normal file
0
collector/collector/__init__.py
Normal file
12
collector/collector/items.py
Normal file
12
collector/collector/items.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class SensitiveWordItem(scrapy.Item):
|
||||||
|
"""
|
||||||
|
评论
|
||||||
|
"""
|
||||||
|
|
||||||
|
sensitive_word = scrapy.Field()
|
101
collector/collector/middlewares.py
Normal file
101
collector/collector/middlewares.py
Normal file
|
@ -0,0 +1,101 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from scrapy import signals
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import is_item, ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class CollectorSpiderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the spider middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_spider_input(self, response, spider):
|
||||||
|
# Called for each response that goes through the spider
|
||||||
|
# middleware and into the spider.
|
||||||
|
|
||||||
|
# Should return None or raise an exception.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
# Called with the results returned from the Spider, after
|
||||||
|
# it has processed the response.
|
||||||
|
|
||||||
|
# Must return an iterable of Request, or item objects.
|
||||||
|
for i in result:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
# Called when a spider or process_spider_input() method
|
||||||
|
# (from other spider middleware) raises an exception.
|
||||||
|
|
||||||
|
# Should return either None or an iterable of Request or item objects.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def process_start_requests(self, start_requests, spider):
|
||||||
|
# Called with the start requests of the spider, and works
|
||||||
|
# similarly to the process_spider_output() method, except
|
||||||
|
# that it doesn’t have a response associated.
|
||||||
|
|
||||||
|
# Must return only requests (not items).
|
||||||
|
for r in start_requests:
|
||||||
|
yield r
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
|
class CollectorDownloaderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the downloader middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_request(self, request, spider):
|
||||||
|
# Called for each request that goes through the downloader
|
||||||
|
# middleware.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this request
|
||||||
|
# - or return a Response object
|
||||||
|
# - or return a Request object
|
||||||
|
# - or raise IgnoreRequest: process_exception() methods of
|
||||||
|
# installed downloader middleware will be called
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_response(self, request, response, spider):
|
||||||
|
# Called with the response returned from the downloader.
|
||||||
|
|
||||||
|
# Must either;
|
||||||
|
# - return a Response object
|
||||||
|
# - return a Request object
|
||||||
|
# - or raise IgnoreRequest
|
||||||
|
return response
|
||||||
|
|
||||||
|
def process_exception(self, request, exception, spider):
|
||||||
|
# Called when a download service or a process_request()
|
||||||
|
# (from other downloader middleware) raises an exception.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this exception
|
||||||
|
# - return a Response object: stops process_exception() chain
|
||||||
|
# - return a Request object: stops process_exception() chain
|
||||||
|
pass
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
27
collector/collector/pipelines.py
Normal file
27
collector/collector/pipelines.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
from web.models import PublicSentimentComment
|
||||||
|
from web.manager.log_manager import LogManager
|
||||||
|
from web.service.public_sentiment_comment_service import PublicSentimentCommentService
|
||||||
|
|
||||||
|
Logger = LogManager.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class CollectorPipeline(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
"""
|
||||||
|
将数据存储在数据库中
|
||||||
|
"""
|
||||||
|
|
||||||
|
public_sentiment_comment = PublicSentimentComment()
|
||||||
|
public_sentiment_comment.content = item['sensitive_word']
|
||||||
|
|
||||||
|
public_sentiment_comment_service = PublicSentimentCommentService()
|
||||||
|
public_sentiment_comment_service.save(public_sentiment_comment)
|
||||||
|
return item
|
117
collector/collector/settings.py
Normal file
117
collector/collector/settings.py
Normal file
|
@ -0,0 +1,117 @@
|
||||||
|
# Scrapy settings for collector project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
BOT_NAME = "collector"
|
||||||
|
|
||||||
|
SPIDER_MODULES = ["collector.spiders"]
|
||||||
|
NEWSPIDER_MODULE = "collector.spiders"
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
# USER_AGENT = "collector (+http://www.yourdomain.com)"
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
# ROBOTSTXT_OBEY = True
|
||||||
|
# 默认为True,此处改为False
|
||||||
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
# CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
# DOWNLOAD_DELAY = 3
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
|
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
# COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
# TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
# DEFAULT_REQUEST_HEADERS = {
|
||||||
|
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
# "Accept-Language": "en",
|
||||||
|
# }
|
||||||
|
|
||||||
|
DEFAULT_REQUEST_HEADERS = {
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
|
'Accept-Language': 'en',
|
||||||
|
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
|
||||||
|
'Cookie': 'adb_isBlock=0; userid=1652710683278_ihrfq92084; prov=cn0731; city=0732; weather_city=hn_xt; region_ip=110.53.149.x; region_ver=1.2; wxIsclose=false; ifengRotator_iis3=6; ifengWindowCookieName_919=1'
|
||||||
|
# 默认是注释的,这个东西非常重要,如果不写很容易被判断为电脑,简单点写一个Mozilla/5.0即可
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
# SPIDER_MIDDLEWARES = {
|
||||||
|
# "collector.middlewares.CollectorSpiderMiddleware": 543,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# DOWNLOADER_MIDDLEWARES = {
|
||||||
|
# "collector.middlewares.CollectorDownloaderMiddleware": 543,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
# EXTENSIONS = {
|
||||||
|
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
# ITEM_PIPELINES = {
|
||||||
|
# "collector.pipelines.CollectorPipeline": 300,
|
||||||
|
# }
|
||||||
|
# 项目管道,数字越小优先度越高
|
||||||
|
ITEM_PIPELINES = {
|
||||||
|
'collector.pipelines.CollectorPipeline': 300,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
# AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
# AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
# AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
# HTTPCACHE_ENABLED = True
|
||||||
|
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
# HTTPCACHE_DIR = "httpcache"
|
||||||
|
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
|
FEED_EXPORT_ENCODING = "utf-8"
|
||||||
|
|
||||||
|
######################################### 下面的都是自定义的 ########################################
|
||||||
|
|
||||||
|
import os, django
|
||||||
|
import sys
|
||||||
|
|
||||||
|
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
sys.path.append(BASE_DIR)
|
||||||
|
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "public_sentiment.settings")
|
||||||
|
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
|
||||||
|
django.setup()
|
4
collector/collector/spiders/__init__.py
Normal file
4
collector/collector/spiders/__init__.py
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
59
collector/collector/spiders/collector_spider.py
Normal file
59
collector/collector/spiders/collector_spider.py
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import Optional, Any
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
from collector.items import SensitiveWordItem
|
||||||
|
from web.manager.log_manager import LogManager
|
||||||
|
from web.spider.base_spider import BaseSpider
|
||||||
|
from web.util.re_util import ReUtil
|
||||||
|
|
||||||
|
Logger = LogManager.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class CollectorSpider(scrapy.Spider, BaseSpider):
|
||||||
|
"""
|
||||||
|
从微博上爬数据
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "collector-spider"
|
||||||
|
allowed_domains = ["s.weibo.com"]
|
||||||
|
# start_urls = ["https://s.weibo.com/weibo?q=%E5%8C%97%E4%BA%AC%E5%B7%A5%E5%95%86%E5%A4%A7%E5%AD%A6&nodup=1&page=5"]
|
||||||
|
start_urls = ["https://xm.buyiju.com/ceming/129803-zajo.html"]
|
||||||
|
# url = 'https://xm.buyiju.com/ceming/129803-zajo.html'
|
||||||
|
|
||||||
|
def __init__(self, name: Optional[str] = None, **kwargs: Any):
|
||||||
|
scrapy.Spider.__init__(self)
|
||||||
|
BaseSpider.__init__(self)
|
||||||
|
|
||||||
|
# def start_requests(self):
|
||||||
|
# yield scrapy.Request(url=self.url, callback=self.parse)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
|
||||||
|
Logger.info('从微博上爬数据')
|
||||||
|
# 返回的html
|
||||||
|
text = response.text
|
||||||
|
|
||||||
|
# 查询敏感词,并将其拼接为字符串,用|分隔
|
||||||
|
training_sensitive_word_list = self.training_sensitive_word_service.find_all()
|
||||||
|
temp_training_sensitive_word_list = list(map(lambda training_sensitive_word: str(training_sensitive_word.word), training_sensitive_word_list))
|
||||||
|
match_str = '.+|.+'.join(temp_training_sensitive_word_list)
|
||||||
|
|
||||||
|
# 去除返回值中的html标签
|
||||||
|
text_without_html = ReUtil.clear_html(text)
|
||||||
|
text_without_html_list = text_without_html.split('\n')
|
||||||
|
|
||||||
|
# 匹配
|
||||||
|
is_match = False
|
||||||
|
sensitive_word_item = SensitiveWordItem()
|
||||||
|
for item in text_without_html_list:
|
||||||
|
match = re.match(match_str, item)
|
||||||
|
if match:
|
||||||
|
sensitive_word_item['sensitive_word'] = match.group()
|
||||||
|
is_match = True
|
||||||
|
break
|
||||||
|
if is_match:
|
||||||
|
yield sensitive_word_item
|
BIN
collector/dbs/default.db
Normal file
BIN
collector/dbs/default.db
Normal file
Binary file not shown.
3
collector/main.py
Normal file
3
collector/main.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
from scrapy.cmdline import execute
|
||||||
|
|
||||||
|
execute('scrapy crawl collector-spider'.split())
|
11
collector/scrapy.cfg
Normal file
11
collector/scrapy.cfg
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = collector.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = collector
|
30
manage.py
Normal file
30
manage.py
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import django
|
||||||
|
from web.manager.log_manager import LogManager
|
||||||
|
|
||||||
|
sys.path.append(r"web")
|
||||||
|
sys.path.append(r"collector")
|
||||||
|
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'public_sentiment.settings')
|
||||||
|
django.setup()
|
||||||
|
|
||||||
|
Logger = LogManager.get_logger(__name__)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
LogManager.get_logger("启动服务器")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from django.core.management import execute_from_command_line
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"Couldn't import Django. Are you sure it's installed and "
|
||||||
|
"available on your PYTHONPATH environment variable? Did you "
|
||||||
|
"forget to activate a virtual environment?"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
execute_from_command_line(sys.argv)
|
2
public_sentiment/__init__.py
Normal file
2
public_sentiment/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
import pymysql
|
||||||
|
pymysql.install_as_MySQLdb()
|
16
public_sentiment/asgi.py
Normal file
16
public_sentiment/asgi.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
"""
|
||||||
|
ASGI config for public_sentiment project.
|
||||||
|
|
||||||
|
It exposes the ASGI callable as a module-level variable named ``application``.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from django.core.asgi import get_asgi_application
|
||||||
|
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'public_sentiment.settings')
|
||||||
|
|
||||||
|
application = get_asgi_application()
|
136
public_sentiment/settings.py
Normal file
136
public_sentiment/settings.py
Normal file
|
@ -0,0 +1,136 @@
|
||||||
|
"""
|
||||||
|
Django settings for public_sentiment project.
|
||||||
|
|
||||||
|
Generated by 'django-admin startproject' using Django 4.2.16.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/4.2/topics/settings/
|
||||||
|
|
||||||
|
For the full list of settings and their values, see
|
||||||
|
https://docs.djangoproject.com/en/4.2/ref/settings/
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||||
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
|
|
||||||
|
# Quick-start development settings - unsuitable for production
|
||||||
|
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
|
||||||
|
|
||||||
|
# SECURITY WARNING: keep the secret key used in production secret!
|
||||||
|
SECRET_KEY = 'django-insecure-!*ar1k^h=h^*azpzf3sabuf4w5m)vo^aev0l6c@6qfcdh73%ze'
|
||||||
|
|
||||||
|
# SECURITY WARNING: don't run with debug turned on in production!
|
||||||
|
DEBUG = True
|
||||||
|
|
||||||
|
ALLOWED_HOSTS = []
|
||||||
|
|
||||||
|
# Application definition
|
||||||
|
|
||||||
|
INSTALLED_APPS = [
|
||||||
|
'django.contrib.admin',
|
||||||
|
'django.contrib.auth',
|
||||||
|
'django.contrib.contenttypes',
|
||||||
|
'django.contrib.sessions',
|
||||||
|
'django.contrib.messages',
|
||||||
|
'django.contrib.staticfiles',
|
||||||
|
'web',
|
||||||
|
]
|
||||||
|
|
||||||
|
MIDDLEWARE = [
|
||||||
|
'django.middleware.security.SecurityMiddleware',
|
||||||
|
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||||
|
'django.middleware.common.CommonMiddleware',
|
||||||
|
'django.middleware.csrf.CsrfViewMiddleware',
|
||||||
|
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||||
|
'django.contrib.messages.middleware.MessageMiddleware',
|
||||||
|
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||||
|
]
|
||||||
|
|
||||||
|
ROOT_URLCONF = 'public_sentiment.urls'
|
||||||
|
|
||||||
|
TEMPLATES = [
|
||||||
|
{
|
||||||
|
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||||
|
'DIRS': [],
|
||||||
|
'APP_DIRS': True,
|
||||||
|
'OPTIONS': {
|
||||||
|
'context_processors': [
|
||||||
|
'django.template.context_processors.debug',
|
||||||
|
'django.template.context_processors.request',
|
||||||
|
'django.contrib.auth.context_processors.auth',
|
||||||
|
'django.contrib.messages.context_processors.messages',
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
WSGI_APPLICATION = 'public_sentiment.wsgi.application'
|
||||||
|
|
||||||
|
# Database
|
||||||
|
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
|
||||||
|
|
||||||
|
DATABASES = {
|
||||||
|
'default': {
|
||||||
|
'ENGINE': 'django.db.backends.mysql', # 默认
|
||||||
|
'NAME': 'base_platform', # 连接的数据库
|
||||||
|
'HOST': '127.0.0.1', # mysql的ip地址
|
||||||
|
'PORT': 3306, # mysql的端口
|
||||||
|
'USER': 'root', # mysql的用户名
|
||||||
|
'PASSWORD': '123456', # mysql的密码
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Password validation
|
||||||
|
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
|
||||||
|
|
||||||
|
AUTH_PASSWORD_VALIDATORS = [
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Internationalization
|
||||||
|
# https://docs.djangoproject.com/en/4.2/topics/i18n/
|
||||||
|
|
||||||
|
LANGUAGE_CODE = 'en-us'
|
||||||
|
|
||||||
|
TIME_ZONE = 'UTC'
|
||||||
|
|
||||||
|
USE_I18N = True
|
||||||
|
|
||||||
|
USE_L10N = True
|
||||||
|
|
||||||
|
# USE_TZ = True
|
||||||
|
USE_TZ = False
|
||||||
|
|
||||||
|
# Static files (CSS, JavaScript, Images)
|
||||||
|
# https://docs.djangoproject.com/en/4.2/howto/static-files/
|
||||||
|
|
||||||
|
STATIC_URL = 'static/'
|
||||||
|
|
||||||
|
# Default primary key field type
|
||||||
|
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
|
||||||
|
|
||||||
|
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||||
|
|
||||||
|
# 时区
|
||||||
|
TIME_ZONE = 'Asia/Shanghai'
|
||||||
|
|
||||||
|
# gridgraph的配置
|
||||||
|
GRID_GRAPH = {
|
||||||
|
'url': 'ws://192.168.3.18:8182/gremlin',
|
||||||
|
'traversal_source': 'gmodern100M',
|
||||||
|
'username': 'admin',
|
||||||
|
'password': 'admin'
|
||||||
|
}
|
25
public_sentiment/urls.py
Normal file
25
public_sentiment/urls.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
"""
|
||||||
|
URL configuration for public_sentiment project.
|
||||||
|
|
||||||
|
The `urlpatterns` list routes URLs to views. For more information please see:
|
||||||
|
https://docs.djangoproject.com/en/4.2/topics/http/urls/
|
||||||
|
Examples:
|
||||||
|
Function views
|
||||||
|
1. Add an import: from my_app import views
|
||||||
|
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
||||||
|
Class-based views
|
||||||
|
1. Add an import: from other_app.views import Home
|
||||||
|
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
||||||
|
Including another URLconf
|
||||||
|
1. Import the include() function: from django.urls import include, path
|
||||||
|
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
||||||
|
"""
|
||||||
|
from django.contrib import admin
|
||||||
|
from django.urls import path
|
||||||
|
|
||||||
|
from web.controller.html_parser_controller import parse_html
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
path('admin/', admin.site.urls),
|
||||||
|
path('api/v1/htmlParser/parseHtml', parse_html),
|
||||||
|
]
|
16
public_sentiment/wsgi.py
Normal file
16
public_sentiment/wsgi.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
"""
|
||||||
|
WSGI config for public_sentiment project.
|
||||||
|
|
||||||
|
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from django.core.wsgi import get_wsgi_application
|
||||||
|
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'public_sentiment.settings')
|
||||||
|
|
||||||
|
application = get_wsgi_application()
|
0
scrawl/__init__.py
Normal file
0
scrawl/__init__.py
Normal file
11
scrawl/scrapy.cfg
Normal file
11
scrawl/scrapy.cfg
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = scrawl.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = scrawl
|
0
scrawl/scrawl/__init__.py
Normal file
0
scrawl/scrawl/__init__.py
Normal file
12
scrawl/scrawl/items.py
Normal file
12
scrawl/scrawl/items.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class ScrawlItem(scrapy.Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
|
# name = scrapy.Field()
|
||||||
|
pass
|
103
scrawl/scrawl/middlewares.py
Normal file
103
scrawl/scrawl/middlewares.py
Normal file
|
@ -0,0 +1,103 @@
|
||||||
|
# Define here the models for your spider middleware
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
from scrapy import signals
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import is_item, ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class ScrawlSpiderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the spider middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_spider_input(self, response, spider):
|
||||||
|
# Called for each response that goes through the spider
|
||||||
|
# middleware and into the spider.
|
||||||
|
|
||||||
|
# Should return None or raise an exception.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
# Called with the results returned from the Spider, after
|
||||||
|
# it has processed the response.
|
||||||
|
|
||||||
|
# Must return an iterable of Request, or item objects.
|
||||||
|
for i in result:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
# Called when a spider or process_spider_input() method
|
||||||
|
# (from other spider middleware) raises an exception.
|
||||||
|
|
||||||
|
# Should return either None or an iterable of Request or item objects.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def process_start_requests(self, start_requests, spider):
|
||||||
|
# Called with the start requests of the spider, and works
|
||||||
|
# similarly to the process_spider_output() method, except
|
||||||
|
# that it doesn’t have a response associated.
|
||||||
|
|
||||||
|
# Must return only requests (not items).
|
||||||
|
for r in start_requests:
|
||||||
|
yield r
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
|
class ScrawlDownloaderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the downloader middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_request(self, request, spider):
|
||||||
|
# Called for each request that goes through the downloader
|
||||||
|
# middleware.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this request
|
||||||
|
# - or return a Response object
|
||||||
|
# - or return a Request object
|
||||||
|
# - or raise IgnoreRequest: process_exception() methods of
|
||||||
|
# installed downloader middleware will be called
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_response(self, request, response, spider):
|
||||||
|
# Called with the response returned from the downloader.
|
||||||
|
|
||||||
|
# Must either;
|
||||||
|
# - return a Response object
|
||||||
|
# - return a Request object
|
||||||
|
# - or raise IgnoreRequest
|
||||||
|
return response
|
||||||
|
|
||||||
|
def process_exception(self, request, exception, spider):
|
||||||
|
# Called when a download handler or a process_request()
|
||||||
|
# (from other downloader middleware) raises an exception.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this exception
|
||||||
|
# - return a Response object: stops process_exception() chain
|
||||||
|
# - return a Request object: stops process_exception() chain
|
||||||
|
pass
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
13
scrawl/scrawl/pipelines.py
Normal file
13
scrawl/scrawl/pipelines.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
# Define your item pipelines here
|
||||||
|
#
|
||||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class ScrawlPipeline:
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
return item
|
105
scrawl/scrawl/settings.py
Normal file
105
scrawl/scrawl/settings.py
Normal file
|
@ -0,0 +1,105 @@
|
||||||
|
# Scrapy settings for scrawl project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
BOT_NAME = "scrawl"
|
||||||
|
|
||||||
|
SPIDER_MODULES = ["scrawl.spiders"]
|
||||||
|
NEWSPIDER_MODULE = "scrawl.spiders"
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
# USER_AGENT = "scrawl (+http://www.yourdomain.com)"
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
# ROBOTSTXT_OBEY = True
|
||||||
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
# CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
# DOWNLOAD_DELAY = 3
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
|
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
# COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
# TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
# DEFAULT_REQUEST_HEADERS = {
|
||||||
|
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
# "Accept-Language": "en",
|
||||||
|
# }
|
||||||
|
DEFAULT_REQUEST_HEADERS = {
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
|
'Accept-Language': 'en',
|
||||||
|
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
|
||||||
|
'Cookie': 'adb_isBlock=0; userid=1652710683278_ihrfq92084; prov=cn0731; city=0732; weather_city=hn_xt; region_ip=110.53.149.x; region_ver=1.2; wxIsclose=false; ifengRotator_iis3=6; ifengWindowCookieName_919=1'
|
||||||
|
# 默认是注释的,这个东西非常重要,如果不写很容易被判断为电脑,简单点写一个Mozilla/5.0即可
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
# SPIDER_MIDDLEWARES = {
|
||||||
|
# "scrawl.middlewares.ScrawlSpiderMiddleware": 543,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# DOWNLOADER_MIDDLEWARES = {
|
||||||
|
# "scrawl.middlewares.ScrawlDownloaderMiddleware": 543,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
# EXTENSIONS = {
|
||||||
|
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
# ITEM_PIPELINES = {
|
||||||
|
# "scrawl.pipelines.ScrawlPipeline": 300,
|
||||||
|
# }
|
||||||
|
# ITEM_PIPELINES:项目管道,300为优先级,越低越爬取的优先度越高
|
||||||
|
ITEM_PIPELINES = {
|
||||||
|
'scrawl.pipelines.ScrawlPipeline': 300,
|
||||||
|
# 'subeiNews.pipelines.SubeinewsMysqlPipeline': 200, # 存数据的管道
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
# AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
# AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
# AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
# HTTPCACHE_ENABLED = True
|
||||||
|
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
# HTTPCACHE_DIR = "httpcache"
|
||||||
|
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
|
FEED_EXPORT_ENCODING = "utf-8"
|
4
scrawl/scrawl/spiders/__init__.py
Normal file
4
scrawl/scrawl/spiders/__init__.py
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
18
scrawl/scrawl/spiders/weibo_spider.py
Normal file
18
scrawl/scrawl/spiders/weibo_spider.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
sys.path.append(r"scrawl")
|
||||||
|
from scrawl.items import ScrawlItem
|
||||||
|
|
||||||
|
|
||||||
|
class WeiboSpiderSpider(scrapy.Spider):
|
||||||
|
name = "weibo_spider"
|
||||||
|
allowed_domains = ["s.weibo.com"]
|
||||||
|
start_urls = ["https://s.weibo.com/weibo?q=%E5%8C%97%E4%BA%AC%E5%B7%A5%E5%95%86%E5%A4%A7%E5%AD%A6&nodup=1&page=5"]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
for con in response.xpath('//*[@id="pl_feedlist_index"]/div/div'):
|
||||||
|
scraw_item = ScrawlItem()
|
2
script/main.bat
Normal file
2
script/main.bat
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
cd C:/mywork/workspace/public_sentiment/collector
|
||||||
|
scrapy crawl collector-spider
|
1
script/runserver.bat
Normal file
1
script/runserver.bat
Normal file
|
@ -0,0 +1 @@
|
||||||
|
C:\mywork\dev-env\python\Python38\python.exe C:\mywork\workspace\public_sentiment\manage.py runserver 9000
|
1
script/scrapyd-console.bat
Normal file
1
script/scrapyd-console.bat
Normal file
|
@ -0,0 +1 @@
|
||||||
|
scrapyd
|
0
web/__init__.py
Normal file
0
web/__init__.py
Normal file
3
web/admin.py
Normal file
3
web/admin.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
from django.contrib import admin
|
||||||
|
|
||||||
|
# Register your models here.
|
6
web/apps.py
Normal file
6
web/apps.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class WebConfig(AppConfig):
|
||||||
|
default_auto_field = 'django.db.models.BigAutoField'
|
||||||
|
name = 'web'
|
0
web/constants/__init__.py
Normal file
0
web/constants/__init__.py
Normal file
12
web/constants/startup_parameter.py
Normal file
12
web/constants/startup_parameter.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
启动系统时的参数
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class StartupParameter:
|
||||||
|
# 采集数据
|
||||||
|
Crawl_Data = 'crawl_data'
|
0
web/controller/__init__.py
Normal file
0
web/controller/__init__.py
Normal file
42
web/controller/base_controller.py
Normal file
42
web/controller/base_controller.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
from scrapy.utils.project import get_project_settings
|
||||||
|
from web.handler.html_parser_handler import HtmlParserHandler
|
||||||
|
|
||||||
|
sys.path.append(r"collector")
|
||||||
|
|
||||||
|
from collector.settings import ITEM_PIPELINES
|
||||||
|
|
||||||
|
|
||||||
|
class BaseController:
|
||||||
|
"""
|
||||||
|
controller层的基类
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.html_parser_handler = HtmlParserHandler()
|
||||||
|
|
||||||
|
def to_vo(self, request, clazz):
|
||||||
|
"""
|
||||||
|
将json参数转换为vo对象
|
||||||
|
"""
|
||||||
|
raw_data = request.body.decode("utf-8")
|
||||||
|
json_data_dict = json.loads(raw_data)
|
||||||
|
obj = clazz(**json_data_dict)
|
||||||
|
return obj
|
||||||
|
|
||||||
|
def start_scrawl(self, spider):
|
||||||
|
"""
|
||||||
|
开始执行爬虫
|
||||||
|
"""
|
||||||
|
|
||||||
|
# get_project_settings方法并不能导入settings.py中的配置,因此此处还要硬编码导入
|
||||||
|
settings = get_project_settings()
|
||||||
|
settings['ITEM_PIPELINES'] = ITEM_PIPELINES
|
||||||
|
process = CrawlerProcess(settings)
|
||||||
|
process.crawl(spider)
|
||||||
|
process.start()
|
38
web/controller/html_parser_controller.py
Normal file
38
web/controller/html_parser_controller.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import json
|
||||||
|
from collections import namedtuple
|
||||||
|
from django.http import JsonResponse
|
||||||
|
from rest_framework.decorators import api_view
|
||||||
|
from twisted.protocols.amp import Box
|
||||||
|
from collector.spiders.collector_spider import CollectorSpider
|
||||||
|
from web.controller.base_controller import BaseController
|
||||||
|
from web.dto.api_result import ApiResult
|
||||||
|
from web.manager.gridgraph_manager import GridGraphManager
|
||||||
|
from web.manager.log_manager import LogManager
|
||||||
|
from web.util.dto_util import DtoUtil
|
||||||
|
from web.vo.parse_html_vo import ParseHtmlVo
|
||||||
|
|
||||||
|
Logger = LogManager.get_logger(__name__)
|
||||||
|
|
||||||
|
base_controller = BaseController()
|
||||||
|
|
||||||
|
|
||||||
|
@api_view(['POST'])
|
||||||
|
def parse_html(request):
|
||||||
|
"""
|
||||||
|
解析html
|
||||||
|
"""
|
||||||
|
|
||||||
|
Logger.info("开始解析html")
|
||||||
|
|
||||||
|
parse_html_vo = base_controller.to_vo(request, ParseHtmlVo)
|
||||||
|
service_result = base_controller.html_parser_handler.parse_html(parse_html_vo.url)
|
||||||
|
|
||||||
|
# grid_graph_manager = GridGraphManager()
|
||||||
|
# list = grid_graph_manager.query_vertex(label='person')
|
||||||
|
|
||||||
|
# base_controller.start_scrawl(CollectorSpider)
|
||||||
|
|
||||||
|
return JsonResponse(DtoUtil.service_result_to_api_result(service_result), safe=False)
|
0
web/dao/__init__.py
Normal file
0
web/dao/__init__.py
Normal file
157
web/dao/base_dao.py
Normal file
157
web/dao/base_dao.py
Normal file
|
@ -0,0 +1,157 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from django.db.models.query import QuerySet
|
||||||
|
from django.db import models
|
||||||
|
|
||||||
|
from web.manager.snowflake_manager import SnowflakeManager
|
||||||
|
|
||||||
|
|
||||||
|
class BaseDao:
|
||||||
|
"""
|
||||||
|
dao基类
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 子类必须覆盖这个
|
||||||
|
model_class = models.Model
|
||||||
|
save_batch_size = 1000
|
||||||
|
|
||||||
|
snowflake_manager = SnowflakeManager()
|
||||||
|
|
||||||
|
def save(self, obj):
|
||||||
|
"""
|
||||||
|
添加
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not obj:
|
||||||
|
return False
|
||||||
|
obj.id = self.snowflake_manager.next_id()
|
||||||
|
obj.create_time = datetime.now()
|
||||||
|
obj.save()
|
||||||
|
return True
|
||||||
|
|
||||||
|
def save_batch(self, objs, *, batch_size=save_batch_size):
|
||||||
|
"""
|
||||||
|
批量添加
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not objs:
|
||||||
|
return False
|
||||||
|
for obj in objs:
|
||||||
|
obj.id = snowflake.next_id()
|
||||||
|
self.model_class.objects.bulk_create(objs, batch_size=batch_size)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def delete(self, obj):
|
||||||
|
"""
|
||||||
|
删除
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not obj:
|
||||||
|
return False
|
||||||
|
obj.delete()
|
||||||
|
return True
|
||||||
|
|
||||||
|
def delete_batch(self, objs):
|
||||||
|
"""
|
||||||
|
批量删除
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not objs:
|
||||||
|
return False
|
||||||
|
for obj in objs:
|
||||||
|
self.delete(obj)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def delete_batch_by_query(self, filter_kw: dict, exclude_kw: dict):
|
||||||
|
"""
|
||||||
|
根据条件,批量删除
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw).delete()
|
||||||
|
return True
|
||||||
|
|
||||||
|
def delete_by_fake(self, obj):
|
||||||
|
"""
|
||||||
|
假删除/伪删除
|
||||||
|
"""
|
||||||
|
|
||||||
|
if obj is None:
|
||||||
|
return False
|
||||||
|
obj.is_deleted = True
|
||||||
|
obj.save()
|
||||||
|
return True
|
||||||
|
|
||||||
|
def update(self, obj):
|
||||||
|
"""
|
||||||
|
更新
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not obj:
|
||||||
|
return False
|
||||||
|
obj.save()
|
||||||
|
return True
|
||||||
|
|
||||||
|
def update_batch(self, objs):
|
||||||
|
"""
|
||||||
|
批量更新
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not objs:
|
||||||
|
return False
|
||||||
|
for obj in objs:
|
||||||
|
self.update(obj)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def update_batch_by_query(self, query_kwargs: dict, exclude_kw: dict, newattrs_kwargs: dict):
|
||||||
|
"""
|
||||||
|
根据条件,批量更新
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.model_class.objects.filter(**query_kwargs).exclude(**exclude_kw).update(**newattrs_kwargs)
|
||||||
|
|
||||||
|
def find_one(self, filter_kw: dict, exclude_kw: dict, order_bys: list):
|
||||||
|
"""
|
||||||
|
根据条件,返回一条记录
|
||||||
|
"""
|
||||||
|
|
||||||
|
qs = self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw)
|
||||||
|
if order_bys:
|
||||||
|
qs = qs.order_by(*order_bys)
|
||||||
|
return qs.first()
|
||||||
|
|
||||||
|
def find_queryset(self, filter_kw: dict, exclude_kw: dict, order_bys: list) -> QuerySet:
|
||||||
|
"""
|
||||||
|
根据条件,返回QuerySet
|
||||||
|
"""
|
||||||
|
if order_bys != None and len(order_bys) != 0:
|
||||||
|
query_set = self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw)
|
||||||
|
for by in order_bys:
|
||||||
|
query_set = query_set.order_by(by)
|
||||||
|
return query_set
|
||||||
|
else:
|
||||||
|
return self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw)
|
||||||
|
|
||||||
|
def find_list(self, filter_kw: dict, exclude_kw: dict, order_bys: list) -> list:
|
||||||
|
"""
|
||||||
|
根据条件,返回对象列表
|
||||||
|
"""
|
||||||
|
|
||||||
|
queryset = self.find_queryset(filter_kw, exclude_kw, order_bys)
|
||||||
|
model_instances = [model for model in queryset]
|
||||||
|
return model_instances
|
||||||
|
|
||||||
|
def is_exists(self, filter_kw: dict, exclude_kw: dict) -> bool:
|
||||||
|
"""
|
||||||
|
根据条件,判断记录是否存在
|
||||||
|
"""
|
||||||
|
|
||||||
|
return self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw).exists()
|
||||||
|
|
||||||
|
def get_count(self, filter_kw: dict, exclude_kw: dict) -> int:
|
||||||
|
"""
|
||||||
|
根据条件,计数
|
||||||
|
"""
|
||||||
|
|
||||||
|
return self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw).count()
|
13
web/dao/public_sentiment_comment_dao.py
Normal file
13
web/dao/public_sentiment_comment_dao.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from web.dao.base_dao import BaseDao
|
||||||
|
from web.models import PublicSentimentComment
|
||||||
|
|
||||||
|
|
||||||
|
class PublicSentimentCommentDao(BaseDao):
|
||||||
|
"""
|
||||||
|
Comment的dao类
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_class = PublicSentimentComment
|
20
web/dao/training_sensitive_word_dao.py
Normal file
20
web/dao/training_sensitive_word_dao.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from web.dao.base_dao import BaseDao
|
||||||
|
from web.models import TrainingSensitiveWord
|
||||||
|
|
||||||
|
|
||||||
|
class TrainingSensitiveWordDao(BaseDao):
|
||||||
|
"""
|
||||||
|
TrainingSensitiveWord的dao类
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_class = TrainingSensitiveWord
|
||||||
|
|
||||||
|
def find_all(self):
|
||||||
|
"""
|
||||||
|
查询所有记录
|
||||||
|
"""
|
||||||
|
|
||||||
|
return self.find_list(dict(), dict(), list())
|
0
web/dto/__init__.py
Normal file
0
web/dto/__init__.py
Normal file
33
web/dto/api_result.py
Normal file
33
web/dto/api_result.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
class ApiResult:
|
||||||
|
"""
|
||||||
|
接口返回类
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def __init__(self, success, code, data, message):
|
||||||
|
# 只要服务端没报错,success都是True
|
||||||
|
self.success = success
|
||||||
|
# 根据处理结果不同,返回不同的值
|
||||||
|
self.code = code
|
||||||
|
# 返回数据
|
||||||
|
self.data = data
|
||||||
|
# 提示信息
|
||||||
|
self.message = message
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def instance(success, code, data, message):
|
||||||
|
return ApiResult(success, code, data, message).__dict__
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def ok(code, data, message):
|
||||||
|
return ApiResult(True, code, data, message).__dict__
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def fail(code, data, message):
|
||||||
|
return ApiResult(False, code, data, message).__dict__
|
29
web/dto/service_result.py
Normal file
29
web/dto/service_result.py
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
class ServiceResult:
|
||||||
|
"""
|
||||||
|
service层返回值对象
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def __init__(self, success, code, data, message):
|
||||||
|
# 只要服务端没报错,success都是True
|
||||||
|
self.success = success
|
||||||
|
# 根据处理结果不同,返回不同的值
|
||||||
|
self.code = code
|
||||||
|
# 返回数据
|
||||||
|
self.data = data
|
||||||
|
# 提示信息
|
||||||
|
self.message = message
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def ok(code, data, message):
|
||||||
|
return ServiceResult(True, code, data, message)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def fail(code, data, message):
|
||||||
|
return ServiceResult(False, code, data, message)
|
0
web/enum/__init__.py
Normal file
0
web/enum/__init__.py
Normal file
19
web/enum/api_result_enum.py
Normal file
19
web/enum/api_result_enum.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class ApiResultEnum(Enum):
|
||||||
|
"""
|
||||||
|
ApiResult类的的枚举类型
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 成功
|
||||||
|
# SUCCESS = 200
|
||||||
|
# SUCCESS_DESCRIPTION = '成功'
|
||||||
|
|
||||||
|
# 失败
|
||||||
|
FAIL = 4000
|
||||||
|
FAIL_DESCRIPTION = '失败'
|
43
web/enum/service_result_enum.py
Normal file
43
web/enum/service_result_enum.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class ServiceResultEnum(Enum):
|
||||||
|
"""
|
||||||
|
ServiceResult类的的枚举类型
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 成功
|
||||||
|
SUCCESS = 200
|
||||||
|
SUCCESS_DESCRIPTION = '成功'
|
||||||
|
|
||||||
|
# 失败
|
||||||
|
FAIL = 3000
|
||||||
|
FAIL_DESCRIPTION = '失败'
|
||||||
|
|
||||||
|
# 添加成功
|
||||||
|
SAVE_SUCCESS = 3001
|
||||||
|
SAVE_SUCCESS_DESCRIPTION = '添加成功'
|
||||||
|
|
||||||
|
# 删除成功
|
||||||
|
DELETE_SUCCESS = 3002
|
||||||
|
DELETE_SUCCESS_DESCRIPTION = '删除成功'
|
||||||
|
|
||||||
|
# 修改成功
|
||||||
|
UPDATE_SUCCESS = 3003
|
||||||
|
UPDATE_SUCCESS_DESCRIPTION = '修改成功'
|
||||||
|
|
||||||
|
# 查询成功
|
||||||
|
SELECT_SUCCESS = 3004
|
||||||
|
SELECT_SUCCESS_DESCRIPTION = '查询成功'
|
||||||
|
|
||||||
|
# 不存在敏感词
|
||||||
|
NOT_EXIST_SENSITIVE_WORD = 3005
|
||||||
|
NOT_EXIST_SENSITIVE_WORD_DESCRIPTION = '不存在敏感词'
|
||||||
|
|
||||||
|
# 存在敏感词
|
||||||
|
EXIST_SENSITIVE_WORD = 3006
|
||||||
|
EXIST_SENSITIVE_WORD_DESCRIPTION = '存在敏感词'
|
0
web/handler/__init__.py
Normal file
0
web/handler/__init__.py
Normal file
13
web/handler/base_handler.py
Normal file
13
web/handler/base_handler.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from web.service.training_sensitive_word_service import TrainingSensitiveWordService
|
||||||
|
|
||||||
|
|
||||||
|
class BaseHandler:
|
||||||
|
"""
|
||||||
|
handler层的基类
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.training_sensitive_word_service = TrainingSensitiveWordService()
|
23
web/handler/crawl_data_handler.py
Normal file
23
web/handler/crawl_data_handler.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from web.manager.log_manager import LogManager
|
||||||
|
from web.handler.base_handler import BaseHandler
|
||||||
|
|
||||||
|
Logger = LogManager.get_logger(__name__)
|
||||||
|
|
||||||
|
"""
|
||||||
|
采集数据的handler
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlDataHandler(BaseHandler):
|
||||||
|
|
||||||
|
def collect_data_from_weibo(self):
|
||||||
|
"""
|
||||||
|
从新浪微博采集数据
|
||||||
|
"""
|
||||||
|
|
||||||
|
Logger.info("开始从新浪微博采集数据")
|
||||||
|
|
||||||
|
|
51
web/handler/html_parser_handler.py
Normal file
51
web/handler/html_parser_handler.py
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from web.enum.service_result_enum import ServiceResultEnum
|
||||||
|
from web.dto.service_result import ServiceResult
|
||||||
|
from web.handler.base_handler import BaseHandler
|
||||||
|
from web.manager.log_manager import LogManager
|
||||||
|
from web.util.re_util import ReUtil
|
||||||
|
|
||||||
|
Logger = LogManager.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class HtmlParserHandler(BaseHandler):
|
||||||
|
"""
|
||||||
|
html解析器类
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse_html(self, url):
|
||||||
|
"""
|
||||||
|
解析html网页
|
||||||
|
"""
|
||||||
|
|
||||||
|
response = requests.get(url)
|
||||||
|
text = response.text
|
||||||
|
|
||||||
|
# 查询敏感词,并将其拼接为字符串,用|分隔
|
||||||
|
service_result = self.training_sensitive_word_service.find_all()
|
||||||
|
if service_result is not None and service_result.success is True:
|
||||||
|
training_sensitive_word_list = service_result.data
|
||||||
|
temp_training_sensitive_word_list = list(
|
||||||
|
map(lambda training_sensitive_word: str(training_sensitive_word.word), training_sensitive_word_list))
|
||||||
|
match_str = '.+|.+'.join(temp_training_sensitive_word_list)
|
||||||
|
|
||||||
|
# 去除返回值中的html标签
|
||||||
|
text_without_html = ReUtil.clear_html(text)
|
||||||
|
text_without_html_list = text_without_html.split('\n')
|
||||||
|
|
||||||
|
# 匹配
|
||||||
|
for item in text_without_html_list:
|
||||||
|
match = re.match(match_str, item)
|
||||||
|
if match:
|
||||||
|
return ServiceResult.ok(ServiceResultEnum.EXIST_SENSITIVE_WORD.value, match.group(),
|
||||||
|
ServiceResultEnum.EXIST_SENSITIVE_WORD_DESCRIPTION.value)
|
||||||
|
return ServiceResult.ok(ServiceResultEnum.NOT_EXIST_SENSITIVE_WORD.value, None,
|
||||||
|
ServiceResultEnum.NOT_EXIST_SENSITIVE_WORD_DESCRIPTION.value)
|
||||||
|
else:
|
||||||
|
return ServiceResult.fail(ServiceResultEnum.FAIL.value, None,
|
||||||
|
ServiceResultEnum.FAIL_DESCRIPTION.value)
|
0
web/manager/__init__.py
Normal file
0
web/manager/__init__.py
Normal file
258
web/manager/gridgraph_manager.py
Normal file
258
web/manager/gridgraph_manager.py
Normal file
|
@ -0,0 +1,258 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from gremlin_python import statics
|
||||||
|
from gremlin_python.process.anonymous_traversal import traversal
|
||||||
|
from gremlin_python.process.graph_traversal import __
|
||||||
|
from gremlin_python.process.strategies import *
|
||||||
|
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
|
||||||
|
from gremlin_python.process.traversal import T
|
||||||
|
from gremlin_python.process.traversal import Order
|
||||||
|
from gremlin_python.process.traversal import Cardinality
|
||||||
|
from gremlin_python.process.traversal import Column
|
||||||
|
from gremlin_python.process.traversal import Direction
|
||||||
|
from gremlin_python.process.traversal import Operator
|
||||||
|
from gremlin_python.process.traversal import P
|
||||||
|
from gremlin_python.process.traversal import Pop
|
||||||
|
from gremlin_python.process.traversal import Scope
|
||||||
|
from gremlin_python.process.traversal import Barrier
|
||||||
|
from gremlin_python.process.traversal import Bindings
|
||||||
|
from gremlin_python.process.traversal import WithOptions
|
||||||
|
from gremlin_python.driver import client
|
||||||
|
from public_sentiment.settings import GRID_GRAPH
|
||||||
|
|
||||||
|
|
||||||
|
class GridGraphManager:
|
||||||
|
"""
|
||||||
|
gridgraph的管理器类
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.graph = traversal().withRemote(
|
||||||
|
DriverRemoteConnection(GRID_GRAPH['url'], GRID_GRAPH['traversal_source'], username=GRID_GRAPH['username'],
|
||||||
|
password=GRID_GRAPH['password']))
|
||||||
|
|
||||||
|
def add_vertex(self, label, properties=None):
|
||||||
|
"""
|
||||||
|
add vertex
|
||||||
|
:param graph: graph, type: GraphTraversalSource
|
||||||
|
:param label: label, type: str
|
||||||
|
:param properties: property dict, like {'p1': 'value1', 'p2': 'value2'}
|
||||||
|
:return: vertex, Vertex(id, label)
|
||||||
|
"""
|
||||||
|
vert = self.graph.addV(label)
|
||||||
|
if properties:
|
||||||
|
for key in properties.keys():
|
||||||
|
vert.property(key, properties.get(key))
|
||||||
|
return vert.next()
|
||||||
|
|
||||||
|
def add_edge(self, label, v_from, v_to, properties=None):
|
||||||
|
"""
|
||||||
|
add edge
|
||||||
|
:param graph: graph, type: GraphTraversalSource
|
||||||
|
:param label: label, type: str
|
||||||
|
:param v_from: long vertex id or Vertex(id, label) of from
|
||||||
|
:param v_to: long vertex id or Vertex(id, label) of to
|
||||||
|
:param properties: property dict, like {'p1': 'value1', 'p2': 'value2'}
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
|
if isinstance(v_from, int):
|
||||||
|
v_from = self.graph.V().hasId(v_from).next()
|
||||||
|
if isinstance(v_to, int):
|
||||||
|
v_to = self.graph.V().hasId(v_to).next()
|
||||||
|
edge = self.graph.V(v_from).addE(label).to(v_to)
|
||||||
|
if properties:
|
||||||
|
for key in properties.keys():
|
||||||
|
edge.property(key, properties.get(key))
|
||||||
|
edge.next()
|
||||||
|
|
||||||
|
def drop_vertex(self, v_id=None, label=None, properties=None):
|
||||||
|
"""
|
||||||
|
drop all vertex or specific vertex
|
||||||
|
:param graph: graph, type: GraphTraversalSource
|
||||||
|
:param v_id: long vertex id or Vertex(id, label)
|
||||||
|
:param label: label, type: str
|
||||||
|
:param properties: property list, like ['p1', 'p2', {'p3': 'value'}]
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
|
if isinstance(v_id, int):
|
||||||
|
v_id = self.graph.V().hasId(v_id).next()
|
||||||
|
travel = self.graph.V(v_id) if v_id else self.graph.V()
|
||||||
|
if label:
|
||||||
|
travel = travel.hasLabel(label)
|
||||||
|
if properties:
|
||||||
|
for p in properties:
|
||||||
|
if isinstance(p, dict):
|
||||||
|
key = list(p.keys())[0]
|
||||||
|
travel = travel.has(key, p.get(key))
|
||||||
|
else:
|
||||||
|
travel = travel.has(p)
|
||||||
|
travel.drop().iterate()
|
||||||
|
|
||||||
|
def drop_edge(self, e_id=None, label=None, properties=None):
|
||||||
|
"""
|
||||||
|
drop all edges or specific edge
|
||||||
|
:param graph: graph, type: GraphTraversalSource
|
||||||
|
:param e_id: edge id, type str
|
||||||
|
:param label: label, type: str
|
||||||
|
:param properties: property list, like ['p1', 'p2', {'p3': 'value'}]
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
|
travel = self.graph.E(e_id) if e_id else self.graph.E()
|
||||||
|
if label:
|
||||||
|
travel = travel.hasLabel(label)
|
||||||
|
if properties:
|
||||||
|
for p in properties:
|
||||||
|
if isinstance(p, dict):
|
||||||
|
key = list(p.keys())[0]
|
||||||
|
travel = travel.has(key, p.get(key))
|
||||||
|
else:
|
||||||
|
travel = travel.has(p)
|
||||||
|
travel.drop().iterate()
|
||||||
|
|
||||||
|
def query_vertex(self, v_id=None, label=None, properties=None):
|
||||||
|
"""
|
||||||
|
query graph vertex (value) list
|
||||||
|
:param graph: graph, type: GraphTraversalSource
|
||||||
|
:param v_id: long vertex id or Vertex(id, label)
|
||||||
|
:param label: label, type: str
|
||||||
|
:param properties: property list, like ['p1', 'p2', {'p3': 'value'}]
|
||||||
|
:return: vertex list or vertex value list
|
||||||
|
"""
|
||||||
|
if isinstance(v_id, int):
|
||||||
|
v_id = self.graph.V().hasId(v_id).next()
|
||||||
|
travel = self.graph.V(v_id) if v_id else self.graph.V()
|
||||||
|
if label:
|
||||||
|
travel = travel.hasLabel(label)
|
||||||
|
if properties:
|
||||||
|
for p in properties:
|
||||||
|
if isinstance(p, dict):
|
||||||
|
key = list(p.keys())[0]
|
||||||
|
travel = travel.has(key, p.get(key))
|
||||||
|
else:
|
||||||
|
travel = travel.has(p)
|
||||||
|
# return travel.valueMap().toList()
|
||||||
|
return travel.toList()
|
||||||
|
|
||||||
|
def query_edge(self, e_id=None, label=None, properties=None):
|
||||||
|
"""
|
||||||
|
query graph edge value list
|
||||||
|
:param graph: graph, type: GraphTraversalSource
|
||||||
|
:param e_id: edge id, type str
|
||||||
|
:param label: label, type: str
|
||||||
|
:param properties: property list, like ['p1', 'p2', {'p3': 'value'}]
|
||||||
|
:return: valueMap list
|
||||||
|
"""
|
||||||
|
travel = self.graph.E(e_id) if e_id else self.graph.E()
|
||||||
|
if label:
|
||||||
|
travel = travel.hasLabel(label)
|
||||||
|
if properties:
|
||||||
|
for p in properties:
|
||||||
|
if isinstance(p, dict):
|
||||||
|
key = list(p.keys())[0]
|
||||||
|
travel = travel.has(key, p.get(key))
|
||||||
|
else:
|
||||||
|
travel = travel.has(p)
|
||||||
|
return travel.valueMap().toList()
|
||||||
|
|
||||||
|
def query_edges_of_vertex(self, v_id):
|
||||||
|
"""
|
||||||
|
query all edges of vertex
|
||||||
|
:param graph: graph, type: GraphTraversalSource
|
||||||
|
:param v_id: v_id: long vertex id or Vertex(id, label)
|
||||||
|
:return: edge list
|
||||||
|
"""
|
||||||
|
if isinstance(v_id, int):
|
||||||
|
v_id = self.graph.V().hasId(v_id).next()
|
||||||
|
result = []
|
||||||
|
in_edges = self.graph.V(v_id).inE().toList()
|
||||||
|
out_edges = self.graph.V(v_id).outE().toList()
|
||||||
|
result.extend(in_edges)
|
||||||
|
result.extend(out_edges)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def query_near_vertex(self, v_id):
|
||||||
|
"""
|
||||||
|
query near vertices of vertex
|
||||||
|
:param graph: graph, type: GraphTraversalSource
|
||||||
|
:param v_id: v_id: long vertex id or Vertex(id, label)
|
||||||
|
:return: vertex list
|
||||||
|
"""
|
||||||
|
if isinstance(v_id, int):
|
||||||
|
v_id = self.graph.V().hasId(v_id).next()
|
||||||
|
result = []
|
||||||
|
out_v = self.graph.V(v_id).out().toList()
|
||||||
|
in_v = self.graph.V(v_id).in_().toList()
|
||||||
|
result.extend(out_v)
|
||||||
|
result.extend(in_v)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def get_edge_id(self):
|
||||||
|
"""
|
||||||
|
get edge id
|
||||||
|
:param edge: Egde(id, label, outV, inV)
|
||||||
|
:return: edge id, type str
|
||||||
|
"""
|
||||||
|
return self.graph.id.get('@value').get('relationId')
|
||||||
|
|
||||||
|
def vertex_to_dict(self, vertex):
|
||||||
|
"""
|
||||||
|
transfer Vertex's info to dict
|
||||||
|
:param graph: graph, type: GraphTraversalSource
|
||||||
|
:param vertex: vertex, Vertex(id, label)
|
||||||
|
:return: vertex info dict
|
||||||
|
"""
|
||||||
|
properties = self.graph.V(vertex).valueMap().toList()[0]
|
||||||
|
for key in properties.keys():
|
||||||
|
properties[key] = properties.get(key)[0]
|
||||||
|
return {
|
||||||
|
'id': vertex.id,
|
||||||
|
'label': vertex.label,
|
||||||
|
'properties': properties
|
||||||
|
}
|
||||||
|
|
||||||
|
def edge_to_dict(self, edge):
|
||||||
|
"""
|
||||||
|
transfer Edge's info to dict
|
||||||
|
:param graph: graph, type: GraphTraversalSource
|
||||||
|
:param edge: edge, Edge(id, label, outV, inV)
|
||||||
|
:return: edge info dict
|
||||||
|
"""
|
||||||
|
e_id = self.get_edge_id(edge)
|
||||||
|
properties = self.graph.E(e_id).valueMap().toList()[0]
|
||||||
|
return {
|
||||||
|
'id': e_id,
|
||||||
|
'label': edge.label,
|
||||||
|
'properties': properties
|
||||||
|
}
|
||||||
|
|
||||||
|
def judge_vertex_in_graph(self, vertex_dict):
|
||||||
|
"""
|
||||||
|
judge a vertex whether in graph
|
||||||
|
:param graph: graph, type: GraphTraversalSource
|
||||||
|
:param vertex_dict: vertex dict, like {'label': 'value1', 'properties': {'p1': 'v1', ...}}
|
||||||
|
:return: None or Vertex(id,label)
|
||||||
|
"""
|
||||||
|
label = vertex_dict.get('label')
|
||||||
|
properties = vertex_dict.get('properties')
|
||||||
|
travel = self.graph.V()
|
||||||
|
if label:
|
||||||
|
travel = travel.hasLabel(label)
|
||||||
|
if properties:
|
||||||
|
for k in properties.keys():
|
||||||
|
travel = travel.has(k, properties.get(k))
|
||||||
|
if travel.hasNext():
|
||||||
|
return travel.next()
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_sub_graph(self, vertices=None, edges=None, vertex_properties=None):
|
||||||
|
"""
|
||||||
|
get sub graph
|
||||||
|
:param graph: graph, type: GraphTraversalSource
|
||||||
|
:param vertices: hasLabel('label').has('property').has('age', gt(20))
|
||||||
|
:param edges: hasLabel('label').has('property')
|
||||||
|
:param vertex_properties:
|
||||||
|
:return: sub_graph, type: GraphTraversalSource
|
||||||
|
"""
|
||||||
|
strategy = SubgraphStrategy(vertices=vertices, edges=edges, vertex_properties=vertex_properties)
|
||||||
|
return self.graph.withStrategies(strategy)
|
47
web/manager/log_manager.py
Normal file
47
web/manager/log_manager.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
class LogManager:
|
||||||
|
"""
|
||||||
|
日志处理器类,同时在控制台和日志文件中打印日志
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 日志对象
|
||||||
|
Logger = None
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(LogManager, self).__init__()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_logger(param_name, log_file='/mywork/log/public-sentiment/public-sentiment.log', level=logging.INFO):
|
||||||
|
"""
|
||||||
|
获取日志对象
|
||||||
|
:param param_name:
|
||||||
|
:param log_file:
|
||||||
|
:param level:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
|
||||||
|
if LogManager.Logger is None:
|
||||||
|
LogManager.Logger = logging.getLogger(param_name)
|
||||||
|
LogManager.Logger.setLevel(level=level)
|
||||||
|
|
||||||
|
formatter = logging.Formatter(
|
||||||
|
'%(asctime)s [%(threadName)s-%(thread)d] [%(levelname)s] %(name)s.%(funcName)s[%(lineno)d] %(message)s')
|
||||||
|
|
||||||
|
file_handler = logging.FileHandler(log_file, encoding="utf-8")
|
||||||
|
file_handler.setLevel(level=level)
|
||||||
|
file_handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
console = logging.StreamHandler()
|
||||||
|
console.setFormatter(formatter)
|
||||||
|
console.setLevel(level)
|
||||||
|
|
||||||
|
LogManager.Logger.addHandler(file_handler)
|
||||||
|
LogManager.Logger.addHandler(console)
|
||||||
|
return LogManager.Logger
|
||||||
|
else:
|
||||||
|
return LogManager.Logger
|
54
web/manager/snowflake_manager.py
Normal file
54
web/manager/snowflake_manager.py
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
class SnowflakeManager(object):
|
||||||
|
"""
|
||||||
|
Twitter的雪花算法实现
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, start_time=1420041600000):
|
||||||
|
self.start_time = start_time / 1000 # 以秒为单位
|
||||||
|
self.last_timestamp = -1
|
||||||
|
|
||||||
|
# 41 bits时间戳
|
||||||
|
self.timestamp_shift = 22
|
||||||
|
# 10 bits机器编号
|
||||||
|
self.machine_id_shift = 12
|
||||||
|
# 12 bits序列号
|
||||||
|
self.sequence_shift = 0
|
||||||
|
|
||||||
|
# 41 bits可以表示的最大值,2^41 - 1
|
||||||
|
self.max_timestamp = -1 ^ (-1 << self.timestamp_shift)
|
||||||
|
# 10 bits可以表示的最大值,2^10 - 1
|
||||||
|
self.max_machine_id = -1 ^ (-1 << self.machine_id_shift)
|
||||||
|
# 12 bits可以表示的最大值,2^12 - 1
|
||||||
|
self.max_sequence = -1 ^ (-1 << self.sequence_shift)
|
||||||
|
|
||||||
|
# 机器编号和序列号暂时不使用,可以通过参数传入
|
||||||
|
self.machine_id = 0
|
||||||
|
self.sequence = 0
|
||||||
|
|
||||||
|
def next_id(self):
|
||||||
|
timestamp = int(time.time())
|
||||||
|
if timestamp < self.last_timestamp:
|
||||||
|
raise ValueError('Current timestamp is less than last timestamp.')
|
||||||
|
|
||||||
|
if timestamp == self.last_timestamp:
|
||||||
|
self.sequence = (self.sequence + 1) & self.max_sequence
|
||||||
|
if self.sequence == 0:
|
||||||
|
timestamp = self.til_next_millis(self.last_timestamp)
|
||||||
|
else:
|
||||||
|
self.sequence = 0
|
||||||
|
|
||||||
|
self.last_timestamp = timestamp
|
||||||
|
return ((timestamp - int(self.start_time)) << self.timestamp_shift) | (
|
||||||
|
self.machine_id << self.machine_id_shift) | self.sequence
|
||||||
|
|
||||||
|
def til_next_millis(self, last_timestamp):
|
||||||
|
timestamp = int(time.time())
|
||||||
|
while timestamp <= last_timestamp:
|
||||||
|
timestamp = int(time.time())
|
||||||
|
return timestamp
|
0
web/migrations/__init__.py
Normal file
0
web/migrations/__init__.py
Normal file
3
web/models.py
Normal file
3
web/models.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
from django.db import models
|
||||||
|
|
||||||
|
# Create your models here.
|
6
web/models/__init__.py
Normal file
6
web/models/__init__.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from .public_sentiment_comment import PublicSentimentComment
|
||||||
|
from .public_sentiment_source import PublicSentimentSource
|
||||||
|
from .training_sensitive_word import TrainingSensitiveWord
|
30
web/models/public_sentiment_comment.py
Normal file
30
web/models/public_sentiment_comment.py
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from django.core.validators import MaxValueValidator
|
||||||
|
from django.db import models
|
||||||
|
|
||||||
|
|
||||||
|
class PublicSentimentComment(models.Model):
|
||||||
|
"""
|
||||||
|
评论表
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 主键
|
||||||
|
id = models.AutoField(primary_key=True)
|
||||||
|
|
||||||
|
# 内容
|
||||||
|
content = models.CharField(max_length=2550, null=True, blank=True)
|
||||||
|
|
||||||
|
# 来源id
|
||||||
|
source_id = models.BigIntegerField(validators=[MaxValueValidator(9223372036854775807)], db_index=True, null=False,
|
||||||
|
blank=False)
|
||||||
|
|
||||||
|
# 创建时间
|
||||||
|
create_time = models.DateTimeField(null=False, blank=False)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
managed = True
|
||||||
|
db_table = 'ps_comment'
|
||||||
|
verbose_name = '评论表'
|
||||||
|
verbose_name_plural = verbose_name
|
25
web/models/public_sentiment_source.py
Normal file
25
web/models/public_sentiment_source.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from django.db import models
|
||||||
|
|
||||||
|
|
||||||
|
class PublicSentimentSource(models.Model):
|
||||||
|
"""
|
||||||
|
来源表
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 主键
|
||||||
|
id = models.AutoField(primary_key=True)
|
||||||
|
|
||||||
|
# 域名
|
||||||
|
domain_name = models.CharField(max_length=255, null=True, blank=True)
|
||||||
|
|
||||||
|
# 名称
|
||||||
|
name = models.CharField(max_length=255, null=True, blank=True)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
managed = True
|
||||||
|
db_table = 'ps_source'
|
||||||
|
verbose_name = '来源表'
|
||||||
|
verbose_name_plural = verbose_name
|
25
web/models/training_sensitive_word.py
Normal file
25
web/models/training_sensitive_word.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from django.db import models
|
||||||
|
|
||||||
|
|
||||||
|
class TrainingSensitiveWord(models.Model):
|
||||||
|
"""
|
||||||
|
敏感词表
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 主键
|
||||||
|
id = models.AutoField(primary_key=True)
|
||||||
|
|
||||||
|
# 类型
|
||||||
|
type = models.CharField(max_length=255, null=True, blank=True)
|
||||||
|
|
||||||
|
# 敏感词
|
||||||
|
word = models.CharField(max_length=255, null=True, blank=True)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
managed = True
|
||||||
|
db_table = 'training_sensitive_word'
|
||||||
|
verbose_name = '敏感词表'
|
||||||
|
verbose_name_plural = verbose_name
|
0
web/service/__init__.py
Normal file
0
web/service/__init__.py
Normal file
15
web/service/base_service.py
Normal file
15
web/service/base_service.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from web.dao.public_sentiment_comment_dao import PublicSentimentCommentDao
|
||||||
|
from web.dao.training_sensitive_word_dao import TrainingSensitiveWordDao
|
||||||
|
|
||||||
|
|
||||||
|
class BaseService:
|
||||||
|
"""
|
||||||
|
service层的基类
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.public_sentiment_comment_dao = PublicSentimentCommentDao()
|
||||||
|
self.training_sensitive_word_dao = TrainingSensitiveWordDao()
|
31
web/service/public_sentiment_comment_service.py
Normal file
31
web/service/public_sentiment_comment_service.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from web.manager.log_manager import LogManager
|
||||||
|
from web.service.base_service import BaseService
|
||||||
|
|
||||||
|
Logger = LogManager.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PublicSentimentCommentService(BaseService):
|
||||||
|
"""
|
||||||
|
PublicSentimentComment的service类
|
||||||
|
"""
|
||||||
|
|
||||||
|
def save(self, public_sentiment_comment):
|
||||||
|
"""
|
||||||
|
保存
|
||||||
|
"""
|
||||||
|
|
||||||
|
Logger.info('保存PublicSentimentComment对象')
|
||||||
|
|
||||||
|
self.public_sentiment_comment_dao.save(public_sentiment_comment)
|
||||||
|
|
||||||
|
def find_all(self):
|
||||||
|
"""
|
||||||
|
查询所有记录
|
||||||
|
"""
|
||||||
|
|
||||||
|
Logger.info('查询所有记录')
|
||||||
|
|
||||||
|
return self.public_sentiment_comment_dao.find_list(dict(), dict(), list())
|
24
web/service/training_sensitive_word_service.py
Normal file
24
web/service/training_sensitive_word_service.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
#!/usr/bin/env python查询所有记录
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from web.dto.service_result import ServiceResult
|
||||||
|
from web.enum.service_result_enum import ServiceResultEnum
|
||||||
|
from web.manager.log_manager import LogManager
|
||||||
|
from web.service.base_service import BaseService
|
||||||
|
|
||||||
|
Logger = LogManager.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TrainingSensitiveWordService(BaseService):
|
||||||
|
"""
|
||||||
|
TrainingSensitiveWord的service类
|
||||||
|
"""
|
||||||
|
|
||||||
|
def find_all(self):
|
||||||
|
"""
|
||||||
|
查询所有记录
|
||||||
|
"""
|
||||||
|
|
||||||
|
Logger.info('查询所有记录')
|
||||||
|
|
||||||
|
return ServiceResult.ok(ServiceResultEnum.SELECT_SUCCESS, self.training_sensitive_word_dao.find_all(),
|
||||||
|
ServiceResultEnum.SELECT_SUCCESS_DESCRIPTION)
|
0
web/spider/__init__.py
Normal file
0
web/spider/__init__.py
Normal file
13
web/spider/base_spider.py
Normal file
13
web/spider/base_spider.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from web.service.training_sensitive_word_service import TrainingSensitiveWordService
|
||||||
|
|
||||||
|
|
||||||
|
class BaseSpider:
|
||||||
|
"""
|
||||||
|
Spider层的基类
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.training_sensitive_word_service = TrainingSensitiveWordService()
|
0
web/task/__init__.py
Normal file
0
web/task/__init__.py
Normal file
14
web/task/base_task.py
Normal file
14
web/task/base_task.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
task基类
|
||||||
|
"""
|
||||||
|
from web.handler.crawl_data_handler import CrawlDataHandler
|
||||||
|
|
||||||
|
|
||||||
|
class BaseTask:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.crawl_data_handler = CrawlDataHandler()
|
23
web/task/crawl_data_task.py
Normal file
23
web/task/crawl_data_task.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from web.manager.log_manager import LogManager
|
||||||
|
from web.task.base_task import BaseTask
|
||||||
|
|
||||||
|
Logger = LogManager.get_logger(__name__)
|
||||||
|
|
||||||
|
"""
|
||||||
|
添加注释
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlDataTask(BaseTask):
|
||||||
|
|
||||||
|
def collect_data_from_weibo(self):
|
||||||
|
"""
|
||||||
|
从新浪微博采集数据
|
||||||
|
"""
|
||||||
|
|
||||||
|
Logger.info("开始从新浪微博采集数据")
|
||||||
|
|
||||||
|
self.crawl_data_handler.collect_data_from_weibo()
|
3
web/tests.py
Normal file
3
web/tests.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
from django.test import TestCase
|
||||||
|
|
||||||
|
# Create your tests here.
|
0
web/util/__init__.py
Normal file
0
web/util/__init__.py
Normal file
19
web/util/dto_util.py
Normal file
19
web/util/dto_util.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from web.dto.api_result import ApiResult
|
||||||
|
from web.dto.service_result import ServiceResult
|
||||||
|
|
||||||
|
|
||||||
|
class DtoUtil:
|
||||||
|
"""
|
||||||
|
dto的工具类
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def service_result_to_api_result(service_result: ServiceResult) -> ApiResult:
|
||||||
|
"""
|
||||||
|
将ServiceResult对象转换为ApiResult对象
|
||||||
|
"""
|
||||||
|
|
||||||
|
return ApiResult.instance(service_result.success, service_result.code, service_result.data,
|
||||||
|
service_result.message)
|
19
web/util/re_util.py
Normal file
19
web/util/re_util.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
class ReUtil:
|
||||||
|
"""
|
||||||
|
正则表达式的工具类
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def clear_html(text_with_html):
|
||||||
|
"""
|
||||||
|
清除html
|
||||||
|
"""
|
||||||
|
|
||||||
|
soup = BeautifulSoup(text_with_html, 'html.parser')
|
||||||
|
return soup.get_text()
|
2
web/views.py
Normal file
2
web/views.py
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
0
web/vo/__init__.py
Normal file
0
web/vo/__init__.py
Normal file
13
web/vo/parse_html_vo.py
Normal file
13
web/vo/parse_html_vo.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class ParseHtmlVo(BaseModel):
|
||||||
|
"""
|
||||||
|
解析html的vo类
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 地址
|
||||||
|
url: str
|
Loading…
Reference in New Issue
Block a user