1.创建、提交项目。

This commit is contained in:
913071727 2024-09-18 13:38:24 +08:00
commit 27f08d6772
83 changed files with 2055 additions and 0 deletions

3
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

View File

@ -0,0 +1,65 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="52">
<item index="0" class="java.lang.String" itemvalue="mysqlclient" />
<item index="1" class="java.lang.String" itemvalue="tushare" />
<item index="2" class="java.lang.String" itemvalue="bs4" />
<item index="3" class="java.lang.String" itemvalue="tzlocal" />
<item index="4" class="java.lang.String" itemvalue="html5lib" />
<item index="5" class="java.lang.String" itemvalue="tabulate" />
<item index="6" class="java.lang.String" itemvalue="python-dateutil" />
<item index="7" class="java.lang.String" itemvalue="cycler" />
<item index="8" class="java.lang.String" itemvalue="backports.zoneinfo" />
<item index="9" class="java.lang.String" itemvalue="certifi" />
<item index="10" class="java.lang.String" itemvalue="lxml" />
<item index="11" class="java.lang.String" itemvalue="soupsieve" />
<item index="12" class="java.lang.String" itemvalue="pyparsing" />
<item index="13" class="java.lang.String" itemvalue="pypinyin" />
<item index="14" class="java.lang.String" itemvalue="xlrd" />
<item index="15" class="java.lang.String" itemvalue="beautifulsoup4" />
<item index="16" class="java.lang.String" itemvalue="asgiref" />
<item index="17" class="java.lang.String" itemvalue="kiwisolver" />
<item index="18" class="java.lang.String" itemvalue="typing-extensions" />
<item index="19" class="java.lang.String" itemvalue="akshare" />
<item index="20" class="java.lang.String" itemvalue="APScheduler" />
<item index="21" class="java.lang.String" itemvalue="simplejson" />
<item index="22" class="java.lang.String" itemvalue="fonttools" />
<item index="23" class="java.lang.String" itemvalue="matplotlib" />
<item index="24" class="java.lang.String" itemvalue="charset-normalizer" />
<item index="25" class="java.lang.String" itemvalue="PyMySQL" />
<item index="26" class="java.lang.String" itemvalue="addcomments" />
<item index="27" class="java.lang.String" itemvalue="idna" />
<item index="28" class="java.lang.String" itemvalue="decorator" />
<item index="29" class="java.lang.String" itemvalue="cx-Oracle" />
<item index="30" class="java.lang.String" itemvalue="numpy" />
<item index="31" class="java.lang.String" itemvalue="requests" />
<item index="32" class="java.lang.String" itemvalue="importlib-metadata" />
<item index="33" class="java.lang.String" itemvalue="py-mini-racer" />
<item index="34" class="java.lang.String" itemvalue="websocket-client" />
<item index="35" class="java.lang.String" itemvalue="sqlparse" />
<item index="36" class="java.lang.String" itemvalue="zipp" />
<item index="37" class="java.lang.String" itemvalue="jsonpath" />
<item index="38" class="java.lang.String" itemvalue="urllib3" />
<item index="39" class="java.lang.String" itemvalue="baostock" />
<item index="40" class="java.lang.String" itemvalue="six" />
<item index="41" class="java.lang.String" itemvalue="tzdata" />
<item index="42" class="java.lang.String" itemvalue="packaging" />
<item index="43" class="java.lang.String" itemvalue="et-xmlfile" />
<item index="44" class="java.lang.String" itemvalue="pandas" />
<item index="45" class="java.lang.String" itemvalue="tqdm" />
<item index="46" class="java.lang.String" itemvalue="django" />
<item index="47" class="java.lang.String" itemvalue="colorama" />
<item index="48" class="java.lang.String" itemvalue="pytz" />
<item index="49" class="java.lang.String" itemvalue="webencodings" />
<item index="50" class="java.lang.String" itemvalue="openpyxl" />
<item index="51" class="java.lang.String" itemvalue="Pillow" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/public_sentiment.iml" filepath="$PROJECT_DIR$/.idea/public_sentiment.iml" />
</modules>
</component>
</project>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

View File

View File

@ -0,0 +1,12 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import scrapy
class SensitiveWordItem(scrapy.Item):
"""
评论
"""
sensitive_word = scrapy.Field()

View File

@ -0,0 +1,101 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class CollectorSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class CollectorDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download service or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)

View File

@ -0,0 +1,27 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from web.models import PublicSentimentComment
from web.manager.log_manager import LogManager
from web.service.public_sentiment_comment_service import PublicSentimentCommentService
Logger = LogManager.get_logger(__name__)
class CollectorPipeline(object):
def __init__(self):
super().__init__()
def process_item(self, item, spider):
"""
将数据存储在数据库中
"""
public_sentiment_comment = PublicSentimentComment()
public_sentiment_comment.content = item['sensitive_word']
public_sentiment_comment_service = PublicSentimentCommentService()
public_sentiment_comment_service.save(public_sentiment_comment)
return item

View File

@ -0,0 +1,117 @@
# Scrapy settings for collector project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "collector"
SPIDER_MODULES = ["collector.spiders"]
NEWSPIDER_MODULE = "collector.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = "collector (+http://www.yourdomain.com)"
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# 默认为True此处改为False
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
# }
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Cookie': 'adb_isBlock=0; userid=1652710683278_ihrfq92084; prov=cn0731; city=0732; weather_city=hn_xt; region_ip=110.53.149.x; region_ver=1.2; wxIsclose=false; ifengRotator_iis3=6; ifengWindowCookieName_919=1'
# 默认是注释的这个东西非常重要如果不写很容易被判断为电脑简单点写一个Mozilla/5.0即可
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# "collector.middlewares.CollectorSpiderMiddleware": 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# "collector.middlewares.CollectorDownloaderMiddleware": 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
# "collector.pipelines.CollectorPipeline": 300,
# }
# 项目管道,数字越小优先度越高
ITEM_PIPELINES = {
'collector.pipelines.CollectorPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = "httpcache"
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
######################################### 下面的都是自定义的 ########################################
import os, django
import sys
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(BASE_DIR)
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "public_sentiment.settings")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
django.setup()

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -0,0 +1,59 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
from typing import Optional, Any
import scrapy
from collector.items import SensitiveWordItem
from web.manager.log_manager import LogManager
from web.spider.base_spider import BaseSpider
from web.util.re_util import ReUtil
Logger = LogManager.get_logger(__name__)
class CollectorSpider(scrapy.Spider, BaseSpider):
"""
从微博上爬数据
"""
name = "collector-spider"
allowed_domains = ["s.weibo.com"]
# start_urls = ["https://s.weibo.com/weibo?q=%E5%8C%97%E4%BA%AC%E5%B7%A5%E5%95%86%E5%A4%A7%E5%AD%A6&nodup=1&page=5"]
start_urls = ["https://xm.buyiju.com/ceming/129803-zajo.html"]
# url = 'https://xm.buyiju.com/ceming/129803-zajo.html'
def __init__(self, name: Optional[str] = None, **kwargs: Any):
scrapy.Spider.__init__(self)
BaseSpider.__init__(self)
# def start_requests(self):
# yield scrapy.Request(url=self.url, callback=self.parse)
def parse(self, response):
Logger.info('从微博上爬数据')
# 返回的html
text = response.text
# 查询敏感词,并将其拼接为字符串,用|分隔
training_sensitive_word_list = self.training_sensitive_word_service.find_all()
temp_training_sensitive_word_list = list(map(lambda training_sensitive_word: str(training_sensitive_word.word), training_sensitive_word_list))
match_str = '.+|.+'.join(temp_training_sensitive_word_list)
# 去除返回值中的html标签
text_without_html = ReUtil.clear_html(text)
text_without_html_list = text_without_html.split('\n')
# 匹配
is_match = False
sensitive_word_item = SensitiveWordItem()
for item in text_without_html_list:
match = re.match(match_str, item)
if match:
sensitive_word_item['sensitive_word'] = match.group()
is_match = True
break
if is_match:
yield sensitive_word_item

BIN
collector/dbs/default.db Normal file

Binary file not shown.

3
collector/main.py Normal file
View File

@ -0,0 +1,3 @@
from scrapy.cmdline import execute
execute('scrapy crawl collector-spider'.split())

11
collector/scrapy.cfg Normal file
View File

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = collector.settings
[deploy]
#url = http://localhost:6800/
project = collector

30
manage.py Normal file
View File

@ -0,0 +1,30 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import django
from web.manager.log_manager import LogManager
sys.path.append(r"web")
sys.path.append(r"collector")
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'public_sentiment.settings')
django.setup()
Logger = LogManager.get_logger(__name__)
if __name__ == '__main__':
LogManager.get_logger("启动服务器")
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)

View File

@ -0,0 +1,2 @@
import pymysql
pymysql.install_as_MySQLdb()

16
public_sentiment/asgi.py Normal file
View File

@ -0,0 +1,16 @@
"""
ASGI config for public_sentiment project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'public_sentiment.settings')
application = get_asgi_application()

View File

@ -0,0 +1,136 @@
"""
Django settings for public_sentiment project.
Generated by 'django-admin startproject' using Django 4.2.16.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/4.2/ref/settings/
"""
from pathlib import Path
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-!*ar1k^h=h^*azpzf3sabuf4w5m)vo^aev0l6c@6qfcdh73%ze'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'web',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'public_sentiment.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'public_sentiment.wsgi.application'
# Database
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.mysql', # 默认
'NAME': 'base_platform', # 连接的数据库
'HOST': '127.0.0.1', # mysql的ip地址
'PORT': 3306, # mysql的端口
'USER': 'root', # mysql的用户名
'PASSWORD': '123456', # mysql的密码
}
}
# Password validation
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/4.2/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_L10N = True
# USE_TZ = True
USE_TZ = False
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/4.2/howto/static-files/
STATIC_URL = 'static/'
# Default primary key field type
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
# 时区
TIME_ZONE = 'Asia/Shanghai'
# gridgraph的配置
GRID_GRAPH = {
'url': 'ws://192.168.3.18:8182/gremlin',
'traversal_source': 'gmodern100M',
'username': 'admin',
'password': 'admin'
}

25
public_sentiment/urls.py Normal file
View File

@ -0,0 +1,25 @@
"""
URL configuration for public_sentiment project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/4.2/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.urls import path
from web.controller.html_parser_controller import parse_html
urlpatterns = [
path('admin/', admin.site.urls),
path('api/v1/htmlParser/parseHtml', parse_html),
]

16
public_sentiment/wsgi.py Normal file
View File

@ -0,0 +1,16 @@
"""
WSGI config for public_sentiment project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'public_sentiment.settings')
application = get_wsgi_application()

0
scrawl/__init__.py Normal file
View File

11
scrawl/scrapy.cfg Normal file
View File

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = scrawl.settings
[deploy]
#url = http://localhost:6800/
project = scrawl

View File

12
scrawl/scrawl/items.py Normal file
View File

@ -0,0 +1,12 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ScrawlItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

View File

@ -0,0 +1,103 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class ScrawlSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class ScrawlDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)

View File

@ -0,0 +1,13 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class ScrawlPipeline:
def process_item(self, item, spider):
return item

105
scrawl/scrawl/settings.py Normal file
View File

@ -0,0 +1,105 @@
# Scrapy settings for scrawl project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "scrawl"
SPIDER_MODULES = ["scrawl.spiders"]
NEWSPIDER_MODULE = "scrawl.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = "scrawl (+http://www.yourdomain.com)"
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
# }
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Cookie': 'adb_isBlock=0; userid=1652710683278_ihrfq92084; prov=cn0731; city=0732; weather_city=hn_xt; region_ip=110.53.149.x; region_ver=1.2; wxIsclose=false; ifengRotator_iis3=6; ifengWindowCookieName_919=1'
# 默认是注释的这个东西非常重要如果不写很容易被判断为电脑简单点写一个Mozilla/5.0即可
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# "scrawl.middlewares.ScrawlSpiderMiddleware": 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# "scrawl.middlewares.ScrawlDownloaderMiddleware": 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
# "scrawl.pipelines.ScrawlPipeline": 300,
# }
# ITEM_PIPELINES项目管道300为优先级越低越爬取的优先度越高
ITEM_PIPELINES = {
'scrawl.pipelines.ScrawlPipeline': 300,
# 'subeiNews.pipelines.SubeinewsMysqlPipeline': 200, # 存数据的管道
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = "httpcache"
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -0,0 +1,18 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import scrapy
sys.path.append(r"scrawl")
from scrawl.items import ScrawlItem
class WeiboSpiderSpider(scrapy.Spider):
name = "weibo_spider"
allowed_domains = ["s.weibo.com"]
start_urls = ["https://s.weibo.com/weibo?q=%E5%8C%97%E4%BA%AC%E5%B7%A5%E5%95%86%E5%A4%A7%E5%AD%A6&nodup=1&page=5"]
def parse(self, response):
for con in response.xpath('//*[@id="pl_feedlist_index"]/div/div'):
scraw_item = ScrawlItem()

2
script/main.bat Normal file
View File

@ -0,0 +1,2 @@
cd C:/mywork/workspace/public_sentiment/collector
scrapy crawl collector-spider

1
script/runserver.bat Normal file
View File

@ -0,0 +1 @@
C:\mywork\dev-env\python\Python38\python.exe C:\mywork\workspace\public_sentiment\manage.py runserver 9000

View File

@ -0,0 +1 @@
scrapyd

0
web/__init__.py Normal file
View File

3
web/admin.py Normal file
View File

@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

6
web/apps.py Normal file
View File

@ -0,0 +1,6 @@
from django.apps import AppConfig
class WebConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'web'

View File

View File

@ -0,0 +1,12 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
启动系统时的参数
"""
class StartupParameter:
# 采集数据
Crawl_Data = 'crawl_data'

View File

View File

@ -0,0 +1,42 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import sys
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from web.handler.html_parser_handler import HtmlParserHandler
sys.path.append(r"collector")
from collector.settings import ITEM_PIPELINES
class BaseController:
"""
controller层的基类
"""
def __init__(self):
self.html_parser_handler = HtmlParserHandler()
def to_vo(self, request, clazz):
"""
将json参数转换为vo对象
"""
raw_data = request.body.decode("utf-8")
json_data_dict = json.loads(raw_data)
obj = clazz(**json_data_dict)
return obj
def start_scrawl(self, spider):
"""
开始执行爬虫
"""
# get_project_settings方法并不能导入settings.py中的配置因此此处还要硬编码导入
settings = get_project_settings()
settings['ITEM_PIPELINES'] = ITEM_PIPELINES
process = CrawlerProcess(settings)
process.crawl(spider)
process.start()

View File

@ -0,0 +1,38 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
from collections import namedtuple
from django.http import JsonResponse
from rest_framework.decorators import api_view
from twisted.protocols.amp import Box
from collector.spiders.collector_spider import CollectorSpider
from web.controller.base_controller import BaseController
from web.dto.api_result import ApiResult
from web.manager.gridgraph_manager import GridGraphManager
from web.manager.log_manager import LogManager
from web.util.dto_util import DtoUtil
from web.vo.parse_html_vo import ParseHtmlVo
Logger = LogManager.get_logger(__name__)
base_controller = BaseController()
@api_view(['POST'])
def parse_html(request):
"""
解析html
"""
Logger.info("开始解析html")
parse_html_vo = base_controller.to_vo(request, ParseHtmlVo)
service_result = base_controller.html_parser_handler.parse_html(parse_html_vo.url)
# grid_graph_manager = GridGraphManager()
# list = grid_graph_manager.query_vertex(label='person')
# base_controller.start_scrawl(CollectorSpider)
return JsonResponse(DtoUtil.service_result_to_api_result(service_result), safe=False)

0
web/dao/__init__.py Normal file
View File

157
web/dao/base_dao.py Normal file
View File

@ -0,0 +1,157 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from datetime import datetime
from django.db.models.query import QuerySet
from django.db import models
from web.manager.snowflake_manager import SnowflakeManager
class BaseDao:
"""
dao基类
"""
# 子类必须覆盖这个
model_class = models.Model
save_batch_size = 1000
snowflake_manager = SnowflakeManager()
def save(self, obj):
"""
添加
"""
if not obj:
return False
obj.id = self.snowflake_manager.next_id()
obj.create_time = datetime.now()
obj.save()
return True
def save_batch(self, objs, *, batch_size=save_batch_size):
"""
批量添加
"""
if not objs:
return False
for obj in objs:
obj.id = snowflake.next_id()
self.model_class.objects.bulk_create(objs, batch_size=batch_size)
return True
def delete(self, obj):
"""
删除
"""
if not obj:
return False
obj.delete()
return True
def delete_batch(self, objs):
"""
批量删除
"""
if not objs:
return False
for obj in objs:
self.delete(obj)
return True
def delete_batch_by_query(self, filter_kw: dict, exclude_kw: dict):
"""
根据条件批量删除
"""
self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw).delete()
return True
def delete_by_fake(self, obj):
"""
假删除/伪删除
"""
if obj is None:
return False
obj.is_deleted = True
obj.save()
return True
def update(self, obj):
"""
更新
"""
if not obj:
return False
obj.save()
return True
def update_batch(self, objs):
"""
批量更新
"""
if not objs:
return False
for obj in objs:
self.update(obj)
return True
def update_batch_by_query(self, query_kwargs: dict, exclude_kw: dict, newattrs_kwargs: dict):
"""
根据条件批量更新
"""
self.model_class.objects.filter(**query_kwargs).exclude(**exclude_kw).update(**newattrs_kwargs)
def find_one(self, filter_kw: dict, exclude_kw: dict, order_bys: list):
"""
根据条件返回一条记录
"""
qs = self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw)
if order_bys:
qs = qs.order_by(*order_bys)
return qs.first()
def find_queryset(self, filter_kw: dict, exclude_kw: dict, order_bys: list) -> QuerySet:
"""
根据条件返回QuerySet
"""
if order_bys != None and len(order_bys) != 0:
query_set = self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw)
for by in order_bys:
query_set = query_set.order_by(by)
return query_set
else:
return self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw)
def find_list(self, filter_kw: dict, exclude_kw: dict, order_bys: list) -> list:
"""
根据条件返回对象列表
"""
queryset = self.find_queryset(filter_kw, exclude_kw, order_bys)
model_instances = [model for model in queryset]
return model_instances
def is_exists(self, filter_kw: dict, exclude_kw: dict) -> bool:
"""
根据条件判断记录是否存在
"""
return self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw).exists()
def get_count(self, filter_kw: dict, exclude_kw: dict) -> int:
"""
根据条件计数
"""
return self.model_class.objects.filter(**filter_kw).exclude(**exclude_kw).count()

View File

@ -0,0 +1,13 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from web.dao.base_dao import BaseDao
from web.models import PublicSentimentComment
class PublicSentimentCommentDao(BaseDao):
"""
Comment的dao类
"""
model_class = PublicSentimentComment

View File

@ -0,0 +1,20 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from web.dao.base_dao import BaseDao
from web.models import TrainingSensitiveWord
class TrainingSensitiveWordDao(BaseDao):
"""
TrainingSensitiveWord的dao类
"""
model_class = TrainingSensitiveWord
def find_all(self):
"""
查询所有记录
"""
return self.find_list(dict(), dict(), list())

0
web/dto/__init__.py Normal file
View File

33
web/dto/api_result.py Normal file
View File

@ -0,0 +1,33 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
class ApiResult:
"""
接口返回类
"""
def __init__(self):
super().__init__()
def __init__(self, success, code, data, message):
# 只要服务端没报错success都是True
self.success = success
# 根据处理结果不同,返回不同的值
self.code = code
# 返回数据
self.data = data
# 提示信息
self.message = message
@staticmethod
def instance(success, code, data, message):
return ApiResult(success, code, data, message).__dict__
@staticmethod
def ok(code, data, message):
return ApiResult(True, code, data, message).__dict__
@staticmethod
def fail(code, data, message):
return ApiResult(False, code, data, message).__dict__

29
web/dto/service_result.py Normal file
View File

@ -0,0 +1,29 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
class ServiceResult:
"""
service层返回值对象
"""
def __init__(self):
super().__init__()
def __init__(self, success, code, data, message):
# 只要服务端没报错success都是True
self.success = success
# 根据处理结果不同,返回不同的值
self.code = code
# 返回数据
self.data = data
# 提示信息
self.message = message
@staticmethod
def ok(code, data, message):
return ServiceResult(True, code, data, message)
@staticmethod
def fail(code, data, message):
return ServiceResult(False, code, data, message)

0
web/enum/__init__.py Normal file
View File

View File

@ -0,0 +1,19 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from enum import Enum
class ApiResultEnum(Enum):
"""
ApiResult类的的枚举类型
"""
# 成功
# SUCCESS = 200
# SUCCESS_DESCRIPTION = '成功'
# 失败
FAIL = 4000
FAIL_DESCRIPTION = '失败'

View File

@ -0,0 +1,43 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from enum import Enum
class ServiceResultEnum(Enum):
"""
ServiceResult类的的枚举类型
"""
# 成功
SUCCESS = 200
SUCCESS_DESCRIPTION = '成功'
# 失败
FAIL = 3000
FAIL_DESCRIPTION = '失败'
# 添加成功
SAVE_SUCCESS = 3001
SAVE_SUCCESS_DESCRIPTION = '添加成功'
# 删除成功
DELETE_SUCCESS = 3002
DELETE_SUCCESS_DESCRIPTION = '删除成功'
# 修改成功
UPDATE_SUCCESS = 3003
UPDATE_SUCCESS_DESCRIPTION = '修改成功'
# 查询成功
SELECT_SUCCESS = 3004
SELECT_SUCCESS_DESCRIPTION = '查询成功'
# 不存在敏感词
NOT_EXIST_SENSITIVE_WORD = 3005
NOT_EXIST_SENSITIVE_WORD_DESCRIPTION = '不存在敏感词'
# 存在敏感词
EXIST_SENSITIVE_WORD = 3006
EXIST_SENSITIVE_WORD_DESCRIPTION = '存在敏感词'

0
web/handler/__init__.py Normal file
View File

View File

@ -0,0 +1,13 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from web.service.training_sensitive_word_service import TrainingSensitiveWordService
class BaseHandler:
"""
handler层的基类
"""
def __init__(self):
self.training_sensitive_word_service = TrainingSensitiveWordService()

View File

@ -0,0 +1,23 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from web.manager.log_manager import LogManager
from web.handler.base_handler import BaseHandler
Logger = LogManager.get_logger(__name__)
"""
采集数据的handler
"""
class CrawlDataHandler(BaseHandler):
def collect_data_from_weibo(self):
"""
从新浪微博采集数据
"""
Logger.info("开始从新浪微博采集数据")

View File

@ -0,0 +1,51 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import requests
from web.enum.service_result_enum import ServiceResultEnum
from web.dto.service_result import ServiceResult
from web.handler.base_handler import BaseHandler
from web.manager.log_manager import LogManager
from web.util.re_util import ReUtil
Logger = LogManager.get_logger(__name__)
class HtmlParserHandler(BaseHandler):
"""
html解析器类
"""
def parse_html(self, url):
"""
解析html网页
"""
response = requests.get(url)
text = response.text
# 查询敏感词,并将其拼接为字符串,用|分隔
service_result = self.training_sensitive_word_service.find_all()
if service_result is not None and service_result.success is True:
training_sensitive_word_list = service_result.data
temp_training_sensitive_word_list = list(
map(lambda training_sensitive_word: str(training_sensitive_word.word), training_sensitive_word_list))
match_str = '.+|.+'.join(temp_training_sensitive_word_list)
# 去除返回值中的html标签
text_without_html = ReUtil.clear_html(text)
text_without_html_list = text_without_html.split('\n')
# 匹配
for item in text_without_html_list:
match = re.match(match_str, item)
if match:
return ServiceResult.ok(ServiceResultEnum.EXIST_SENSITIVE_WORD.value, match.group(),
ServiceResultEnum.EXIST_SENSITIVE_WORD_DESCRIPTION.value)
return ServiceResult.ok(ServiceResultEnum.NOT_EXIST_SENSITIVE_WORD.value, None,
ServiceResultEnum.NOT_EXIST_SENSITIVE_WORD_DESCRIPTION.value)
else:
return ServiceResult.fail(ServiceResultEnum.FAIL.value, None,
ServiceResultEnum.FAIL_DESCRIPTION.value)

0
web/manager/__init__.py Normal file
View File

View File

@ -0,0 +1,258 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from gremlin_python import statics
from gremlin_python.process.anonymous_traversal import traversal
from gremlin_python.process.graph_traversal import __
from gremlin_python.process.strategies import *
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.traversal import T
from gremlin_python.process.traversal import Order
from gremlin_python.process.traversal import Cardinality
from gremlin_python.process.traversal import Column
from gremlin_python.process.traversal import Direction
from gremlin_python.process.traversal import Operator
from gremlin_python.process.traversal import P
from gremlin_python.process.traversal import Pop
from gremlin_python.process.traversal import Scope
from gremlin_python.process.traversal import Barrier
from gremlin_python.process.traversal import Bindings
from gremlin_python.process.traversal import WithOptions
from gremlin_python.driver import client
from public_sentiment.settings import GRID_GRAPH
class GridGraphManager:
"""
gridgraph的管理器类
"""
def __init__(self):
self.graph = traversal().withRemote(
DriverRemoteConnection(GRID_GRAPH['url'], GRID_GRAPH['traversal_source'], username=GRID_GRAPH['username'],
password=GRID_GRAPH['password']))
def add_vertex(self, label, properties=None):
"""
add vertex
:param graph: graph, type: GraphTraversalSource
:param label: label, type: str
:param properties: property dict, like {'p1': 'value1', 'p2': 'value2'}
:return: vertex, Vertex(id, label)
"""
vert = self.graph.addV(label)
if properties:
for key in properties.keys():
vert.property(key, properties.get(key))
return vert.next()
def add_edge(self, label, v_from, v_to, properties=None):
"""
add edge
:param graph: graph, type: GraphTraversalSource
:param label: label, type: str
:param v_from: long vertex id or Vertex(id, label) of from
:param v_to: long vertex id or Vertex(id, label) of to
:param properties: property dict, like {'p1': 'value1', 'p2': 'value2'}
:return: None
"""
if isinstance(v_from, int):
v_from = self.graph.V().hasId(v_from).next()
if isinstance(v_to, int):
v_to = self.graph.V().hasId(v_to).next()
edge = self.graph.V(v_from).addE(label).to(v_to)
if properties:
for key in properties.keys():
edge.property(key, properties.get(key))
edge.next()
def drop_vertex(self, v_id=None, label=None, properties=None):
"""
drop all vertex or specific vertex
:param graph: graph, type: GraphTraversalSource
:param v_id: long vertex id or Vertex(id, label)
:param label: label, type: str
:param properties: property list, like ['p1', 'p2', {'p3': 'value'}]
:return: None
"""
if isinstance(v_id, int):
v_id = self.graph.V().hasId(v_id).next()
travel = self.graph.V(v_id) if v_id else self.graph.V()
if label:
travel = travel.hasLabel(label)
if properties:
for p in properties:
if isinstance(p, dict):
key = list(p.keys())[0]
travel = travel.has(key, p.get(key))
else:
travel = travel.has(p)
travel.drop().iterate()
def drop_edge(self, e_id=None, label=None, properties=None):
"""
drop all edges or specific edge
:param graph: graph, type: GraphTraversalSource
:param e_id: edge id, type str
:param label: label, type: str
:param properties: property list, like ['p1', 'p2', {'p3': 'value'}]
:return: None
"""
travel = self.graph.E(e_id) if e_id else self.graph.E()
if label:
travel = travel.hasLabel(label)
if properties:
for p in properties:
if isinstance(p, dict):
key = list(p.keys())[0]
travel = travel.has(key, p.get(key))
else:
travel = travel.has(p)
travel.drop().iterate()
def query_vertex(self, v_id=None, label=None, properties=None):
"""
query graph vertex (value) list
:param graph: graph, type: GraphTraversalSource
:param v_id: long vertex id or Vertex(id, label)
:param label: label, type: str
:param properties: property list, like ['p1', 'p2', {'p3': 'value'}]
:return: vertex list or vertex value list
"""
if isinstance(v_id, int):
v_id = self.graph.V().hasId(v_id).next()
travel = self.graph.V(v_id) if v_id else self.graph.V()
if label:
travel = travel.hasLabel(label)
if properties:
for p in properties:
if isinstance(p, dict):
key = list(p.keys())[0]
travel = travel.has(key, p.get(key))
else:
travel = travel.has(p)
# return travel.valueMap().toList()
return travel.toList()
def query_edge(self, e_id=None, label=None, properties=None):
"""
query graph edge value list
:param graph: graph, type: GraphTraversalSource
:param e_id: edge id, type str
:param label: label, type: str
:param properties: property list, like ['p1', 'p2', {'p3': 'value'}]
:return: valueMap list
"""
travel = self.graph.E(e_id) if e_id else self.graph.E()
if label:
travel = travel.hasLabel(label)
if properties:
for p in properties:
if isinstance(p, dict):
key = list(p.keys())[0]
travel = travel.has(key, p.get(key))
else:
travel = travel.has(p)
return travel.valueMap().toList()
def query_edges_of_vertex(self, v_id):
"""
query all edges of vertex
:param graph: graph, type: GraphTraversalSource
:param v_id: v_id: long vertex id or Vertex(id, label)
:return: edge list
"""
if isinstance(v_id, int):
v_id = self.graph.V().hasId(v_id).next()
result = []
in_edges = self.graph.V(v_id).inE().toList()
out_edges = self.graph.V(v_id).outE().toList()
result.extend(in_edges)
result.extend(out_edges)
return result
def query_near_vertex(self, v_id):
"""
query near vertices of vertex
:param graph: graph, type: GraphTraversalSource
:param v_id: v_id: long vertex id or Vertex(id, label)
:return: vertex list
"""
if isinstance(v_id, int):
v_id = self.graph.V().hasId(v_id).next()
result = []
out_v = self.graph.V(v_id).out().toList()
in_v = self.graph.V(v_id).in_().toList()
result.extend(out_v)
result.extend(in_v)
return result
def get_edge_id(self):
"""
get edge id
:param edge: Egde(id, label, outV, inV)
:return: edge id, type str
"""
return self.graph.id.get('@value').get('relationId')
def vertex_to_dict(self, vertex):
"""
transfer Vertex's info to dict
:param graph: graph, type: GraphTraversalSource
:param vertex: vertex, Vertex(id, label)
:return: vertex info dict
"""
properties = self.graph.V(vertex).valueMap().toList()[0]
for key in properties.keys():
properties[key] = properties.get(key)[0]
return {
'id': vertex.id,
'label': vertex.label,
'properties': properties
}
def edge_to_dict(self, edge):
"""
transfer Edge's info to dict
:param graph: graph, type: GraphTraversalSource
:param edge: edge, Edge(id, label, outV, inV)
:return: edge info dict
"""
e_id = self.get_edge_id(edge)
properties = self.graph.E(e_id).valueMap().toList()[0]
return {
'id': e_id,
'label': edge.label,
'properties': properties
}
def judge_vertex_in_graph(self, vertex_dict):
"""
judge a vertex whether in graph
:param graph: graph, type: GraphTraversalSource
:param vertex_dict: vertex dict, like {'label': 'value1', 'properties': {'p1': 'v1', ...}}
:return: None or Vertex(id,label)
"""
label = vertex_dict.get('label')
properties = vertex_dict.get('properties')
travel = self.graph.V()
if label:
travel = travel.hasLabel(label)
if properties:
for k in properties.keys():
travel = travel.has(k, properties.get(k))
if travel.hasNext():
return travel.next()
return None
def get_sub_graph(self, vertices=None, edges=None, vertex_properties=None):
"""
get sub graph
:param graph: graph, type: GraphTraversalSource
:param vertices: hasLabel('label').has('property').has('age', gt(20))
:param edges: hasLabel('label').has('property')
:param vertex_properties:
:return: sub_graph, type: GraphTraversalSource
"""
strategy = SubgraphStrategy(vertices=vertices, edges=edges, vertex_properties=vertex_properties)
return self.graph.withStrategies(strategy)

View File

@ -0,0 +1,47 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
class LogManager:
"""
日志处理器类同时在控制台和日志文件中打印日志
"""
# 日志对象
Logger = None
def __init__(self):
super(LogManager, self).__init__()
@staticmethod
def get_logger(param_name, log_file='/mywork/log/public-sentiment/public-sentiment.log', level=logging.INFO):
"""
获取日志对象
:param param_name:
:param log_file:
:param level:
:return:
"""
if LogManager.Logger is None:
LogManager.Logger = logging.getLogger(param_name)
LogManager.Logger.setLevel(level=level)
formatter = logging.Formatter(
'%(asctime)s [%(threadName)s-%(thread)d] [%(levelname)s] %(name)s.%(funcName)s[%(lineno)d] %(message)s')
file_handler = logging.FileHandler(log_file, encoding="utf-8")
file_handler.setLevel(level=level)
file_handler.setFormatter(formatter)
console = logging.StreamHandler()
console.setFormatter(formatter)
console.setLevel(level)
LogManager.Logger.addHandler(file_handler)
LogManager.Logger.addHandler(console)
return LogManager.Logger
else:
return LogManager.Logger

View File

@ -0,0 +1,54 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import time
class SnowflakeManager(object):
"""
Twitter的雪花算法实现
"""
def __init__(self, start_time=1420041600000):
self.start_time = start_time / 1000 # 以秒为单位
self.last_timestamp = -1
# 41 bits时间戳
self.timestamp_shift = 22
# 10 bits机器编号
self.machine_id_shift = 12
# 12 bits序列号
self.sequence_shift = 0
# 41 bits可以表示的最大值2^41 - 1
self.max_timestamp = -1 ^ (-1 << self.timestamp_shift)
# 10 bits可以表示的最大值2^10 - 1
self.max_machine_id = -1 ^ (-1 << self.machine_id_shift)
# 12 bits可以表示的最大值2^12 - 1
self.max_sequence = -1 ^ (-1 << self.sequence_shift)
# 机器编号和序列号暂时不使用,可以通过参数传入
self.machine_id = 0
self.sequence = 0
def next_id(self):
timestamp = int(time.time())
if timestamp < self.last_timestamp:
raise ValueError('Current timestamp is less than last timestamp.')
if timestamp == self.last_timestamp:
self.sequence = (self.sequence + 1) & self.max_sequence
if self.sequence == 0:
timestamp = self.til_next_millis(self.last_timestamp)
else:
self.sequence = 0
self.last_timestamp = timestamp
return ((timestamp - int(self.start_time)) << self.timestamp_shift) | (
self.machine_id << self.machine_id_shift) | self.sequence
def til_next_millis(self, last_timestamp):
timestamp = int(time.time())
while timestamp <= last_timestamp:
timestamp = int(time.time())
return timestamp

View File

3
web/models.py Normal file
View File

@ -0,0 +1,3 @@
from django.db import models
# Create your models here.

6
web/models/__init__.py Normal file
View File

@ -0,0 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from .public_sentiment_comment import PublicSentimentComment
from .public_sentiment_source import PublicSentimentSource
from .training_sensitive_word import TrainingSensitiveWord

View File

@ -0,0 +1,30 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from django.core.validators import MaxValueValidator
from django.db import models
class PublicSentimentComment(models.Model):
"""
评论表
"""
# 主键
id = models.AutoField(primary_key=True)
# 内容
content = models.CharField(max_length=2550, null=True, blank=True)
# 来源id
source_id = models.BigIntegerField(validators=[MaxValueValidator(9223372036854775807)], db_index=True, null=False,
blank=False)
# 创建时间
create_time = models.DateTimeField(null=False, blank=False)
class Meta:
managed = True
db_table = 'ps_comment'
verbose_name = '评论表'
verbose_name_plural = verbose_name

View File

@ -0,0 +1,25 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from django.db import models
class PublicSentimentSource(models.Model):
"""
来源表
"""
# 主键
id = models.AutoField(primary_key=True)
# 域名
domain_name = models.CharField(max_length=255, null=True, blank=True)
# 名称
name = models.CharField(max_length=255, null=True, blank=True)
class Meta:
managed = True
db_table = 'ps_source'
verbose_name = '来源表'
verbose_name_plural = verbose_name

View File

@ -0,0 +1,25 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from django.db import models
class TrainingSensitiveWord(models.Model):
"""
敏感词表
"""
# 主键
id = models.AutoField(primary_key=True)
# 类型
type = models.CharField(max_length=255, null=True, blank=True)
# 敏感词
word = models.CharField(max_length=255, null=True, blank=True)
class Meta:
managed = True
db_table = 'training_sensitive_word'
verbose_name = '敏感词表'
verbose_name_plural = verbose_name

0
web/service/__init__.py Normal file
View File

View File

@ -0,0 +1,15 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from web.dao.public_sentiment_comment_dao import PublicSentimentCommentDao
from web.dao.training_sensitive_word_dao import TrainingSensitiveWordDao
class BaseService:
"""
service层的基类
"""
def __init__(self):
self.public_sentiment_comment_dao = PublicSentimentCommentDao()
self.training_sensitive_word_dao = TrainingSensitiveWordDao()

View File

@ -0,0 +1,31 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from web.manager.log_manager import LogManager
from web.service.base_service import BaseService
Logger = LogManager.get_logger(__name__)
class PublicSentimentCommentService(BaseService):
"""
PublicSentimentComment的service类
"""
def save(self, public_sentiment_comment):
"""
保存
"""
Logger.info('保存PublicSentimentComment对象')
self.public_sentiment_comment_dao.save(public_sentiment_comment)
def find_all(self):
"""
查询所有记录
"""
Logger.info('查询所有记录')
return self.public_sentiment_comment_dao.find_list(dict(), dict(), list())

View File

@ -0,0 +1,24 @@
#!/usr/bin/env python查询所有记录
# -*- coding: utf-8 -*-
from web.dto.service_result import ServiceResult
from web.enum.service_result_enum import ServiceResultEnum
from web.manager.log_manager import LogManager
from web.service.base_service import BaseService
Logger = LogManager.get_logger(__name__)
class TrainingSensitiveWordService(BaseService):
"""
TrainingSensitiveWord的service类
"""
def find_all(self):
"""
查询所有记录
"""
Logger.info('查询所有记录')
return ServiceResult.ok(ServiceResultEnum.SELECT_SUCCESS, self.training_sensitive_word_dao.find_all(),
ServiceResultEnum.SELECT_SUCCESS_DESCRIPTION)

0
web/spider/__init__.py Normal file
View File

13
web/spider/base_spider.py Normal file
View File

@ -0,0 +1,13 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from web.service.training_sensitive_word_service import TrainingSensitiveWordService
class BaseSpider:
"""
Spider层的基类
"""
def __init__(self):
self.training_sensitive_word_service = TrainingSensitiveWordService()

0
web/task/__init__.py Normal file
View File

14
web/task/base_task.py Normal file
View File

@ -0,0 +1,14 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
task基类
"""
from web.handler.crawl_data_handler import CrawlDataHandler
class BaseTask:
def __init__(self):
self.crawl_data_handler = CrawlDataHandler()

View File

@ -0,0 +1,23 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from web.manager.log_manager import LogManager
from web.task.base_task import BaseTask
Logger = LogManager.get_logger(__name__)
"""
添加注释
"""
class CrawlDataTask(BaseTask):
def collect_data_from_weibo(self):
"""
从新浪微博采集数据
"""
Logger.info("开始从新浪微博采集数据")
self.crawl_data_handler.collect_data_from_weibo()

3
web/tests.py Normal file
View File

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

0
web/util/__init__.py Normal file
View File

19
web/util/dto_util.py Normal file
View File

@ -0,0 +1,19 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from web.dto.api_result import ApiResult
from web.dto.service_result import ServiceResult
class DtoUtil:
"""
dto的工具类
"""
@staticmethod
def service_result_to_api_result(service_result: ServiceResult) -> ApiResult:
"""
将ServiceResult对象转换为ApiResult对象
"""
return ApiResult.instance(service_result.success, service_result.code, service_result.data,
service_result.message)

19
web/util/re_util.py Normal file
View File

@ -0,0 +1,19 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
class ReUtil:
"""
正则表达式的工具类
"""
@staticmethod
def clear_html(text_with_html):
"""
清除html
"""
soup = BeautifulSoup(text_with_html, 'html.parser')
return soup.get_text()

2
web/views.py Normal file
View File

@ -0,0 +1,2 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

0
web/vo/__init__.py Normal file
View File

13
web/vo/parse_html_vo.py Normal file
View File

@ -0,0 +1,13 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pydantic import BaseModel
class ParseHtmlVo(BaseModel):
"""
解析html的vo类
"""
# 地址
url: str