2024-09-18 13:41:28 +08:00
|
|
|
|
#!/usr/bin/env python
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
import sys
|
|
|
|
|
from scrapy.crawler import CrawlerProcess
|
|
|
|
|
from scrapy.utils.project import get_project_settings
|
2024-09-19 16:58:49 +08:00
|
|
|
|
|
|
|
|
|
from web.handler.apscheduler_handler import ApschedulerHandler
|
2024-09-18 13:41:28 +08:00
|
|
|
|
from web.handler.html_parser_handler import HtmlParserHandler
|
|
|
|
|
|
|
|
|
|
sys.path.append(r"collector")
|
|
|
|
|
|
|
|
|
|
from collector.settings import ITEM_PIPELINES
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BaseController:
|
|
|
|
|
"""
|
|
|
|
|
controller层的基类
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self.html_parser_handler = HtmlParserHandler()
|
2024-09-19 16:58:49 +08:00
|
|
|
|
self.apscheduler_handler = ApschedulerHandler()
|
2024-09-18 13:41:28 +08:00
|
|
|
|
|
|
|
|
|
def to_vo(self, request, clazz):
|
|
|
|
|
"""
|
|
|
|
|
将json参数转换为vo对象
|
|
|
|
|
"""
|
|
|
|
|
raw_data = request.body.decode("utf-8")
|
|
|
|
|
json_data_dict = json.loads(raw_data)
|
|
|
|
|
obj = clazz(**json_data_dict)
|
|
|
|
|
return obj
|
|
|
|
|
|
|
|
|
|
def start_scrawl(self, spider):
|
|
|
|
|
"""
|
|
|
|
|
开始执行爬虫
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# get_project_settings方法并不能导入settings.py中的配置,因此此处还要硬编码导入
|
|
|
|
|
settings = get_project_settings()
|
|
|
|
|
settings['ITEM_PIPELINES'] = ITEM_PIPELINES
|
|
|
|
|
process = CrawlerProcess(settings)
|
|
|
|
|
process.crawl(spider)
|
|
|
|
|
process.start()
|