#!/usr/bin/env python # -*- coding: utf-8 -*- import re import requests from web.enum.service_result_enum import ServiceResultEnum from web.dto.service_result import ServiceResult from web.handler.base_handler import BaseHandler from web.manager.log_manager import LogManager from web.util.re_util import ReUtil Logger = LogManager.get_logger(__name__) class HtmlParserHandler(BaseHandler): """ html解析器类 """ def parse_html(self, url): """ 解析html网页 """ response = requests.get(url) text = response.text # 查询敏感词,并将其拼接为字符串,用|分隔 service_result = self.training_sensitive_word_service.find_all() if service_result is not None and service_result.success is True: training_sensitive_word_list = service_result.data temp_training_sensitive_word_list = list( map(lambda training_sensitive_word: str(training_sensitive_word.word), training_sensitive_word_list)) match_str = '.+|.+'.join(temp_training_sensitive_word_list) # 去除返回值中的html标签 text_without_html = ReUtil.clear_html(text) text_without_html_list = text_without_html.split('\n') # 匹配 for item in text_without_html_list: match = re.match(match_str, item) if match: return ServiceResult.ok(ServiceResultEnum.EXIST_SENSITIVE_WORD.value, match.group(), ServiceResultEnum.EXIST_SENSITIVE_WORD_DESCRIPTION.value) return ServiceResult.ok(ServiceResultEnum.NOT_EXIST_SENSITIVE_WORD.value, None, ServiceResultEnum.NOT_EXIST_SENSITIVE_WORD_DESCRIPTION.value) else: return ServiceResult.fail(ServiceResultEnum.FAIL.value, None, ServiceResultEnum.FAIL_DESCRIPTION.value)