52 lines
1.9 KiB
Python
52 lines
1.9 KiB
Python
![]() |
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
import re
|
||
|
import requests
|
||
|
|
||
|
from web.enum.service_result_enum import ServiceResultEnum
|
||
|
from web.dto.service_result import ServiceResult
|
||
|
from web.handler.base_handler import BaseHandler
|
||
|
from web.manager.log_manager import LogManager
|
||
|
from web.util.re_util import ReUtil
|
||
|
|
||
|
Logger = LogManager.get_logger(__name__)
|
||
|
|
||
|
|
||
|
class HtmlParserHandler(BaseHandler):
|
||
|
"""
|
||
|
html解析器类
|
||
|
"""
|
||
|
|
||
|
def parse_html(self, url):
|
||
|
"""
|
||
|
解析html网页
|
||
|
"""
|
||
|
|
||
|
response = requests.get(url)
|
||
|
text = response.text
|
||
|
|
||
|
# 查询敏感词,并将其拼接为字符串,用|分隔
|
||
|
service_result = self.training_sensitive_word_service.find_all()
|
||
|
if service_result is not None and service_result.success is True:
|
||
|
training_sensitive_word_list = service_result.data
|
||
|
temp_training_sensitive_word_list = list(
|
||
|
map(lambda training_sensitive_word: str(training_sensitive_word.word), training_sensitive_word_list))
|
||
|
match_str = '.+|.+'.join(temp_training_sensitive_word_list)
|
||
|
|
||
|
# 去除返回值中的html标签
|
||
|
text_without_html = ReUtil.clear_html(text)
|
||
|
text_without_html_list = text_without_html.split('\n')
|
||
|
|
||
|
# 匹配
|
||
|
for item in text_without_html_list:
|
||
|
match = re.match(match_str, item)
|
||
|
if match:
|
||
|
return ServiceResult.ok(ServiceResultEnum.EXIST_SENSITIVE_WORD.value, match.group(),
|
||
|
ServiceResultEnum.EXIST_SENSITIVE_WORD_DESCRIPTION.value)
|
||
|
return ServiceResult.ok(ServiceResultEnum.NOT_EXIST_SENSITIVE_WORD.value, None,
|
||
|
ServiceResultEnum.NOT_EXIST_SENSITIVE_WORD_DESCRIPTION.value)
|
||
|
else:
|
||
|
return ServiceResult.fail(ServiceResultEnum.FAIL.value, None,
|
||
|
ServiceResultEnum.FAIL_DESCRIPTION.value)
|