public_sentiment/web/handler/html_parser_handler.py

52 lines
1.9 KiB
Python
Raw Normal View History

2024-09-18 13:38:24 +08:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import requests
from web.enum.service_result_enum import ServiceResultEnum
from web.dto.service_result import ServiceResult
from web.handler.base_handler import BaseHandler
from web.manager.log_manager import LogManager
from web.util.re_util import ReUtil
Logger = LogManager.get_logger(__name__)
class HtmlParserHandler(BaseHandler):
"""
html解析器类
"""
def parse_html(self, url):
"""
解析html网页
"""
response = requests.get(url)
text = response.text
# 查询敏感词,并将其拼接为字符串,用|分隔
service_result = self.training_sensitive_word_service.find_all()
if service_result is not None and service_result.success is True:
training_sensitive_word_list = service_result.data
temp_training_sensitive_word_list = list(
map(lambda training_sensitive_word: str(training_sensitive_word.word), training_sensitive_word_list))
match_str = '.+|.+'.join(temp_training_sensitive_word_list)
# 去除返回值中的html标签
text_without_html = ReUtil.clear_html(text)
text_without_html_list = text_without_html.split('\n')
# 匹配
for item in text_without_html_list:
match = re.match(match_str, item)
if match:
return ServiceResult.ok(ServiceResultEnum.EXIST_SENSITIVE_WORD.value, match.group(),
ServiceResultEnum.EXIST_SENSITIVE_WORD_DESCRIPTION.value)
return ServiceResult.ok(ServiceResultEnum.NOT_EXIST_SENSITIVE_WORD.value, None,
ServiceResultEnum.NOT_EXIST_SENSITIVE_WORD_DESCRIPTION.value)
else:
return ServiceResult.fail(ServiceResultEnum.FAIL.value, None,
ServiceResultEnum.FAIL_DESCRIPTION.value)