- 新增三个核心子图:人工审核、意图理解、格式化输出 - 实现完整的审核 API 端点(/api/review/*) - 前端添加审核确认界面(右下角固定框) - 为每个子图创建分步测试代码 - 添加功能实现文档
428 lines
12 KiB
Python
428 lines
12 KiB
Python
"""
|
||
意图理解工具模块
|
||
提供标准化的意图分类和信息提取能力
|
||
|
||
功能:
|
||
1. Intent - 意图数据类
|
||
2. IntentClassifier - 意图分类器
|
||
3. EntityExtractor - 实体提取器
|
||
4. IntentParser - 完整的意图解析器
|
||
5. IntentRegistry - 意图注册器
|
||
"""
|
||
|
||
import re
|
||
from typing import Dict, List, Any, Optional, Set, Tuple, Callable
|
||
from dataclasses import dataclass, field
|
||
from enum import Enum, auto
|
||
from abc import ABC, abstractmethod
|
||
|
||
|
||
class IntentType(Enum):
|
||
"""意图类型枚举"""
|
||
UNKNOWN = auto()
|
||
GREETING = auto() # 问候
|
||
QUESTION = auto() # 提问
|
||
REQUEST = auto() # 请求
|
||
COMMAND = auto() # 命令
|
||
INFORM = auto() # 告知信息
|
||
CONFIRM = auto() # 确认
|
||
DENY = auto() # 否认
|
||
THANKS = auto() # 感谢
|
||
GOODBYE = auto() # 告别
|
||
COMPLAINT = auto() # 投诉
|
||
PRAISE = auto() # 表扬
|
||
CLARIFY = auto() # 澄清
|
||
SUGGEST = auto() # 建议
|
||
|
||
|
||
@dataclass
|
||
class Entity:
|
||
"""实体数据类"""
|
||
entity_type: str # 实体类型
|
||
value: str # 实体值
|
||
start_pos: int = 0 # 起始位置
|
||
end_pos: int = 0 # 结束位置
|
||
confidence: float = 1.0 # 置信度
|
||
metadata: Dict[str, Any] = field(default_factory=dict) # 元数据
|
||
|
||
|
||
@dataclass
|
||
class Intent:
|
||
"""意图数据类"""
|
||
intent_type: IntentType # 意图类型
|
||
confidence: float = 1.0 # 置信度
|
||
entities: List[Entity] = field(default_factory=list) # 提取的实体
|
||
parameters: Dict[str, Any] = field(default_factory=dict) # 参数
|
||
original_text: str = "" # 原始文本
|
||
normalized_text: str = "" # 标准化后的文本
|
||
metadata: Dict[str, Any] = field(default_factory=dict) # 元数据
|
||
|
||
|
||
class BaseIntentClassifier(ABC):
|
||
"""意图分类器基类"""
|
||
|
||
@abstractmethod
|
||
def classify(self, text: str) -> Tuple[IntentType, float]:
|
||
"""
|
||
分类意图
|
||
|
||
Args:
|
||
text: 输入文本
|
||
|
||
Returns:
|
||
(意图类型, 置信度)
|
||
"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
def classify_with_scores(self, text: str) -> Dict[IntentType, float]:
|
||
"""
|
||
分类意图,返回所有类型的置信度
|
||
|
||
Args:
|
||
text: 输入文本
|
||
|
||
Returns:
|
||
{意图类型: 置信度}
|
||
"""
|
||
pass
|
||
|
||
|
||
class RuleBasedIntentClassifier(BaseIntentClassifier):
|
||
"""基于规则的意图分类器"""
|
||
|
||
def __init__(self):
|
||
self._rules: Dict[IntentType, Set[str]] = {}
|
||
self._initialize_default_rules()
|
||
|
||
def _initialize_default_rules(self) -> None:
|
||
"""初始化默认规则"""
|
||
# 问候
|
||
self.add_rule(IntentType.GREETING, {
|
||
"你好", "您好", "hi", "hello", "hey", "早上好", "下午好", "晚上好", "哈喽"
|
||
})
|
||
# 告别
|
||
self.add_rule(IntentType.GOODBYE, {
|
||
"再见", "拜拜", "bye", "goodbye", "回见", "下次见", "再见了"
|
||
})
|
||
# 感谢
|
||
self.add_rule(IntentType.THANKS, {
|
||
"谢谢", "感谢", "多谢", "thanks", "thank you", "3q", "谢谢了"
|
||
})
|
||
# 确认
|
||
self.add_rule(IntentType.CONFIRM, {
|
||
"是的", "对", "没错", "好的", "可以", "行", "同意", "确认", "yes", "yep"
|
||
})
|
||
# 否认
|
||
self.add_rule(IntentType.DENY, {
|
||
"不", "不是", "不对", "不行", "不要", "拒绝", "no", "nope", "没有"
|
||
})
|
||
# 提问
|
||
self.add_rule(IntentType.QUESTION, {
|
||
"?", "?", "什么", "怎么", "如何", "为什么", "哪", "谁", "多少", "吗", "呢"
|
||
})
|
||
|
||
def add_rule(self, intent_type: IntentType, keywords: Set[str]) -> None:
|
||
"""
|
||
添加规则
|
||
|
||
Args:
|
||
intent_type: 意图类型
|
||
keywords: 关键词集合
|
||
"""
|
||
if intent_type not in self._rules:
|
||
self._rules[intent_type] = set()
|
||
self._rules[intent_type].update(keywords)
|
||
|
||
def classify(self, text: str) -> Tuple[IntentType, float]:
|
||
"""
|
||
分类意图
|
||
|
||
Args:
|
||
text: 输入文本
|
||
|
||
Returns:
|
||
(意图类型, 置信度)
|
||
"""
|
||
scores = self.classify_with_scores(text)
|
||
if not scores:
|
||
return IntentType.UNKNOWN, 0.0
|
||
|
||
best_intent = max(scores.items(), key=lambda x: x[1])
|
||
return best_intent[0], best_intent[1]
|
||
|
||
def classify_with_scores(self, text: str) -> Dict[IntentType, float]:
|
||
"""
|
||
分类意图,返回所有类型的置信度
|
||
|
||
Args:
|
||
text: 输入文本
|
||
|
||
Returns:
|
||
{意图类型: 置信度}
|
||
"""
|
||
scores: Dict[IntentType, float] = {}
|
||
normalized_text = text.lower()
|
||
|
||
for intent_type, keywords in self._rules.items():
|
||
match_count = 0
|
||
for keyword in keywords:
|
||
if keyword.lower() in normalized_text:
|
||
match_count += 1
|
||
|
||
if match_count > 0:
|
||
scores[intent_type] = min(1.0, match_count / 3.0)
|
||
|
||
# 如果没有匹配,返回UNKNOWN
|
||
if not scores:
|
||
scores[IntentType.UNKNOWN] = 0.5
|
||
|
||
return scores
|
||
|
||
|
||
class BaseEntityExtractor(ABC):
|
||
"""实体提取器基类"""
|
||
|
||
@abstractmethod
|
||
def extract(self, text: str) -> List[Entity]:
|
||
"""
|
||
提取实体
|
||
|
||
Args:
|
||
text: 输入文本
|
||
|
||
Returns:
|
||
实体列表
|
||
"""
|
||
pass
|
||
|
||
|
||
class RuleBasedEntityExtractor(BaseEntityExtractor):
|
||
"""基于规则的实体提取器"""
|
||
|
||
def __init__(self):
|
||
self._patterns: Dict[str, re.Pattern] = {} # 正则模式
|
||
self._keywords: Dict[str, Set[str]] = {} # 关键词列表
|
||
self._initialize_default_patterns()
|
||
|
||
def _initialize_default_patterns(self) -> None:
|
||
"""初始化默认模式"""
|
||
# 邮箱
|
||
self.add_regex_pattern(
|
||
"email",
|
||
r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
|
||
)
|
||
# 电话号码
|
||
self.add_regex_pattern(
|
||
"phone",
|
||
r'1[3-9]\d{9}'
|
||
)
|
||
# 日期(简单模式)
|
||
self.add_regex_pattern(
|
||
"date",
|
||
r'\d{4}[-/年]\d{1,2}[-/月]\d{1,2}[日号]?|\d{1,2}[-/月]\d{1,2}[日号]?'
|
||
)
|
||
# 数字
|
||
self.add_regex_pattern(
|
||
"number",
|
||
r'\d+\.?\d*'
|
||
)
|
||
|
||
def add_regex_pattern(self, entity_type: str, pattern: str) -> None:
|
||
"""
|
||
添加正则匹配规则
|
||
|
||
Args:
|
||
entity_type: 实体类型
|
||
pattern: 正则表达式
|
||
"""
|
||
try:
|
||
self._patterns[entity_type] = re.compile(pattern, re.IGNORECASE)
|
||
except re.error:
|
||
pass
|
||
|
||
def add_keywords(self, entity_type: str, keywords: Set[str]) -> None:
|
||
"""
|
||
添加关键词匹配规则
|
||
|
||
Args:
|
||
entity_type: 实体类型
|
||
keywords: 关键词集合
|
||
"""
|
||
if entity_type not in self._keywords:
|
||
self._keywords[entity_type] = set()
|
||
self._keywords[entity_type].update(keywords)
|
||
|
||
def extract(self, text: str) -> List[Entity]:
|
||
"""
|
||
提取实体
|
||
|
||
Args:
|
||
text: 输入文本
|
||
|
||
Returns:
|
||
实体列表
|
||
"""
|
||
entities: List[Entity] = []
|
||
|
||
# 正则匹配
|
||
for entity_type, pattern in self._patterns.items():
|
||
for match in pattern.finditer(text):
|
||
entity = Entity(
|
||
entity_type=entity_type,
|
||
value=match.group(),
|
||
start_pos=match.start(),
|
||
end_pos=match.end(),
|
||
confidence=0.95
|
||
)
|
||
entities.append(entity)
|
||
|
||
# 关键词匹配
|
||
for entity_type, keywords in self._keywords.items():
|
||
for keyword in keywords:
|
||
start_idx = 0
|
||
while True:
|
||
pos = text.lower().find(keyword.lower(), start_idx)
|
||
if pos == -1:
|
||
break
|
||
entity = Entity(
|
||
entity_type=entity_type,
|
||
value=text[pos:pos + len(keyword)],
|
||
start_pos=pos,
|
||
end_pos=pos + len(keyword),
|
||
confidence=0.9
|
||
)
|
||
entities.append(entity)
|
||
start_idx = pos + len(keyword)
|
||
|
||
# 按位置排序
|
||
entities.sort(key=lambda e: e.start_pos)
|
||
return entities
|
||
|
||
|
||
class IntentRegistry:
|
||
"""意图注册器"""
|
||
|
||
def __init__(self):
|
||
self._intent_handlers: Dict[IntentType, Callable] = {}
|
||
|
||
def register(self, intent_type: IntentType, handler: Callable) -> None:
|
||
"""
|
||
注册意图处理器
|
||
|
||
Args:
|
||
intent_type: 意图类型
|
||
handler: 处理器
|
||
"""
|
||
self._intent_handlers[intent_type] = handler
|
||
|
||
def get_handler(self, intent_type: IntentType) -> Optional[Callable]:
|
||
"""
|
||
获取意图处理器
|
||
|
||
Args:
|
||
intent_type: 意图类型
|
||
|
||
Returns:
|
||
处理器,如果不存在返回 None
|
||
"""
|
||
return self._intent_handlers.get(intent_type)
|
||
|
||
|
||
class IntentParser:
|
||
"""完整的意图解析器"""
|
||
|
||
def __init__(
|
||
self,
|
||
classifier: Optional[BaseIntentClassifier] = None,
|
||
extractor: Optional[BaseEntityExtractor] = None,
|
||
registry: Optional[IntentRegistry] = None
|
||
):
|
||
"""
|
||
初始化意图解析器
|
||
|
||
Args:
|
||
classifier: 意图分类器
|
||
extractor: 实体提取器
|
||
registry: 意图注册器
|
||
"""
|
||
self.classifier = classifier or RuleBasedIntentClassifier()
|
||
self.extractor = extractor or RuleBasedEntityExtractor()
|
||
self.registry = registry or IntentRegistry()
|
||
|
||
def parse(self, text: str) -> Intent:
|
||
"""
|
||
解析文本,返回完整的意图对象
|
||
|
||
Args:
|
||
text: 输入文本
|
||
|
||
Returns:
|
||
意图对象
|
||
"""
|
||
# 分类意图
|
||
intent_type, confidence = self.classifier.classify(text)
|
||
|
||
# 提取实体
|
||
entities = self.extractor.extract(text)
|
||
|
||
# 构建意图对象
|
||
intent = Intent(
|
||
intent_type=intent_type,
|
||
confidence=confidence,
|
||
entities=entities,
|
||
original_text=text,
|
||
normalized_text=text.lower().strip()
|
||
)
|
||
|
||
# 从实体中提取参数
|
||
for entity in entities:
|
||
intent.parameters[entity.entity_type] = entity.value
|
||
|
||
return intent
|
||
|
||
def parse_and_execute(self, text: str, context: Optional[Dict[str, Any]] = None) -> Any:
|
||
"""
|
||
解析文本并执行对应的处理器
|
||
|
||
Args:
|
||
text: 输入文本
|
||
context: 上下文
|
||
|
||
Returns:
|
||
执行结果
|
||
"""
|
||
intent = self.parse(text)
|
||
handler = self.registry.get_handler(intent.intent_type)
|
||
|
||
if handler:
|
||
return handler(intent, context or {})
|
||
|
||
return None
|
||
|
||
|
||
def create_default_intent_parser() -> IntentParser:
|
||
"""
|
||
创建默认配置的意图解析器
|
||
|
||
Returns:
|
||
配置好的意图解析器
|
||
"""
|
||
parser = IntentParser()
|
||
|
||
# 注册默认处理器
|
||
def greeting_handler(intent: Intent, context: Dict) -> str:
|
||
return "你好!很高兴为你服务。"
|
||
|
||
def thanks_handler(intent: Intent, context: Dict) -> str:
|
||
return "不客气!"
|
||
|
||
def goodbye_handler(intent: Intent, context: Dict) -> str:
|
||
return "再见!有需要随时找我。"
|
||
|
||
parser.registry.register(IntentType.GREETING, greeting_handler)
|
||
parser.registry.register(IntentType.THANKS, thanks_handler)
|
||
parser.registry.register(IntentType.GOODBYE, goodbye_handler)
|
||
|
||
return parser
|