INFO · info-20251219-004
Facts/Preferences 提取方案
[INFO] Facts/Preferences 提取方案
- 时间: 2024-12-19
- 类型: 方案
- 来源: 技术设计
- 置信度: 8/10
- 标签: #Memory #Facts #Preferences #LLM提取
概述
从对话中提取 facts(用户事实)和 preferences(用户偏好)的完整实现方案。
一、数据定义
Facts(事实)
@dataclass
class Fact:
fact: str # 事实描述(第三人称)
category: str # 分类
confidence: float # 置信度 0-1
source_session: str # 来源会话
first_seen: datetime # 首次发现
last_confirmed: datetime # 最近确认
times_mentioned: int # 提及次数
is_valid: bool # 是否有效(可能被修正)
# 分类
FACT_CATEGORIES = [
"personal_info", # 个人信息:姓名、年龄、性别
"location", # 地点:住址、工作地点
"relationship", # 关系:家人、同事、老板
"work", # 工作:公司、职位、行业
"habit", # 习惯:作息、饮食
"preference", # 偏好:喜好、厌恶
"other" # 其他
]
Preferences(偏好)
@dataclass
class Preference:
preference: str # 偏好描述
category: str # 分类
strength: float # 强度 0-1
evidence: List[str] # 支持证据
first_seen: datetime
last_confirmed: datetime
times_observed: int # 观察次数
# 分类
PREF_CATEGORIES = [
"communication_style", # 沟通风格:简洁/详细/正式/随意
"time", # 时间偏好:工作时间、勿扰时间
"content", # 内容偏好:喜欢什么类型内容
"format", # 格式偏好:列表/段落/代码
"tool", # 工具偏好:使用什么工具
"language", # 语言偏好:中英文、术语
"other"
]
二、检测规则
信号词检测
class SignalDetector:
"""检测消息中是否可能包含 fact 或 preference"""
# Fact 信号词
FACT_SIGNALS = {
# 身份类
"identity": ["我是", "我叫", "我姓", "我的名字"],
# 位置类
"location": ["我在", "我住", "我工作在", "我们公司在", "坐标"],
# 关系类
"relationship": ["我老板", "我同事", "我老婆", "我老公", "我女朋友",
"我男朋友", "我爸", "我妈", "我儿子", "我女儿"],
# 工作类
"work": ["我们公司", "我负责", "我的工作", "我是做", "我的职位"],
# 所有物
"possession": ["我的车", "我的房", "我的电脑", "我的手机"],
# 状态类
"state": ["我有", "我没有", "我会", "我不会"],
}
# Preference 信号词
PREF_SIGNALS = {
# 明确偏好
"explicit": ["我喜欢", "我偏好", "我习惯", "我倾向于"],
# 明确厌恶
"dislike": ["我不喜欢", "我讨厌", "我不想", "别给我", "不要"],
# 请求风格
"style_request": ["简短点", "详细点", "直接说", "说重点",
"不用解释", "解释一下"],
# 时间相关
"time": ["早上", "晚上", "周末", "工作日", "以后", "每次", "总是"],
# 频率/强度
"frequency": ["经常", "很少", "从不", "一直", "偶尔"],
}
def detect(self, text: str) -> dict:
"""返回检测结果"""
result = {
"has_fact_signal": False,
"has_pref_signal": False,
"fact_categories": [],
"pref_categories": [],
}
# 检测 fact 信号
for category, signals in self.FACT_SIGNALS.items():
if any(signal in text for signal in signals):
result["has_fact_signal"] = True
result["fact_categories"].append(category)
# 检测 preference 信号
for category, signals in self.PREF_SIGNALS.items():
if any(signal in text for signal in signals):
result["has_pref_signal"] = True
result["pref_categories"].append(category)
return result
三、LLM 提取器
Prompt 模板
class ExtractionPrompts:
FACT_PROMPT = """从用户的话中提取事实信息。
用户说:"{user_message}"
规则:
1. 只提取**明确陈述**的事实,不要推测
2. 用第三人称描述("用户..."而不是"我...")
3. 如果没有明确事实,返回空数组 []
分类说明:
- personal_info: 姓名、年龄、性别等个人信息
- location: 居住地、工作地点
- relationship: 家人、同事、朋友关系
- work: 公司、职位、行业、职责
- habit: 日常习惯、作息
- preference: 明确表达的喜好
- other: 其他事实
返回 JSON 数组:
```json
[
{{
"fact": "用户的老板姓张",
"category": "relationship",
"confidence": 0.9
}}
]
只返回 JSON,不要其他内容。"""
FACT_PROMPT_WITH_CONTEXT = """从用户的话中提取事实信息。
当前对话: {conversation}
用户最新说:"{user_message}"
已知事实(避免重复): {existing_facts}
规则:
- 只提取明确陈述的事实,不要推测
- 用第三人称描述
- 不要重复已知事实
- 如果新信息与已知事实冲突,标注 confidence 并说明
返回 JSON 数组:
[
{{
"fact": "事实描述",
"category": "分类",
"confidence": 0.9,
"conflicts_with": null 或 "冲突的旧事实"
}}
]
只返回 JSON。"""
PREF_PROMPT = """从用户的话或行为中识别偏好。
用户说:"{user_message}" 上下文:{context}
偏好类型:
- communication_style: 沟通风格(简洁/详细/正式/随意/直接)
- time: 时间偏好(什么时候联系、勿扰时间)
- content: 内容偏好(喜欢什么话题/不喜欢什么)
- format: 格式偏好(列表/段落/代码/图表)
- tool: 工具偏好(使用什么软件/平台)
- language: 语言偏好(中英文/术语使用)
- other: 其他偏好
规则:
- 可以从明确表达中提取("我喜欢简短的回复")
- 也可以从行为推断(用户多次要求简短 → 偏好简洁)
- strength 表示偏好强度:
- 0.9-1.0: 明确强烈表达
- 0.7-0.9: 明确表达
- 0.5-0.7: 行为推断
- 0.3-0.5: 弱信号
返回 JSON 数组:
[
{{
"preference": "偏好简洁直接的回复风格",
"category": "communication_style",
"strength": 0.8,
"evidence": "用户说'直接说重点'"
}}
]
只返回 JSON。"""
CONFLICT_CHECK_PROMPT = """判断这两个事实是否冲突。
旧事实:{old_fact} 新事实:{new_fact}
分析:
- 如果新事实是对旧事实的更新(如地址变了),返回 "update"
- 如果新事实与旧事实矛盾(不可能同时为真),返回 "conflict"
- 如果是不同的事实(可以共存),返回 "different"
- 如果是重复(表达同一件事),返回 "duplicate"
只返回一个词:update / conflict / different / duplicate"""
### 提取器实现
```python
import json
from typing import List, Optional, Tuple
from dataclasses import dataclass
from datetime import datetime
@dataclass
class ExtractedFact:
fact: str
category: str
confidence: float
conflicts_with: Optional[str] = None
@dataclass
class ExtractedPreference:
preference: str
category: str
strength: float
evidence: str
class FactPreferenceExtractor:
"""Facts 和 Preferences 提取器"""
def __init__(self, llm_client, memory_store):
self.llm = llm_client
self.store = memory_store
self.detector = SignalDetector()
self.prompts = ExtractionPrompts()
def process_message(
self,
user_id: str,
message: str,
context: List[dict] = None
) -> Tuple[List[ExtractedFact], List[ExtractedPreference]]:
"""处理单条消息,提取 facts 和 preferences"""
# 1. 信号检测
signals = self.detector.detect(message)
facts = []
preferences = []
# 2. 提取 facts
if signals["has_fact_signal"]:
facts = self._extract_facts(user_id, message, context)
# 3. 提取 preferences
if signals["has_pref_signal"]:
preferences = self._extract_preferences(user_id, message, context)
return facts, preferences
def _extract_facts(
self,
user_id: str,
message: str,
context: List[dict] = None
) -> List[ExtractedFact]:
"""提取 facts"""
# 获取已有 facts(用于去重和冲突检测)
existing_facts = self.store.recall_by_type(
user_id, MemoryType.FACTS, limit=20
)
existing_text = "\n".join([f"- {f.content}" for f in existing_facts]) or "无"
# 构建 prompt
if context:
conv_text = "\n".join([
f"{turn['role']}: {turn['content']}"
for turn in context[-5:] # 最近 5 轮
])
prompt = self.prompts.FACT_PROMPT_WITH_CONTEXT.format(
conversation=conv_text,
user_message=message,
existing_facts=existing_text
)
else:
prompt = self.prompts.FACT_PROMPT.format(user_message=message)
# 调用 LLM
response = self.llm.generate(prompt)
raw_facts = self._parse_json_array(response)
# 转换为 dataclass
return [
ExtractedFact(
fact=f.get("fact", ""),
category=f.get("category", "other"),
confidence=f.get("confidence", 0.5),
conflicts_with=f.get("conflicts_with")
)
for f in raw_facts
if f.get("fact")
]
def _extract_preferences(
self,
user_id: str,
message: str,
context: List[dict] = None
) -> List[ExtractedPreference]:
"""提取 preferences"""
context_text = ""
if context:
context_text = "\n".join([
f"{turn['role']}: {turn['content']}"
for turn in context[-3:]
])
prompt = self.prompts.PREF_PROMPT.format(
user_message=message,
context=context_text or "无"
)
response = self.llm.generate(prompt)
raw_prefs = self._parse_json_array(response)
return [
ExtractedPreference(
preference=p.get("preference", ""),
category=p.get("category", "other"),
strength=p.get("strength", 0.5),
evidence=p.get("evidence", "")
)
for p in raw_prefs
if p.get("preference")
]
def _parse_json_array(self, text: str) -> list:
"""解析 JSON 数组"""
try:
# 尝试找到 JSON 数组
start = text.find('[')
end = text.rfind(']') + 1
if start >= 0 and end > start:
return json.loads(text[start:end])
except json.JSONDecodeError:
pass
return []
四、存储管理器
class FactManager:
"""Fact 存储和更新管理"""
def __init__(self, memory_store, llm_client):
self.store = memory_store
self.llm = llm_client
self.prompts = ExtractionPrompts()
def save_fact(self, user_id: str, fact: ExtractedFact, session_id: str = None):
"""保存 fact,处理去重和冲突"""
# 1. 语义相似度搜索
similar = self.store.recall(
user_id=user_id,
query=fact.fact,
types=[MemoryType.FACTS],
top_k=3,
min_similarity=0.75
)
if not similar:
# 2a. 新 fact,直接创建
return self._create_fact(user_id, fact, session_id)
# 2b. 有相似的,检查关系
existing = similar[0].memory
relation = self._check_relation(existing.content, fact.fact)
if relation == "duplicate":
# 重复:增加提及次数
return self._increment_mention(existing.id)
elif relation == "update":
# 更新:新信息替换旧信息
return self._update_fact(existing.id, fact, session_id)
elif relation == "conflict":
# 冲突:标记旧的失效,创建新的
self._invalidate_fact(existing.id, reason=f"被新事实替代: {fact.fact}")
return self._create_fact(user_id, fact, session_id)
else: # different
# 不同的事实,直接创建
return self._create_fact(user_id, fact, session_id)
def _check_relation(self, old_fact: str, new_fact: str) -> str:
"""用 LLM 判断两个 fact 的关系"""
prompt = self.prompts.CONFLICT_CHECK_PROMPT.format(
old_fact=old_fact,
new_fact=new_fact
)
response = self.llm.generate(prompt).strip().lower()
if "update" in response:
return "update"
elif "conflict" in response:
return "conflict"
elif "duplicate" in response:
return "duplicate"
else:
return "different"
def _create_fact(self, user_id: str, fact: ExtractedFact, session_id: str) -> int:
"""创建新 fact"""
memory = Memory(
id=None,
user_id=user_id,
type=MemoryType.FACTS,
content=fact.fact,
embedding=None,
metadata={
"category": fact.category,
"confidence": fact.confidence,
"source_session": session_id,
"first_seen": datetime.now().isoformat(),
"last_confirmed": datetime.now().isoformat(),
"times_mentioned": 1,
"conflicts_with": fact.conflicts_with
},
importance=fact.confidence,
session_id=session_id
)
return self.store.save(memory, dedupe=False)
def _update_fact(self, fact_id: int, new_fact: ExtractedFact, session_id: str):
"""更新已有 fact"""
with self.store.conn.cursor() as cur:
cur.execute("""
UPDATE memories
SET content = %s,
metadata = metadata || %s,
importance = GREATEST(importance, %s)
WHERE id = %s
""", (
new_fact.fact,
json.dumps({
"last_confirmed": datetime.now().isoformat(),
"updated_from_session": session_id
}),
new_fact.confidence,
fact_id
))
self.store.conn.commit()
return fact_id
def _increment_mention(self, fact_id: int):
"""增加提及次数"""
with self.store.conn.cursor() as cur:
cur.execute("""
UPDATE memories
SET metadata = jsonb_set(
jsonb_set(metadata, '{times_mentioned}',
(COALESCE((metadata->>'times_mentioned')::int, 0) + 1)::text::jsonb),
'{last_confirmed}', %s::jsonb
),
importance = LEAST(1.0, importance + 0.05)
WHERE id = %s
""", (json.dumps(datetime.now().isoformat()), fact_id))
self.store.conn.commit()
return fact_id
def _invalidate_fact(self, fact_id: int, reason: str):
"""标记 fact 失效"""
with self.store.conn.cursor() as cur:
cur.execute("""
UPDATE memories
SET is_active = false,
metadata = metadata || %s
WHERE id = %s
""", (
json.dumps({
"invalidated_at": datetime.now().isoformat(),
"invalidation_reason": reason
}),
fact_id
))
self.store.conn.commit()
class PreferenceManager:
"""Preference 存储和更新管理"""
def __init__(self, memory_store):
self.store = memory_store
def save_preference(self, user_id: str, pref: ExtractedPreference, session_id: str = None):
"""保存 preference,处理合并"""
# 1. 按 category 查找已有偏好
existing = self._find_similar_preference(user_id, pref)
if existing:
# 2a. 更新已有偏好(增强或减弱)
return self._update_preference(existing, pref, session_id)
else:
# 2b. 创建新偏好
return self._create_preference(user_id, pref, session_id)
def _find_similar_preference(self, user_id: str, pref: ExtractedPreference):
"""查找相似的已有偏好"""
# 先按 category 过滤,再语义搜索
similar = self.store.recall(
user_id=user_id,
query=pref.preference,
types=[MemoryType.PREFERENCES],
top_k=3,
min_similarity=0.7
)
# 找同 category 的
for s in similar:
if s.memory.metadata.get("category") == pref.category:
return s.memory
return None
def _create_preference(self, user_id: str, pref: ExtractedPreference, session_id: str) -> int:
"""创建新 preference"""
memory = Memory(
id=None,
user_id=user_id,
type=MemoryType.PREFERENCES,
content=pref.preference,
embedding=None,
metadata={
"category": pref.category,
"strength": pref.strength,
"evidence": [pref.evidence] if pref.evidence else [],
"first_seen": datetime.now().isoformat(),
"last_confirmed": datetime.now().isoformat(),
"times_observed": 1
},
importance=pref.strength,
session_id=session_id
)
return self.store.save(memory, dedupe=False)
def _update_preference(self, existing: Memory, new_pref: ExtractedPreference, session_id: str):
"""更新已有 preference"""
old_strength = existing.metadata.get("strength", 0.5)
old_evidence = existing.metadata.get("evidence", [])
old_times = existing.metadata.get("times_observed", 1)
# 强度:加权平均,新观察权重更高
new_strength = (old_strength * old_times + new_pref.strength * 1.5) / (old_times + 1.5)
new_strength = min(1.0, new_strength)
# 证据:追加
if new_pref.evidence and new_pref.evidence not in old_evidence:
new_evidence = old_evidence + [new_pref.evidence]
else:
new_evidence = old_evidence
with self.store.conn.cursor() as cur:
cur.execute("""
UPDATE memories
SET metadata = metadata || %s,
importance = %s
WHERE id = %s
""", (
json.dumps({
"strength": new_strength,
"evidence": new_evidence[-5:], # 保留最近 5 条证据
"last_confirmed": datetime.now().isoformat(),
"times_observed": old_times + 1
}),
new_strength,
existing.id
))
self.store.conn.commit()
return existing.id
五、集成到 Pipeline
class EnhancedMemoryPipeline:
"""增强的 Memory Pipeline,支持实时提取"""
def __init__(self, memory_store, llm_client):
self.store = memory_store
self.extractor = FactPreferenceExtractor(llm_client, memory_store)
self.fact_manager = FactManager(memory_store, llm_client)
self.pref_manager = PreferenceManager(memory_store)
def on_user_message(
self,
user_id: str,
message: str,
session_id: str,
context: List[dict] = None
):
"""处理用户消息(实时)"""
# 1. 提取 facts 和 preferences
facts, preferences = self.extractor.process_message(
user_id, message, context
)
# 2. 保存 facts
for fact in facts:
self.fact_manager.save_fact(user_id, fact, session_id)
# 3. 保存 preferences
for pref in preferences:
self.pref_manager.save_preference(user_id, pref, session_id)
# 4. 返回提取结果(可用于日志或调试)
return {
"facts_extracted": len(facts),
"preferences_extracted": len(preferences),
"facts": [f.fact for f in facts],
"preferences": [p.preference for p in preferences]
}
def get_user_profile(self, user_id: str) -> dict:
"""获取用户画像"""
facts = self.store.recall_by_type(user_id, MemoryType.FACTS, limit=50)
prefs = self.store.recall_by_type(user_id, MemoryType.PREFERENCES, limit=20)
# 按 category 分组
facts_by_category = {}
for f in facts:
cat = f.metadata.get("category", "other")
if cat not in facts_by_category:
facts_by_category[cat] = []
facts_by_category[cat].append({
"fact": f.content,
"confidence": f.metadata.get("confidence", 0.5),
"times_mentioned": f.metadata.get("times_mentioned", 1)
})
prefs_by_category = {}
for p in prefs:
cat = p.metadata.get("category", "other")
if cat not in prefs_by_category:
prefs_by_category[cat] = []
prefs_by_category[cat].append({
"preference": p.content,
"strength": p.metadata.get("strength", 0.5)
})
return {
"user_id": user_id,
"facts": facts_by_category,
"preferences": prefs_by_category,
"facts_count": len(facts),
"preferences_count": len(prefs)
}
六、使用示例
# 初始化
pipeline = EnhancedMemoryPipeline(memory_store, llm_client)
# 处理用户消息
result = pipeline.on_user_message(
user_id="user_123",
message="我在杭州工作,老板姓张,平时喜欢简短的回复",
session_id="session_456",
context=[
{"role": "assistant", "content": "你好,有什么可以帮你的?"},
]
)
# 结果:
# {
# "facts_extracted": 2,
# "preferences_extracted": 1,
# "facts": ["用户在杭州工作", "用户的老板姓张"],
# "preferences": ["偏好简短的回复风格"]
# }
# 获取用户画像
profile = pipeline.get_user_profile("user_123")
# {
# "user_id": "user_123",
# "facts": {
# "location": [{"fact": "用户在杭州工作", ...}],
# "relationship": [{"fact": "用户的老板姓张", ...}]
# },
# "preferences": {
# "communication_style": [{"preference": "偏好简短的回复风格", ...}]
# },
# ...
# }
七、验收标准
| 场景 | 输入 | 预期输出 |
|---|---|---|
| Fact 提取 | "我在杭州" | fact: "用户在杭州", category: "location" |
| Fact 去重 | 再说 "我在杭州" | times_mentioned +1,不创建新记录 |
| Fact 更新 | "我搬到上海了" | 旧 fact 失效,创建新 fact |
| Pref 提取 | "简短点" | preference: "偏好简短回复", category: "communication_style" |
| Pref 增强 | 多次说 "直接说重点" | strength 逐渐增加 |
| 无信号 | "今天天气真好" | 不调用 LLM,返回空 |
关联
- 相关: INFO-20251219-003(Memory 系统技术方案)
- 触发规则: -