meeting_memory/extractor.py

157 lines
4.6 KiB
Python
Raw Normal View History

2026-05-15 08:39:57 +00:00
import json
import logging
import re
from typing import List, Optional
from pydantic import BaseModel
from openai import OpenAI
from config import config
logger = logging.getLogger(__name__)
client = OpenAI(
api_key=config.llm.api_key or None,
base_url=config.llm.base_url if config.llm.base_url else None,
)
class Entity(BaseModel):
name: str
entity_type: str
description: str = ""
class Relation(BaseModel):
subject: str
subject_type: str
predicate: str
object: str
object_type: str
description: str = ""
class ActionItem(BaseModel):
task: str
assignee: str = ""
deadline: str = ""
status: str = "待办"
priority: str = ""
class Decision(BaseModel):
content: str
proposer: str = ""
status: str = "已决"
class MeetingMetric(BaseModel):
metric_name: str
value: str
target: str = ""
owner: str = ""
trend: str = ""
class MeetingExtraction(BaseModel):
title: str
date: str = ""
participants: List[str] = []
agenda: List[str] = []
entities: List[Entity] = []
relations: List[Relation] = []
action_items: List[ActionItem] = []
decisions: List[Decision] = []
metrics: List[MeetingMetric] = []
summary: str = ""
EXTRACTION_SYSTEM_PROMPT = """
你是一个专业的会议纪要信息抽取专家你的任务是从中文会议记录中抽取结构化信息并严格按照要求的JSON格式返回
## 抽取内容
### 1. 实体
- 人物参会人员提及的人员
- 组织/部门公司部门团队
- 项目/任务正在进行的项目任务
- 指标/KPI关键绩效指标如转化率退单率等
- 概念/制度管理概念制度要求
- 地点会议地点项目地点
### 2. 关系 (主体-关系谓词-客体)
抽取事实性关系例如
- {"subject": "建维部", "subject_type": "组织", "predicate": "负责", "object": "网络运维", "object_type": "任务", "description": ""}
- {"subject": "弱光指标", "subject_type": "指标", "predicate": "目标值", "object": "0.5以下", "object_type": "数值", "description": ""}
### 3. 行动项
谁负责什么任务截止时间优先级
### 4. 决策
做出的决定和结论
### 5. 指标数据
具体的数字指标当前值目标值负责人趋势(向好/持平/恶化)
## 规则
- 只提取事实性信息
- 过滤比喻假设主观评价
- 数字指标要精确提取
- entitiesrelationsaction_itemsdecisionsmetrics 如果没有则返回空数组
"""
def _call_llm(system: str, user: str) -> str:
response = client.chat.completions.create(
model=config.llm.model,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
max_tokens=config.llm.max_tokens,
temperature=config.llm.temperature,
)
content = response.choices[0].message.content
if content is None:
raise ValueError("LLM returned empty response")
return content
def extract_meeting_info(text: str) -> MeetingExtraction:
user_prompt = f"""
从以下会议记录中抽取结构化信息
JSON字段说明
- title: 会议标题
- date: 会议日期
- participants: 参会人列表
- agenda: 议程列表
- entities: 实体列表每个实体包含 name(名称), entity_type(类型), description(描述)
- relations: 关系列表每个关系包含 subject(主体), subject_type(主体类型), predicate(关系谓词), object(客体), object_type(客体类型), description(描述)
- action_items: 行动项列表每条包含 task(任务), assignee(负责人), deadline(截止时间), status(状态), priority(优先级)
- decisions: 决策列表每条包含 content(决策内容), proposer(提出人), status(状态)
- metrics: 指标列表每条包含 metric_name(指标名), value(当前值), target(目标值), owner(负责人), trend(趋势)
- summary: 会议摘要
请直接返回JSON对象不要包含任何额外说明文字
会议记录
{text}
"""
content = _call_llm(EXTRACTION_SYSTEM_PROMPT, user_prompt)
data = _try_parse_json(content)
return MeetingExtraction(**data)
def _try_parse_json(content: str) -> dict:
try:
return json.loads(content)
except json.JSONDecodeError:
logger.warning("JSON解析失败尝试修复...")
match = re.search(r'\{.*\}', content, re.DOTALL)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError as e:
logger.error(f"修复后的JSON仍无法解析: {e}")
raise