meeting_memory/extractor.py

157 lines
4.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import json
import logging
import re
from typing import List, Optional
from pydantic import BaseModel
from openai import OpenAI
from config import config
logger = logging.getLogger(__name__)
client = OpenAI(
api_key=config.llm.api_key or None,
base_url=config.llm.base_url if config.llm.base_url else None,
)
class Entity(BaseModel):
name: str
entity_type: str
description: str = ""
class Relation(BaseModel):
subject: str
subject_type: str
predicate: str
object: str
object_type: str
description: str = ""
class ActionItem(BaseModel):
task: str
assignee: str = ""
deadline: str = ""
status: str = "待办"
priority: str = ""
class Decision(BaseModel):
content: str
proposer: str = ""
status: str = "已决"
class MeetingMetric(BaseModel):
metric_name: str
value: str
target: str = ""
owner: str = ""
trend: str = ""
class MeetingExtraction(BaseModel):
title: str
date: str = ""
participants: List[str] = []
agenda: List[str] = []
entities: List[Entity] = []
relations: List[Relation] = []
action_items: List[ActionItem] = []
decisions: List[Decision] = []
metrics: List[MeetingMetric] = []
summary: str = ""
EXTRACTION_SYSTEM_PROMPT = """
你是一个专业的会议纪要信息抽取专家。你的任务是从中文会议记录中抽取结构化信息并严格按照要求的JSON格式返回。
## 抽取内容
### 1. 实体
- 人物:参会人员、提及的人员
- 组织/部门:公司、部门、团队
- 项目/任务:正在进行的项目、任务
- 指标/KPI关键绩效指标如转化率、退单率等
- 概念/制度:管理概念、制度要求
- 地点:会议地点、项目地点
### 2. 关系 (主体-关系谓词-客体)
抽取事实性关系,例如:
- {"subject": "建维部", "subject_type": "组织", "predicate": "负责", "object": "网络运维", "object_type": "任务", "description": ""}
- {"subject": "弱光指标", "subject_type": "指标", "predicate": "目标值", "object": "0.5以下", "object_type": "数值", "description": ""}
### 3. 行动项
谁负责什么任务,截止时间,优先级
### 4. 决策
做出的决定和结论
### 5. 指标数据
具体的数字指标:当前值、目标值、负责人、趋势(向好/持平/恶化)
## 规则
- 只提取事实性信息
- 过滤比喻、假设、主观评价
- 数字指标要精确提取
- entities、relations、action_items、decisions、metrics 如果没有则返回空数组
"""
def _call_llm(system: str, user: str) -> str:
response = client.chat.completions.create(
model=config.llm.model,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
max_tokens=config.llm.max_tokens,
temperature=config.llm.temperature,
)
content = response.choices[0].message.content
if content is None:
raise ValueError("LLM returned empty response")
return content
def extract_meeting_info(text: str) -> MeetingExtraction:
user_prompt = f"""
从以下会议记录中抽取结构化信息。
JSON字段说明
- title: 会议标题
- date: 会议日期
- participants: 参会人列表
- agenda: 议程列表
- entities: 实体列表,每个实体包含 name(名称), entity_type(类型), description(描述)
- relations: 关系列表,每个关系包含 subject(主体), subject_type(主体类型), predicate(关系谓词), object(客体), object_type(客体类型), description(描述)
- action_items: 行动项列表,每条包含 task(任务), assignee(负责人), deadline(截止时间), status(状态), priority(优先级)
- decisions: 决策列表,每条包含 content(决策内容), proposer(提出人), status(状态)
- metrics: 指标列表,每条包含 metric_name(指标名), value(当前值), target(目标值), owner(负责人), trend(趋势)
- summary: 会议摘要
请直接返回JSON对象。不要包含任何额外说明文字。
会议记录:
{text}
"""
content = _call_llm(EXTRACTION_SYSTEM_PROMPT, user_prompt)
data = _try_parse_json(content)
return MeetingExtraction(**data)
def _try_parse_json(content: str) -> dict:
try:
return json.loads(content)
except json.JSONDecodeError:
logger.warning("JSON解析失败尝试修复...")
match = re.search(r'\{.*\}', content, re.DOTALL)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError as e:
logger.error(f"修复后的JSON仍无法解析: {e}")
raise