157 lines
4.6 KiB
Python
157 lines
4.6 KiB
Python
|
|
import json
|
|||
|
|
import logging
|
|||
|
|
import re
|
|||
|
|
from typing import List, Optional
|
|||
|
|
from pydantic import BaseModel
|
|||
|
|
|
|||
|
|
from openai import OpenAI
|
|||
|
|
|
|||
|
|
from config import config
|
|||
|
|
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
client = OpenAI(
|
|||
|
|
api_key=config.llm.api_key or None,
|
|||
|
|
base_url=config.llm.base_url if config.llm.base_url else None,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class Entity(BaseModel):
|
|||
|
|
name: str
|
|||
|
|
entity_type: str
|
|||
|
|
description: str = ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
class Relation(BaseModel):
|
|||
|
|
subject: str
|
|||
|
|
subject_type: str
|
|||
|
|
predicate: str
|
|||
|
|
object: str
|
|||
|
|
object_type: str
|
|||
|
|
description: str = ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
class ActionItem(BaseModel):
|
|||
|
|
task: str
|
|||
|
|
assignee: str = ""
|
|||
|
|
deadline: str = ""
|
|||
|
|
status: str = "待办"
|
|||
|
|
priority: str = "中"
|
|||
|
|
|
|||
|
|
|
|||
|
|
class Decision(BaseModel):
|
|||
|
|
content: str
|
|||
|
|
proposer: str = ""
|
|||
|
|
status: str = "已决"
|
|||
|
|
|
|||
|
|
|
|||
|
|
class MeetingMetric(BaseModel):
|
|||
|
|
metric_name: str
|
|||
|
|
value: str
|
|||
|
|
target: str = ""
|
|||
|
|
owner: str = ""
|
|||
|
|
trend: str = ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
class MeetingExtraction(BaseModel):
|
|||
|
|
title: str
|
|||
|
|
date: str = ""
|
|||
|
|
participants: List[str] = []
|
|||
|
|
agenda: List[str] = []
|
|||
|
|
entities: List[Entity] = []
|
|||
|
|
relations: List[Relation] = []
|
|||
|
|
action_items: List[ActionItem] = []
|
|||
|
|
decisions: List[Decision] = []
|
|||
|
|
metrics: List[MeetingMetric] = []
|
|||
|
|
summary: str = ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
EXTRACTION_SYSTEM_PROMPT = """
|
|||
|
|
你是一个专业的会议纪要信息抽取专家。你的任务是从中文会议记录中抽取结构化信息,并严格按照要求的JSON格式返回。
|
|||
|
|
|
|||
|
|
## 抽取内容
|
|||
|
|
|
|||
|
|
### 1. 实体
|
|||
|
|
- 人物:参会人员、提及的人员
|
|||
|
|
- 组织/部门:公司、部门、团队
|
|||
|
|
- 项目/任务:正在进行的项目、任务
|
|||
|
|
- 指标/KPI:关键绩效指标(如转化率、退单率等)
|
|||
|
|
- 概念/制度:管理概念、制度要求
|
|||
|
|
- 地点:会议地点、项目地点
|
|||
|
|
|
|||
|
|
### 2. 关系 (主体-关系谓词-客体)
|
|||
|
|
抽取事实性关系,例如:
|
|||
|
|
- {"subject": "建维部", "subject_type": "组织", "predicate": "负责", "object": "网络运维", "object_type": "任务", "description": ""}
|
|||
|
|
- {"subject": "弱光指标", "subject_type": "指标", "predicate": "目标值", "object": "0.5以下", "object_type": "数值", "description": ""}
|
|||
|
|
|
|||
|
|
### 3. 行动项
|
|||
|
|
谁负责什么任务,截止时间,优先级
|
|||
|
|
|
|||
|
|
### 4. 决策
|
|||
|
|
做出的决定和结论
|
|||
|
|
|
|||
|
|
### 5. 指标数据
|
|||
|
|
具体的数字指标:当前值、目标值、负责人、趋势(向好/持平/恶化)
|
|||
|
|
|
|||
|
|
## 规则
|
|||
|
|
- 只提取事实性信息
|
|||
|
|
- 过滤比喻、假设、主观评价
|
|||
|
|
- 数字指标要精确提取
|
|||
|
|
- entities、relations、action_items、decisions、metrics 如果没有则返回空数组
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _call_llm(system: str, user: str) -> str:
|
|||
|
|
response = client.chat.completions.create(
|
|||
|
|
model=config.llm.model,
|
|||
|
|
messages=[
|
|||
|
|
{"role": "system", "content": system},
|
|||
|
|
{"role": "user", "content": user},
|
|||
|
|
],
|
|||
|
|
max_tokens=config.llm.max_tokens,
|
|||
|
|
temperature=config.llm.temperature,
|
|||
|
|
)
|
|||
|
|
content = response.choices[0].message.content
|
|||
|
|
if content is None:
|
|||
|
|
raise ValueError("LLM returned empty response")
|
|||
|
|
return content
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_meeting_info(text: str) -> MeetingExtraction:
|
|||
|
|
user_prompt = f"""
|
|||
|
|
从以下会议记录中抽取结构化信息。
|
|||
|
|
|
|||
|
|
JSON字段说明:
|
|||
|
|
- title: 会议标题
|
|||
|
|
- date: 会议日期
|
|||
|
|
- participants: 参会人列表
|
|||
|
|
- agenda: 议程列表
|
|||
|
|
- entities: 实体列表,每个实体包含 name(名称), entity_type(类型), description(描述)
|
|||
|
|
- relations: 关系列表,每个关系包含 subject(主体), subject_type(主体类型), predicate(关系谓词), object(客体), object_type(客体类型), description(描述)
|
|||
|
|
- action_items: 行动项列表,每条包含 task(任务), assignee(负责人), deadline(截止时间), status(状态), priority(优先级)
|
|||
|
|
- decisions: 决策列表,每条包含 content(决策内容), proposer(提出人), status(状态)
|
|||
|
|
- metrics: 指标列表,每条包含 metric_name(指标名), value(当前值), target(目标值), owner(负责人), trend(趋势)
|
|||
|
|
- summary: 会议摘要
|
|||
|
|
|
|||
|
|
请直接返回JSON对象。不要包含任何额外说明文字。
|
|||
|
|
|
|||
|
|
会议记录:
|
|||
|
|
{text}
|
|||
|
|
"""
|
|||
|
|
content = _call_llm(EXTRACTION_SYSTEM_PROMPT, user_prompt)
|
|||
|
|
data = _try_parse_json(content)
|
|||
|
|
return MeetingExtraction(**data)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _try_parse_json(content: str) -> dict:
|
|||
|
|
try:
|
|||
|
|
return json.loads(content)
|
|||
|
|
except json.JSONDecodeError:
|
|||
|
|
logger.warning("JSON解析失败,尝试修复...")
|
|||
|
|
match = re.search(r'\{.*\}', content, re.DOTALL)
|
|||
|
|
if match:
|
|||
|
|
try:
|
|||
|
|
return json.loads(match.group())
|
|||
|
|
except json.JSONDecodeError as e:
|
|||
|
|
logger.error(f"修复后的JSON仍无法解析: {e}")
|
|||
|
|
raise
|