import json import logging import re from typing import List, Optional from pydantic import BaseModel from openai import OpenAI from config import config logger = logging.getLogger(__name__) client = OpenAI( api_key=config.llm.api_key or None, base_url=config.llm.base_url if config.llm.base_url else None, ) class Entity(BaseModel): name: str entity_type: str description: str = "" class Relation(BaseModel): subject: str subject_type: str predicate: str object: str object_type: str description: str = "" class ActionItem(BaseModel): task: str assignee: str = "" deadline: str = "" status: str = "待办" priority: str = "中" class Decision(BaseModel): content: str proposer: str = "" status: str = "已决" class MeetingMetric(BaseModel): metric_name: str value: str target: str = "" owner: str = "" trend: str = "" class MeetingExtraction(BaseModel): title: str date: str = "" participants: List[str] = [] agenda: List[str] = [] entities: List[Entity] = [] relations: List[Relation] = [] action_items: List[ActionItem] = [] decisions: List[Decision] = [] metrics: List[MeetingMetric] = [] summary: str = "" EXTRACTION_SYSTEM_PROMPT = """ 你是一个专业的会议纪要信息抽取专家。你的任务是从中文会议记录中抽取结构化信息,并严格按照要求的JSON格式返回。 ## 抽取内容 ### 1. 实体 - 人物:参会人员、提及的人员 - 组织/部门:公司、部门、团队 - 项目/任务:正在进行的项目、任务 - 指标/KPI:关键绩效指标(如转化率、退单率等) - 概念/制度:管理概念、制度要求 - 地点:会议地点、项目地点 ### 2. 关系 (主体-关系谓词-客体) 抽取事实性关系,例如: - {"subject": "建维部", "subject_type": "组织", "predicate": "负责", "object": "网络运维", "object_type": "任务", "description": ""} - {"subject": "弱光指标", "subject_type": "指标", "predicate": "目标值", "object": "0.5以下", "object_type": "数值", "description": ""} ### 3. 行动项 谁负责什么任务,截止时间,优先级 ### 4. 决策 做出的决定和结论 ### 5. 指标数据 具体的数字指标:当前值、目标值、负责人、趋势(向好/持平/恶化) ## 规则 - 只提取事实性信息 - 过滤比喻、假设、主观评价 - 数字指标要精确提取 - entities、relations、action_items、decisions、metrics 如果没有则返回空数组 """ def _call_llm(system: str, user: str) -> str: response = client.chat.completions.create( model=config.llm.model, messages=[ {"role": "system", "content": system}, {"role": "user", "content": user}, ], max_tokens=config.llm.max_tokens, temperature=config.llm.temperature, ) content = response.choices[0].message.content if content is None: raise ValueError("LLM returned empty response") return content def extract_meeting_info(text: str) -> MeetingExtraction: user_prompt = f""" 从以下会议记录中抽取结构化信息。 JSON字段说明: - title: 会议标题 - date: 会议日期 - participants: 参会人列表 - agenda: 议程列表 - entities: 实体列表,每个实体包含 name(名称), entity_type(类型), description(描述) - relations: 关系列表,每个关系包含 subject(主体), subject_type(主体类型), predicate(关系谓词), object(客体), object_type(客体类型), description(描述) - action_items: 行动项列表,每条包含 task(任务), assignee(负责人), deadline(截止时间), status(状态), priority(优先级) - decisions: 决策列表,每条包含 content(决策内容), proposer(提出人), status(状态) - metrics: 指标列表,每条包含 metric_name(指标名), value(当前值), target(目标值), owner(负责人), trend(趋势) - summary: 会议摘要 请直接返回JSON对象。不要包含任何额外说明文字。 会议记录: {text} """ content = _call_llm(EXTRACTION_SYSTEM_PROMPT, user_prompt) data = _try_parse_json(content) return MeetingExtraction(**data) def _try_parse_json(content: str) -> dict: try: return json.loads(content) except json.JSONDecodeError: logger.warning("JSON解析失败,尝试修复...") match = re.search(r'\{.*\}', content, re.DOTALL) if match: try: return json.loads(match.group()) except json.JSONDecodeError as e: logger.error(f"修复后的JSON仍无法解析: {e}") raise