meeting_memory/meeting_processor.py

171 lines
6.4 KiB
Python
Raw Normal View History

2026-05-15 08:39:57 +00:00
import hashlib
import logging
from typing import Optional
from extractor import extract_meeting_info, MeetingExtraction
from vector_store import meeting_vector_store
from obsidian_manager import obsidian_manager
from meeting_state import MeetingStateStore
from config import config
logger = logging.getLogger(__name__)
state_store = MeetingStateStore(config.state_path)
class MeetingProcessor:
def process_meeting_file(self, filepath: str, force: bool = False) -> Optional[str]:
with open(filepath, "r", encoding="utf-8") as f:
text = f.read()
return self.process_meeting_text(text, force=force)
def process_meeting_text(self, text: str, force: bool = False) -> Optional[str]:
content_hash = self._compute_content_hash(text)
if not force and state_store.has_content_hash(content_hash):
print(f"\n⚠️ 检测到重复内容(内容指纹匹配),跳过处理")
logger.info(f"内容哈希重复,跳过: {content_hash[:12]}")
return None
if not force:
similar = meeting_vector_store.find_similar_text(text, threshold=0.92)
if similar:
meta = similar["metadata"]
print(f"\n⚠️ 发现高度相似的已有会议: 「{meta.get('title', '')}」({meta.get('date', '')}) 相似度: {similar['score']:.2%}")
while True:
choice = input(" 选择操作 [s]跳过 / [o]覆盖 (默认 s): ").strip().lower() or "s"
if choice == "s":
logger.info(f"跳过相似会议: {meta.get('title', '')}")
return None
elif choice == "o":
logger.info(f"覆盖重新处理相似会议")
force = True
break
print(" 请输入 s(skip) 或 o(overwrite)")
meeting_data = self._extract(text)
if not meeting_data:
logger.error("会议信息提取失败")
return None
data_dict = meeting_data.model_dump()
meeting_title = data_dict.get("title", "")
meeting_date = data_dict.get("date", "")
data_dict["_content_hash"] = content_hash
should_skip = self._handle_duplicate(data_dict, force)
if should_skip:
return None
raw_path = obsidian_manager.save_raw_text(
text,
title=meeting_title,
date=meeting_date,
)
data_dict["_original_text"] = text
data_dict["_original_text_path"] = raw_path
obsidian_manager.mark_raw_processed(raw_path)
meeting_filename = obsidian_manager._meeting_filename(data_dict)
merged_items = state_store.merge_action_items(
data_dict.get("action_items", []),
meeting_title,
meeting_date,
meeting_filename,
)
data_dict["action_items"] = merged_items
merged_metrics = state_store.merge_metrics(
data_dict.get("metrics", []),
meeting_title,
meeting_date,
meeting_filename,
)
data_dict["metrics"] = merged_metrics
state_store.add_content_hash(content_hash, meeting_title, meeting_date, meeting_filename)
state_store.save()
vault_path = obsidian_manager.add_meeting(data_dict, text)
vector_store_manager = meeting_vector_store
vector_store_manager.add_meeting(data_dict)
logger.info(f"会议处理完成: {meeting_data.title}")
return vault_path
def _handle_duplicate(self, data_dict: dict, force: bool) -> bool:
title = data_dict.get("title", "")
date = data_dict.get("date", "")
existing = meeting_vector_store.find_meeting(title, date)
file_exists = obsidian_manager.meeting_file_exists(data_dict)
if not existing and not file_exists:
return False
if force:
logger.info(f"发现重复会议「{title}」,--force 模式自动覆盖")
self._remove_old(data_dict)
return False
print(f"\n⚠️ 发现重复会议: 「{title}」({date})")
while True:
choice = input(" 选择操作 [s]跳过 / [o]覆盖 (默认 s): ").strip().lower() or "s"
if choice == "s":
logger.info(f"跳过重复会议: {title}")
return True
elif choice == "o":
logger.info(f"覆盖重新处理: {title}")
self._remove_old(data_dict)
return False
print(" 请输入 s(skip) 或 o(overwrite)")
def _remove_old(self, data_dict: dict):
meeting_id = meeting_vector_store._meeting_id(data_dict)
meeting_vector_store.remove_meeting(meeting_id)
obsidian_manager.remove_meeting_note(data_dict)
content_hash = data_dict.get("_content_hash", "")
if content_hash:
state_store.remove_content_hash(content_hash)
logger.info(f"旧数据清理完成: {data_dict.get('title', '')}")
def _compute_content_hash(self, text: str) -> str:
normalized = text.strip().replace('\r\n', '\n')
return hashlib.sha256(normalized.encode('utf-8')).hexdigest()
def _extract(self, text: str) -> Optional[MeetingExtraction]:
try:
return extract_meeting_info(text)
except Exception as e:
logger.error(f"LLM提取失败: {e}")
return None
def query(self, question: str, top_k: int = 3) -> str:
return meeting_vector_store.query_as_context(question, top_k=top_k)
def stats(self) -> dict:
import os
vault = config.obsidian.vault_path
meetings_dir = os.path.join(vault, config.obsidian.meetings_dir)
entities_dir = os.path.join(vault, config.obsidian.entities_dir)
meeting_files = [f for f in os.listdir(meetings_dir) if f.endswith(".md")] if os.path.exists(meetings_dir) else []
entity_files = [f for f in os.listdir(entities_dir) if f.endswith(".md")] if os.path.exists(entities_dir) else []
vs_stats = meeting_vector_store.get_stats()
state_stats = state_store.get_stats()
return {
"obsidian_meetings": len(meeting_files),
"obsidian_entities": len(entity_files),
"vector_index": vs_stats,
"state": state_stats,
"vault_path": vault,
}
meeting_processor = MeetingProcessor()