171 lines
6.4 KiB
Python
171 lines
6.4 KiB
Python
|
|
import hashlib
|
||
|
|
import logging
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
from extractor import extract_meeting_info, MeetingExtraction
|
||
|
|
from vector_store import meeting_vector_store
|
||
|
|
from obsidian_manager import obsidian_manager
|
||
|
|
from meeting_state import MeetingStateStore
|
||
|
|
from config import config
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
state_store = MeetingStateStore(config.state_path)
|
||
|
|
|
||
|
|
|
||
|
|
class MeetingProcessor:
|
||
|
|
def process_meeting_file(self, filepath: str, force: bool = False) -> Optional[str]:
|
||
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
||
|
|
text = f.read()
|
||
|
|
|
||
|
|
return self.process_meeting_text(text, force=force)
|
||
|
|
|
||
|
|
def process_meeting_text(self, text: str, force: bool = False) -> Optional[str]:
|
||
|
|
content_hash = self._compute_content_hash(text)
|
||
|
|
|
||
|
|
if not force and state_store.has_content_hash(content_hash):
|
||
|
|
print(f"\n⚠️ 检测到重复内容(内容指纹匹配),跳过处理")
|
||
|
|
logger.info(f"内容哈希重复,跳过: {content_hash[:12]}")
|
||
|
|
return None
|
||
|
|
|
||
|
|
if not force:
|
||
|
|
similar = meeting_vector_store.find_similar_text(text, threshold=0.92)
|
||
|
|
if similar:
|
||
|
|
meta = similar["metadata"]
|
||
|
|
print(f"\n⚠️ 发现高度相似的已有会议: 「{meta.get('title', '')}」({meta.get('date', '')}) 相似度: {similar['score']:.2%}")
|
||
|
|
while True:
|
||
|
|
choice = input(" 选择操作 [s]跳过 / [o]覆盖 (默认 s): ").strip().lower() or "s"
|
||
|
|
if choice == "s":
|
||
|
|
logger.info(f"跳过相似会议: {meta.get('title', '')}")
|
||
|
|
return None
|
||
|
|
elif choice == "o":
|
||
|
|
logger.info(f"覆盖重新处理相似会议")
|
||
|
|
force = True
|
||
|
|
break
|
||
|
|
print(" 请输入 s(skip) 或 o(overwrite)")
|
||
|
|
|
||
|
|
meeting_data = self._extract(text)
|
||
|
|
if not meeting_data:
|
||
|
|
logger.error("会议信息提取失败")
|
||
|
|
return None
|
||
|
|
|
||
|
|
data_dict = meeting_data.model_dump()
|
||
|
|
meeting_title = data_dict.get("title", "")
|
||
|
|
meeting_date = data_dict.get("date", "")
|
||
|
|
data_dict["_content_hash"] = content_hash
|
||
|
|
|
||
|
|
should_skip = self._handle_duplicate(data_dict, force)
|
||
|
|
if should_skip:
|
||
|
|
return None
|
||
|
|
|
||
|
|
raw_path = obsidian_manager.save_raw_text(
|
||
|
|
text,
|
||
|
|
title=meeting_title,
|
||
|
|
date=meeting_date,
|
||
|
|
)
|
||
|
|
data_dict["_original_text"] = text
|
||
|
|
data_dict["_original_text_path"] = raw_path
|
||
|
|
|
||
|
|
obsidian_manager.mark_raw_processed(raw_path)
|
||
|
|
|
||
|
|
meeting_filename = obsidian_manager._meeting_filename(data_dict)
|
||
|
|
|
||
|
|
merged_items = state_store.merge_action_items(
|
||
|
|
data_dict.get("action_items", []),
|
||
|
|
meeting_title,
|
||
|
|
meeting_date,
|
||
|
|
meeting_filename,
|
||
|
|
)
|
||
|
|
data_dict["action_items"] = merged_items
|
||
|
|
|
||
|
|
merged_metrics = state_store.merge_metrics(
|
||
|
|
data_dict.get("metrics", []),
|
||
|
|
meeting_title,
|
||
|
|
meeting_date,
|
||
|
|
meeting_filename,
|
||
|
|
)
|
||
|
|
data_dict["metrics"] = merged_metrics
|
||
|
|
|
||
|
|
state_store.add_content_hash(content_hash, meeting_title, meeting_date, meeting_filename)
|
||
|
|
state_store.save()
|
||
|
|
|
||
|
|
vault_path = obsidian_manager.add_meeting(data_dict, text)
|
||
|
|
|
||
|
|
vector_store_manager = meeting_vector_store
|
||
|
|
vector_store_manager.add_meeting(data_dict)
|
||
|
|
|
||
|
|
logger.info(f"会议处理完成: {meeting_data.title}")
|
||
|
|
return vault_path
|
||
|
|
|
||
|
|
def _handle_duplicate(self, data_dict: dict, force: bool) -> bool:
|
||
|
|
title = data_dict.get("title", "")
|
||
|
|
date = data_dict.get("date", "")
|
||
|
|
|
||
|
|
existing = meeting_vector_store.find_meeting(title, date)
|
||
|
|
file_exists = obsidian_manager.meeting_file_exists(data_dict)
|
||
|
|
|
||
|
|
if not existing and not file_exists:
|
||
|
|
return False
|
||
|
|
|
||
|
|
if force:
|
||
|
|
logger.info(f"发现重复会议「{title}」,--force 模式自动覆盖")
|
||
|
|
self._remove_old(data_dict)
|
||
|
|
return False
|
||
|
|
|
||
|
|
print(f"\n⚠️ 发现重复会议: 「{title}」({date})")
|
||
|
|
while True:
|
||
|
|
choice = input(" 选择操作 [s]跳过 / [o]覆盖 (默认 s): ").strip().lower() or "s"
|
||
|
|
if choice == "s":
|
||
|
|
logger.info(f"跳过重复会议: {title}")
|
||
|
|
return True
|
||
|
|
elif choice == "o":
|
||
|
|
logger.info(f"覆盖重新处理: {title}")
|
||
|
|
self._remove_old(data_dict)
|
||
|
|
return False
|
||
|
|
print(" 请输入 s(skip) 或 o(overwrite)")
|
||
|
|
|
||
|
|
def _remove_old(self, data_dict: dict):
|
||
|
|
meeting_id = meeting_vector_store._meeting_id(data_dict)
|
||
|
|
meeting_vector_store.remove_meeting(meeting_id)
|
||
|
|
obsidian_manager.remove_meeting_note(data_dict)
|
||
|
|
content_hash = data_dict.get("_content_hash", "")
|
||
|
|
if content_hash:
|
||
|
|
state_store.remove_content_hash(content_hash)
|
||
|
|
logger.info(f"旧数据清理完成: {data_dict.get('title', '')}")
|
||
|
|
|
||
|
|
def _compute_content_hash(self, text: str) -> str:
|
||
|
|
normalized = text.strip().replace('\r\n', '\n')
|
||
|
|
return hashlib.sha256(normalized.encode('utf-8')).hexdigest()
|
||
|
|
|
||
|
|
def _extract(self, text: str) -> Optional[MeetingExtraction]:
|
||
|
|
try:
|
||
|
|
return extract_meeting_info(text)
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"LLM提取失败: {e}")
|
||
|
|
return None
|
||
|
|
|
||
|
|
def query(self, question: str, top_k: int = 3) -> str:
|
||
|
|
return meeting_vector_store.query_as_context(question, top_k=top_k)
|
||
|
|
|
||
|
|
def stats(self) -> dict:
|
||
|
|
import os
|
||
|
|
vault = config.obsidian.vault_path
|
||
|
|
meetings_dir = os.path.join(vault, config.obsidian.meetings_dir)
|
||
|
|
entities_dir = os.path.join(vault, config.obsidian.entities_dir)
|
||
|
|
|
||
|
|
meeting_files = [f for f in os.listdir(meetings_dir) if f.endswith(".md")] if os.path.exists(meetings_dir) else []
|
||
|
|
entity_files = [f for f in os.listdir(entities_dir) if f.endswith(".md")] if os.path.exists(entities_dir) else []
|
||
|
|
|
||
|
|
vs_stats = meeting_vector_store.get_stats()
|
||
|
|
state_stats = state_store.get_stats()
|
||
|
|
|
||
|
|
return {
|
||
|
|
"obsidian_meetings": len(meeting_files),
|
||
|
|
"obsidian_entities": len(entity_files),
|
||
|
|
"vector_index": vs_stats,
|
||
|
|
"state": state_stats,
|
||
|
|
"vault_path": vault,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
meeting_processor = MeetingProcessor()
|