import hashlib import logging from typing import Optional from extractor import extract_meeting_info, MeetingExtraction from vector_store import meeting_vector_store from obsidian_manager import obsidian_manager from meeting_state import MeetingStateStore from config import config logger = logging.getLogger(__name__) state_store = MeetingStateStore(config.state_path) class MeetingProcessor: def process_meeting_file(self, filepath: str, force: bool = False) -> Optional[str]: with open(filepath, "r", encoding="utf-8") as f: text = f.read() return self.process_meeting_text(text, force=force) def process_meeting_text(self, text: str, force: bool = False) -> Optional[str]: content_hash = self._compute_content_hash(text) if not force and state_store.has_content_hash(content_hash): print(f"\n⚠️ 检测到重复内容(内容指纹匹配),跳过处理") logger.info(f"内容哈希重复,跳过: {content_hash[:12]}") return None if not force: similar = meeting_vector_store.find_similar_text(text, threshold=0.92) if similar: meta = similar["metadata"] print(f"\n⚠️ 发现高度相似的已有会议: 「{meta.get('title', '')}」({meta.get('date', '')}) 相似度: {similar['score']:.2%}") while True: choice = input(" 选择操作 [s]跳过 / [o]覆盖 (默认 s): ").strip().lower() or "s" if choice == "s": logger.info(f"跳过相似会议: {meta.get('title', '')}") return None elif choice == "o": logger.info(f"覆盖重新处理相似会议") force = True break print(" 请输入 s(skip) 或 o(overwrite)") meeting_data = self._extract(text) if not meeting_data: logger.error("会议信息提取失败") return None data_dict = meeting_data.model_dump() meeting_title = data_dict.get("title", "") meeting_date = data_dict.get("date", "") data_dict["_content_hash"] = content_hash should_skip = self._handle_duplicate(data_dict, force) if should_skip: return None raw_path = obsidian_manager.save_raw_text( text, title=meeting_title, date=meeting_date, ) data_dict["_original_text"] = text data_dict["_original_text_path"] = raw_path obsidian_manager.mark_raw_processed(raw_path) meeting_filename = obsidian_manager._meeting_filename(data_dict) merged_items = state_store.merge_action_items( data_dict.get("action_items", []), meeting_title, meeting_date, meeting_filename, ) data_dict["action_items"] = merged_items merged_metrics = state_store.merge_metrics( data_dict.get("metrics", []), meeting_title, meeting_date, meeting_filename, ) data_dict["metrics"] = merged_metrics state_store.add_content_hash(content_hash, meeting_title, meeting_date, meeting_filename) state_store.save() vault_path = obsidian_manager.add_meeting(data_dict, text) vector_store_manager = meeting_vector_store vector_store_manager.add_meeting(data_dict) logger.info(f"会议处理完成: {meeting_data.title}") return vault_path def _handle_duplicate(self, data_dict: dict, force: bool) -> bool: title = data_dict.get("title", "") date = data_dict.get("date", "") existing = meeting_vector_store.find_meeting(title, date) file_exists = obsidian_manager.meeting_file_exists(data_dict) if not existing and not file_exists: return False if force: logger.info(f"发现重复会议「{title}」,--force 模式自动覆盖") self._remove_old(data_dict) return False print(f"\n⚠️ 发现重复会议: 「{title}」({date})") while True: choice = input(" 选择操作 [s]跳过 / [o]覆盖 (默认 s): ").strip().lower() or "s" if choice == "s": logger.info(f"跳过重复会议: {title}") return True elif choice == "o": logger.info(f"覆盖重新处理: {title}") self._remove_old(data_dict) return False print(" 请输入 s(skip) 或 o(overwrite)") def _remove_old(self, data_dict: dict): meeting_id = meeting_vector_store._meeting_id(data_dict) meeting_vector_store.remove_meeting(meeting_id) obsidian_manager.remove_meeting_note(data_dict) content_hash = data_dict.get("_content_hash", "") if content_hash: state_store.remove_content_hash(content_hash) logger.info(f"旧数据清理完成: {data_dict.get('title', '')}") def _compute_content_hash(self, text: str) -> str: normalized = text.strip().replace('\r\n', '\n') return hashlib.sha256(normalized.encode('utf-8')).hexdigest() def _extract(self, text: str) -> Optional[MeetingExtraction]: try: return extract_meeting_info(text) except Exception as e: logger.error(f"LLM提取失败: {e}") return None def query(self, question: str, top_k: int = 3) -> str: return meeting_vector_store.query_as_context(question, top_k=top_k) def stats(self) -> dict: import os vault = config.obsidian.vault_path meetings_dir = os.path.join(vault, config.obsidian.meetings_dir) entities_dir = os.path.join(vault, config.obsidian.entities_dir) meeting_files = [f for f in os.listdir(meetings_dir) if f.endswith(".md")] if os.path.exists(meetings_dir) else [] entity_files = [f for f in os.listdir(entities_dir) if f.endswith(".md")] if os.path.exists(entities_dir) else [] vs_stats = meeting_vector_store.get_stats() state_stats = state_store.get_stats() return { "obsidian_meetings": len(meeting_files), "obsidian_entities": len(entity_files), "vector_index": vs_stats, "state": state_stats, "vault_path": vault, } meeting_processor = MeetingProcessor()