135 lines
4.8 KiB
Python
135 lines
4.8 KiB
Python
|
|
import argparse
|
||
|
|
import json
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
from agents.chat import get_qwen_response
|
||
|
|
from prompt_loader import load_prompt
|
||
|
|
|
||
|
|
PROJECT_ROOT = Path(__file__).resolve().parent
|
||
|
|
DATA_DIR = PROJECT_ROOT / "data" / "meetings"
|
||
|
|
RESULTS_MD_DIR = PROJECT_ROOT / "data" / "results" / "md"
|
||
|
|
RESULTS_JSON_DIR = PROJECT_ROOT / "data" / "results" / "json"
|
||
|
|
TEMPLATE_DIR = PROJECT_ROOT / "template"
|
||
|
|
EXAMPLES_DIR = PROJECT_ROOT / "examples"
|
||
|
|
|
||
|
|
|
||
|
|
def parse_args():
|
||
|
|
parser = argparse.ArgumentParser(description="Generate meeting topics and summary.")
|
||
|
|
parser.add_argument("--meeting-id", help="Meeting ID under data/meetings")
|
||
|
|
parser.add_argument("--input", help="Path to a transcript file (.txt or .md)")
|
||
|
|
parser.add_argument("--template", default="template1.md", help="Template file name under template/")
|
||
|
|
parser.add_argument("--model", default="Qwen3.6-35B", help="LLM model name")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
if bool(args.meeting_id) == bool(args.input):
|
||
|
|
parser.error("Use exactly one of --meeting-id or --input")
|
||
|
|
return args
|
||
|
|
|
||
|
|
|
||
|
|
def load_transcript(args) -> tuple[str, str, Path]:
|
||
|
|
if args.meeting_id:
|
||
|
|
meeting_dir = DATA_DIR / args.meeting_id
|
||
|
|
if not meeting_dir.exists():
|
||
|
|
raise FileNotFoundError(f"Meeting not found: {args.meeting_id}")
|
||
|
|
|
||
|
|
for ext in (".txt", ".md"):
|
||
|
|
transcript_path = meeting_dir / f"transcript{ext}"
|
||
|
|
if transcript_path.exists():
|
||
|
|
return args.meeting_id, transcript_path.read_text(encoding="utf-8"), transcript_path
|
||
|
|
raise FileNotFoundError(f"No transcript file found for meeting: {args.meeting_id}")
|
||
|
|
|
||
|
|
transcript_path = Path(args.input).resolve()
|
||
|
|
if not transcript_path.exists():
|
||
|
|
raise FileNotFoundError(f"Input file not found: {transcript_path}")
|
||
|
|
return transcript_path.stem, transcript_path.read_text(encoding="utf-8"), transcript_path
|
||
|
|
|
||
|
|
|
||
|
|
def read_template(template_name: str) -> str:
|
||
|
|
template_path = TEMPLATE_DIR / template_name
|
||
|
|
if not template_path.exists():
|
||
|
|
raise FileNotFoundError(f"Template not found: {template_name}")
|
||
|
|
return template_path.read_text(encoding="utf-8")
|
||
|
|
|
||
|
|
|
||
|
|
def collect_stream(response) -> str:
|
||
|
|
content = []
|
||
|
|
current_part = None
|
||
|
|
|
||
|
|
for chunk_type, chunk_content in response:
|
||
|
|
if not chunk_content:
|
||
|
|
continue
|
||
|
|
|
||
|
|
if chunk_type == "reasoning":
|
||
|
|
if current_part != "reasoning":
|
||
|
|
print("\n[Thinking]\n")
|
||
|
|
current_part = "reasoning"
|
||
|
|
print(chunk_content, end="", flush=True)
|
||
|
|
else:
|
||
|
|
if current_part != "content":
|
||
|
|
print("\n[Content]\n")
|
||
|
|
current_part = "content"
|
||
|
|
print(chunk_content, end="", flush=True)
|
||
|
|
content.append(str(chunk_content))
|
||
|
|
|
||
|
|
print()
|
||
|
|
return "".join(content)
|
||
|
|
|
||
|
|
|
||
|
|
def save_outputs(target_name: str, meeting_id: str | None, sub_topics: str, summary_text: str):
|
||
|
|
if meeting_id:
|
||
|
|
json_dir = RESULTS_JSON_DIR / meeting_id
|
||
|
|
md_dir = RESULTS_MD_DIR / meeting_id
|
||
|
|
else:
|
||
|
|
json_dir = EXAMPLES_DIR
|
||
|
|
md_dir = EXAMPLES_DIR
|
||
|
|
|
||
|
|
json_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
md_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
json_path = json_dir / "sub_topic.json"
|
||
|
|
try:
|
||
|
|
json_path.write_text(
|
||
|
|
json.dumps(json.loads(sub_topics), ensure_ascii=False, indent=2),
|
||
|
|
encoding="utf-8",
|
||
|
|
)
|
||
|
|
except json.JSONDecodeError:
|
||
|
|
json_path.write_text(sub_topics, encoding="utf-8")
|
||
|
|
|
||
|
|
summary_path = md_dir / "meeting_summary.md"
|
||
|
|
summary_path.write_text(summary_text, encoding="utf-8")
|
||
|
|
|
||
|
|
print(f"\nSaved topics to: {json_path}")
|
||
|
|
print(f"Saved summary to: {summary_path}")
|
||
|
|
print(f"Processed target: {target_name}")
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
args = parse_args()
|
||
|
|
started_at = time.perf_counter()
|
||
|
|
|
||
|
|
target_name, transcript, transcript_path = load_transcript(args)
|
||
|
|
template = read_template(args.template)
|
||
|
|
prompt = load_prompt("meeting_summary", "zh")
|
||
|
|
|
||
|
|
print(f"Processing transcript: {transcript_path}")
|
||
|
|
if args.meeting_id:
|
||
|
|
print(f"Meeting ID: {args.meeting_id}")
|
||
|
|
|
||
|
|
system_prompt = prompt["system"]["role"] + prompt["mode_contracts"]["data_preproces"]
|
||
|
|
user_prompt = prompt["user_template"]["article_preproces"].format(article=transcript)
|
||
|
|
sub_topics = collect_stream(get_qwen_response(args.model, system_prompt, user_prompt))
|
||
|
|
|
||
|
|
system_prompt = prompt["system"]["role"] + prompt["mode_contracts"]["data_summary"].format(template=template)
|
||
|
|
user_prompt = prompt["user_template"]["article_summary"].format(article=transcript, sub_topices=sub_topics)
|
||
|
|
summary_text = collect_stream(get_qwen_response(args.model, system_prompt, user_prompt))
|
||
|
|
|
||
|
|
save_outputs(target_name, args.meeting_id, sub_topics, summary_text)
|
||
|
|
|
||
|
|
elapsed = time.perf_counter() - started_at
|
||
|
|
print(f"Elapsed: {elapsed:.2f}s")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|