diff --git a/src/summeryanyfile/main.py b/src/summeryanyfile/main.py new file mode 100644 index 0000000..e22a808 --- /dev/null +++ b/src/summeryanyfile/main.py @@ -0,0 +1,345 @@ +""" +主入口模块 - 命令行界面和主要功能 +""" + +import asyncio +import json +import sys +from pathlib import Path +from typing import Optional +import click +from rich.console import Console +from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeElapsedColumn +from rich.panel import Panel +from rich.table import Table + +from .config.settings import load_settings, create_default_config, create_env_template +from .core.models import ProcessingConfig, ChunkStrategy +from .generators.ppt_generator import PPTOutlineGenerator +from .utils.logger import setup_logging, get_logger +from .utils.file_handler import FileHandler +from .utils.validators import validate_file_path, validate_url, validate_config + +console = Console() +logger = get_logger(__name__) + + +@click.group() +@click.option('--config', '-c', help='配置文件路径') +@click.option('--log-level', default='INFO', help='日志级别') +@click.option('--debug', is_flag=True, help='启用调试模式') +@click.pass_context +def cli(ctx, config, log_level, debug): + """通用文本转PPT大纲生成器""" + ctx.ensure_object(dict) + + # 设置日志 + if debug: + log_level = 'DEBUG' + + setup_logging(level=log_level, rich_logging=True) + + # 加载设置 + settings = load_settings(config_file=config, debug_mode=debug) + ctx.obj['settings'] = settings + + if debug: + console.print(f"[dim]配置加载完成: {settings.llm_provider}/{settings.llm_model}[/dim]") + + +@cli.command() +@click.argument('input_path') +@click.option('--output', '-o', help='输出文件路径') +@click.option('--encoding', help='文件编码') +@click.option('--max-slides', type=int, help='最大幻灯片数量') +@click.option('--min-slides', type=int, help='最小幻灯片数量') +@click.option('--chunk-size', type=int, help='文档块大小') +@click.option('--chunk-strategy', type=click.Choice(['paragraph', 'semantic', 'recursive', 'hybrid', 'fast']), help='分块策略') +@click.option('--model', help='LLM模型名称') +@click.option('--provider', type=click.Choice(['openai', 'anthropic', 'azure']), help='LLM提供商') +@click.option('--temperature', type=float, help='温度参数 (0.0-2.0)') +@click.option('--max-tokens', type=int, help='最大token数量') +@click.option('--base-url', help='自定义OpenAI API端点URL') +@click.option('--save-markdown', is_flag=True, help='保存转换后的Markdown文件到temp目录') +@click.option('--temp-dir', help='自定义temp目录路径') +@click.option('--no-magic-pdf', is_flag=True, help='禁用Magic-PDF,强制使用MarkItDown处理PDF') +@click.option('--no-progress', is_flag=True, help='禁用进度条') +@click.pass_context +def generate(ctx, input_path, output, encoding, max_slides, min_slides, chunk_size, chunk_strategy, + model, provider, temperature, max_tokens, base_url, save_markdown, temp_dir, no_magic_pdf, no_progress): + """生成PPT大纲""" + settings = ctx.obj['settings'] + + # 更新设置 + if max_slides: + settings.max_slides = max_slides + if min_slides: + settings.min_slides = min_slides + if chunk_size: + settings.chunk_size = chunk_size + if chunk_strategy: + settings.chunk_strategy = chunk_strategy + if model: + settings.llm_model = model + if provider: + settings.llm_provider = provider + if temperature is not None: + settings.temperature = temperature + if max_tokens: + settings.max_tokens = max_tokens + if base_url: + settings.openai_base_url = base_url + + # 验证配置 + config_errors = validate_config(settings.__dict__) + if config_errors: + console.print("[red]配置错误:[/red]") + for error in config_errors: + console.print(f" • {error}") + sys.exit(1) + + # 运行生成 + asyncio.run(_run_generation( + input_path, output, encoding, settings, not no_progress, save_markdown, temp_dir, not no_magic_pdf + )) + + +async def _run_generation( + input_path: str, + output_path: Optional[str], + encoding: Optional[str], + settings, + show_progress: bool, + save_markdown: bool = False, + temp_dir: Optional[str] = None, + use_magic_pdf: bool = True +): + """运行PPT生成""" + try: + # 验证输入 + if validate_url(input_path): + console.print(f"[blue]正在处理URL:[/blue] {input_path}") + file_handler = FileHandler() + local_path = file_handler.handle_input(input_path)[0] + is_temp = True + elif validate_file_path(input_path): + console.print(f"[blue]正在处理文件:[/blue] {input_path}") + local_path = input_path + is_temp = False + else: + console.print(f"[red]无效的输入路径:[/red] {input_path}") + sys.exit(1) + + # 创建生成器 + config = settings.to_processing_config() + generator = PPTOutlineGenerator( + config, + save_markdown=save_markdown, + temp_dir=temp_dir, + use_magic_pdf=use_magic_pdf, + cache_dir=None # 使用默认缓存目录 + ) + + # 显示配置信息 + if save_markdown: + markdown_dir = temp_dir or generator.document_processor.temp_dir + console.print(f"[yellow]Markdown文件将保存到:[/yellow] {markdown_dir}") + + # 显示PDF处理方式 + if use_magic_pdf: + console.print("[green]PDF转换:[/green] 使用Magic-PDF (本地高质量)") + else: + console.print("[yellow]PDF转换:[/yellow] 使用MarkItDown (标准质量)") + + # 进度回调 + progress_task = None + if show_progress: + progress = Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + TimeElapsedColumn(), + console=console + ) + progress.start() + progress_task = progress.add_task("初始化...", total=100) + + def progress_callback(step_name: str, percent: float): + progress.update(progress_task, description=step_name, completed=percent) + else: + def progress_callback(step_name: str, percent: float): + console.print(f"[dim]{step_name} ({percent:.1f}%)[/dim]") + + try: + # 生成PPT大纲 + outline = await generator.generate_from_file( + local_path, + encoding=encoding, + progress_callback=progress_callback + ) + + if show_progress: + progress.stop() + + # 输出结果 + result_json = outline.to_dict() + + if output_path: + # 保存到文件 + output_file = Path(output_path) + output_file.parent.mkdir(parents=True, exist_ok=True) + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(result_json, f, ensure_ascii=False, indent=2) + + console.print(f"[green]PPT大纲已保存到:[/green] {output_file}") + else: + # 输出到控制台 + console.print(json.dumps(result_json, ensure_ascii=False, indent=2)) + + # 显示摘要 + _show_generation_summary(outline) + + finally: + if show_progress and progress_task: + progress.stop() + + # 清理临时文件 + if is_temp: + file_handler.cleanup_temp_file(local_path) + + except Exception as e: + console.print(f"[red]生成失败:[/red] {e}") + logger.error(f"生成失败: {e}", exc_info=True) + sys.exit(1) + + +def _show_generation_summary(outline): + """显示生成摘要""" + table = Table(title="PPT大纲摘要") + table.add_column("属性", style="cyan") + table.add_column("值", style="green") + + table.add_row("标题", outline.title) + table.add_row("总页数", str(outline.total_pages)) + table.add_row("状态", outline.page_count_mode) + + # 统计幻灯片类型 + type_counts = {} + for slide in outline.slides: + slide_type = slide.slide_type + type_counts[slide_type] = type_counts.get(slide_type, 0) + 1 + + type_summary = ", ".join([f"{k}: {v}" for k, v in type_counts.items()]) + table.add_row("幻灯片类型分布", type_summary) + + console.print(table) + + +@cli.command() +@click.option('--provider', type=click.Choice(['openai', 'anthropic', 'azure']), help='LLM提供商') +def validate_setup(provider): + """验证设置和API连接""" + console.print("[blue]正在验证设置...[/blue]") + + settings = load_settings() + + # 验证配置 + config_errors = validate_config(settings.__dict__) + if config_errors: + console.print("[red]配置错误:[/red]") + for error in config_errors: + console.print(f" • {error}") + return + + # 验证LLM连接 + from .core.llm_manager import LLMManager + + llm_manager = LLMManager() + test_provider = provider or settings.llm_provider + + if llm_manager.validate_configuration(test_provider, **settings.get_llm_kwargs()): + console.print(f"[green]✓[/green] {test_provider} 配置有效") + else: + console.print(f"[red]✗[/red] {test_provider} 配置无效") + return + + # 测试LLM连接 + try: + llm = llm_manager.get_llm( + model=settings.llm_model, + provider=test_provider, + **settings.get_llm_kwargs() + ) + console.print(f"[green]✓[/green] LLM连接成功: {settings.llm_model}") + except Exception as e: + console.print(f"[red]✗[/red] LLM连接失败: {e}") + + +@cli.command() +def init_config(): + """初始化配置文件""" + console.print("[blue]正在创建配置文件...[/blue]") + + create_default_config() + create_env_template() + + console.print("[green]配置文件创建完成![/green]") + console.print("请编辑 .env 文件并填入您的API密钥") + + +@cli.command() +@click.argument('input_path') +def analyze(input_path): + """分析文档结构""" + console.print(f"[blue]正在分析文档:[/blue] {input_path}") + + try: + from .core.document_processor import DocumentProcessor + + processor = DocumentProcessor() + doc_info = processor.load_document(input_path) + + # 显示文档信息 + info_table = Table(title="文档信息") + info_table.add_column("属性", style="cyan") + info_table.add_column("值", style="green") + + info_table.add_row("标题", doc_info.title) + info_table.add_row("文件类型", doc_info.file_type) + info_table.add_row("编码", doc_info.encoding) + info_table.add_row("大小", f"{doc_info.size:,} 字节") + info_table.add_row("内容长度", f"{len(doc_info.content):,} 字符") + + console.print(info_table) + + # 显示内容预览 + preview = doc_info.content[:500] + "..." if len(doc_info.content) > 500 else doc_info.content + console.print(Panel(preview, title="内容预览")) + + except Exception as e: + console.print(f"[red]分析失败:[/red] {e}") + + +@cli.command() +def list_models(): + """列出支持的模型""" + from .core.llm_manager import LLMManager + + llm_manager = LLMManager() + + for provider in llm_manager.SUPPORTED_PROVIDERS: + models = llm_manager.list_available_models(provider) + + table = Table(title=f"{provider.upper()} 模型") + table.add_column("模型名称", style="cyan") + + for model in models: + table.add_row(model) + + console.print(table) + + +if __name__ == '__main__': + cli()