Add File

2025-11-07 09:05:44 +08:00
parent 01964045e3
commit de938db51b
1 changed files with 194 additions and 0 deletions
--- a/src/summeryanyfile/core/json_parser.py
+++ b/src/summeryanyfile/core/json_parser.py
@@ -0,0 +1,194 @@
 """
 JSON解析工具 - 处理LLM返回的JSON响应
 """
 import json
 import re
 from typing import Dict, Any, Optional
 import logging
 logger = logging.getLogger(__name__)
 class JSONParser:
    """JSON解析器，用于处理LLM返回的各种格式的JSON响应"""
    @staticmethod
    def extract_json_from_response(response: str) -> Dict[str, Any]:
        """
        从LLM响应中提取JSON
        Args:
            response: LLM的原始响应文本
        Returns:
            解析后的JSON字典，如果解析失败则返回默认结构
        """
        if not response or not response.strip():
            logger.warning("收到空响应，返回默认JSON结构")
            return JSONParser._get_default_structure()
        # 尝试方法1：直接解析
        try:
            return json.loads(response.strip())
        except json.JSONDecodeError:
            logger.debug("直接JSON解析失败，尝试其他方法")
        # 尝试方法2：提取JSON代码块
        json_match = re.search(r'```json\s*(.*?)\s*```', response, re.DOTALL | re.IGNORECASE)
        if json_match:
            try:
                json_content = json_match.group(1).strip()
                return json.loads(json_content)
            except json.JSONDecodeError:
                logger.debug("JSON代码块解析失败")
        # 尝试方法3：提取普通代码块
        code_match = re.search(r'```\s*(.*?)\s*```', response, re.DOTALL)
        if code_match:
            try:
                code_content = code_match.group(1).strip()
                return json.loads(code_content)
            except json.JSONDecodeError:
                logger.debug("代码块解析失败")
        # 尝试方法4：寻找JSON结构
        json_patterns = [
            r'\{.*\}',  # 匹配大括号包围的内容
            r'\[.*\]',  # 匹配方括号包围的内容
        ]
        for pattern in json_patterns:
            json_match = re.search(pattern, response, re.DOTALL)
            if json_match:
                try:
                    json_content = json_match.group(0)
                    return json.loads(json_content)
                except json.JSONDecodeError:
                    continue
        # 尝试方法5：清理并重试
        cleaned_response = JSONParser._clean_response(response)
        if cleaned_response:
            try:
                return json.loads(cleaned_response)
            except json.JSONDecodeError:
                logger.debug("清理后的响应解析失败")
        logger.warning(f"所有JSON解析方法都失败，响应内容: {response[:200]}...")
        return JSONParser._get_default_structure()
    @staticmethod
    def _clean_response(response: str) -> Optional[str]:
        """
        清理响应文本，尝试提取可能的JSON内容
        Args:
            response: 原始响应文本
        Returns:
            清理后的文本，如果无法清理则返回None
        """
        # 移除常见的非JSON前缀和后缀
        prefixes_to_remove = [
            "Here's the JSON:",
            "Here is the JSON:",
            "JSON:",
            "Result:",
            "Output:",
            "Response:",
        ]
        cleaned = response.strip()
        for prefix in prefixes_to_remove:
            if cleaned.lower().startswith(prefix.lower()):
                cleaned = cleaned[len(prefix):].strip()
        # 移除可能的Markdown格式
        cleaned = re.sub(r'^```.*?\n', '', cleaned, flags=re.MULTILINE)
        cleaned = re.sub(r'\n```$', '', cleaned, flags=re.MULTILINE)
        # 查找第一个 { 和最后一个 }
        first_brace = cleaned.find('{')
        last_brace = cleaned.rfind('}')
        if first_brace != -1 and last_brace != -1 and first_brace < last_brace:
            return cleaned[first_brace:last_brace + 1]
        return None
    @staticmethod
    def _get_default_structure() -> Dict[str, Any]:
        """
        返回默认的JSON结构
        Returns:
            默认的PPT大纲结构
        """
        return {
            "title": "PPT大纲",
            "total_pages": 10,
            "page_count_mode": "estimated",
            "slides": [
                {
                    "page_number": 1,
                    "title": "标题页",
                    "content_points": ["演示标题", "演示者信息", "日期"],
                    "slide_type": "title",
                    "description": "PPT的开场标题页"
                }
            ]
        }
    @staticmethod
    def validate_ppt_structure(data: Dict[str, Any]) -> Dict[str, Any]:
        """
        验证并修复PPT结构
        Args:
            data: 待验证的PPT数据
        Returns:
            验证并修复后的PPT数据
        """
        # 确保必需字段存在
        if "title" not in data:
            data["title"] = "PPT大纲"
        if "slides" not in data or not isinstance(data["slides"], list):
            data["slides"] = []
        if "total_pages" not in data:
            data["total_pages"] = len(data["slides"])
        if "page_count_mode" not in data:
            data["page_count_mode"] = "final"
        # 验证和修复每个幻灯片
        valid_slides = []
        for i, slide in enumerate(data["slides"]):
            if not isinstance(slide, dict):
                continue
            # 确保幻灯片必需字段
            slide.setdefault("page_number", i + 1)
            slide.setdefault("title", f"幻灯片 {i + 1}")
            slide.setdefault("content_points", [])
            slide.setdefault("slide_type", "content")
            slide.setdefault("description", "")
            # 验证slide_type
            if slide["slide_type"] not in ["title", "content", "conclusion"]:
                slide["slide_type"] = "content"
            # 确保content_points是列表
            if not isinstance(slide["content_points"], list):
                slide["content_points"] = []
            valid_slides.append(slide)
        data["slides"] = valid_slides
        data["total_pages"] = len(valid_slides)
        return data