diff --git a/src/summeryanyfile/core/file_cache_manager.py b/src/summeryanyfile/core/file_cache_manager.py new file mode 100644 index 0000000..974736f --- /dev/null +++ b/src/summeryanyfile/core/file_cache_manager.py @@ -0,0 +1,333 @@ +""" +文件缓存管理器 - 基于MD5哈希值缓存文件处理结果 +""" + +import os +import json +import hashlib +import tempfile +import shutil +import logging +from pathlib import Path +from typing import Optional, Tuple, Dict, Any +from datetime import datetime, timedelta + +logger = logging.getLogger(__name__) + + +class FileCacheManager: + """文件缓存管理器,用于缓存文件处理结果""" + + def __init__(self, cache_dir: Optional[str] = None, cache_ttl_hours: int = 24 * 7, processing_mode: Optional[str] = None): + """ + 初始化文件缓存管理器 + + Args: + cache_dir: 缓存目录,默认为系统临时目录下的summeryanyfile_cache + cache_ttl_hours: 缓存过期时间(小时),默认7天 + processing_mode: 处理模式(如markitdown、magic_pdf等),用于分离不同模式的缓存 + """ + self.cache_ttl_hours = cache_ttl_hours + self.processing_mode = processing_mode or "default" + + # 设置缓存目录 + if cache_dir: + self.cache_dir = Path(cache_dir) + else: + self.cache_dir = Path(tempfile.gettempdir()) / "summeryanyfile_cache" + + # 如果指定了处理模式,在缓存目录下创建模式子目录 + if self.processing_mode != "default": + self.cache_dir = self.cache_dir / self.processing_mode + + # 创建缓存目录结构 + self.files_cache_dir = self.cache_dir / "files" + self.markdown_cache_dir = self.cache_dir / "markdown" + self.metadata_cache_dir = self.cache_dir / "metadata" + + # 确保目录存在 + for dir_path in [self.files_cache_dir, self.markdown_cache_dir, self.metadata_cache_dir]: + dir_path.mkdir(parents=True, exist_ok=True) + + logger.info(f"文件缓存管理器初始化完成,缓存目录: {self.cache_dir},处理模式: {self.processing_mode}") + + def calculate_file_md5(self, file_path: str) -> str: + """ + 计算文件的MD5哈希值 + + Args: + file_path: 文件路径 + + Returns: + MD5哈希值字符串 + """ + hash_md5 = hashlib.md5() + + try: + with open(file_path, "rb") as f: + # 分块读取文件以处理大文件 + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + + md5_hash = hash_md5.hexdigest() + logger.debug(f"文件 {file_path} 的MD5: {md5_hash}") + return md5_hash + + except Exception as e: + logger.error(f"计算文件MD5失败 {file_path}: {e}") + raise + + def is_cached(self, file_path: str) -> Tuple[bool, Optional[str]]: + """ + 检查文件是否已缓存且未过期 + + Args: + file_path: 文件路径 + + Returns: + (是否已缓存, MD5哈希值) + """ + try: + md5_hash = self.calculate_file_md5(file_path) + + # 检查元数据文件是否存在 + metadata_file = self.metadata_cache_dir / f"{md5_hash}.json" + if not metadata_file.exists(): + return False, md5_hash + + # 检查缓存是否过期 + try: + with open(metadata_file, 'r', encoding='utf-8') as f: + metadata = json.load(f) + + cached_time = datetime.fromisoformat(metadata.get('cached_time', '')) + expiry_time = cached_time + timedelta(hours=self.cache_ttl_hours) + + if datetime.now() > expiry_time: + logger.info(f"缓存已过期: {md5_hash}") + self._remove_cache_entry(md5_hash) + return False, md5_hash + + # 检查markdown文件是否存在 + markdown_file = self.markdown_cache_dir / f"{md5_hash}.md" + if not markdown_file.exists(): + logger.warning(f"缓存元数据存在但markdown文件缺失: {md5_hash}") + self._remove_cache_entry(md5_hash) + return False, md5_hash + + logger.info(f"找到有效缓存: {md5_hash}") + return True, md5_hash + + except (json.JSONDecodeError, ValueError, KeyError) as e: + logger.warning(f"缓存元数据文件损坏: {metadata_file}, 错误: {e}") + self._remove_cache_entry(md5_hash) + return False, md5_hash + + except Exception as e: + logger.error(f"检查缓存状态失败: {e}") + return False, None + + def get_cached_content(self, md5_hash: str) -> Tuple[Optional[str], Optional[Dict[str, Any]]]: + """ + 获取缓存的内容和元数据 + + Args: + md5_hash: 文件MD5哈希值 + + Returns: + (markdown内容, 元数据) + """ + try: + # 读取markdown内容 + markdown_file = self.markdown_cache_dir / f"{md5_hash}.md" + if not markdown_file.exists(): + return None, None + + with open(markdown_file, 'r', encoding='utf-8') as f: + markdown_content = f.read() + + # 读取元数据 + metadata_file = self.metadata_cache_dir / f"{md5_hash}.json" + if not metadata_file.exists(): + return markdown_content, {} + + with open(metadata_file, 'r', encoding='utf-8') as f: + metadata = json.load(f) + + logger.info(f"成功读取缓存内容: {md5_hash}") + return markdown_content, metadata + + except Exception as e: + logger.error(f"读取缓存内容失败 {md5_hash}: {e}") + return None, None + + def save_to_cache(self, file_path: str, markdown_content: str, + processing_metadata: Optional[Dict[str, Any]] = None) -> str: + """ + 保存文件处理结果到缓存 + + Args: + file_path: 原始文件路径 + markdown_content: 处理后的markdown内容 + processing_metadata: 处理过程的元数据 + + Returns: + MD5哈希值 + """ + try: + md5_hash = self.calculate_file_md5(file_path) + + # 保存markdown内容 + markdown_file = self.markdown_cache_dir / f"{md5_hash}.md" + with open(markdown_file, 'w', encoding='utf-8') as f: + f.write(markdown_content) + + # 准备元数据 + file_info = Path(file_path) + metadata = { + 'md5_hash': md5_hash, + 'original_file_path': str(file_info.absolute()), + 'original_file_name': file_info.name, + 'original_file_size': file_info.stat().st_size, + 'original_file_extension': file_info.suffix.lower(), + 'cached_time': datetime.now().isoformat(), + 'markdown_length': len(markdown_content), + 'processing_metadata': processing_metadata or {} + } + + # 保存元数据 + metadata_file = self.metadata_cache_dir / f"{md5_hash}.json" + with open(metadata_file, 'w', encoding='utf-8') as f: + json.dump(metadata, f, ensure_ascii=False, indent=2) + + # 可选:保存原始文件副本(用于调试或备份) + if self._should_backup_file(file_path): + self._backup_original_file(file_path, md5_hash) + + logger.info(f"成功缓存文件处理结果: {md5_hash} ({file_info.name})") + return md5_hash + + except Exception as e: + logger.error(f"保存缓存失败 {file_path}: {e}") + raise + + def _should_backup_file(self, file_path: str) -> bool: + """ + 判断是否应该备份原始文件 + + Args: + file_path: 文件路径 + + Returns: + 是否应该备份 + """ + # 对于小文件(<10MB)进行备份 + try: + file_size = Path(file_path).stat().st_size + return file_size < 10 * 1024 * 1024 # 10MB + except: + return False + + def _backup_original_file(self, file_path: str, md5_hash: str): + """ + 备份原始文件 + + Args: + file_path: 原始文件路径 + md5_hash: MD5哈希值 + """ + try: + file_info = Path(file_path) + backup_file = self.files_cache_dir / f"{md5_hash}{file_info.suffix}" + + shutil.copy2(file_path, backup_file) + logger.debug(f"已备份原始文件: {backup_file}") + + except Exception as e: + logger.warning(f"备份原始文件失败: {e}") + + def _remove_cache_entry(self, md5_hash: str): + """ + 删除缓存条目 + + Args: + md5_hash: MD5哈希值 + """ + try: + # 删除markdown文件 + markdown_file = self.markdown_cache_dir / f"{md5_hash}.md" + if markdown_file.exists(): + markdown_file.unlink() + + # 删除元数据文件 + metadata_file = self.metadata_cache_dir / f"{md5_hash}.json" + if metadata_file.exists(): + metadata_file.unlink() + + # 删除备份文件(如果存在) + for backup_file in self.files_cache_dir.glob(f"{md5_hash}.*"): + backup_file.unlink() + + logger.debug(f"已删除缓存条目: {md5_hash}") + + except Exception as e: + logger.warning(f"删除缓存条目失败 {md5_hash}: {e}") + + def cleanup_expired_cache(self): + """清理过期的缓存条目""" + try: + cleaned_count = 0 + + for metadata_file in self.metadata_cache_dir.glob("*.json"): + try: + with open(metadata_file, 'r', encoding='utf-8') as f: + metadata = json.load(f) + + cached_time = datetime.fromisoformat(metadata.get('cached_time', '')) + expiry_time = cached_time + timedelta(hours=self.cache_ttl_hours) + + if datetime.now() > expiry_time: + md5_hash = metadata_file.stem + self._remove_cache_entry(md5_hash) + cleaned_count += 1 + + except Exception as e: + logger.warning(f"清理缓存条目失败 {metadata_file}: {e}") + + if cleaned_count > 0: + logger.info(f"清理了 {cleaned_count} 个过期缓存条目") + + except Exception as e: + logger.error(f"清理过期缓存失败: {e}") + + def get_cache_stats(self) -> Dict[str, Any]: + """ + 获取缓存统计信息 + + Returns: + 缓存统计信息 + """ + try: + metadata_files = list(self.metadata_cache_dir.glob("*.json")) + markdown_files = list(self.markdown_cache_dir.glob("*.md")) + backup_files = list(self.files_cache_dir.glob("*")) + + total_size = 0 + for file_path in [*metadata_files, *markdown_files, *backup_files]: + try: + total_size += file_path.stat().st_size + except: + pass + + return { + 'cache_dir': str(self.cache_dir), + 'total_entries': len(metadata_files), + 'markdown_files': len(markdown_files), + 'backup_files': len(backup_files), + 'total_size_mb': round(total_size / (1024 * 1024), 2), + 'cache_ttl_hours': self.cache_ttl_hours + } + + except Exception as e: + logger.error(f"获取缓存统计信息失败: {e}") + return {}