Add File
This commit is contained in:
234
src/summeryanyfile/core/chunkers/hybrid_chunker.py
Normal file
234
src/summeryanyfile/core/chunkers/hybrid_chunker.py
Normal file
@@ -0,0 +1,234 @@
|
|||||||
|
"""
|
||||||
|
混合分块器 - 结合多种分块策略的智能分块器
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
|
||||||
|
from .base_chunker import BaseChunker, DocumentChunk
|
||||||
|
from .semantic_chunker import SemanticChunker
|
||||||
|
from .paragraph_chunker import ParagraphChunker
|
||||||
|
from .recursive_chunker import RecursiveChunker
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class HybridChunker(BaseChunker):
|
||||||
|
"""
|
||||||
|
混合分块器,智能选择和组合多种分块策略
|
||||||
|
|
||||||
|
这个分块器首先尝试语义分块,然后对过大的块使用段落分块,
|
||||||
|
最后对仍然过大的块使用递归分块
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
chunk_size: int = 1000,
|
||||||
|
chunk_overlap: int = 200,
|
||||||
|
size_tolerance: float = 1.2
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
初始化混合分块器
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunk_size: 每个块的最大大小
|
||||||
|
chunk_overlap: 块之间的重叠
|
||||||
|
size_tolerance: 大小容忍度(超过此倍数的块将被进一步分割)
|
||||||
|
"""
|
||||||
|
super().__init__(chunk_size, chunk_overlap)
|
||||||
|
self.size_tolerance = size_tolerance
|
||||||
|
|
||||||
|
# 初始化子分块器
|
||||||
|
self.semantic_chunker = SemanticChunker(chunk_size, chunk_overlap)
|
||||||
|
self.paragraph_chunker = ParagraphChunker(chunk_size, chunk_overlap)
|
||||||
|
self.recursive_chunker = RecursiveChunker(chunk_size, chunk_overlap)
|
||||||
|
|
||||||
|
def chunk_text(self, text: str, metadata: Optional[Dict[str, Any]] = None) -> List[DocumentChunk]:
|
||||||
|
"""
|
||||||
|
使用混合策略分块文本
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: 输入文本
|
||||||
|
metadata: 可选的元数据
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DocumentChunk对象列表
|
||||||
|
"""
|
||||||
|
if metadata is None:
|
||||||
|
metadata = {}
|
||||||
|
|
||||||
|
logger.info("开始混合分块策略")
|
||||||
|
|
||||||
|
# 第一步:尝试语义分块
|
||||||
|
try:
|
||||||
|
chunks = self.semantic_chunker.chunk_text(text, metadata)
|
||||||
|
logger.info(f"语义分块产生了 {len(chunks)} 个块")
|
||||||
|
|
||||||
|
# 检查是否有过大的块需要进一步处理
|
||||||
|
final_chunks = []
|
||||||
|
for chunk in chunks:
|
||||||
|
if self._is_chunk_too_large(chunk):
|
||||||
|
logger.info(f"块 {chunk.chunk_id} 过大,应用段落分块")
|
||||||
|
sub_chunks = self._apply_paragraph_chunking(chunk)
|
||||||
|
final_chunks.extend(sub_chunks)
|
||||||
|
else:
|
||||||
|
final_chunks.append(chunk)
|
||||||
|
|
||||||
|
# 第三步:对仍然过大的块应用递归分块
|
||||||
|
ultra_final_chunks = []
|
||||||
|
for chunk in final_chunks:
|
||||||
|
if self._is_chunk_too_large(chunk):
|
||||||
|
logger.info(f"块 {chunk.chunk_id} 仍然过大,应用递归分块")
|
||||||
|
sub_chunks = self._apply_recursive_chunking(chunk)
|
||||||
|
ultra_final_chunks.extend(sub_chunks)
|
||||||
|
else:
|
||||||
|
ultra_final_chunks.append(chunk)
|
||||||
|
|
||||||
|
# 更新元数据
|
||||||
|
for i, chunk in enumerate(ultra_final_chunks):
|
||||||
|
chunk.metadata["final_chunk_index"] = i
|
||||||
|
chunk.metadata["chunking_strategy"] = "hybrid"
|
||||||
|
|
||||||
|
logger.info(f"混合分块完成,最终产生了 {len(ultra_final_chunks)} 个块")
|
||||||
|
return ultra_final_chunks
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"混合分块失败,回退到段落分块: {e}")
|
||||||
|
return self.paragraph_chunker.chunk_text(text, metadata)
|
||||||
|
|
||||||
|
def _is_chunk_too_large(self, chunk: DocumentChunk) -> bool:
|
||||||
|
"""
|
||||||
|
检查块是否过大
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunk: 要检查的块
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
是否过大
|
||||||
|
"""
|
||||||
|
return chunk.size > self.chunk_size * self.size_tolerance
|
||||||
|
|
||||||
|
def _apply_paragraph_chunking(self, chunk: DocumentChunk) -> List[DocumentChunk]:
|
||||||
|
"""
|
||||||
|
对单个块应用段落分块
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunk: 要分块的块
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
分块后的块列表
|
||||||
|
"""
|
||||||
|
# 创建新的元数据,保留原始信息
|
||||||
|
new_metadata = chunk.metadata.copy()
|
||||||
|
new_metadata["parent_chunk_id"] = chunk.chunk_id
|
||||||
|
new_metadata["parent_strategy"] = chunk.metadata.get("chunking_strategy", "unknown")
|
||||||
|
|
||||||
|
# 应用段落分块
|
||||||
|
sub_chunks = self.paragraph_chunker.chunk_text(chunk.content, new_metadata)
|
||||||
|
|
||||||
|
# 更新元数据
|
||||||
|
for i, sub_chunk in enumerate(sub_chunks):
|
||||||
|
sub_chunk.metadata["sub_chunk_index"] = i
|
||||||
|
sub_chunk.metadata["chunking_strategy"] = "hybrid_paragraph"
|
||||||
|
|
||||||
|
return sub_chunks
|
||||||
|
|
||||||
|
def _apply_recursive_chunking(self, chunk: DocumentChunk) -> List[DocumentChunk]:
|
||||||
|
"""
|
||||||
|
对单个块应用递归分块
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunk: 要分块的块
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
分块后的块列表
|
||||||
|
"""
|
||||||
|
# 创建新的元数据,保留原始信息
|
||||||
|
new_metadata = chunk.metadata.copy()
|
||||||
|
new_metadata["parent_chunk_id"] = chunk.chunk_id
|
||||||
|
new_metadata["parent_strategy"] = chunk.metadata.get("chunking_strategy", "unknown")
|
||||||
|
|
||||||
|
# 应用递归分块
|
||||||
|
sub_chunks = self.recursive_chunker.chunk_text(chunk.content, new_metadata)
|
||||||
|
|
||||||
|
# 更新元数据
|
||||||
|
for i, sub_chunk in enumerate(sub_chunks):
|
||||||
|
sub_chunk.metadata["sub_chunk_index"] = i
|
||||||
|
sub_chunk.metadata["chunking_strategy"] = "hybrid_recursive"
|
||||||
|
|
||||||
|
return sub_chunks
|
||||||
|
|
||||||
|
def analyze_text_structure(self, text: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
分析文本结构以选择最佳分块策略
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: 输入文本
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
结构分析结果
|
||||||
|
"""
|
||||||
|
analysis = {
|
||||||
|
"text_length": len(text),
|
||||||
|
"line_count": len(text.split('\n')),
|
||||||
|
"paragraph_count": len([p for p in text.split('\n\n') if p.strip()]),
|
||||||
|
"has_markdown_headers": False,
|
||||||
|
"header_count": 0,
|
||||||
|
"recommended_strategy": "paragraph"
|
||||||
|
}
|
||||||
|
|
||||||
|
# 检查Markdown头部
|
||||||
|
lines = text.split('\n')
|
||||||
|
header_count = 0
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if line.startswith('#'):
|
||||||
|
header_count += 1
|
||||||
|
|
||||||
|
analysis["header_count"] = header_count
|
||||||
|
analysis["has_markdown_headers"] = header_count > 0
|
||||||
|
|
||||||
|
# 推荐策略
|
||||||
|
if header_count >= 3:
|
||||||
|
analysis["recommended_strategy"] = "semantic"
|
||||||
|
elif analysis["paragraph_count"] >= 5:
|
||||||
|
analysis["recommended_strategy"] = "paragraph"
|
||||||
|
else:
|
||||||
|
analysis["recommended_strategy"] = "recursive"
|
||||||
|
|
||||||
|
return analysis
|
||||||
|
|
||||||
|
def get_chunking_statistics(self, chunks: List[DocumentChunk]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
获取分块统计信息
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunks: 块列表
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
统计信息
|
||||||
|
"""
|
||||||
|
if not chunks:
|
||||||
|
return {"total_chunks": 0}
|
||||||
|
|
||||||
|
# 基础统计
|
||||||
|
base_stats = self.get_chunk_statistics(chunks)
|
||||||
|
|
||||||
|
# 策略统计
|
||||||
|
strategy_counts = {}
|
||||||
|
for chunk in chunks:
|
||||||
|
strategy = chunk.metadata.get("chunking_strategy", "unknown")
|
||||||
|
strategy_counts[strategy] = strategy_counts.get(strategy, 0) + 1
|
||||||
|
|
||||||
|
# 大小分布
|
||||||
|
sizes = [chunk.size for chunk in chunks]
|
||||||
|
oversized_count = sum(1 for size in sizes if size > self.chunk_size)
|
||||||
|
|
||||||
|
base_stats.update({
|
||||||
|
"strategy_distribution": strategy_counts,
|
||||||
|
"oversized_chunks": oversized_count,
|
||||||
|
"oversized_percentage": (oversized_count / len(chunks)) * 100 if chunks else 0,
|
||||||
|
"size_tolerance": self.size_tolerance
|
||||||
|
})
|
||||||
|
|
||||||
|
return base_stats
|
||||||
Reference in New Issue
Block a user