This commit is contained in:
2025-08-27 19:58:42 +08:00
parent 4f45921567
commit c5304a77b8

View File

@@ -0,0 +1,97 @@
/*
* Copyright (c) 2023-2025, Agents-Flex (fuhai999@gmail.com).
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.agentsflex.core.document.splitter;
import com.agentsflex.core.document.Document;
import com.agentsflex.core.document.DocumentSplitter;
import com.agentsflex.core.document.id.DocumentIdGenerator;
import com.agentsflex.core.util.StringUtil;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class SimpleDocumentSplitter implements DocumentSplitter {
private int chunkSize;
private int overlapSize;
public SimpleDocumentSplitter(int chunkSize) {
this.chunkSize = chunkSize;
if (this.chunkSize <= 0) {
throw new IllegalArgumentException("chunkSize must be greater than 0, chunkSize: " + this.chunkSize);
}
}
public SimpleDocumentSplitter(int chunkSize, int overlapSize) {
this.chunkSize = chunkSize;
this.overlapSize = overlapSize;
if (this.chunkSize <= 0) {
throw new IllegalArgumentException("chunkSize must be greater than 0, chunkSize: " + this.chunkSize);
}
if (this.overlapSize >= this.chunkSize) {
throw new IllegalArgumentException("overlapSize must be less than chunkSize, overlapSize: " + this.overlapSize + ", chunkSize: " + this.chunkSize);
}
}
public int getChunkSize() {
return chunkSize;
}
public void setChunkSize(int chunkSize) {
this.chunkSize = chunkSize;
}
public int getOverlapSize() {
return overlapSize;
}
public void setOverlapSize(int overlapSize) {
this.overlapSize = overlapSize;
}
@Override
public List<Document> split(Document document, DocumentIdGenerator idGenerator) {
if (document == null || StringUtil.noText(document.getContent())) {
return Collections.emptyList();
}
String content = document.getContent();
int index = 0, currentIndex = index;
int maxIndex = content.length();
List<Document> chunks = new ArrayList<>();
while (currentIndex < maxIndex) {
int endIndex = Math.min(currentIndex + chunkSize, maxIndex);
String chunk = content.substring(currentIndex, endIndex).trim();
currentIndex = currentIndex + chunkSize - overlapSize;
if (chunk.isEmpty()) {
continue;
}
Document newDocument = new Document();
newDocument.addMetadata(document.getMetadataMap());
newDocument.setContent(chunk);
//we should invoke setId after setContent
newDocument.setId(idGenerator == null ? null : idGenerator.generateId(newDocument));
chunks.add(newDocument);
}
return chunks;
}
}