From c5304a77b81ad180e58a0ce876329d301771fd5e Mon Sep 17 00:00:00 2001 From: 0007 <0007@qq.com> Date: Wed, 27 Aug 2025 19:58:42 +0800 Subject: [PATCH] Add File --- .../splitter/SimpleDocumentSplitter.java | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/SimpleDocumentSplitter.java diff --git a/agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/SimpleDocumentSplitter.java b/agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/SimpleDocumentSplitter.java new file mode 100644 index 0000000..d5c4d08 --- /dev/null +++ b/agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/SimpleDocumentSplitter.java @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2023-2025, Agents-Flex (fuhai999@gmail.com). + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.agentsflex.core.document.splitter; + +import com.agentsflex.core.document.Document; +import com.agentsflex.core.document.DocumentSplitter; +import com.agentsflex.core.document.id.DocumentIdGenerator; +import com.agentsflex.core.util.StringUtil; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class SimpleDocumentSplitter implements DocumentSplitter { + private int chunkSize; + private int overlapSize; + + public SimpleDocumentSplitter(int chunkSize) { + this.chunkSize = chunkSize; + if (this.chunkSize <= 0) { + throw new IllegalArgumentException("chunkSize must be greater than 0, chunkSize: " + this.chunkSize); + } + } + + public SimpleDocumentSplitter(int chunkSize, int overlapSize) { + this.chunkSize = chunkSize; + this.overlapSize = overlapSize; + + if (this.chunkSize <= 0) { + throw new IllegalArgumentException("chunkSize must be greater than 0, chunkSize: " + this.chunkSize); + } + if (this.overlapSize >= this.chunkSize) { + throw new IllegalArgumentException("overlapSize must be less than chunkSize, overlapSize: " + this.overlapSize + ", chunkSize: " + this.chunkSize); + } + } + + public int getChunkSize() { + return chunkSize; + } + + public void setChunkSize(int chunkSize) { + this.chunkSize = chunkSize; + } + + public int getOverlapSize() { + return overlapSize; + } + + public void setOverlapSize(int overlapSize) { + this.overlapSize = overlapSize; + } + + @Override + public List split(Document document, DocumentIdGenerator idGenerator) { + if (document == null || StringUtil.noText(document.getContent())) { + return Collections.emptyList(); + } + + String content = document.getContent(); + int index = 0, currentIndex = index; + int maxIndex = content.length(); + + List chunks = new ArrayList<>(); + while (currentIndex < maxIndex) { + int endIndex = Math.min(currentIndex + chunkSize, maxIndex); + String chunk = content.substring(currentIndex, endIndex).trim(); + currentIndex = currentIndex + chunkSize - overlapSize; + + if (chunk.isEmpty()) { + continue; + } + + Document newDocument = new Document(); + newDocument.addMetadata(document.getMetadataMap()); + newDocument.setContent(chunk); + + //we should invoke setId after setContent + newDocument.setId(idGenerator == null ? null : idGenerator.generateId(newDocument)); + chunks.add(newDocument); + } + + return chunks; + } +}