diff --git a/agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/RegexDocumentSplitter.java b/agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/RegexDocumentSplitter.java new file mode 100644 index 0000000..eddca2b --- /dev/null +++ b/agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/RegexDocumentSplitter.java @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023-2025, Agents-Flex (fuhai999@gmail.com). + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.agentsflex.core.document.splitter; + +import com.agentsflex.core.document.DocumentSplitter; +import com.agentsflex.core.document.Document; +import com.agentsflex.core.document.id.DocumentIdGenerator; +import com.agentsflex.core.util.StringUtil; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class RegexDocumentSplitter implements DocumentSplitter { + + private final String regex; + + public RegexDocumentSplitter(String regex) { + this.regex = regex; + } + + @Override + public List split(Document document, DocumentIdGenerator idGenerator) { + if (document == null || StringUtil.noText(document.getContent())) { + return Collections.emptyList(); + } + String[] textArray = document.getContent().split(regex); + List chunks = new ArrayList<>(textArray.length); + for (String textString : textArray) { + if (StringUtil.noText(textString)) { + continue; + } + Document newDocument = new Document(); + newDocument.addMetadata(document.getMetadataMap()); + newDocument.setContent(textString); + + //we should invoke setId after setContent + newDocument.setId(idGenerator == null ? null : idGenerator.generateId(newDocument)); + chunks.add(newDocument); + } + return chunks; + } +}