From 2c22279d049ac21204cb525ac372e80a843471c4 Mon Sep 17 00:00:00 2001 From: 0007 <0007@qq.com> Date: Wed, 27 Aug 2025 19:58:42 +0800 Subject: [PATCH] Add File --- .../splitter/SimpleTokenizeSplitter.java | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/SimpleTokenizeSplitter.java diff --git a/agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/SimpleTokenizeSplitter.java b/agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/SimpleTokenizeSplitter.java new file mode 100644 index 0000000..05e92e6 --- /dev/null +++ b/agents-flex-core/src/main/java/com/agentsflex/core/document/splitter/SimpleTokenizeSplitter.java @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2023-2025, Agents-Flex (fuhai999@gmail.com). + *
+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *
+ * http://www.apache.org/licenses/LICENSE-2.0 + *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.agentsflex.core.document.splitter;
+
+import com.agentsflex.core.document.Document;
+import com.agentsflex.core.document.DocumentSplitter;
+import com.agentsflex.core.document.id.DocumentIdGenerator;
+import com.agentsflex.core.util.StringUtil;
+import com.knuddels.jtokkit.Encodings;
+import com.knuddels.jtokkit.api.Encoding;
+import com.knuddels.jtokkit.api.EncodingRegistry;
+import com.knuddels.jtokkit.api.EncodingType;
+import com.knuddels.jtokkit.api.IntArrayList;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+public class SimpleTokenizeSplitter implements DocumentSplitter {
+ private EncodingRegistry registry = Encodings.newLazyEncodingRegistry();
+ private EncodingType encodingType = EncodingType.CL100K_BASE;
+ private int chunkSize;
+ private int overlapSize;
+
+ public SimpleTokenizeSplitter(int chunkSize) {
+ this.chunkSize = chunkSize;
+ if (this.chunkSize <= 0) {
+ throw new IllegalArgumentException("chunkSize must be greater than 0, chunkSize: " + this.chunkSize);
+ }
+ }
+
+ public SimpleTokenizeSplitter(int chunkSize, int overlapSize) {
+ this.chunkSize = chunkSize;
+ this.overlapSize = overlapSize;
+
+ if (this.chunkSize <= 0) {
+ throw new IllegalArgumentException("chunkSize must be greater than 0, chunkSize: " + this.chunkSize);
+ }
+ if (this.overlapSize >= this.chunkSize) {
+ throw new IllegalArgumentException("overlapSize must be less than chunkSize, overlapSize: " + this.overlapSize + ", chunkSize: " + this.chunkSize);
+ }
+ }
+
+ public int getChunkSize() {
+ return chunkSize;
+ }
+
+ public void setChunkSize(int chunkSize) {
+ this.chunkSize = chunkSize;
+ }
+
+ public int getOverlapSize() {
+ return overlapSize;
+ }
+
+ public void setOverlapSize(int overlapSize) {
+ this.overlapSize = overlapSize;
+ }
+
+ public EncodingRegistry getRegistry() {
+ return registry;
+ }
+
+ public void setRegistry(EncodingRegistry registry) {
+ this.registry = registry;
+ }
+
+ public EncodingType getEncodingType() {
+ return encodingType;
+ }
+
+ public void setEncodingType(EncodingType encodingType) {
+ this.encodingType = encodingType;
+ }
+
+ @Override
+ public List