diff --git a/agents-flex-document-parser/agents-flex-document-parser-pdfbox/src/main/java/com/agentsflex/document/parser/PdfBoxDocumentParser.java b/agents-flex-document-parser/agents-flex-document-parser-pdfbox/src/main/java/com/agentsflex/document/parser/PdfBoxDocumentParser.java new file mode 100644 index 0000000..ab008d4 --- /dev/null +++ b/agents-flex-document-parser/agents-flex-document-parser-pdfbox/src/main/java/com/agentsflex/document/parser/PdfBoxDocumentParser.java @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023-2025, Agents-Flex (fuhai999@gmail.com). + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.agentsflex.document.parser; + +import com.agentsflex.core.document.Document; +import com.agentsflex.core.document.DocumentParser; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +public class PdfBoxDocumentParser implements DocumentParser { + + /** + * 返回整个文档的内容 + */ + @Override + public Document parse(InputStream stream) { + try (PDDocument pdfDocument = PDDocument.load(stream)) { + PDFTextStripper stripper = new PDFTextStripper(); + String text = stripper.getText(pdfDocument); + return new Document(text); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * 返回每页文档的内容 + */ + public List parseWithPage(InputStream inputStream) { + try (PDDocument pdfDocument = PDDocument.load(inputStream)) { + return getDocuments(pdfDocument); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private List getDocuments(PDDocument pdDocument) throws IOException { + List documents = new ArrayList<>(); + int pageCount = pdDocument.getNumberOfPages(); + for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { + PDFTextStripper stripper = new PDFTextStripper(); + stripper.setStartPage(pageNumber); + stripper.setEndPage(pageNumber); + String content = stripper.getText(pdDocument); + + Document document = new Document(); + document.setContent(content); + + document.addMetadata("pageNumber", pageNumber); + documents.add(document); + } + return documents; + } + +}