有 Java 编程相关的问题?

你可以在下面搜索框中键入要查询的问题!

AmazonWeb服务如何在java中使用AWS Textract检索pdf中存在的表

我发现下面的文章是用python编写的

https://docs.aws.amazon.com/textract/latest/dg/examples-export-table-csv.html

我还使用下面的文章来提取文本

https://docs.aws.amazon.com/textract/latest/dg/detecting-document-text.html

但上面的文章只帮助获取文本,我还使用了函数“block.getBlockType() 块的,但没有一个块返回其类型为“CELL”,即使image/pdf中有表

帮助我找到类似于“boto3”的java库来提取所有表


共 (2) 个答案

  1. # 1 楼答案

    我在json响应中创建了每个数据集的模型,并可以使用这些模型在jsf中构建一个表视图

    public static List<TableModel> getTablesFromTextract(TextractModel textractModel) {
        List<TableModel> tables = null;
    
        try {
    
            if (textractModel != null) {
                tables = new ArrayList<>();
                List<BlockModel> tableBlocks = new ArrayList<>();
                Map<String, BlockModel> blockMap = new HashMap<>();
    
                for (BlockModel block : textractModel.getBlocks()) {
    
                    if (block.getBlockType().equals("TABLE")) {
                        tableBlocks.add(block);
    
                    }
                    blockMap.put(block.getId(), block);
                }
    
                for (BlockModel blockModel : tableBlocks) {
    
                    Map<Long, Map<Long, String>> rowMap = new HashMap<>();
    
                    for (RelationshipModel relationship : blockModel.getRelationships()) {
    
                        if (relationship.getType().equals("CHILD")) {
    
                            for (String id : relationship.getIds()) {
    
                                BlockModel cell = blockMap.get(id);
    
                                if (cell.getBlockType().equals("CELL")) {
    
                                    long rowIndex = cell.getRowIndex();
                                    long columnIndex = cell.getColumnIndex();
    
                                    if (!rowMap.containsKey(rowIndex)) {
                                        rowMap.put(rowIndex, new HashMap<>());
                                    }
    
                                    Map<Long, String> columnMap = rowMap.get(rowIndex);
                                    columnMap.put(columnIndex, getCellText(cell, blockMap));
                                }
                            }
                        }
                    }
                    tables.add(new TableModel(blockModel, rowMap));
                }
                System.out.println("row Map " + tables.toString());
            }
        } catch (Exception e) {
            LOG.error("Could not get table from textract model", e);
        }
        return tables;
    }
    
    private static String getCellText(BlockModel cell, Map<String, BlockModel> blockMap) {
        String text = "";
    
        try {
    
            if (cell != null
                    && CollectionUtils.isNotEmpty(cell.getRelationships())) {
    
                for (RelationshipModel relationship : cell.getRelationships()) {
    
                    if (relationship.getType().equals("CHILD")) {
    
                        for (String id : relationship.getIds()) {
    
                            BlockModel word = blockMap.get(id);
    
                            if (word.getBlockType().equals("WORD")) {
                                text += word.getText() + " ";
                            } else if (word.getBlockType().equals("SELECTION_ELEMENT")) {
    
                                if (word.getSelectionStatus().equals("SELECTED")) {
                                    text += "X ";
                                }
                            }
                        }
                    }
                }
            }
    
        } catch (Exception e) {
            LOG.error("Could not get cell text of table", e);
        }
        return text;
    }
    

    要从中创建视图的TableModel:

    public class TableModel {
    
    private BlockModel table;
    private Map<Long, Map<Long, String>> rowMap;
    
    public TableModel(BlockModel table, Map<Long, Map<Long, String>> rowMap) {
        this.table = table;
        this.rowMap = rowMap;
    }
    
    public BlockModel getTable() {
        return table;
    }
    
    public void setTable(BlockModel table) {
        this.table = table;
    }
    
    public Map<Long, Map<Long, String>> getRowMap() {
        return rowMap;
    }
    
    public void setRowMap(Map<Long, Map<Long, String>> rowMap) {
        this.rowMap = rowMap;
    }
    
    @Override
    public String toString() {
        return table.getId() + " - " + rowMap.toString();
    }
    
  2. # 2 楼答案

    我也有类似的想法:

    public class AnalyzeDocument {
    
        public DocumentModel startProcess(byte[] content) {
    
            Region region = Region.EU_WEST_2;
            TextractClient textractClient = TextractClient.builder().region(region)
                    .credentialsProvider(EnvironmentVariableCredentialsProvider.create()).build();
    
            return analyzeDoc(textractClient, content);
        }
    
        public DocumentModel analyzeDoc(TextractClient textractClient, byte[] content) {
    
            try {
                SdkBytes sourceBytes = SdkBytes.fromByteArray(content);
                Util util = new Util();
                Document myDoc = Document.builder().bytes(sourceBytes).build();
    
                List<FeatureType> featureTypes = new ArrayList<FeatureType>();
                featureTypes.add(FeatureType.FORMS);
                featureTypes.add(FeatureType.TABLES);
    
                AnalyzeDocumentRequest analyzeDocumentRequest = AnalyzeDocumentRequest.builder().featureTypes(featureTypes)
                        .document(myDoc).build();
    
                AnalyzeDocumentResponse analyzeDocument = textractClient.analyzeDocument(analyzeDocumentRequest);
                List<Block> docInfo = analyzeDocument.blocks();
    //          util.displayBlockInfo(docInfo);
                PageModel pageModel = util.getTableResults(docInfo);
                DocumentModel documentModel = new DocumentModel();
                documentModel.getPages().add(pageModel);
    
                Iterator<Block> blockIterator = docInfo.iterator();
    
                while (blockIterator.hasNext()) {
                    Block block = blockIterator.next();
                    log.debug("The block type is " + block.blockType().toString());
                }
                return documentModel;
            } catch (TextractException e) {
    
                System.err.println(e.getMessage());
            }
            return null;
        }
    
    

    这是util文件:

    
        public PageModel getTableResults(List<Block> blocks) {
            List<Block> tableBlocks = new ArrayList<>();
            Map<String, Block> blockMap = new HashMap<>();
            for (Block block : blocks) {
                blockMap.put(block.id(), block);
                if (block.blockType().equals(BlockType.TABLE)) {
                    tableBlocks.add(block);
                    log.debug("added table: " + block.text());
                }
            }
            PageModel page = new PageModel();
    
            if (tableBlocks.size() == 0) {
                return null;
            }
            int i = 0;
            for (Block table : tableBlocks) {
                page.getTables().add(generateTable(table, blockMap, i++));
            }
            return page;
        }
    
        private TableModel generateTable(Block table, Map<String, Block> blockMap, int index) {
            TableModel model = new TableModel();
            Map<Integer, Map<Integer, String>> rows = getRowsColumnsMap(table, blockMap);
            model.setTableId("Table_" + index);
            for (Map.Entry<Integer, Map<Integer, String>> entry : rows.entrySet()) {
                RowModel rowModel = new RowModel();
                Map<Integer, String> value = entry.getValue();
                for (int i = 0; i < value.size(); i++) {
                    rowModel.getCells().add(value.get(i));
                }
                model.getRows().add(rowModel);
            }
    
            return model;
        }
    
        private Map<Integer, Map<Integer, String>> getRowsColumnsMap(Block block, Map<String, Block> blockMap) {
    
            Map<Integer, Map<Integer, String>> rows = new HashMap<>();
    
            for (Relationship relationship : block.relationships()) {
                if (relationship.type().equals(RelationshipType.CHILD)) {
                    for (String childId : relationship.ids()) {
                        Block cell = blockMap.get(childId);
                        if (cell != null) {
                            int rowIndex = cell.rowIndex();
                            int colIndex = cell.columnIndex();
                            if (rows.get(rowIndex) == null) {
                                Map<Integer, String> row = new HashMap<>();
                                rows.put(rowIndex, row);
                            }
                            rows.get(rowIndex).put(colIndex, getText(cell, blockMap));
                        }
                    }
                }
            }
            return rows;
        }
    
        public String getText(Block block, Map<String, Block> blockMap) {
            String text = "";
            if (block.relationships() != null && block.relationships().size() > 0) {
                for (Relationship relationship : block.relationships()) {
                    if (relationship.type().equals(RelationshipType.CHILD)) {
                        for (String childId : relationship.ids()) {
                            Block wordBlock = blockMap.get(childId);
                            if (wordBlock != null && wordBlock.blockType() != null) {
                                if (wordBlock.blockType().equals(BlockType.WORD))) {
                                    text += wordBlock.text() + " ";
                                }
                            }
                        }
                    }
                }
            }
    
            return text;
        }