AmazonWeb服务如何在java中使用AWS Textract检索pdf中存在的表

3 日，4 小时 Questions & Answers 1355

我发现下面的文章是用python编写的

我还使用下面的文章来提取文本

但上面的文章只帮助获取文本，我还使用了函数“block.getBlockType（）块的，但没有一个块返回其类型为“CELL”，即使image/pdf中有表

帮助我找到类似于“boto3”的java库来提取所有表

public static List<TableModel> getTablesFromTextract(TextractModel textractModel) { List<TableModel> tables = null; try { if (textractModel != null) { tables = new ArrayList<>(); List<BlockModel> tableBlocks = new ArrayList<>(); Map<String, BlockModel> blockMap = new HashMap<>(); for (BlockModel block : textractModel.getBlocks()) { if (block.getBlockType().equals("TABLE")) { tableBlocks.add(block); } blockMap.put(block.getId(), block); } for (BlockModel blockModel : tableBlocks) { Map<Long, Map<Long, String>> rowMap = new HashMap<>(); for (RelationshipModel relationship : blockModel.getRelationships()) { if (relationship.getType().equals("CHILD")) { for (String id : relationship.getIds()) { BlockModel cell = blockMap.get(id); if (cell.getBlockType().equals("CELL")) { long rowIndex = cell.getRowIndex(); long columnIndex = cell.getColumnIndex(); if (!rowMap.containsKey(rowIndex)) { rowMap.put(rowIndex, new HashMap<>()); } Map<Long, String> columnMap = rowMap.get(rowIndex); columnMap.put(columnIndex, getCellText(cell, blockMap)); } } } } tables.add(new TableModel(blockModel, rowMap)); } System.out.println("row Map " + tables.toString()); } } catch (Exception e) { LOG.error("Could not get table from textract model", e); } return tables; } private static String getCellText(BlockModel cell, Map<String, BlockModel> blockMap) { String text = ""; try { if (cell != null && CollectionUtils.isNotEmpty(cell.getRelationships())) { for (RelationshipModel relationship : cell.getRelationships()) { if (relationship.getType().equals("CHILD")) { for (String id : relationship.getIds()) { BlockModel word = blockMap.get(id); if (word.getBlockType().equals("WORD")) { text += word.getText() + " "; } else if (word.getBlockType().equals("SELECTION_ELEMENT")) { if (word.getSelectionStatus().equals("SELECTED")) { text += "X "; } } } } } } } catch (Exception e) { LOG.error("Could not get cell text of table", e); } return text; }

public class TableModel { private BlockModel table; private Map<Long, Map<Long, String>> rowMap; public TableModel(BlockModel table, Map<Long, Map<Long, String>> rowMap) { this.table = table; this.rowMap = rowMap; } public BlockModel getTable() { return table; } public void setTable(BlockModel table) { this.table = table; } public Map<Long, Map<Long, String>> getRowMap() { return rowMap; } public void setRowMap(Map<Long, Map<Long, String>> rowMap) { this.rowMap = rowMap; } @Override public String toString() { return table.getId() + " - " + rowMap.toString(); }

# 2 楼答案

我也有类似的想法：

public class AnalyzeDocument {

    public DocumentModel startProcess(byte[] content) {

        Region region = Region.EU_WEST_2;
        TextractClient textractClient = TextractClient.builder().region(region)
                .credentialsProvider(EnvironmentVariableCredentialsProvider.create()).build();

        return analyzeDoc(textractClient, content);
    }

    public DocumentModel analyzeDoc(TextractClient textractClient, byte[] content) {

        try {
            SdkBytes sourceBytes = SdkBytes.fromByteArray(content);
            Util util = new Util();
            Document myDoc = Document.builder().bytes(sourceBytes).build();

            List<FeatureType> featureTypes = new ArrayList<FeatureType>();
            featureTypes.add(FeatureType.FORMS);
            featureTypes.add(FeatureType.TABLES);

            AnalyzeDocumentRequest analyzeDocumentRequest = AnalyzeDocumentRequest.builder().featureTypes(featureTypes)
                    .document(myDoc).build();

            AnalyzeDocumentResponse analyzeDocument = textractClient.analyzeDocument(analyzeDocumentRequest);
            List<Block> docInfo = analyzeDocument.blocks();
//          util.displayBlockInfo(docInfo);
            PageModel pageModel = util.getTableResults(docInfo);
            DocumentModel documentModel = new DocumentModel();
            documentModel.getPages().add(pageModel);

            Iterator<Block> blockIterator = docInfo.iterator();

            while (blockIterator.hasNext()) {
                Block block = blockIterator.next();
                log.debug("The block type is " + block.blockType().toString());
            }
            return documentModel;
        } catch (TextractException e) {

            System.err.println(e.getMessage());
        }
        return null;
    }

这是util文件：


    public PageModel getTableResults(List<Block> blocks) {
        List<Block> tableBlocks = new ArrayList<>();
        Map<String, Block> blockMap = new HashMap<>();
        for (Block block : blocks) {
            blockMap.put(block.id(), block);
            if (block.blockType().equals(BlockType.TABLE)) {
                tableBlocks.add(block);
                log.debug("added table: " + block.text());
            }
        }
        PageModel page = new PageModel();

        if (tableBlocks.size() == 0) {
            return null;
        }
        int i = 0;
        for (Block table : tableBlocks) {
            page.getTables().add(generateTable(table, blockMap, i++));
        }
        return page;
    }

    private TableModel generateTable(Block table, Map<String, Block> blockMap, int index) {
        TableModel model = new TableModel();
        Map<Integer, Map<Integer, String>> rows = getRowsColumnsMap(table, blockMap);
        model.setTableId("Table_" + index);
        for (Map.Entry<Integer, Map<Integer, String>> entry : rows.entrySet()) {
            RowModel rowModel = new RowModel();
            Map<Integer, String> value = entry.getValue();
            for (int i = 0; i < value.size(); i++) {
                rowModel.getCells().add(value.get(i));
            }
            model.getRows().add(rowModel);
        }

        return model;
    }

    private Map<Integer, Map<Integer, String>> getRowsColumnsMap(Block block, Map<String, Block> blockMap) {

        Map<Integer, Map<Integer, String>> rows = new HashMap<>();

        for (Relationship relationship : block.relationships()) {
            if (relationship.type().equals(RelationshipType.CHILD)) {
                for (String childId : relationship.ids()) {
                    Block cell = blockMap.get(childId);
                    if (cell != null) {
                        int rowIndex = cell.rowIndex();
                        int colIndex = cell.columnIndex();
                        if (rows.get(rowIndex) == null) {
                            Map<Integer, String> row = new HashMap<>();
                            rows.put(rowIndex, row);
                        }
                        rows.get(rowIndex).put(colIndex, getText(cell, blockMap));
                    }
                }
            }
        }
        return rows;
    }

    public String getText(Block block, Map<String, Block> blockMap) {
        String text = "";
        if (block.relationships() != null && block.relationships().size() > 0) {
            for (Relationship relationship : block.relationships()) {
                if (relationship.type().equals(RelationshipType.CHILD)) {
                    for (String childId : relationship.ids()) {
                        Block wordBlock = blockMap.get(childId);
                        if (wordBlock != null && wordBlock.blockType() != null) {
                            if (wordBlock.blockType().equals(BlockType.WORD))) {
                                text += wordBlock.text() + " ";
                            }
                        }
                    }
                }
            }
        }

        return text;
    }

Python中文网

有 Java 编程相关的问题?

AmazonWeb服务如何在java中使用AWS Textract检索pdf中存在的表

共 (2) 个答案

# 1 楼答案

# 2 楼答案