java PDFBox文本位置x、y和宽度,高度偏移因子为2
我编写了一个简短的库,从PDF文档中提取锚文本的位置,以便以后可以将图像渲染为缓冲图像,并在其上分层HTML表单。它可以工作,但我必须将x、y、宽度和高度缩放2才能正确工作。我目前正在测试它,将图像渲染到BuffereImage,然后用红色绘制它们的边界框。大体上为什么它会被这个2的因子所抵消。。。我能指望这个因子是常数吗?当然,我意识到如果图像的大小改变了x,y坐标和宽度,那么高度将适当地缩放。是否转换为图像或缩放它
这是我的密码:
锚定裂土器。爪哇
import java.awt.Rectangle;
import java.io.IOException;
import java.util.HashMap;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
public class AnchorTextRipper extends PDFTextStripper {
protected enum ScanState {
INIT,
SEARCHING,
FOUND_POSSIBLE,
SCANNING_ANCHOR,
DONE
}
protected HashMap<String, Rectangle> anchors = new HashMap<String, Rectangle>();
// Scanning variables
protected ScanState state = ScanState.INIT;
protected TextPosition lastFoundAnchor;
protected StringBuilder lastFoundAnchorText;
protected Double lastWidth;
protected Rectangle lastFoundAnchorRect;
public AnchorTextRipper() throws IOException {
super();
this.setSortByPosition(true);
}
/**
* A method provided as an event interface to allow a subclass to perform
* some specific functionality when text needs to be processed.
*
* @param text
* The text to be processed
*/
@Override
protected void processTextPosition(TextPosition text) {
switch(state) {
case INIT:
state = ScanState.SEARCHING;
lastFoundAnchor = null;
lastFoundAnchorText = new StringBuilder();
lastWidth = 0.0;
lastFoundAnchorRect = null;
break;
case SEARCHING:
if (text.getCharacter().equals("$")) {
state = ScanState.FOUND_POSSIBLE;
lastFoundAnchor = text;
}
break;
case FOUND_POSSIBLE:
if (text.getCharacter().equals("{")) {
state = ScanState.SCANNING_ANCHOR;
}
break;
case SCANNING_ANCHOR:
if (text.getCharacter().equals("}")) {
state = ScanState.DONE;
break;
}
lastFoundAnchorText.append(text.getCharacter());
break;
case DONE:
System.out.println(String.format("%f, %f (%f, %f) [%f, %f]", lastFoundAnchor.getX(), lastFoundAnchor.getY(), lastFoundAnchor.getXScale(), lastFoundAnchor.getYScale(), lastFoundAnchor.getWidth(), lastFoundAnchor.getHeight()));
lastFoundAnchorRect = new Rectangle((int)Math.round(lastFoundAnchor.getX() * 2) , (int)Math.round((lastFoundAnchor.getY() * 2) - lastFoundAnchor.getHeight() * 2), (int)Math.round(lastWidth) * 2, (int)Math.round(lastFoundAnchor.getHeight() * 2));
anchors.put(lastFoundAnchorText.toString(), lastFoundAnchorRect);
state = ScanState.INIT;
break;
}
if (state != ScanState.SEARCHING) {
lastWidth += text.getWidth();
}
}
}
AnchortExtLocator服务。爪哇
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
public class AnchorTextLocatorService {
protected AnchorTextRipper ripper = new AnchorTextRipper();
public AnchorTextLocatorService(String filename) throws IOException {
PDDocument document = null;
try {
document = PDDocument.load(filename);
if (document.isEncrypted()) {
document.decrypt("");
}
@SuppressWarnings("unchecked")
List<PDPage> allPages = document.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
PDPage page = (PDPage) allPages.get(i);
PDStream contents = page.getContents();
if (contents != null) {
ripper.processStream(page, page.findResources(), page.getContents().getStream());
}
}
} catch (CryptographyException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (document != null) {
document.close();
}
}
}
public HashMap<String, Rectangle> getAnchors() {
return ripper.anchors;
}
public Rectangle getAnchorRect(String anchorText) {
return ripper.anchors.get(anchorText);
}
}
应用程序。爪哇
import java.awt.Color;
import java.awt.Graphics2D;
import java.awt.Rectangle;
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.Map.Entry;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
public class Application {
/**
* This will print the documents data.
*
* @param args
* The command line arguments.
*
* @throws Exception
* If there is an error parsing the document.
*/
public static void main(String[] args) throws Exception {
PDDocument document = PDDocument.load("test.pdf");
if (document.isEncrypted()) {
document.decrypt("");
}
PDPage page = (PDPage)document.getDocumentCatalog().getAllPages().get(0);
BufferedImage bi = page.convertToImage();
AnchorTextLocatorService ats = new AnchorTextLocatorService("test.pdf");
for (Entry<String, Rectangle> anchor : ats.getAnchors().entrySet()) {
System.out.println(anchor.getKey() + " => " + anchor.getValue());
Graphics2D g = (Graphics2D)bi.getGraphics();
g.setColor(Color.RED);
g.drawRect(anchor.getValue().x, anchor.getValue().y, anchor.getValue().width, anchor.getValue().height);
}
ImageIO.write(bi, "png", new File("test.png"));
}
}
# 1 楼答案
https://pdfbox.apache.org/apidocs/org/apache/pdfbox/pdmodel/PDPage.html
对不起。。。我刚看过医生。。。我应该先这么做。PDPage::convertToImage()以双分辨率输出。希望这对其他人有帮助