有 Java 编程相关的问题?

你可以在下面搜索框中键入要查询的问题!

java使用ApachePDFBox在PDF中查找javascript代码

我的目标是提取和处理PDF文档可能包含的任何JavasSript代码。通过在编辑器中打开PDF,我可以看到如下对象:

    402 0 obj
<</S/JavaScript/JS(\n\r\n   /* Set day 25 */\r\n    FormRouter_SetCurrentDate\("25"\);\r)>>
endobj

我正在尝试使用ApachePDFBox来实现这一点,但到目前为止运气不佳

此行返回一个空列表:

 jsObj = doc.getObjectsByType(COSName.JAVA_SCRIPT);

谁能给我指点方向吗


共 (1) 个答案

  1. # 1 楼答案

    此工具基于PDFBox中的PrintFields示例。它将以表单的形式显示Javascript字段。我去年为一个在AcroForm字段之间的关系有问题的家伙写了这篇文章(一些字段根据其他字段的值被启用/禁用)。还有其他地方可以使用Javascript

    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *      http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */
    package pdfboxpageimageextraction;
    
    import java.io.File;
    import java.io.IOException;
    import java.util.List;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
    import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
    import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
    import org.apache.pdfbox.pdmodel.interactive.action.PDFormFieldAdditionalActions;
    import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
    import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
    import org.apache.pdfbox.pdmodel.interactive.form.PDField;
    import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
    import org.apache.pdfbox.pdmodel.interactive.form.PDTerminalField;
    
    /**
     * This example will take a PDF document and print all the fields from the file.
     *
     * @author Ben Litchfield
     *
     */
    public class PrintJavaScriptFields
    {
    
        /**
         * This will print all the fields from the document.
         *
         * @param pdfDocument The PDF to get the fields from.
         *
         * @throws IOException If there is an error getting the fields.
         */
        public void printFields(PDDocument pdfDocument) throws IOException
        {
            PDDocumentCatalog docCatalog = pdfDocument.getDocumentCatalog();
            PDAcroForm acroForm = docCatalog.getAcroForm();
            List<PDField> fields = acroForm.getFields();
    
            //System.out.println(fields.size() + " top-level fields were found on the form");
            for (PDField field : fields)
            {
                processField(field, "| ", field.getPartialName());
            }
        }
    
        private void processField(PDField field, String sLevel, String sParent) throws IOException
        {
            String partialName = field.getPartialName();
    
            if (field instanceof PDTerminalField)
            {
                PDTerminalField termField = (PDTerminalField) field;
                PDFormFieldAdditionalActions fieldActions = field.getActions();
                if (fieldActions != null)
                {
                    System.out.println(field.getFullyQualifiedName() + ": " + fieldActions.getClass().getSimpleName() + " js field actionS:\n" + fieldActions.getCOSObject());
                    printPossibleJS(fieldActions.getK());
                    printPossibleJS(fieldActions.getC());
                    printPossibleJS(fieldActions.getF());
                    printPossibleJS(fieldActions.getV());
                }
                for (PDAnnotationWidget widgetAction : termField.getWidgets())
                {
                    PDAction action = widgetAction.getAction();
                    if (action instanceof PDActionJavaScript)
                    {
                        System.out.println(field.getFullyQualifiedName() + ": " + action.getClass().getSimpleName() + " js widget action:\n" + action.getCOSObject());
                        printPossibleJS(action);
                    }
                }
            }
    
            if (field instanceof PDNonTerminalField)
            {
                if (!sParent.equals(field.getPartialName()))
                {
                    if (partialName != null)
                    {
                        sParent = sParent + "." + partialName;
                    }
                }
                //System.out.println(sLevel + sParent);
    
                for (PDField child : ((PDNonTerminalField) field).getChildren())
                {
                    processField(child, "|  " + sLevel, sParent);
                }
            }
            else
            {
                String fieldValue = field.getValueAsString();
                StringBuilder outputString = new StringBuilder(sLevel);
                outputString.append(sParent);
                if (partialName != null)
                {
                    outputString.append(".").append(partialName);
                }
                outputString.append(" = ").append(fieldValue);
                outputString.append(", type=").append(field.getClass().getName());
                //System.out.println(outputString);
            }
        }
    
        private void printPossibleJS(PDAction kAction)
        {
            if (kAction instanceof PDActionJavaScript)
            {
                PDActionJavaScript jsAction = (PDActionJavaScript) kAction;
                String jsString = jsAction.getAction();
                if (!jsString.contains("\n"))
                {
                    // avoid display problems with netbeans
                    jsString = jsString.replaceAll("\r", "\n").replaceAll("\n\n", "\n");
                }
                System.out.println(jsString);
                System.out.println();
            }
        }
    
        /**
         * This will read a PDF file and print out the form elements. <br />
         * see usage() for commandline
         *
         * @param args command line arguments
         *
         * @throws IOException If there is an error importing the FDF document.
         */
        public static void main(String[] args) throws IOException
        {
            PDDocument pdf = null;
            try
            {
                pdf = PDDocument.load(new File("XXXX", "YYYYY.pdf"));
                PrintJavaScriptFields exporter = new PrintJavaScriptFields();
                exporter.printFields(pdf);
            }
            finally
            {
                if (pdf != null)
                {
                    pdf.close();
                }
            }
        }
    
    } 
    

    作为奖励,下面是显示所有coString对象的代码:

    public class ShowAllCOSStrings
    {
        static Set<COSString> strings = new HashSet<COSString>();
    
        static void crawl(COSBase base)
        {
            if (base instanceof COSString)
            {
                strings.add((COSString)base);
                return;
            }
            if (base instanceof COSDictionary)
            {
                COSDictionary dict = (COSDictionary) base;
                for (COSName key : dict.keySet())
                {
                    crawl(dict.getDictionaryObject(key));
                }
                return;
            }
            if (base instanceof COSArray)
            {
                COSArray ar = (COSArray) base;
    
                for (COSBase item : ar)
                {
                    crawl(item);
                }
                return;
            }
            if (base instanceof COSNull || 
                    base instanceof COSObject || 
                    base instanceof COSName || 
                    base instanceof COSNumber || 
                    base instanceof COSBoolean || 
                    base == null)
            {
                return;
            }
            System.out.println("huh? " + base);
        }
    
        public static void main(String[] args) throws IOException
        {
            PDDocument doc = PDDocument.load(new File("XXX","YYY.pdf"));
    
            for (COSObject obj : doc.getDocument().getObjects())
            {
                COSBase base = obj.getObject();
                //System.out.println(obj + ": " + base);
                crawl(base);
            }
            System.out.println(strings.size() + " strings:");
            for (COSString s : strings)
            {
                String str = s.getString();
                if (!str.contains("\n"))
                {
                    // avoid display problems with netbeans
                    str = str.replaceAll("\r", "\n").replaceAll("\n\n", "\n");
                }
                System.out.println(str);
            }
            doc.close();
        }
    }
    

    但是,Javascript也可以在流中。请参见PDF规范“特定于格式副本操作的其他条目”中的JS条目:

    A text string or stream containing a JavaScript script that shall be executed when the action is triggered.

    您也可以更改上面的代码来捕获costream对象;Costream是从COSDictionary扩展而来的