有 Java 编程相关的问题?

你可以在下面搜索框中键入要查询的问题!

java xpath获取表中的行

我有一个html文件,比如:http://scholar.google.gr/citations?user=v9xULZwAAAAJ&hl=el

该文件中有一个包含项目的表格。我想用xpath获得前20篇文章(如果有的话)

我试图找到第一篇文章:

String str = (String) xpath.evaluate("//form[contains(@id,'citationsForm')]/div[2]/div[1]/table/tbody/tr[2]/td[@id='col-title']/a", docList.get(0), XPathConstants.STRING);

没关系!结果:现代信息检索

对于所有文章:

String str = (String) xpath.evaluate("//form[contains(@id,'citationsForm')]/div[2]/div[1]/table/tbody/tr/td[@id='col-title']/a", docList.get(0), XPathConstants.STRING);

但不起作用 知道吗

比你强

编辑: 我还尝试:

        NodeList result = (NodeList)xpath.evaluate("//form[contains(@id,'citationsForm')]/div[2]/div[1]/table/tbody/tr/td[@id='col-title']/a",
        docList.get(0), XPathConstants.NODESET);
        ArrayList<String>liste = new ArrayList<String>();
        for(int i=0; i<result.getLength();i++){
            System.out.println(result.item(i).getNodeValue());
            liste.add(result.item(i).getNodeName());
        }

编辑2所有代码

类文件操作:

package xmlparse;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.parsers.ParserConfigurationException;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;

public class FileOperations {

    private static final String path = "C:\\Users\\Dimitris\\Desktop\\authors";

    public ArrayList<Document> getXmlDocumt() {
        ArrayList<Document> xmlFileList = new ArrayList<>();

        try {
            ArrayList<File> listFiles = listFiles(path);
            for (File f : listFiles) {
                String html = readfile(f.getAbsolutePath());

                xmlFileList.add(ConvertHtml2Xml(html) );

            }
        } catch (IOException ex) {
            Logger.getLogger(FileOperations.class.getName()).log(Level.SEVERE, null, ex);
        }
        return xmlFileList;
    }

    private ArrayList<File> listFiles(String directoryName) throws IOException {
        ArrayList<File> htmlfilelist = new ArrayList<>();
        File directory = new File(directoryName);

        //get all the files from a directory
        File[] fList = directory.listFiles();

        for (File file : fList) {
            if (file.isFile()) {
                htmlfilelist.add(file);
            }
        }
        return htmlfilelist;
    }

    private String readfile(String file) throws FileNotFoundException, IOException {
        String s = "";
        FileReader fr = new FileReader(file);
        BufferedReader br = new BufferedReader(fr);
        StringBuilder content = new StringBuilder(1024);
        while ((s = br.readLine()) != null) {
            content.append(s);
        }
        //System.out.println(content.toString());
        return content.toString();
    }

    private Document ConvertHtml2Xml(String html) {
        TagNode tagNode = new HtmlCleaner().clean(html);
        Document doc = null;

        try {
            doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
        } catch (ParserConfigurationException ex) {
            Logger.getLogger(FileOperations.class.getName()).log(Level.SEVERE, null, ex);
        }

        return doc;

    }

}

类XpathQueries:

XPath xpath;
    ArrayList<Document> docList;

    public XpathQueries() {
        xpath = XPathFactory.newInstance().newXPath();
        FileOperations fo = new FileOperations();
        docList = new ArrayList<>(fo.getXmlDocumt());
    }

    public void getArticle() throws XPathExpressionException {
//        String str = (String) xpath.evaluate("//form[contains(@id,'citationsForm')]/div[2]/div[1]/table/tbody//td[1]/a",
//                docList.get(0), XPathConstants.STRING);
         String str = (String) xpath.evaluate("//*[@id='col-title']/a", docList.get(0), XPathConstants.STRING);

        System.out.println(str);
    }
}

共 (2) 个答案

  1. # 1 楼答案

    谢谢你的帮助。 解决方案是:

    int length;
    Object result = xpath.evaluate("//a[contains(@href,'citation_for_view')]", docList.get(0), XPathConstants.NODESET);
    NodeList nodes = (NodeList) result;
    length = nodes.getLength();
    if(length>20){
        length=20;
    }
    
    for (int i = 0; i < length; i++) {
        System.out.println(nodes.item(i).getFirstChild().getNodeValue());
    }
    
  2. # 2 楼答案

    试试这个:

    Object result = xpath.evaluate("//*[@id='col-title']/a", docList.get(0), XPathConstants.STRING);
    NodeList nodes = (NodeList) result;
    for (int i = 0; i < nodes.getLength(); i++) {
     System.out.println(nodes.item(i).getNodeValue()); 
    }