有 Java 编程相关的问题?

你可以在下面搜索框中键入要查询的问题!

使用java解析xml并保留html标记

我有一个xml,我解析它并从节点之间获取数据。但是,这些数据被html标记包围。我创建了另一个xml并将这些数据放入其中。现在我必须再次解析它,以获得正确的html语法

请帮忙

public class XMLfunctions {

public final static Document XMLfromString(String xml){

    Document doc = null;

    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    try {

        DocumentBuilder db = dbf.newDocumentBuilder();

        InputSource is = new InputSource();
        is.setCharacterStream(new StringReader(xml));
        doc = db.parse(is); 

    } catch (ParserConfigurationException e) {
        System.out.println("XML parse error: " + e.getMessage());
        return null;
    } catch (SAXException e) {
        System.out.println("Wrong XML file structure: " + e.getMessage());
        return null;
    } catch (IOException e) {
        System.out.println("I/O exeption: " + e.getMessage());
        return null;
    }

    return doc;

}


/** Returns element value
  * @param elem element (it is XML tag)
  * @return Element value otherwise empty String
  */
 public final static String getElementValue( Node elem ) {
     Node kid;
     if( elem != null){
         if (elem.hasChildNodes()){
             for( kid = elem.getFirstChild(); kid != null; kid = kid.getNextSibling() ){
                 if( kid.getNodeType() == Node.TEXT_NODE  ){
                     return kid.getNodeValue();
                 }
             }
         }
     }
     return "";
 }

/*Start Parsing Body */
 public static String getBodyXML(String id){     
        String line = null;
        try {
            DefaultHttpClient httpClient = new DefaultHttpClient();
            HttpPost httpPost = new HttpPost("http://192.168.1.44:9090/solr/core0/select/?q=content_id:"+id+"&version=2.2&start=0&rows=10&indent=on");
            HttpResponse httpResponse = httpClient.execute(httpPost);
            HttpEntity httpEntity = httpResponse.getEntity();
            line = EntityUtils.toString(httpEntity);

        } catch (UnsupportedEncodingException e) {
            line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
        } catch (MalformedURLException e) {
            line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
        } catch (IOException e) {
            line = "<results status=\"error\"><msg>Can't connect to server</msg></results>";
        }
        String st= ParseXMLBodyNode(line,"doc");
        return st;

}

public static String ParseXMLBodyNode(String str,String node){
     String xmlRecords = str;
     String results = "";
     String[] result = new String [1];
     StringBuffer sb = new StringBuffer();
     StringBuffer text = new StringBuffer(); 
     try {
         DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
         DocumentBuilder db = dbf.newDocumentBuilder();
         InputSource is = new InputSource();
         is.setCharacterStream(new StringReader(xmlRecords));
         Document doc = db.parse(is);
         NodeList indiatimes1 = doc.getElementsByTagName(node);
         sb.append("<results count=");
         sb.append("\"1\"");
         sb.append(">\r\n");

         for (int i = 0; i < indiatimes1.getLength(); i++) {
            Node node1 = indiatimes1.item(i);
            if (node1.getNodeType() == Node.ELEMENT_NODE) {
                Element element = (Element) node1;
                NodeList nodelist = element.getElementsByTagName("str");
                Element element1 = (Element) nodelist.item(0);
                NodeList title = element1.getChildNodes();
                title.getLength();
                for(int j=0; j<title.getLength();j++){
                    text.append(title.item(j).getNodeValue());
                }
                System.out.print((title.item(0)).getNodeValue());
                sb.append("<result>");
                    sb.append("<news>");
                    String tmpText = html2text(text.toString());
                        //sb.append("<![CDATA[<body>");
                            sb.append(tmpText);
                        //sb.append("</body>]]>");
                    sb.append("</news>");
                sb.append("</result>\r\n");
                result[i] = title.item(0).getNodeValue();
            }
         }
         sb.append("</results>");
     } catch (Exception e) {
         System.out.println("Exception........"+results );
         e.printStackTrace();
     }
     return sb.toString();
 }

 public static String html2text(String html) {

    String pText = Jsoup.clean(html, Whitelist.basic());
    return pText;
}

初始化进程的类

public class NewsDetails extends ListActivity{

/** Called when the activity is first created. */
@Override
public void onCreate(Bundle savedInstanceState) {
    super.onCreate(savedInstanceState);
    setContentView(R.layout.listplaceholder);
/*}

@Override
protected void onStart() {*/

    super.onStart();
    Intent myIntent = getIntent(); 
    String id = myIntent.getStringExtra("content_id");
    String title = myIntent.getStringExtra("title");

    ArrayList<HashMap<String, String>> mylist = new ArrayList<HashMap<String, String>>();


    String xml = XMLfunctions.getBodyXML(id);
    Document doc = XMLfunctions.XMLfromString(xml);

    int numResults = XMLfunctions.numResults(doc);

    if((numResults <= 0)){
        Toast.makeText(NewsDetails.this, "No Result Found", Toast.LENGTH_LONG).show();  
        finish();
    }

    NodeList nodes = doc.getElementsByTagName("result");

    for (int i = 0; i < nodes.getLength(); i++) {                           
        HashMap<String, String> map = new HashMap<String, String>();    
        map.put("title", title);
        Element e = (Element)nodes.item(i);
        map.put("news", XMLfunctions.getValue(e, "news"));
        mylist.add(map);            
    }       

    ListAdapter adapter = new SimpleAdapter(this, mylist , R.layout.list_item, new String[] { "title", "news" }, new int[] { R.id.item_title, R.id.item_subtitle });

    setListAdapter(adapter);

    final ListView lv = getListView();
    lv.setTextFilterEnabled(true);
}

从jsoup转换后得到的示例xml

<results count="1">
<result>
    <news>
        <ul><li><p>as part of its growth plan,</p></li><li><p>in a bid to achieve the target</p></li><li><p>it is pointed out that most of ccl's production came from opencast mines and only 2 mt from underground (ug) mines. ccl is now trying to increase the share underground production. the board of ccl has, thus, approved the introduction of continuous mine in chiru ug at a cost of about rs 145 crore to raise this mine's production from 2 mt to 8 mt per annum.</p></li><li><p>mr ritolia said that.</p></li></ul>
    </news>
</result>
</results>

我想提取新闻标签之间的内容。此xml被馈送到XMLFunctions类中的XMLfromString(String xml)函数,然后该函数只返回“<;”身体的其余部分就剩下了

我无法使用html标记获取正文以提供格式


共 (1) 个答案

  1. # 1 楼答案

    一个选项是将XML CDATA节用作:

        <result>
            <news><![CDATA[ 
    <ul><li><p>as part of its growth plan,</p></li><li><p>in a bid to achieve the target</p></li><li><p>it is pointed out that most of ccl's production came from opencast mines and only 2 mt from underground (ug) mines. ccl is now trying to increase the share underground production. the board of ccl has, thus, approved the introduction of continuous mine in chiru ug at a cost of about rs 145 crore to raise this mine's production from 2 mt to 8 mt per annum.</p></li><li><p>mr ritolia said that.</p></li></ul>
    ]]>
            </news>
        </result>
        </results>
    

    然后,解析器将不会将HTML标记视为XML,并允许您访问元素的原始内容。另一个选项是对HTML标记进行编码,即将所有<转换为&lt;>转换为&gt;&转换为&amp;等。有关编码的更多信息,请参见here