Skip to Main Content

Java EE (Java Enterprise Edition) General Discussion

Announcement

For appeals, questions and feedback about Oracle Forums, please email oracle-forums-moderators_us@oracle.com. Technical questions should be asked in the appropriate category. Thank you!

Can someone explain why one code works and the other one doesn't?

843834Jun 19 2006 — edited Jun 27 2006
Hi,
I have been doing a little work with XML today and I wrote the following code which did not function properly. In short, it was as if there were elements in the NodeList that disappeared after the initial call to NodeList.getElementsByTagName("span"); The code completely drops through the for loop when I make a call to getTextContent, even though it is not a controlling variable and it does not throw an exception! I'm befuddled. The second portion of code works. For what it is worth, tidy is the HTML cleaner that's been ported to java (JTidy) and parseDOM(InputStream, OutputStream) is supposed to return a Document, which it does! So why I have to call a DocumentBuilderFactory and then get a DocumentBuilder is beyond me. If I don't call Node.getTextContent() the list is processed properly and calls to toString() indicate that the class nodes are in the list! Any help would be appreciated!
import com.boeing.ict.pdemo.io.NullOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.util.Properties;

import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;

public class HTMLDocumentProcessor {
   
    // class fields
    private Properties tidyProperties   = null;
    private final String tidyConfigFile =
            "com/boeing/ict/pdemo/resources/TidyConfiguration.properties";
   
    /**
     * Creates a new instance of HTMLDocumentProcessor
     */
    public HTMLDocumentProcessor() {
        initComponents();
    }
   
    /*
     *
     */
    private void initComponents() {
        try {
            tidyProperties = new Properties();
            tidyProperties.load(ClassLoader.getSystemResourceAsStream(tidyConfigFile));
        } catch (IOException ignore) {
        }
    }
   
    /**
     *
     */
    public Document cleanPage(InputStream docStream) throws IOException {
       
        Document doc = null;
        NullOutputStream nos = new NullOutputStream(); // A NullOutputStream is
                                                       // is used to keep all the
                                                       // error output from printing
       
        // check to see if we were successful at loading properties
        if (tidyProperties.isEmpty()) {
            System.err.println("Unable to load configuration file for Tidy");
            System.err.println("Proceeding with default configuration");
        }
       
        Tidy tidy = new Tidy();
        // set some local, non-destructive settings
        tidy.setQuiet(true);
        tidy.setErrout(new PrintWriter(nos));
        tidy.setConfigurationFromProps(tidyProperties);
        doc = tidy.parseDOM(docStream, nos);

        // assuming everything has gone ok, we return the root element
        return doc;
    }

   
    /**
     *
     */
    public static void main(String[] args) {
        try {
            String fileName = "C:/tmp/metars-search.htm";
           
            File htmlFile = new File(fileName);
           
            if (!htmlFile.exists()) {
                System.err.println("File : " + fileName + " does not exist for reading");
                System.exit(0);
            }
           
            FileInputStream fis = new FileInputStream(htmlFile);
            HTMLDocumentProcessor processor = new HTMLDocumentProcessor();
           
            Document doc = processor.cleanPage(fis);
           
            if (doc == null) {
               System.out.println("cleanPage(InputStream) returned null Document");
               System.exit(0);
            }
           
            NodeList spanTags = doc.getElementsByTagName("span");
           
            int numSpanTags = spanTags.getLength();
            System.out.println("Number of <span> tags = " + numSpanTags);
           
            for (int i = 0; i < numSpanTags; i++) { // Loop falls through here!
                System.out.println("Span tag (" + i + ") = " +
                                    spanTags.item(i).getTextContent());
            }
           
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            System.exit(0);
        }
    }
}
This segment of code works!
import com.boeing.ict.pdemo.io.NullOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.util.Properties;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import org.xml.sax.SAXException;



/**
 * Class designed to remove specific notam entries from the
 * HTML document returned in a request. The document will contain
 * either formatted (HTML with CSS) or raw (HTML, pre tags). The
 * Formatted HTML will extract the paragraph body information from the
 * document in it's formatted state. The raw format will extract data
 * as simple lines of text.
 *
 * @author John M. Resler (Capt. USAF, Ret.)<br/>
 * Class : NotamExtractor<br/>
 * Compiler : Sun J2SE version 1.5.0_06<br/>
 * Date : June 15, 2006<br/>
 * Time : 11:05 AM<br/>
 */
public class HTMLDocumentProcessor {
   
    // class fields
    private Properties tidyProperties   = null;
    private final String tidyConfigFile =
            "com/boeing/ict/pdemo/resources/TidyConfiguration.properties";
   
    /**
     * Creates a new instance of HTMLDocumentProcessor
     */
    public HTMLDocumentProcessor() {
        initComponents();
    }
   
    /*
     *
     */
    private void initComponents() {
        try {
            tidyProperties = new Properties();
            tidyProperties.load(ClassLoader.getSystemResourceAsStream(tidyConfigFile));
        } catch (IOException ignore) {
        }
    }
   
    /**
     *
     */
    public Document cleanPage(InputStream docStream) throws IOException {
       
        Document doc = null;
        NullOutputStream nos = new NullOutputStream(); // A NullOutputStream is
                                                       // is used to keep all the
                                                       // error output from printing
       
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
       
        // check to see if we were successful at loading properties
        if (tidyProperties.isEmpty()) {
            System.err.println("Unable to load configuration file for Tidy");
            System.err.println("Proceeding with default configuration");
        }
       
        Tidy tidy = new Tidy();
        // set some local, non-destructive settings
        tidy.setQuiet(true);
        tidy.setErrout(new PrintWriter(nos));
        tidy.setConfigurationFromProps(tidyProperties);
        doc = tidy.parseDOM(docStream, bos);
       
        DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder docBuilder = null;
        try {
            docBuilder = docFactory.newDocumentBuilder();
        } catch (ParserConfigurationException ex) {
            ex.printStackTrace();
        }
        try {
            doc = docBuilder.parse(new ByteArrayInputStream(bos.toByteArray()));
        } catch (IOException ex) {
            ex.printStackTrace();
        } catch (SAXException ex) {
            ex.printStackTrace();
        }

        // assuming everything has gone ok, we return the root element
        return doc;
    }

   
    /**
     *
     */
    public static void main(String[] args) {
        try {
            String fileName = "C:/tmp/metars-search.htm";
           
            File htmlFile = new File(fileName);
           
            if (!htmlFile.exists()) {
                System.err.println("File : " + fileName + " does not exist for reading");
                System.exit(0);
            }
           
            FileInputStream fis = new FileInputStream(htmlFile);
            HTMLDocumentProcessor processor = new HTMLDocumentProcessor();
           
            Document doc = processor.cleanPage(fis);
           
            if (doc == null) {
               System.out.println("cleanPage(InputStream) returned null Document");
               System.exit(0);
            }
           
            NodeList spanTags = doc.getElementsByTagName("span");
            int numSpanTags = spanTags.getLength();
           
            for (int i = 0; i < numSpanTags; i++ ) {
                System.out.println(spanTags.item(i).getTextContent().trim());
            }

           
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            System.exit(0);
        }
    }
}
Comments
Locked Post
New comments cannot be posted to this locked post.
Post Details
Locked on Jul 25 2006
Added on Jun 19 2006
11 comments
377 views