Re: Detect XML document encodings with SAX
Sebastian wrote:
I discovered this post:
http://www.ibm.com/developerworks/library/x-tipsaxxni/
and implemented both approaches (SAX and Xerces XNI).
Unfortunately, for the attached XML file, both methods
output an encoding of UTF-8, while looking at the file
I tried.
And I can not get it to work either.
SAX detects UTF-8 no matter what it really is.
StAX seems never to detect and W3C DOM seems to
always detect correct.
I can not offer an explanation. Obviously the parsers
need to internally detect correct. Otherwise they
could not parse correct.
Code below.
Arne
====
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.ext.Locator2;
import org.xml.sax.helpers.XMLReaderFactory;
import org.xml.sax.helpers.DefaultHandler;
public class XmlEncodingDectect {
private static final String FNM1 = "/work/foobar1.xml";
private static final String FNM2 = "/work/foobar2.xml";
private static final String FNM3 = "/work/foobar3.xml";
private static void gen1() throws IOException {
PrintWriter pw = new PrintWriter(new FileWriter(FNM1));
pw.println("<?xml version='1.0' encoding='UTF-8'?>");
pw.println("<root/>");
pw.close();
}
private static void gen2() throws IOException {
PrintWriter pw = new PrintWriter(new FileWriter(FNM2));
pw.println("<?xml version='1.0' encoding='ISO-8859-1'?>");
pw.println("<root/>");
pw.close();
}
private static void gen3() throws IOException {
PrintWriter pw = new PrintWriter(new FileWriter(FNM3));
pw.println("<?xml version='1.0'?>");
pw.println("<root/>");
pw.close();
}
private static String encoding;
private static String detectSAX(String fnm) throws SAXException,
IOException {
XMLReader parser = XMLReaderFactory.createXMLReader();
parser.setContentHandler(new DefaultHandler() {
private Locator2 locator;
@Override
public void setDocumentLocator(Locator locator) {
if (locator instanceof Locator2) {
this.locator = (Locator2) locator;
} else {
encoding = "Unknown";
}
}
@Override
public void startDocument() throws SAXException {
if (locator != null) {
encoding = locator.getEncoding();
}
}
});
parser.parse(new InputSource(new FileInputStream(fnm)));
return encoding;
}
private static String detectW3CDOM(String fnm) throws
ParserConfigurationException, FileNotFoundException, SAXException,
IOException {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
Document doc = db.parse(new InputSource(new FileInputStream(fnm)));
String encoding = doc.getXmlEncoding();
return encoding != null ? encoding : "Unknown";
}
private static String detectStAX(String fnm) throws
FileNotFoundException, XMLStreamException {
XMLInputFactory xif = XMLInputFactory.newInstance();
XMLStreamReader xsr = xif.createXMLStreamReader(new
FileInputStream(fnm));
String encoding = null;
while(xsr.hasNext()) {
xsr.next();
switch(xsr.getEventType()) {
case XMLStreamReader.START_DOCUMENT:
encoding = xsr.getEncoding();
break;
default:
break;
}
}
return encoding != null ? encoding : "Unknown";
}
public static void main(String[] args) throws IOException,
SAXException, ParserConfigurationException, XMLStreamException {
gen1();
System.out.println(detectSAX(FNM1));
System.out.println(detectW3CDOM(FNM1));
System.out.println(detectStAX(FNM1));
gen2();
System.out.println(detectSAX(FNM2));
System.out.println(detectW3CDOM(FNM2));
System.out.println(detectStAX(FNM2));
gen3();
System.out.println(detectSAX(FNM3));
System.out.println(detectW3CDOM(FNM3));
System.out.println(detectStAX(FNM3));
}
}