Re: Detect XML document encodings with SAX
On 21/11/12 14:32, Sebastian wrote:
Does anyone have an idea why that is so? And how I could
go about making some XML parser determine the correct encoding?
Sussed it! (Come to think of it, I feel I've sussed this before...)
The charset returned by the locator changes during parsing. At
startDocument(), it is the assumed charset, possibly based on the first
four-or-so bytes. At endDocument(), it is reset to null. On the first
call to startElement, it has the correct value. There might be an
earlier event where it is correct - I didn't investigate.
SSCCE...
import org.xml.sax.*;
import org.xml.sax.ext.*;
import org.xml.sax.helpers.*;
import java.io.*;
import java.nio.charset.*;
public class SAXEncodingDetector extends DefaultHandler {
static void escape(PrintWriter out, CharsetEncoder enc, CharSequence text) {
final int len = text.length();
for (int i = 0; i < len; i++) {
char c = text.charAt(i);
if (enc.canEncode(c))
out.print(c);
else
out.printf("&#x%x;", (int) c);
}
}
static final String MESSAGE = "L\u00f6we \u20ac";
static byte[] createXMLBytes(String charsetName)
throws UnsupportedEncodingException {
Charset charset = Charset.forName(charsetName);
CharsetEncoder encoder = charset.newEncoder();
ByteArrayOutputStream bytesOut = new ByteArrayOutputStream();
PrintWriter out =
new PrintWriter(new OutputStreamWriter(bytesOut, charset));
out.printf("<?xml version=\"1.0\" encoding=\"%s\" ?>%n", charsetName);
out.print("<root>");
escape(out, encoder, MESSAGE);
out.println("</root>");
out.close();
return bytesOut.toByteArray();
}
public static void main(String[] args) throws SAXException, IOException {
for (int i = 0; i < args.length; i++) {
String inCharset = args[i];
byte[] bytes = createXMLBytes(inCharset);
System.out.printf("%nCharset %s: (%d bytes)%n",
inCharset, bytes.length);
printBytes(bytes, System.out);
ByteArrayInputStream in = new ByteArrayInputStream(bytes);
XMLReader parser = XMLReaderFactory.createXMLReader();
SAXEncodingDetector handler = new SAXEncodingDetector();
parser.setContentHandler(handler);
parser.parse(new InputSource(in));
System.out.printf("Charset at document start: %s%n",
handler.encodingAtDocumentStart);
System.out.printf(" Charset at element start: %s%n",
handler.encodingAtElementStart);
System.out.printf(" Charset at element end: %s%n",
handler.encodingAtElementEnd);
System.out.printf(" Charset at document end: %s%n",
handler.encodingAtDocumentEnd);
String content = handler.content.toString();
System.out.println("Content: " + content);
if (!content.equals(MESSAGE))
System.out.println("Warning: message corrupted");
}
}
private String encodingAtDocumentStart;
private String encodingAtElementStart;
private String encodingAtElementEnd;
private String encodingAtDocumentEnd;
private Locator2 locator;
private StringWriter content = new StringWriter();
private boolean inElement;
@Override
public void setDocumentLocator(Locator locator) {
if (locator instanceof Locator2) {
this.locator = (Locator2) locator;
}
}
@Override
public void startDocument() throws SAXException {
if (locator != null) {
this.encodingAtDocumentStart = locator.getEncoding();
}
}
@Override
public void endDocument() throws SAXException {
if (locator != null) {
this.encodingAtDocumentEnd = locator.getEncoding();
}
}
@Override
public void startElement(String uri, String localName,
String qName, Attributes atts) {
if (localName.equals("root")) {
if (locator != null)
this.encodingAtElementStart = locator.getEncoding();
inElement = true;
}
}
@Override
public void endElement(String uri, String localName, String qName) {
if (localName.equals("root")) {
if (locator != null)
this.encodingAtElementEnd = locator.getEncoding();
inElement = false;
}
}
@Override
public void characters(char[] ch, int start, int length) {
if (inElement)
content.write(ch, start, length);
}
static void printBytes(byte[] bytes, PrintStream out) {
for (int major = 0; major < bytes.length; major += 16) {
final int lim = Math.min(major + 16, bytes.length) - major;
for (int minor = 0; minor < 16; minor++) {
if (minor < lim) {
final int pos = major + minor;
out.printf("%02X ", bytes[pos]);
} else {
out.print(".. ");
}
}
for (int minor = 0; minor < 16; minor++) {
if (minor < lim) {
final int pos = major + minor;
final int c = bytes[pos] & 0xff;
if (c == 10) {
out.print("\\n");
} else if (c == 13) {
out.print("\\r");
} else if (c == 9) {
out.print("\\t");
} else if (c < 32) {
out.printf("^%c", (char) (c + 64));
} else if (c >= 127 && c <= 160) {
out.printf("%02X", c);
} else {
out.printf("%c ", (char) c);
}
} else {
out.print("..");
}
}
out.println();
}
}
}
Command:
java SAXEncodingDetector US-ASCII ISO-8859-1 UTF-8 windows-1252
Output:
Charset US-ASCII: (75 bytes)
3C 3F 78 6D 6C 20 76 65 72 73 69 6F 6E 3D 22 31 < ? x m l v e r s i o n = " 1
2E 30 22 20 65 6E 63 6F 64 69 6E 67 3D 22 55 53 . 0 " e n c o d i n g = " U S
2D 41 53 43 49 49 22 20 3F 3E 0A 3C 72 6F 6F 74 - A S C I I " ? > \n< r o o t
3E 4C 26 23 78 66 36 3B 77 65 20 26 23 78 32 30 > L & # x f 6 ; w e & # x 2 0
61 63 3B 3C 2F 72 6F 6F 74 3E 0A .. .. .. .. .. a c ; < / r o o t > \n..........
Charset at document start: UTF-8
Charset at element start: US-ASCII
Charset at element end: US-ASCII
Charset at document end: null
Content: L?we ?
Charset ISO-8859-1: (72 bytes)
3C 3F 78 6D 6C 20 76 65 72 73 69 6F 6E 3D 22 31 < ? x m l v e r s i o n = " 1
2E 30 22 20 65 6E 63 6F 64 69 6E 67 3D 22 49 53 . 0 " e n c o d i n g = " I S
4F 2D 38 38 35 39 2D 31 22 20 3F 3E 0A 3C 72 6F O - 8 8 5 9 - 1 " ? > \n< r o
6F 74 3E 4C F6 77 65 20 26 23 78 32 30 61 63 3B o t > L ? w e & # x 2 0 a c ;
3C 2F 72 6F 6F 74 3E 0A .. .. .. .. .. .. .. .. < / r o o t > \n................
Charset at document start: UTF-8
Charset at element start: ISO-8859-1
Charset at element end: ISO-8859-1
Charset at document end: null
Content: L?we ?
Charset UTF-8: (63 bytes)
3C 3F 78 6D 6C 20 76 65 72 73 69 6F 6E 3D 22 31 < ? x m l v e r s i o n = " 1
2E 30 22 20 65 6E 63 6F 64 69 6E 67 3D 22 55 54 . 0 " e n c o d i n g = " U T
46 2D 38 22 20 3F 3E 0A 3C 72 6F 6F 74 3E 4C C3 F - 8 " ? > \n< r o o t > L ?
B6 77 65 20 E2 82 AC 3C 2F 72 6F 6F 74 3E 0A .. ? w e ? 82? < / r o o t > \n..
Charset at document start: UTF-8
Charset at element start: UTF-8
Charset at element end: UTF-8
Charset at document end: null
Content: L?we ?
Charset windows-1252: (67 bytes)
3C 3F 78 6D 6C 20 76 65 72 73 69 6F 6E 3D 22 31 < ? x m l v e r s i o n = " 1
2E 30 22 20 65 6E 63 6F 64 69 6E 67 3D 22 77 69 . 0 " e n c o d i n g = " w i
6E 64 6F 77 73 2D 31 32 35 32 22 20 3F 3E 0A 3C n d o w s - 1 2 5 2 " ? > \n<
72 6F 6F 74 3E 4C F6 77 65 20 80 3C 2F 72 6F 6F r o o t > L ? w e 80< / r o o
74 3E 0A .. .. .. .. .. .. .. .. .. .. .. .. .. t > \n..........................
Charset at document start: UTF-8
Charset at element start: windows-1252
Charset at element end: windows-1252
Charset at document end: null
Content: L?we ?
--
ss at comp dot lancs dot ac dot uk