Re: determining character encoding format of a file
Alan wrote:
Is there any easy way to determine what character encoding format
(e.g., UTF-8) a text file uses?
Not in general.
For ISO-8859-1 versus UTF-8 for a western language you may make
a qualified guess.
See attached code as a stating point (note that the
code is designed to identify text in danish).
Arne
=============================
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
public class CharSetGuesser {
public static String guess(String filename) throws IOException {
int[] freq = new int[256];
InputStream is = new FileInputStream(filename);
int c;
while((c = is.read()) >= 0) {
freq[c]++;
}
is.close();
if((freq[197] + freq[198] + freq[200] +
freq[201] + freq[203] + freq[216] +
freq[229] + freq[230] + freq[232] +
freq[233] + freq[235] + freq[248]) >
(freq[133] + freq[134] + freq[136] +
freq[137] + freq[139] + freq[152] +
freq[165] + freq[166] + freq[168] +
freq[169] + freq[171] + freq[184] +
freq[195])) {
return "ISO-8859-1";
} else {
return "UTF-8";
}
}
public static void main(String[] args) throws Exception {
System.out.println(guess("C:\\iso-8859-1.txt"));
System.out.println(guess("C:\\utf-8.txt"));
}
}