SAXParseException: The declaration for the entity "ContentType" must
end with '>'.
Hi,
~
while trying to scrape, e.g., this page:
~
http://www.gutenberg.org/ebooks/18203
~
by using a custom Document Handler, I first download it and then tidy
it using JTidy
~
Everything seems OK in a browser as an XML document and JTidy reports:
~
Tidy (vers 4th August 2000) Parsing "InputStream"
InputStream: Doctype given is "-//W3C//DTD HTML 4.01//EN"
InputStream: Document content looks like HTML 4.01 Transitional
no warnings or errors were found
~
The thing is that SAX is stumbling on a line that looks totally
inoffensive to my understanding telling me:
~
SAXParseException: The declaration for the entity "ContentType" must
end with '>'.
~
Here is an outline of the involved part in my code. Could you spot
where my mistake is?
~
~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
// __
try{
SAXParserFactory SxPrsr = SAXParserFactory.newInstance();
SxPrsr.setNamespaceAware(false);
SxPrsr.setValidating(false);
// __
GutPgInfoHndlr00 GutHndlr = new GutPgInfoHndlr00();
XMLReader parser = SxPrsr.newSAXParser().getXMLReader();
parser.setContentHandler(GutHndlr);
parser.setErrorHandler(GutHndlr);
// __
Tidy tidy = new Tidy();
tidy.setNumEntities(true);
tidy.setXmlOut(true);
tidy.setErrout(new PrintWriter(new FileWriter(aTidyErrFl), true));
// __
int iIxCnt = 0, iSubDirs, iFls;
String[] aFls;
String aURL, aFOS;
File FlDir;
File Fl;
String aDir = "/media/hda4/GUTW/GUTDOWN02";
HTMLFileFilter00 HTMLFls = new HTMLFileFilter00();
// __
iSubDirs = aSubDirs.length;
iSubDirs = 1;
for(int i = 0; (i < iSubDirs); ++i){
FlDir = new File(aDir, aSubDirs[i]);
if(FlDir.exists() && FlDir.isDirectory()){
aFls = FlDir.list(HTMLFls);
// __
iFls = aFls.length;
iFls = 1;
for(int j = 0; (j < iFls); ++j){
Fl = new File(FlDir, aFls[j]);
if(Fl.exists() && Fl.isFile()){
aURL = "file://" + Fl.getAbsolutePath();
// __ first tidying page
BIS = new BufferedInputStream(new FileInputStream(new
URL(aURL).getFile()));
aFOS = Fl.getAbsolutePath() + ".jtidied";
FOS = new FileOutputStream(aFOS);
tidy.parse(BIS, FOS);
// __
FOS.close(); BIS.close();
// __ then parsing the tidied up data feed
parser.parse(aFOS);
// __
if((new File(aFOS)).delete()){ System.out.println("// __ |" +
aFOS + "| deleted!"); }
// __
++iIxCnt;
}// (Fl.exists() && Fl.isFile())
}// j
}// (FlDir.exists() && FlDir.isDirectory())
}// i
}catch(ParserConfigurationException PrsConfX){
PrsConfX.printStackTrace(System.err); }
catch(SAXException SAXX){ SAXX.printStackTrace(System.err); }
catch(IOException IOX){ IOX.printStackTrace(System.err); }
~
~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
~
and here is the SAX Exception and "GutPgInfo00Test.java:96" is th
eline where I go:
~
parser.parse(aFOS);
~
// __ Fatal error at line: |81|
org.xml.sax.SAXParseException: The declaration for the entity
"ContentType" must end with '>'.
at
com.sun.org.apache.xerces.internal.util.ErrorHandlerWrapper.createSAXParseException(ErrorHandlerWrapper.java:195)
at
com.sun.org.apache.xerces.internal.util.ErrorHandlerWrapper.fatalError(ErrorHandlerWrapper.java:174)
at
com.sun.org.apache.xerces.internal.impl.XMLErrorReporter.reportError(XMLErrorReporter.java:388)
at
com.sun.org.apache.xerces.internal.impl.XMLScanner.reportFatalError(XMLScanner.java:1411)
at
com.sun.org.apache.xerces.internal.impl.XMLDTDScannerImpl.scanEntityDecl(XMLDTDScannerImpl.java:1585)
at
com.sun.org.apache.xerces.internal.impl.XMLDTDScannerImpl.scanDecls(XMLDTDScannerImpl.java:1986)
at
com.sun.org.apache.xerces.internal.impl.XMLDTDScannerImpl.scanDTDExternalSubset(XMLDTDScannerImpl.java:320)
at
com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$DTDDriver.dispatch(XMLDocumentScannerImpl.java:1201)
at
com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$DTDDriver.next(XMLDocumentScannerImpl.java:1089)
at
com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$PrologDriver.next(XMLDocumentScannerImpl.java:1002)
at
com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl.next(XMLDocumentScannerImpl.java:648)
at
com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl.scanDocument(XMLDocumentFragmentScannerImpl.java:510)
at
com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(XML11Configuration.java:807)
at
com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(XML11Configuration.java:737)
at
com.sun.org.apache.xerces.internal.parsers.XMLParser.parse(XMLParser.java:107)
at
com.sun.org.apache.xerces.internal.parsers.AbstractSAXParser.parse(AbstractSAXParser.java:1132)
at
com.sun.org.apache.xerces.internal.jaxp.SAXParserImpl$JAXPSAXParser.parse(SAXParserImpl.java:533)
at GutPgInfo00Test.main(GutPgInfo00Test.java:96)