Re: How to slurp/get the content of a URI?
Mark Space wrote:
So I'm no expert, and I hope I'm not wasting your time by blathering,
but the question is interesting to me so I did a bit of work on it.
Here's what I have so far.
static void method4() throws MalformedURLException, IOException {
String TEST_URL =
"http://cnn.com";
URL url = new URL(TEST_URL);
URLConnection c = url.openConnection();
String type = c.getContentType();
System.out.println("Mime type: " + type );
if( type == null || type.contains("text") )
{
String enc = c.getContentEncoding();
System.out.println( "Encoding: " + enc );
if( enc == null )
{
enc = "ISO-8859-1";
}
InputStreamReader inr = new InputStreamReader(
c.getInputStream(),
enc ); // I have no idea if http encoding
strings // will work here
List<CharBuffer> result = new ArrayList<CharBuffer>();
int byteCount = 0;
for( ;; )
{
int read;
CharBuffer cb = CharBuffer.allocate( 4 * 1024 );
if( ( read = inr.read( cb )) != -1 )
{
byteCount += read;
result.add( cb );
}
else
{
break;
}
}
System.out.println( "Read: " + byteCount );
}
else // binary
{
System.out.println("binary...");
}
}
You need to also handle the META HTTP-EQUIV way of specifying charset.
My suggestion for code:
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HttpDownloadCharset {
private static Pattern encpat =
Pattern.compile("charset=([A-Za-z0-9-]+)", Pattern.CASE_INSENSITIVE);
private static String parseContentType(String contenttype) {
Matcher m = encpat.matcher(contenttype);
if(m.find()) {
return m.group(1);
} else {
return "ISO-8859-1";
}
}
private static Pattern metaencpat =
Pattern.compile("<META\\s+HTTP-EQUIV\\s*=\\s*[\"']Content-Type[\"']\\s+CONTENT\\s*=\\s*[\"']([^\"']*)[\"']>",
Pattern.CASE_INSENSITIVE);
private static String parseMetaContentType(String html, String
defenc) {
Matcher m = metaencpat.matcher(html);
if(m.find()) {
return parseContentType(m.group(1));
} else {
return defenc;
}
}
private static final int DEFAULT_BUFSIZ = 1000000;
public static String download(String urlstr) throws IOException {
URL url = new URL(urlstr);
HttpURLConnection con = (HttpURLConnection)url.openConnection();
con.connect();
if (con.getResponseCode() == HttpURLConnection.HTTP_OK) {
String enc = parseContentType(con.getContentType());
int bufsiz = con.getContentLength();
if(bufsiz < 0) {
bufsiz = DEFAULT_BUFSIZ;
}
byte[] buf = new byte[bufsiz];
InputStream is = con.getInputStream();
int ix = 0;
int n;
while((n = is.read(buf, ix, buf.length - ix)) > 0) {
ix += n;
}
is.close();
con.disconnect();
String temp = new String(buf, "US-ASCII");
enc = parseMetaContentType(temp, enc);
return new String(buf, enc);
} else {
con.disconnect();
throw new IllegalArgumentException("URL " + urlstr + "
returned " + con.getResponseMessage());
}
}
}
Arne