XML Feeds

.

[java-dev] Re: rendering HTML in Jitter

Jonny jmail at nospaces.net
Thu Jan 10 14:16:45 MST 2008


Fixed the encoding (the hard way, i think)

My version of getDocumentFromUrl follows.

//---------------------------------------

	private void getDocumentFromUrl(String uri) throws Exception {
		URL url = new URL(uri);
		URLConnection connection = url.openConnection();
		InputStream in = connection.getInputStream();

		// A Reader should be created with the correct charset,
		// which may be obtained from the Content-Type header
		// of an HTTP response.
		Reader reader;
		String encoding = connection.getContentEncoding();
		if (encoding == null) {
			// take a peek in the file then
			byte[] buff = new byte[512];
			int rc = in.read(buff);
			if (rc > 0) {
				// fish out charset from meta tag
				String s = new String(buff);
				Pattern p = Pattern.compile("<meta\\shttp-equiv.*?Content-Type.*?charset=(.*?)\\\">",
						Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.MULTILINE | Pattern.DOTALL);
				Matcher m = p.matcher(s);
				if (m.find()) {
					if (m.groupCount() > 1) {
						encoding = m.group(1);
					}
				}
			}
			if (encoding == null) {	// no idea?
				encoding = "UTF-8";
			}
		}
		reader = new InputStreamReader(in, encoding);

		// InputSourceImpl constructor with URI recommended
		// so the renderer can resolve page component URLs.
		InputSource is = new InputSourceImpl(reader, uri);

		// This example does not perform incremental rendering.
		document = builder.parse(is);

		in.close();
	}



More information about the java-dev mailing list