/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jasper.compiler;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
/*
* The BoM detection is derived from:
* https://svn.us.apache.org/viewvc/tomcat/trunk/java/org/apache/jasper/xmlparser/XMLEncodingDetector.java?annotate=1742248
*
* The prolog is always at least as specific as the BOM therefore any encoding
* specified in the prolog should take priority over the BOM.
*/
class EncodingDetector {
private static final XMLInputFactory XML_INPUT_FACTORY;
static {
XML_INPUT_FACTORY = XMLInputFactory.newInstance();
}
private final String encoding;
private final int skip;
private final boolean encodingSpecifiedInProlog;
EncodingDetector(BufferedInputStream bis)
throws IOException {
// Buffer is 1k. BOM is only 4 bytes.
bis.mark(
4);
BomResult bomResult = processBom(bis);
// Reset the stream back to the start to allow the XML prolog detection
// to work. Skip any BoM we discovered.
bis.reset();
for (
int i =
0; i < bomResult.skip; i++) {
bis.read();
}
String prologEncoding = getPrologEncoding(bis);
if (prologEncoding ==
null) {
encodingSpecifiedInProlog =
false;
encoding = bomResult.encoding;
}
else {
encodingSpecifiedInProlog =
true;
encoding = prologEncoding;
}
skip = bomResult.skip;
}
String getEncoding() {
return encoding;
}
int getSkip() {
return skip;
}
boolean isEncodingSpecifiedInProlog() {
return encodingSpecifiedInProlog;
}
private String getPrologEncoding(InputStream stream) {
String encoding =
null;
try {
XMLStreamReader xmlStreamReader = XML_INPUT_FACTORY.createXMLStreamReader(stream);
encoding = xmlStreamReader.getCharacterEncodingScheme();
}
catch (XMLStreamException e) {
// Ignore
}
return encoding;
}
private BomResult processBom(InputStream stream) {
// Read first four bytes (or as many are available) and determine
// encoding
try {
final byte[] b4 =
new byte[
4];
int count =
0;
int singleByteRead;
while (count <
4) {
singleByteRead = stream.read();
if (singleByteRead == -
1) {
break;
}
b4[count] = (
byte) singleByteRead;
count++;
}
return parseBom(b4, count);
}
catch (IOException ioe) {
// Failed.
return new BomResult(
"UTF-8",
0);
}
}
private BomResult parseBom(
byte[] b4,
int count) {
if (count <
2) {
return new BomResult(
"UTF-8",
0);
}
// UTF-16, with BOM
int b0 = b4[
0] &
0xFF;
int b1 = b4[
1] &
0xFF;
if (b0 ==
0xFE && b1 ==
0xFF) {
// UTF-16, big-endian
return new BomResult(
"UTF-16BE",
2);
}
if (b0 ==
0xFF && b1 ==
0xFE) {
// UTF-16, little-endian
return new BomResult(
"UTF-16LE",
2);
}
// default to UTF-8 if we don't have enough bytes to make a
// good determination of the encoding
if (count <
3) {
return new BomResult(
"UTF-8",
0);
}
// UTF-8 with a BOM
int b2 = b4[
2] &
0xFF;
if (b0 ==
0xEF && b1 ==
0xBB && b2 ==
0xBF) {
return new BomResult(
"UTF-8",
3);
}
// default to UTF-8 if we don't have enough bytes to make a
// good determination of the encoding
if (count <
4) {
return new BomResult(
"UTF-8",
0);
}
// Other encodings. No BOM. Try and ID encoding.
int b3 = b4[
3] &
0xFF;
if (b0 ==
0x00 && b1 ==
0x00 && b2 ==
0x00 && b3 ==
0x3C) {
// UCS-4, big endian (1234)
return new BomResult(
"ISO-10646-UCS-4",
0);
}
if (b0 ==
0x3C && b1 ==
0x00 && b2 ==
0x00 && b3 ==
0x00) {
// UCS-4, little endian (4321)
return new BomResult(
"ISO-10646-UCS-4",
0);
}
if (b0 ==
0x00 && b1 ==
0x00 && b2 ==
0x3C && b3 ==
0x00) {
// UCS-4, unusual octet order (2143)
// REVISIT: What should this be?
return new BomResult(
"ISO-10646-UCS-4",
0);
}
if (b0 ==
0x00 && b1 ==
0x3C && b2 ==
0x00 && b3 ==
0x00) {
// UCS-4, unusual octet order (3412)
// REVISIT: What should this be?
return new BomResult(
"ISO-10646-UCS-4",
0);
}
if (b0 ==
0x00 && b1 ==
0x3C && b2 ==
0x00 && b3 ==
0x3F) {
// UTF-16, big-endian, no BOM
// (or could turn out to be UCS-2...
// REVISIT: What should this be?
return new BomResult(
"UTF-16BE",
0);
}
if (b0 ==
0x3C && b1 ==
0x00 && b2 ==
0x3F && b3 ==
0x00) {
// UTF-16, little-endian, no BOM
// (or could turn out to be UCS-2...
return new BomResult(
"UTF-16LE",
0);
}
if (b0 ==
0x4C && b1 ==
0x6F && b2 ==
0xA7 && b3 ==
0x94) {
// EBCDIC
// a la xerces1, return CP037 instead of EBCDIC here
return new BomResult(
"CP037",
0);
}
// default encoding
return new BomResult(
"UTF-8",
0);
}
private static class BomResult {
public final String encoding;
public final int skip;
BomResult(String encoding,
int skip) {
this.encoding = encoding;
this.skip = skip;
}
}
}