How to deal with BOM in InputStream [duplicate] - java

I'm trying to read CSV files using Java. Some of the files may have a byte order mark in the beginning, but not all. When present, the byte order gets read along with the rest of the first line, thus causing problems with string compares.
Is there an easy way to skip the byte order mark when it is present?

EDIT: I've made a proper release on GitHub: https://github.com/gpakosz/UnicodeBOMInputStream
Here is a class I coded a while ago, I just edited the package name before pasting. Nothing special, it is quite similar to solutions posted in SUN's bug database. Incorporate it in your code and you're fine.
/* ____________________________________________________________________________
*
* File: UnicodeBOMInputStream.java
* Author: Gregory Pakosz.
* Date: 02 - November - 2005
* ____________________________________________________________________________
*/
package com.stackoverflow.answer;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
/**
* The <code>UnicodeBOMInputStream</code> class wraps any
* <code>InputStream</code> and detects the presence of any Unicode BOM
* (Byte Order Mark) at its beginning, as defined by
* RFC 3629 - UTF-8, a transformation format of ISO 10646
*
* <p>The
* Unicode FAQ
* defines 5 types of BOMs:<ul>
* <li><pre>00 00 FE FF = UTF-32, big-endian</pre></li>
* <li><pre>FF FE 00 00 = UTF-32, little-endian</pre></li>
* <li><pre>FE FF = UTF-16, big-endian</pre></li>
* <li><pre>FF FE = UTF-16, little-endian</pre></li>
* <li><pre>EF BB BF = UTF-8</pre></li>
* </ul></p>
*
* <p>Use the {#link #getBOM()} method to know whether a BOM has been detected
* or not.
* </p>
* <p>Use the {#link #skipBOM()} method to remove the detected BOM from the
* wrapped <code>InputStream</code> object.</p>
*/
public class UnicodeBOMInputStream extends InputStream
{
/**
* Type safe enumeration class that describes the different types of Unicode
* BOMs.
*/
public static final class BOM
{
/**
* NONE.
*/
public static final BOM NONE = new BOM(new byte[]{},"NONE");
/**
* UTF-8 BOM (EF BB BF).
*/
public static final BOM UTF_8 = new BOM(new byte[]{(byte)0xEF,
(byte)0xBB,
(byte)0xBF},
"UTF-8");
/**
* UTF-16, little-endian (FF FE).
*/
public static final BOM UTF_16_LE = new BOM(new byte[]{ (byte)0xFF,
(byte)0xFE},
"UTF-16 little-endian");
/**
* UTF-16, big-endian (FE FF).
*/
public static final BOM UTF_16_BE = new BOM(new byte[]{ (byte)0xFE,
(byte)0xFF},
"UTF-16 big-endian");
/**
* UTF-32, little-endian (FF FE 00 00).
*/
public static final BOM UTF_32_LE = new BOM(new byte[]{ (byte)0xFF,
(byte)0xFE,
(byte)0x00,
(byte)0x00},
"UTF-32 little-endian");
/**
* UTF-32, big-endian (00 00 FE FF).
*/
public static final BOM UTF_32_BE = new BOM(new byte[]{ (byte)0x00,
(byte)0x00,
(byte)0xFE,
(byte)0xFF},
"UTF-32 big-endian");
/**
* Returns a <code>String</code> representation of this <code>BOM</code>
* value.
*/
public final String toString()
{
return description;
}
/**
* Returns the bytes corresponding to this <code>BOM</code> value.
*/
public final byte[] getBytes()
{
final int length = bytes.length;
final byte[] result = new byte[length];
// Make a defensive copy
System.arraycopy(bytes,0,result,0,length);
return result;
}
private BOM(final byte bom[], final String description)
{
assert(bom != null) : "invalid BOM: null is not allowed";
assert(description != null) : "invalid description: null is not allowed";
assert(description.length() != 0) : "invalid description: empty string is not allowed";
this.bytes = bom;
this.description = description;
}
final byte bytes[];
private final String description;
} // BOM
/**
* Constructs a new <code>UnicodeBOMInputStream</code> that wraps the
* specified <code>InputStream</code>.
*
* #param inputStream an <code>InputStream</code>.
*
* #throws NullPointerException when <code>inputStream</code> is
* <code>null</code>.
* #throws IOException on reading from the specified <code>InputStream</code>
* when trying to detect the Unicode BOM.
*/
public UnicodeBOMInputStream(final InputStream inputStream) throws NullPointerException,
IOException
{
if (inputStream == null)
throw new NullPointerException("invalid input stream: null is not allowed");
in = new PushbackInputStream(inputStream,4);
final byte bom[] = new byte[4];
final int read = in.read(bom);
switch(read)
{
case 4:
if ((bom[0] == (byte)0xFF) &&
(bom[1] == (byte)0xFE) &&
(bom[2] == (byte)0x00) &&
(bom[3] == (byte)0x00))
{
this.bom = BOM.UTF_32_LE;
break;
}
else
if ((bom[0] == (byte)0x00) &&
(bom[1] == (byte)0x00) &&
(bom[2] == (byte)0xFE) &&
(bom[3] == (byte)0xFF))
{
this.bom = BOM.UTF_32_BE;
break;
}
case 3:
if ((bom[0] == (byte)0xEF) &&
(bom[1] == (byte)0xBB) &&
(bom[2] == (byte)0xBF))
{
this.bom = BOM.UTF_8;
break;
}
case 2:
if ((bom[0] == (byte)0xFF) &&
(bom[1] == (byte)0xFE))
{
this.bom = BOM.UTF_16_LE;
break;
}
else
if ((bom[0] == (byte)0xFE) &&
(bom[1] == (byte)0xFF))
{
this.bom = BOM.UTF_16_BE;
break;
}
default:
this.bom = BOM.NONE;
break;
}
if (read > 0)
in.unread(bom,0,read);
}
/**
* Returns the <code>BOM</code> that was detected in the wrapped
* <code>InputStream</code> object.
*
* #return a <code>BOM</code> value.
*/
public final BOM getBOM()
{
// BOM type is immutable.
return bom;
}
/**
* Skips the <code>BOM</code> that was found in the wrapped
* <code>InputStream</code> object.
*
* #return this <code>UnicodeBOMInputStream</code>.
*
* #throws IOException when trying to skip the BOM from the wrapped
* <code>InputStream</code> object.
*/
public final synchronized UnicodeBOMInputStream skipBOM() throws IOException
{
if (!skipped)
{
in.skip(bom.bytes.length);
skipped = true;
}
return this;
}
/**
* {#inheritDoc}
*/
public int read() throws IOException
{
return in.read();
}
/**
* {#inheritDoc}
*/
public int read(final byte b[]) throws IOException,
NullPointerException
{
return in.read(b,0,b.length);
}
/**
* {#inheritDoc}
*/
public int read(final byte b[],
final int off,
final int len) throws IOException,
NullPointerException
{
return in.read(b,off,len);
}
/**
* {#inheritDoc}
*/
public long skip(final long n) throws IOException
{
return in.skip(n);
}
/**
* {#inheritDoc}
*/
public int available() throws IOException
{
return in.available();
}
/**
* {#inheritDoc}
*/
public void close() throws IOException
{
in.close();
}
/**
* {#inheritDoc}
*/
public synchronized void mark(final int readlimit)
{
in.mark(readlimit);
}
/**
* {#inheritDoc}
*/
public synchronized void reset() throws IOException
{
in.reset();
}
/**
* {#inheritDoc}
*/
public boolean markSupported()
{
return in.markSupported();
}
private final PushbackInputStream in;
private final BOM bom;
private boolean skipped = false;
} // UnicodeBOMInputStream
And you're using it this way:
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
public final class UnicodeBOMInputStreamUsage
{
public static void main(final String[] args) throws Exception
{
FileInputStream fis = new FileInputStream("test/offending_bom.txt");
UnicodeBOMInputStream ubis = new UnicodeBOMInputStream(fis);
System.out.println("detected BOM: " + ubis.getBOM());
System.out.print("Reading the content of the file without skipping the BOM: ");
InputStreamReader isr = new InputStreamReader(ubis);
BufferedReader br = new BufferedReader(isr);
System.out.println(br.readLine());
br.close();
isr.close();
ubis.close();
fis.close();
fis = new FileInputStream("test/offending_bom.txt");
ubis = new UnicodeBOMInputStream(fis);
isr = new InputStreamReader(ubis);
br = new BufferedReader(isr);
ubis.skipBOM();
System.out.print("Reading the content of the file after skipping the BOM: ");
System.out.println(br.readLine());
br.close();
isr.close();
ubis.close();
fis.close();
}
} // UnicodeBOMInputStreamUsage

The Apache Commons IO library has an InputStream that can detect and discard BOMs: BOMInputStream (javadoc):
BOMInputStream bomIn = new BOMInputStream(in);
int firstNonBOMByte = bomIn.read(); // Skips BOM
if (bomIn.hasBOM()) {
// has a UTF-8 BOM
}
If you also need to detect different encodings, it can also distinguish among various different byte-order marks, e.g. UTF-8 vs. UTF-16 big + little endian - details at the doc link above. You can then use the detected ByteOrderMark to choose a Charset to decode the stream. (There's probably a more streamlined way to do this if you need all of this functionality - maybe the UnicodeReader in BalusC's answer?). Note that, in general, there's not a very good way to detect what encoding some bytes are in, but if the stream starts with a BOM, apparently this can be helpful.
Edit: If you need to detect the BOM in UTF-16, UTF-32, etc, then the constructor should be:
new BOMInputStream(is, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE,
ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE)
Upvote #martin-charlesworth's comment :)

More simple solution:
public class BOMSkipper
{
public static void skip(Reader reader) throws IOException
{
reader.mark(1);
char[] possibleBOM = new char[1];
reader.read(possibleBOM);
if (possibleBOM[0] != '\ufeff')
{
reader.reset();
}
}
}
Usage sample:
BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(file), fileExpectedCharset));
BOMSkipper.skip(input);
//Now UTF prefix not present:
input.readLine();
...
It works with all 5 UTF encodings!

Google Data API has an UnicodeReader which automagically detects the encoding.
You can use it instead of InputStreamReader. Here's an -slightly compactized- extract of its source which is pretty straightforward:
public class UnicodeReader extends Reader {
private static final int BOM_SIZE = 4;
private final InputStreamReader reader;
/**
* Construct UnicodeReader
* #param in Input stream.
* #param defaultEncoding Default encoding to be used if BOM is not found,
* or <code>null</code> to use system default encoding.
* #throws IOException If an I/O error occurs.
*/
public UnicodeReader(InputStream in, String defaultEncoding) throws IOException {
byte bom[] = new byte[BOM_SIZE];
String encoding;
int unread;
PushbackInputStream pushbackStream = new PushbackInputStream(in, BOM_SIZE);
int n = pushbackStream.read(bom, 0, bom.length);
// Read ahead four bytes and check for BOM marks.
if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
encoding = "UTF-8";
unread = n - 3;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
encoding = "UTF-16BE";
unread = n - 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
encoding = "UTF-16LE";
unread = n - 2;
} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
encoding = "UTF-32BE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
encoding = "UTF-32LE";
unread = n - 4;
} else {
encoding = defaultEncoding;
unread = n;
}
// Unread bytes if necessary and skip BOM marks.
if (unread > 0) {
pushbackStream.unread(bom, (n - unread), unread);
} else if (unread < -1) {
pushbackStream.unread(bom, 0, 0);
}
// Use given encoding.
if (encoding == null) {
reader = new InputStreamReader(pushbackStream);
} else {
reader = new InputStreamReader(pushbackStream, encoding);
}
}
public String getEncoding() {
return reader.getEncoding();
}
public int read(char[] cbuf, int off, int len) throws IOException {
return reader.read(cbuf, off, len);
}
public void close() throws IOException {
reader.close();
}
}

The Apache Commons IO Library's BOMInputStream has already been mentioned by #rescdsk, but I did not see it mention how to get an InputStream without the BOM.
Here's how I did it in Scala.
import java.io._
val file = new File(path_to_xml_file_with_BOM)
val fileInpStream = new FileInputStream(file)
val bomIn = new BOMInputStream(fileInpStream,
false); // false means don't include BOM

To simply remove the BOM characters from your file, I recomend using Apache Common IO
public BOMInputStream(InputStream delegate,
boolean include)
Constructs a new BOM InputStream that detects a a ByteOrderMark.UTF_8 and optionally includes it.
Parameters:
delegate - the InputStream to delegate to
include - true to include the UTF-8 BOM or false to exclude it
Set include to false and your BOM characters will be excluded.

Regrettably not. You'll have to identify and skip yourself. This page details what you have to watch for. Also see this SO question for more details.

Here is my code to read csv files in most char sets. It should cover 99% situations.
try(InputStream inputStream = new FileInputStream(csvFile);){
BOMInputStream bomInputStream = new BOMInputStream(inputStream ,ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);
Charset charset;
if(!bomInputStream.hasBOM()) charset = StandardCharsets.UTF_8;
else if(bomInputStream.hasBOM(ByteOrderMark.UTF_8)) charset = StandardCharsets.UTF_8;
else if(bomInputStream.hasBOM(ByteOrderMark.UTF_16LE)) charset = StandardCharsets.UTF_16LE;
else if(bomInputStream.hasBOM(ByteOrderMark.UTF_16BE)) charset = StandardCharsets.UTF_16BE;
else { throw new Exception("The charset of the file " + csvFile + " is not supported.");}
try(Reader streamReader = new InputStreamReader(bomInputStream, charset);
BufferedReader bufferedReader = new BufferedReader(streamReader);) {
for(String line; (line = bufferedReader.readLine()) != null; ) {
String[] columns = line.split(",");
//read csv columns
}
}

IMO none of the given answers is really satisfying. Just skipping the BOM and then read the rest of the stream in the current platform's default encoding is definitively wrong. Remember: The platform default on Unix/Linux and windows differ: former is UTF-8, later is ANSI. Such a solution only works if the rest of the stream (after the BOM) only contains 7-bit ASCII characters (which, I admit, in most programmer near files like configurations is true). But as soon there are non ASCII characters, you will fail with this approach.
That's why all java classes/methods, which can convert byte arrays/streams to string (and vice versa) have a second parameter indicating the encoding to be used (Reader, Writer, Scanner, String.getBytes(), etc.).
There are so much character encodings out in the world, not only UTF-xx. And still - in the current year 2021 - there are so much encoding problems between end user applications, especially if they run on different platforms (iOS, windows, unix). All these problems only exist because the programmer was too lazy to learn how character encoding works.
Thus, it's an absolute MUST to evaluate first the encoding to be used, and then performing the string/stream conversion using the found encoding. Consulting the respective specification(s) is the first step. And only if you cannot be sure which encoding you encounter while reading a stream you have to evaluate it by yourself. But caution: such an evaluation always will only be a 'best guess', there is no algorithm which can cover all possibilities.
In this sense, Lee's answer (and coding example) from Feb 6,2021 is IMO the best one, except that he falls back to UTF-8 if there is no BOM.

I had the same problem, and because I wasn't reading in a bunch of files I did a simpler solution. I think my encoding was UTF-8 because when I printed out the offending character with the help of this page: Get unicode value of a character I found that it was \ufeff. I used the code System.out.println( "\\u" + Integer.toHexString(str.charAt(0) | 0x10000).substring(1) ); to print out the offending unicode value.
Once I had the offending unicode value, I replaced it in the first line of my file before I went on reading. The business logic of that section:
String str = reader.readLine().trim();
str = str.replace("\ufeff", "");
This fixed my problem. Then I was able to go on processing the file with no issue. I added on trim() just in case of leading or trailing whitespace, you can do that or not, based on what your specific needs are.

NotePad++ is a good tool to convert UTF-8 encoding to UTF-8(BOM) encoding.
https://notepad-plus-plus.org/downloads/
UTF8BOMTester.java
public class UTF8BOMTester {
public static void main(String[] args) throws FileNotFoundException, IOException {
// TODO Auto-generated method stub
File file = new File("test.txt");
boolean same = UTF8BOMInputStream.isSameEncodingType(file);
System.out.println(same);
if (same) {
UTF8BOMInputStream is = new UTF8BOMInputStream(file);
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
System.out.println(br.readLine());
}
}
static void bytesPrint(byte[] b) {
for (byte a : b)
System.out.printf("%x ", a);
}}
UTF8BOMInputStream.java
public class UTF8BOMInputStream extends InputStream {
byte[] SYMBLE_BOM = { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF };
FileInputStream fis;
final boolean isSameEncodingType;
public UTF8BOMInputStream(File file) throws IOException {
FileInputStream fis=new FileInputStream(file);
byte[] symble=new byte[3];
fis.read(symble);
bytesPrint(symble);
isSameEncodingType=isSameEncodingType(symble);
if(isSameEncodingType)
this.fis=fis;
else
this.fis=null;
}
#Override
public int read() throws IOException {
return fis.read();
}
void bytesPrint(byte[] b) {
for (byte a : b)
System.out.printf("%x ", a);
}
boolean bytesCompare(byte[] a, byte[] b) {
if (a.length != b.length)
return false;
for (int i = 0; i < a.length; i++) {
if (a[i] != b[i])
return false;
}
return true;
}
boolean isSameEncodingType(byte[] symble) {
return bytesCompare(symble,SYMBLE_BOM);
}
public static boolean isSameEncodingType(File file) throws IOException {
return (new UTF8BOMInputStream(file)).isSameEncodingType;
}

Related

why is my program not reading/writing the bits?

Im currently creating a Huffman compression program.
But Im having some trouble with writing/reading the bits.
I want to be able to write specific bits to a file.
e.g first "0100" then "0101" should be written as a byte to a new file using fileOutputStream as "01000101" :69
Class BitFileWriter - writes bits to file by saving each byte in buffer and then writing when buffer is full (contains 8 bits).
In the main function of this class I have some tests to se if all bytes will be written to file.
but opening the text file it doesn't read "AB".
/**
* writes bits to file outPutStream.
*/
public class BitFileWriter {
private BufferedOutputStream out;
private int buffer; // 8-bit buffer of bits to write out
private int n; // number of bits remaining in buffer
private String filename;
public BitFileWriter(String filename){
this.filename = filename;
}
private void addBitToBuffer(boolean bit) throws IOException {
// add bit to buffer
this.buffer <<= 1;
if (bit) this.buffer |= 1;
n++;
//if buffer is full write a whole byte.
if(n == 8){
writeByte(this.buffer);
this.n = 0;
this.buffer = 0;
}
}
private void writeByte(int b) throws IOException {
this.out = new BufferedOutputStream(new
FileOutputStream(filename));
out.write(b);
}
public void flush() throws IOException {
this.out.flush();
}
public static void main(String[] args) throws IOException {
BitFileWriter bitFileWriter = new
BitFileWriter("./src/result.txt");
// byte: 01000001, A
bitFileWriter.addBitToBuffer(true);
bitFileWriter.addBitToBuffer(true);
bitFileWriter.addBitToBuffer(false);
bitFileWriter.addBitToBuffer(false);
bitFileWriter.addBitToBuffer(false);
bitFileWriter.addBitToBuffer(false);
bitFileWriter.addBitToBuffer(true);
bitFileWriter.addBitToBuffer(false);
//byte 01000011, B
bitFileWriter.addBitToBuffer(false);
bitFileWriter.addBitToBuffer(true);
bitFileWriter.addBitToBuffer(true);
bitFileWriter.addBitToBuffer(false);
bitFileWriter.addBitToBuffer(false);
bitFileWriter.addBitToBuffer(false);
bitFileWriter.addBitToBuffer(false);
bitFileWriter.addBitToBuffer(false);
bitFileWriter.flush();
}
}
Class BitFileReader - reads bits from file.
but reading all 16 bits that I wanted to write to result.txt doesn´t give me the bits I (think) have written.
/**
* Reads one bit at a time from a file.
*
*
*/
public class BitFileReader {
private BufferedInputStream in;
private int currentByte; // -1 if no more data
private int bitPos; // position in currentByte
/**
* Creates a BitFileReader by opening a connection to an actual file,
* the file named by the File object file in the file system.
*/
public BitFileReader(File file) throws IOException {
in = new BufferedInputStream(new FileInputStream(file));
currentByte = in.read();
bitPos = 7;
}
/** Returns true if this reader has another bit in its input. */
public boolean hasNextBit() {
return currentByte != -1 && in != null;
}
/** Reads a single bit. */
public int nextBit() throws IOException {
int res = (currentByte>>bitPos) & 1;
--bitPos;
if (bitPos < 0) {
currentByte = in.read(); // -1 if end of file has been reached (read returns -1 if end of file).
bitPos = 7;
}
return res ;
}
/** Closes this reader. */
public void close() throws IOException {
if (in != null) {
in.close();
}
}
//Test
public static void main(String[] args) throws IOException {
File temp;
BitFileReader reader;
reader = new BitFileReader(new File("./src/result.txt"));
System.out.print("first byte: ");
for(int i = 0; i <8; i++){
System.out.print(reader.nextBit());
}
System.out.print(". second byte: ");
for(int i = 0; i <8; i++){
System.out.print(reader.nextBit());
}
reader.close();
}
}
Output is: first byte: 01100000. second byte: 11111111
The first thing I would do, is to move the statement:
this.out = new BufferedOutputStream(new
FileOutputStream(filename));
from writeByte to the constructor

Optimizing a Java server that is using data compression

For a multi-client server program, I'm utilizing a wrapper for java.util.zip.Inflater and Deflater that I found online. It seems that--because I'm frequently transferring large amounts of data in the form of ImageIcons--using these zipping methods speeds up my program significantly.
One thing I noticed however, while trying to optimize my program, is that the server is under heavy cpu load while transferring data among clients. The culprit is the server spending unnecessary cpu time unzipping objects sent by a client and re-zipping them to send it to other clients.
This crude schematic of mine may explain what is happening more clearly:
My question:
How can I send the raw compressed data that a client sends to the server directly to other clients without decompressing and compressing on the server side?
I'm not at all familiar with IO streams (I only code for a hobby) so I am stuck clueless. Anyone got any good resources that cover this area?
Below is the code that I am using on both server and client side to send and receive compressed data.
Creating a compressor
new ObjectOutputStream(
new BufferedOutputStream(
new CompressedBlockOutputStream(
socket.getOutputStream(), 1024)));
Creating a decompressor
new ObjectInputStream(
new BufferedInputStream(
new CompressedBlockInputStream(
socket.getInputStream())));
Code for CompressedBlock(Input/Output)Streams are below
Code that I copied from a source described in the license.
CompressedBlockInputStream.java
import java.io.EOFException;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
/**
* Input stream that decompresses data.
*
* Copyright 2005 - Philip Isenhour - http://javatechniques.com/
*
* This software is provided 'as-is', without any express or
* implied warranty. In no event will the authors be held liable
* for any damages arising from the use of this software.
*
* Permission is granted to anyone to use this software for any
* purpose, including commercial applications, and to alter it and
* redistribute it freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you
* must not claim that you wrote the original software. If you
* use this software in a product, an acknowledgment in the
* product documentation would be appreciated but is not required.
*
* 2. Altered source versions must be plainly marked as such, and
* must not be misrepresented as being the original software.
*
* 3. This notice may not be removed or altered from any source
* distribution.
*
* $Id: 1.2 2005/10/26 17:40:19 isenhour Exp $
*/
public class CompressedBlockInputStream extends FilterInputStream {
/**
* Buffer of compressed data read from the stream
*/
private byte[] inBuf = null;
/**
* Length of data in the input data
*/
private int inLength = 0;
/**
* Buffer of uncompressed data
*/
private byte[] outBuf = null;
/**
* Offset and length of uncompressed data
*/
private int outOffs = 0;
private int outLength = 0;
/**
* Inflater for decompressing
*/
private Inflater inflater = null;
public CompressedBlockInputStream(InputStream is) {
super(is);
inflater = new Inflater();
}
private void readAndDecompress() throws IOException {
// Read the length of the compressed block
int ch1 = in.read();
int ch2 = in.read();
int ch3 = in.read();
int ch4 = in.read();
if ((ch1 | ch2 | ch3 | ch4) < 0)
throw new EOFException();
inLength = ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0));
ch1 = in.read();
ch2 = in.read();
ch3 = in.read();
ch4 = in.read();
if ((ch1 | ch2 | ch3 | ch4) < 0)
throw new EOFException();
outLength = ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0));
// Make sure we've got enough space to read the block
if ((inBuf == null) || (inLength > inBuf.length)) {
inBuf = new byte[inLength];
}
if ((outBuf == null) || (outLength > outBuf.length)) {
outBuf = new byte[outLength];
}
// Read until we're got the entire compressed buffer.
// read(...) will not necessarily block until all
// requested data has been read, so we loop until
// we're done.
int inOffs = 0;
while (inOffs < inLength) {
int inCount = in.read(inBuf, inOffs, inLength - inOffs);
if (inCount == -1) {
throw new EOFException();
}
inOffs += inCount;
}
inflater.setInput(inBuf, 0, inLength);
try {
inflater.inflate(outBuf);
} catch(DataFormatException dfe) {
throw new IOException("Data format exception - " + dfe.getMessage());
}
// Reset the inflator so we can re-use it for the
// next block
inflater.reset();
outOffs = 0;
}
#Override
public int read() throws IOException {
if (outOffs >= outLength) {
try {
readAndDecompress();
}
catch(EOFException eof) {
return -1;
}
}
return outBuf[outOffs++] & 0xff;
}
#Override
public int read(byte[] b, int off, int len) throws IOException {
int count = 0;
while (count < len) {
if (outOffs >= outLength) {
try {
// If we've read at least one decompressed
// byte and further decompression would
// require blocking, return the count.
if ((count > 0) && (in.available() == 0))
return count;
else
readAndDecompress();
} catch(EOFException eof) {
if (count == 0)
count = -1;
return count;
}
}
int toCopy = Math.min(outLength - outOffs, len - count);
System.arraycopy(outBuf, outOffs, b, off + count, toCopy);
outOffs += toCopy;
count += toCopy;
}
return count;
}
#Override
public int available() throws IOException {
// This isn't precise, but should be an adequate
// lower bound on the actual amount of available data
return (outLength - outOffs) + in.available();
}
}
Code that I copied from a source described in the license.
CompressedBlockOutputStream.java
import java.io.FilterOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.zip.Deflater;
/**
* Output stream that compresses data. A compressed block
* is generated and transmitted once a given number of bytes
* have been written, or when the flush method is invoked.
*
* Copyright 2005 - Philip Isenhour - http://javatechniques.com/
*
* This software is provided 'as-is', without any express or
* implied warranty. In no event will the authors be held liable
* for any damages arising from the use of this software.
*
* Permission is granted to anyone to use this software for any
* purpose, including commercial applications, and to alter it and
* redistribute it freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you
* must not claim that you wrote the original software. If you
* use this software in a product, an acknowledgment in the
* product documentation would be appreciated but is not required.
*
* 2. Altered source versions must be plainly marked as such, and
* must not be misrepresented as being the original software.
*
* 3. This notice may not be removed or altered from any source
* distribution.
*
* $Id: 1.1 2005/10/26 17:19:05 isenhour Exp $
*/
public class CompressedBlockOutputStream extends FilterOutputStream {
/**
* Buffer for input data
*/
private byte[] inBuf = null;
/**
* Buffer for compressed data to be written
*/
private byte[] outBuf = null;
/**
* Number of bytes in the buffer
*/
private int len = 0;
/**
* Deflater for compressing data
*/
private Deflater deflater = null;
/**
* Constructs a CompressedBlockOutputStream that writes to
* the given underlying output stream 'os' and sends a compressed
* block once 'size' byte have been written. The default
* compression strategy and level are used.
*/
public CompressedBlockOutputStream(OutputStream os, int size) {
this(os, size, Deflater.DEFAULT_COMPRESSION, Deflater.DEFAULT_STRATEGY);
}
/**
* Constructs a CompressedBlockOutputStream that writes to the
* given underlying output stream 'os' and sends a compressed
* block once 'size' byte have been written. The compression
* level and strategy should be specified using the constants
* defined in java.util.zip.Deflator.
*/
public CompressedBlockOutputStream(OutputStream os, int size, int level, int strategy) {
super(os);
this.inBuf = new byte[size];
this.outBuf = new byte[size + 64];
this.deflater = new Deflater(level);
this.deflater.setStrategy(strategy);
}
protected void compressAndSend() throws IOException {
if (len > 0) {
deflater.setInput(inBuf, 0, len);
deflater.finish();
int size = deflater.deflate(outBuf);
// Write the size of the compressed data, followed
// by the size of the uncompressed data
out.write((size >> 24) & 0xFF);
out.write((size >> 16) & 0xFF);
out.write((size >> 8) & 0xFF);
out.write((size >> 0) & 0xFF);
out.write((len >> 24) & 0xFF);
out.write((len >> 16) & 0xFF);
out.write((len >> 8) & 0xFF);
out.write((len >> 0) & 0xFF);
out.write(outBuf, 0, size);
out.flush();
len = 0;
deflater.reset();
}
}
#Override
public void write(int b) throws IOException {
inBuf[len++] = (byte) b;
if (len == inBuf.length) {
compressAndSend();
}
}
#Override
public void write(byte[] b, int boff, int blen) throws IOException {
while ((len + blen) > inBuf.length) {
int toCopy = inBuf.length - len;
System.arraycopy(b, boff, inBuf, len, toCopy);
len += toCopy;
compressAndSend();
boff += toCopy;
blen -= toCopy;
}
System.arraycopy(b, boff, inBuf, len, blen);
len += blen;
}
#Override
public void flush() throws IOException {
compressAndSend();
out.flush();
}
#Override
public void close() throws IOException {
compressAndSend();
out.close();
}
}
You can replace the ObjectOutputStream and ObjectInputStream with normal InputStream and OutputStream or even BufferedInputStream and BufferedOutputStream
Here is an example:
try(InputStream is = socket.getInputStream()){
byte[] b = new byte[2048];// you can change the buffer's size.
for(int r = 0; (r = is.read(b))!= -1;){
for(OutputStream client : clients){
client.write(b, 0, r);
}
}
}catch(Exception e){
e.printStackTrace();
}
This will send the raw bytes received by the server to all the clients (without decompressing and compressing again)

How to split binary data into hex strings when characters are in the start and end of the strings

I want to split data based on character values which are two right parenthesis )) as start of substring and carriage return CR as the end of substring. The data comes in form of bytes Am stuck on how to split it. This is so far what I have come up with.
public class ByteDecoder {
public static void main(String[] args) throws IOException {
InputStream is = null;
DataInputStream dis = null;
try{
is = new FileInputStream("byte.log");
dis = new DataInputStream(is);
int count = is.available();
byte[] bs = new byte[count];
dis.read(bs);
for (byte b:bs)
{
char c = (char)b;
System.out.println(c);
//convert bytes to hex string
// String c = DatatypeConverter.printHexBinary( bs);
}
}catch(Exception e){
e.printStackTrace();
}finally{
if(is!=null)
is.close();
if(dis!=null)
dis.close();
}
}
}
CR (unlucky 13) as end marker of binary data might be a bit dangerous. More dangerous seems how the text and bytes became written: the text must be written as bytes in some encoding.
But considering that, one could wrap the FileInputStream in your own ByteLogInputStream, and there hold the reading state:
/**
* An InputStream converting bytes between ASCII "))" and CR to hexadecimal.
* Typically wrapped as:
* <pre>
* try (BufferedReader in = new BufferedReader(
* new InputStreamReader(
* new ByteLogInputStream(
* new FileInputStream(file), "UTF-8"))) {
* ...
* }
* </pre>
*/
public class ByteLogInputStream extends InputStream {
private enum State {
TEXT,
AFTER_RIGHT_PARENT,
BINARY
}
private final InputStream in;
private State state = State.TEXT;
private int nextHexDigit = 0;
public ByteLogInputStream(InputStream in) {
this.in = in;
}
#Override
public int read() throws IOException {
if (nextHexDigit != 0) {
int hex = nextHexDigit;
nextHexDigit = 0;
return hex;
}
int ch = in.read();
if (ch != -1) {
switch (state) {
case TEXT:
if (ch == ')') {
state = State.AFTER_RIGHT_PARENT;
}
break;
case AFTER_RIGHT_PARENT:
if (ch == ')') {
state = State.BINARY;
}
break;
case BINARY:
if (ch == '\r') {
state = State.TEXT;
} else {
String hex2 = String.format("%02X", ch);
ch = hex2.charAt(0);
nextHexDigit = hex2.charAt(1);
}
break;
}
}
return ch;
}
}
As one binary byte results in two hexadecimal digits, you need to buffer a nextHexDigit for the next digit.
I did not override available (to account for a possible nextHexDigit).
If you want to check whether \r\n follows, one should use a PushBackReader. I did use an InputStream, as you did not specify the encoding.

Unchecked or unsafe operations error in javac

I am completing a lab assignment for school and get this error when I compile. The program runs fine, bit would like to fix what is causing the error. The program code and the complete error is below. Thanks as always!
Note: Recompile with -Xlint:unchecked for details.
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package ie.moguntia.webcrawler;
import java.net.*;
import java.io.*;
import java.util.*;
/**
*
* #author Cong
*/
public class SaveURL
{
/**
* Opens a buffered stream on the url and copies the contents to writer
*/
public static void saveURL(URL url, Writer writer)
throws IOException {
BufferedInputStream in = new BufferedInputStream(url.openStream());
for (int c = in.read(); c != -1; c = in.read()) {
writer.write(c);
}
}
/**
* Opens a buffered stream on the url and copies the contents to OutputStream
*/
public static void saveURL(URL url, OutputStream os)
throws IOException {
InputStream is = url.openStream();
byte[] buf = new byte[1048576];
int n = is.read(buf);
while (n != -1) {
os.write(buf, 0, n);
n = is.read(buf);
}
}
/**
* Writes the contents of the url to a string by calling saveURL with a
* string writer as argument
*/
public static String getURL(URL url)
throws IOException {
StringWriter sw = new StringWriter();
saveURL(url, sw);
return sw.toString();
}
/**
* Writes the contents of the url to a new file by calling saveURL with
* a file writer as argument
*/
public static void writeURLtoFile(URL url, String filename)
throws IOException {
// FileWriter writer = new FileWriter(filename);
// saveURL(url, writer);
// writer.close();
FileOutputStream os = new FileOutputStream(filename);
saveURL(url, os);
os.close();
}
/**
* Extract links directly from a URL by calling extractLinks(getURL())
*/
public static Vector extractLinks(URL url)
throws IOException {
return extractLinks(getURL(url));
}
public static Map extractLinksWithText(URL url)
throws IOException {
return extractLinksWithText(getURL(url));
}
/**
* Extract links from a html page given as a raw and a lower case string
* In order to avoid the possible double conversion from mixed to lower case
* a second method is provided, where the conversion is done externally.
*/
public static Vector extractLinks(String rawPage, String page) {
int index = 0;
Vector links = new Vector();
while ((index = page.indexOf("<a ", index)) != -1)
{
if ((index = page.indexOf("href", index)) == -1) break;
if ((index = page.indexOf("=", index)) == -1) break;
String remaining = rawPage.substring(++index);
StringTokenizer st
= new StringTokenizer(remaining, "\t\n\r\"'>#");
String strLink = st.nextToken();
if (! links.contains(strLink)) links.add(strLink);
}
return links;
}
/**
* Extract links (key) with link text (value)
* Note that due to the nature of a Map only one link text is returned per
* URL, even if a link occurs multiple times with different texts.
*/
public static Map extractLinksWithText(String rawPage, String page) {
int index = 0;
Map links = new HashMap();
while ((index = page.indexOf("<a ", index)) != -1)
{
int tagEnd = page.indexOf(">", index);
if ((index = page.indexOf("href", index)) == -1) break;
if ((index = page.indexOf("=", index)) == -1) break;
int endTag = page.indexOf("</a", index);
String remaining = rawPage.substring(++index);
StringTokenizer st
= new StringTokenizer(remaining, "\t\n\r\"'>#");
String strLink = st.nextToken();
String strText = "";
if (tagEnd != -1 && tagEnd + 1 <= endTag) {
strText = rawPage.substring(tagEnd + 1, endTag);
}
strText = strText.replaceAll("\\s+", " ");
links.put(strLink, strText);
}
return links;
}
/**
* Extract links from a html page given as a String
* The return value is a vector of strings. This method does neither check
* the validity of its results nor does it care about html comments, so
* links that are commented out are also retrieved.
*/
public static Vector extractLinks(String rawPage) {
return extractLinks(rawPage, rawPage.toLowerCase().replaceAll("\\s", " "));
}
public static Map extractLinksWithText(String rawPage) {
return extractLinksWithText(rawPage, rawPage.toLowerCase().replaceAll("\\s", " "));
}
/**
* As a standalone program this class is capable of copying a url to a file
*/
public static void main(String[] args) {
try {
if (args.length == 1) {
URL url = new URL(args[0]);
System.out.println("Content-Type: " +
url.openConnection().getContentType());
// Vector links = extractLinks(url);
// for (int n = 0; n < links.size(); n++) {
// System.out.println((String) links.elementAt(n));
// }
Set links = extractLinksWithText(url).entrySet();
Iterator it = links.iterator();
while (it.hasNext()) {
Map.Entry en = (Map.Entry) it.next();
String strLink = (String) en.getKey();
String strText = (String) en.getValue();
System.out.println(strLink + " \"" + strText + "\" ");
}
return;
} else if (args.length == 2) {
writeURLtoFile(new URL(args[0]), args[1]);
return;
}
} catch (Exception e) {
System.err.println("An error occured: ");
e.printStackTrace();
// System.err.println(e.toString());
}
// Display usage information
// (If the program had done anything sensible, we wouldn't be here.)
System.err.println("Usage: java SaveURL <url> [<file>]");
System.err.println("Saves a URL to a file.");
System.err.println("If no file is given, extracts hyperlinks on url to console.");
}
}
You are using the raw (i.e. non-generic) forms of several classes that have generic type parameters, including
Map
HashMap
Vector
Iterator
Set
Map.Entry
Use the generic forms of these classes by supplying appropriate type parameters.

Byte order mark screws up file reading in Java

I'm trying to read CSV files using Java. Some of the files may have a byte order mark in the beginning, but not all. When present, the byte order gets read along with the rest of the first line, thus causing problems with string compares.
Is there an easy way to skip the byte order mark when it is present?
EDIT: I've made a proper release on GitHub: https://github.com/gpakosz/UnicodeBOMInputStream
Here is a class I coded a while ago, I just edited the package name before pasting. Nothing special, it is quite similar to solutions posted in SUN's bug database. Incorporate it in your code and you're fine.
/* ____________________________________________________________________________
*
* File: UnicodeBOMInputStream.java
* Author: Gregory Pakosz.
* Date: 02 - November - 2005
* ____________________________________________________________________________
*/
package com.stackoverflow.answer;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
/**
* The <code>UnicodeBOMInputStream</code> class wraps any
* <code>InputStream</code> and detects the presence of any Unicode BOM
* (Byte Order Mark) at its beginning, as defined by
* RFC 3629 - UTF-8, a transformation format of ISO 10646
*
* <p>The
* Unicode FAQ
* defines 5 types of BOMs:<ul>
* <li><pre>00 00 FE FF = UTF-32, big-endian</pre></li>
* <li><pre>FF FE 00 00 = UTF-32, little-endian</pre></li>
* <li><pre>FE FF = UTF-16, big-endian</pre></li>
* <li><pre>FF FE = UTF-16, little-endian</pre></li>
* <li><pre>EF BB BF = UTF-8</pre></li>
* </ul></p>
*
* <p>Use the {#link #getBOM()} method to know whether a BOM has been detected
* or not.
* </p>
* <p>Use the {#link #skipBOM()} method to remove the detected BOM from the
* wrapped <code>InputStream</code> object.</p>
*/
public class UnicodeBOMInputStream extends InputStream
{
/**
* Type safe enumeration class that describes the different types of Unicode
* BOMs.
*/
public static final class BOM
{
/**
* NONE.
*/
public static final BOM NONE = new BOM(new byte[]{},"NONE");
/**
* UTF-8 BOM (EF BB BF).
*/
public static final BOM UTF_8 = new BOM(new byte[]{(byte)0xEF,
(byte)0xBB,
(byte)0xBF},
"UTF-8");
/**
* UTF-16, little-endian (FF FE).
*/
public static final BOM UTF_16_LE = new BOM(new byte[]{ (byte)0xFF,
(byte)0xFE},
"UTF-16 little-endian");
/**
* UTF-16, big-endian (FE FF).
*/
public static final BOM UTF_16_BE = new BOM(new byte[]{ (byte)0xFE,
(byte)0xFF},
"UTF-16 big-endian");
/**
* UTF-32, little-endian (FF FE 00 00).
*/
public static final BOM UTF_32_LE = new BOM(new byte[]{ (byte)0xFF,
(byte)0xFE,
(byte)0x00,
(byte)0x00},
"UTF-32 little-endian");
/**
* UTF-32, big-endian (00 00 FE FF).
*/
public static final BOM UTF_32_BE = new BOM(new byte[]{ (byte)0x00,
(byte)0x00,
(byte)0xFE,
(byte)0xFF},
"UTF-32 big-endian");
/**
* Returns a <code>String</code> representation of this <code>BOM</code>
* value.
*/
public final String toString()
{
return description;
}
/**
* Returns the bytes corresponding to this <code>BOM</code> value.
*/
public final byte[] getBytes()
{
final int length = bytes.length;
final byte[] result = new byte[length];
// Make a defensive copy
System.arraycopy(bytes,0,result,0,length);
return result;
}
private BOM(final byte bom[], final String description)
{
assert(bom != null) : "invalid BOM: null is not allowed";
assert(description != null) : "invalid description: null is not allowed";
assert(description.length() != 0) : "invalid description: empty string is not allowed";
this.bytes = bom;
this.description = description;
}
final byte bytes[];
private final String description;
} // BOM
/**
* Constructs a new <code>UnicodeBOMInputStream</code> that wraps the
* specified <code>InputStream</code>.
*
* #param inputStream an <code>InputStream</code>.
*
* #throws NullPointerException when <code>inputStream</code> is
* <code>null</code>.
* #throws IOException on reading from the specified <code>InputStream</code>
* when trying to detect the Unicode BOM.
*/
public UnicodeBOMInputStream(final InputStream inputStream) throws NullPointerException,
IOException
{
if (inputStream == null)
throw new NullPointerException("invalid input stream: null is not allowed");
in = new PushbackInputStream(inputStream,4);
final byte bom[] = new byte[4];
final int read = in.read(bom);
switch(read)
{
case 4:
if ((bom[0] == (byte)0xFF) &&
(bom[1] == (byte)0xFE) &&
(bom[2] == (byte)0x00) &&
(bom[3] == (byte)0x00))
{
this.bom = BOM.UTF_32_LE;
break;
}
else
if ((bom[0] == (byte)0x00) &&
(bom[1] == (byte)0x00) &&
(bom[2] == (byte)0xFE) &&
(bom[3] == (byte)0xFF))
{
this.bom = BOM.UTF_32_BE;
break;
}
case 3:
if ((bom[0] == (byte)0xEF) &&
(bom[1] == (byte)0xBB) &&
(bom[2] == (byte)0xBF))
{
this.bom = BOM.UTF_8;
break;
}
case 2:
if ((bom[0] == (byte)0xFF) &&
(bom[1] == (byte)0xFE))
{
this.bom = BOM.UTF_16_LE;
break;
}
else
if ((bom[0] == (byte)0xFE) &&
(bom[1] == (byte)0xFF))
{
this.bom = BOM.UTF_16_BE;
break;
}
default:
this.bom = BOM.NONE;
break;
}
if (read > 0)
in.unread(bom,0,read);
}
/**
* Returns the <code>BOM</code> that was detected in the wrapped
* <code>InputStream</code> object.
*
* #return a <code>BOM</code> value.
*/
public final BOM getBOM()
{
// BOM type is immutable.
return bom;
}
/**
* Skips the <code>BOM</code> that was found in the wrapped
* <code>InputStream</code> object.
*
* #return this <code>UnicodeBOMInputStream</code>.
*
* #throws IOException when trying to skip the BOM from the wrapped
* <code>InputStream</code> object.
*/
public final synchronized UnicodeBOMInputStream skipBOM() throws IOException
{
if (!skipped)
{
in.skip(bom.bytes.length);
skipped = true;
}
return this;
}
/**
* {#inheritDoc}
*/
public int read() throws IOException
{
return in.read();
}
/**
* {#inheritDoc}
*/
public int read(final byte b[]) throws IOException,
NullPointerException
{
return in.read(b,0,b.length);
}
/**
* {#inheritDoc}
*/
public int read(final byte b[],
final int off,
final int len) throws IOException,
NullPointerException
{
return in.read(b,off,len);
}
/**
* {#inheritDoc}
*/
public long skip(final long n) throws IOException
{
return in.skip(n);
}
/**
* {#inheritDoc}
*/
public int available() throws IOException
{
return in.available();
}
/**
* {#inheritDoc}
*/
public void close() throws IOException
{
in.close();
}
/**
* {#inheritDoc}
*/
public synchronized void mark(final int readlimit)
{
in.mark(readlimit);
}
/**
* {#inheritDoc}
*/
public synchronized void reset() throws IOException
{
in.reset();
}
/**
* {#inheritDoc}
*/
public boolean markSupported()
{
return in.markSupported();
}
private final PushbackInputStream in;
private final BOM bom;
private boolean skipped = false;
} // UnicodeBOMInputStream
And you're using it this way:
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
public final class UnicodeBOMInputStreamUsage
{
public static void main(final String[] args) throws Exception
{
FileInputStream fis = new FileInputStream("test/offending_bom.txt");
UnicodeBOMInputStream ubis = new UnicodeBOMInputStream(fis);
System.out.println("detected BOM: " + ubis.getBOM());
System.out.print("Reading the content of the file without skipping the BOM: ");
InputStreamReader isr = new InputStreamReader(ubis);
BufferedReader br = new BufferedReader(isr);
System.out.println(br.readLine());
br.close();
isr.close();
ubis.close();
fis.close();
fis = new FileInputStream("test/offending_bom.txt");
ubis = new UnicodeBOMInputStream(fis);
isr = new InputStreamReader(ubis);
br = new BufferedReader(isr);
ubis.skipBOM();
System.out.print("Reading the content of the file after skipping the BOM: ");
System.out.println(br.readLine());
br.close();
isr.close();
ubis.close();
fis.close();
}
} // UnicodeBOMInputStreamUsage
The Apache Commons IO library has an InputStream that can detect and discard BOMs: BOMInputStream (javadoc):
BOMInputStream bomIn = new BOMInputStream(in);
int firstNonBOMByte = bomIn.read(); // Skips BOM
if (bomIn.hasBOM()) {
// has a UTF-8 BOM
}
If you also need to detect different encodings, it can also distinguish among various different byte-order marks, e.g. UTF-8 vs. UTF-16 big + little endian - details at the doc link above. You can then use the detected ByteOrderMark to choose a Charset to decode the stream. (There's probably a more streamlined way to do this if you need all of this functionality - maybe the UnicodeReader in BalusC's answer?). Note that, in general, there's not a very good way to detect what encoding some bytes are in, but if the stream starts with a BOM, apparently this can be helpful.
Edit: If you need to detect the BOM in UTF-16, UTF-32, etc, then the constructor should be:
new BOMInputStream(is, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE,
ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE)
Upvote #martin-charlesworth's comment :)
More simple solution:
public class BOMSkipper
{
public static void skip(Reader reader) throws IOException
{
reader.mark(1);
char[] possibleBOM = new char[1];
reader.read(possibleBOM);
if (possibleBOM[0] != '\ufeff')
{
reader.reset();
}
}
}
Usage sample:
BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(file), fileExpectedCharset));
BOMSkipper.skip(input);
//Now UTF prefix not present:
input.readLine();
...
It works with all 5 UTF encodings!
Google Data API has an UnicodeReader which automagically detects the encoding.
You can use it instead of InputStreamReader. Here's an -slightly compactized- extract of its source which is pretty straightforward:
public class UnicodeReader extends Reader {
private static final int BOM_SIZE = 4;
private final InputStreamReader reader;
/**
* Construct UnicodeReader
* #param in Input stream.
* #param defaultEncoding Default encoding to be used if BOM is not found,
* or <code>null</code> to use system default encoding.
* #throws IOException If an I/O error occurs.
*/
public UnicodeReader(InputStream in, String defaultEncoding) throws IOException {
byte bom[] = new byte[BOM_SIZE];
String encoding;
int unread;
PushbackInputStream pushbackStream = new PushbackInputStream(in, BOM_SIZE);
int n = pushbackStream.read(bom, 0, bom.length);
// Read ahead four bytes and check for BOM marks.
if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
encoding = "UTF-8";
unread = n - 3;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
encoding = "UTF-16BE";
unread = n - 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
encoding = "UTF-16LE";
unread = n - 2;
} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
encoding = "UTF-32BE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
encoding = "UTF-32LE";
unread = n - 4;
} else {
encoding = defaultEncoding;
unread = n;
}
// Unread bytes if necessary and skip BOM marks.
if (unread > 0) {
pushbackStream.unread(bom, (n - unread), unread);
} else if (unread < -1) {
pushbackStream.unread(bom, 0, 0);
}
// Use given encoding.
if (encoding == null) {
reader = new InputStreamReader(pushbackStream);
} else {
reader = new InputStreamReader(pushbackStream, encoding);
}
}
public String getEncoding() {
return reader.getEncoding();
}
public int read(char[] cbuf, int off, int len) throws IOException {
return reader.read(cbuf, off, len);
}
public void close() throws IOException {
reader.close();
}
}
The Apache Commons IO Library's BOMInputStream has already been mentioned by #rescdsk, but I did not see it mention how to get an InputStream without the BOM.
Here's how I did it in Scala.
import java.io._
val file = new File(path_to_xml_file_with_BOM)
val fileInpStream = new FileInputStream(file)
val bomIn = new BOMInputStream(fileInpStream,
false); // false means don't include BOM
To simply remove the BOM characters from your file, I recomend using Apache Common IO
public BOMInputStream(InputStream delegate,
boolean include)
Constructs a new BOM InputStream that detects a a ByteOrderMark.UTF_8 and optionally includes it.
Parameters:
delegate - the InputStream to delegate to
include - true to include the UTF-8 BOM or false to exclude it
Set include to false and your BOM characters will be excluded.
Regrettably not. You'll have to identify and skip yourself. This page details what you have to watch for. Also see this SO question for more details.
Here is my code to read csv files in most char sets. It should cover 99% situations.
try(InputStream inputStream = new FileInputStream(csvFile);){
BOMInputStream bomInputStream = new BOMInputStream(inputStream ,ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);
Charset charset;
if(!bomInputStream.hasBOM()) charset = StandardCharsets.UTF_8;
else if(bomInputStream.hasBOM(ByteOrderMark.UTF_8)) charset = StandardCharsets.UTF_8;
else if(bomInputStream.hasBOM(ByteOrderMark.UTF_16LE)) charset = StandardCharsets.UTF_16LE;
else if(bomInputStream.hasBOM(ByteOrderMark.UTF_16BE)) charset = StandardCharsets.UTF_16BE;
else { throw new Exception("The charset of the file " + csvFile + " is not supported.");}
try(Reader streamReader = new InputStreamReader(bomInputStream, charset);
BufferedReader bufferedReader = new BufferedReader(streamReader);) {
for(String line; (line = bufferedReader.readLine()) != null; ) {
String[] columns = line.split(",");
//read csv columns
}
}
IMO none of the given answers is really satisfying. Just skipping the BOM and then read the rest of the stream in the current platform's default encoding is definitively wrong. Remember: The platform default on Unix/Linux and windows differ: former is UTF-8, later is ANSI. Such a solution only works if the rest of the stream (after the BOM) only contains 7-bit ASCII characters (which, I admit, in most programmer near files like configurations is true). But as soon there are non ASCII characters, you will fail with this approach.
That's why all java classes/methods, which can convert byte arrays/streams to string (and vice versa) have a second parameter indicating the encoding to be used (Reader, Writer, Scanner, String.getBytes(), etc.).
There are so much character encodings out in the world, not only UTF-xx. And still - in the current year 2021 - there are so much encoding problems between end user applications, especially if they run on different platforms (iOS, windows, unix). All these problems only exist because the programmer was too lazy to learn how character encoding works.
Thus, it's an absolute MUST to evaluate first the encoding to be used, and then performing the string/stream conversion using the found encoding. Consulting the respective specification(s) is the first step. And only if you cannot be sure which encoding you encounter while reading a stream you have to evaluate it by yourself. But caution: such an evaluation always will only be a 'best guess', there is no algorithm which can cover all possibilities.
In this sense, Lee's answer (and coding example) from Feb 6,2021 is IMO the best one, except that he falls back to UTF-8 if there is no BOM.
I had the same problem, and because I wasn't reading in a bunch of files I did a simpler solution. I think my encoding was UTF-8 because when I printed out the offending character with the help of this page: Get unicode value of a character I found that it was \ufeff. I used the code System.out.println( "\\u" + Integer.toHexString(str.charAt(0) | 0x10000).substring(1) ); to print out the offending unicode value.
Once I had the offending unicode value, I replaced it in the first line of my file before I went on reading. The business logic of that section:
String str = reader.readLine().trim();
str = str.replace("\ufeff", "");
This fixed my problem. Then I was able to go on processing the file with no issue. I added on trim() just in case of leading or trailing whitespace, you can do that or not, based on what your specific needs are.
NotePad++ is a good tool to convert UTF-8 encoding to UTF-8(BOM) encoding.
https://notepad-plus-plus.org/downloads/
UTF8BOMTester.java
public class UTF8BOMTester {
public static void main(String[] args) throws FileNotFoundException, IOException {
// TODO Auto-generated method stub
File file = new File("test.txt");
boolean same = UTF8BOMInputStream.isSameEncodingType(file);
System.out.println(same);
if (same) {
UTF8BOMInputStream is = new UTF8BOMInputStream(file);
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
System.out.println(br.readLine());
}
}
static void bytesPrint(byte[] b) {
for (byte a : b)
System.out.printf("%x ", a);
}}
UTF8BOMInputStream.java
public class UTF8BOMInputStream extends InputStream {
byte[] SYMBLE_BOM = { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF };
FileInputStream fis;
final boolean isSameEncodingType;
public UTF8BOMInputStream(File file) throws IOException {
FileInputStream fis=new FileInputStream(file);
byte[] symble=new byte[3];
fis.read(symble);
bytesPrint(symble);
isSameEncodingType=isSameEncodingType(symble);
if(isSameEncodingType)
this.fis=fis;
else
this.fis=null;
}
#Override
public int read() throws IOException {
return fis.read();
}
void bytesPrint(byte[] b) {
for (byte a : b)
System.out.printf("%x ", a);
}
boolean bytesCompare(byte[] a, byte[] b) {
if (a.length != b.length)
return false;
for (int i = 0; i < a.length; i++) {
if (a[i] != b[i])
return false;
}
return true;
}
boolean isSameEncodingType(byte[] symble) {
return bytesCompare(symble,SYMBLE_BOM);
}
public static boolean isSameEncodingType(File file) throws IOException {
return (new UTF8BOMInputStream(file)).isSameEncodingType;
}

Categories