Optimizing a Java server that is using data compression - java

For a multi-client server program, I'm utilizing a wrapper for java.util.zip.Inflater and Deflater that I found online. It seems that--because I'm frequently transferring large amounts of data in the form of ImageIcons--using these zipping methods speeds up my program significantly.
One thing I noticed however, while trying to optimize my program, is that the server is under heavy cpu load while transferring data among clients. The culprit is the server spending unnecessary cpu time unzipping objects sent by a client and re-zipping them to send it to other clients.
This crude schematic of mine may explain what is happening more clearly:
My question:
How can I send the raw compressed data that a client sends to the server directly to other clients without decompressing and compressing on the server side?
I'm not at all familiar with IO streams (I only code for a hobby) so I am stuck clueless. Anyone got any good resources that cover this area?
Below is the code that I am using on both server and client side to send and receive compressed data.
Creating a compressor
new ObjectOutputStream(
new BufferedOutputStream(
new CompressedBlockOutputStream(
socket.getOutputStream(), 1024)));
Creating a decompressor
new ObjectInputStream(
new BufferedInputStream(
new CompressedBlockInputStream(
socket.getInputStream())));
Code for CompressedBlock(Input/Output)Streams are below
Code that I copied from a source described in the license.
CompressedBlockInputStream.java
import java.io.EOFException;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
/**
* Input stream that decompresses data.
*
* Copyright 2005 - Philip Isenhour - http://javatechniques.com/
*
* This software is provided 'as-is', without any express or
* implied warranty. In no event will the authors be held liable
* for any damages arising from the use of this software.
*
* Permission is granted to anyone to use this software for any
* purpose, including commercial applications, and to alter it and
* redistribute it freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you
* must not claim that you wrote the original software. If you
* use this software in a product, an acknowledgment in the
* product documentation would be appreciated but is not required.
*
* 2. Altered source versions must be plainly marked as such, and
* must not be misrepresented as being the original software.
*
* 3. This notice may not be removed or altered from any source
* distribution.
*
* $Id: 1.2 2005/10/26 17:40:19 isenhour Exp $
*/
public class CompressedBlockInputStream extends FilterInputStream {
/**
* Buffer of compressed data read from the stream
*/
private byte[] inBuf = null;
/**
* Length of data in the input data
*/
private int inLength = 0;
/**
* Buffer of uncompressed data
*/
private byte[] outBuf = null;
/**
* Offset and length of uncompressed data
*/
private int outOffs = 0;
private int outLength = 0;
/**
* Inflater for decompressing
*/
private Inflater inflater = null;
public CompressedBlockInputStream(InputStream is) {
super(is);
inflater = new Inflater();
}
private void readAndDecompress() throws IOException {
// Read the length of the compressed block
int ch1 = in.read();
int ch2 = in.read();
int ch3 = in.read();
int ch4 = in.read();
if ((ch1 | ch2 | ch3 | ch4) < 0)
throw new EOFException();
inLength = ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0));
ch1 = in.read();
ch2 = in.read();
ch3 = in.read();
ch4 = in.read();
if ((ch1 | ch2 | ch3 | ch4) < 0)
throw new EOFException();
outLength = ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0));
// Make sure we've got enough space to read the block
if ((inBuf == null) || (inLength > inBuf.length)) {
inBuf = new byte[inLength];
}
if ((outBuf == null) || (outLength > outBuf.length)) {
outBuf = new byte[outLength];
}
// Read until we're got the entire compressed buffer.
// read(...) will not necessarily block until all
// requested data has been read, so we loop until
// we're done.
int inOffs = 0;
while (inOffs < inLength) {
int inCount = in.read(inBuf, inOffs, inLength - inOffs);
if (inCount == -1) {
throw new EOFException();
}
inOffs += inCount;
}
inflater.setInput(inBuf, 0, inLength);
try {
inflater.inflate(outBuf);
} catch(DataFormatException dfe) {
throw new IOException("Data format exception - " + dfe.getMessage());
}
// Reset the inflator so we can re-use it for the
// next block
inflater.reset();
outOffs = 0;
}
#Override
public int read() throws IOException {
if (outOffs >= outLength) {
try {
readAndDecompress();
}
catch(EOFException eof) {
return -1;
}
}
return outBuf[outOffs++] & 0xff;
}
#Override
public int read(byte[] b, int off, int len) throws IOException {
int count = 0;
while (count < len) {
if (outOffs >= outLength) {
try {
// If we've read at least one decompressed
// byte and further decompression would
// require blocking, return the count.
if ((count > 0) && (in.available() == 0))
return count;
else
readAndDecompress();
} catch(EOFException eof) {
if (count == 0)
count = -1;
return count;
}
}
int toCopy = Math.min(outLength - outOffs, len - count);
System.arraycopy(outBuf, outOffs, b, off + count, toCopy);
outOffs += toCopy;
count += toCopy;
}
return count;
}
#Override
public int available() throws IOException {
// This isn't precise, but should be an adequate
// lower bound on the actual amount of available data
return (outLength - outOffs) + in.available();
}
}
Code that I copied from a source described in the license.
CompressedBlockOutputStream.java
import java.io.FilterOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.zip.Deflater;
/**
* Output stream that compresses data. A compressed block
* is generated and transmitted once a given number of bytes
* have been written, or when the flush method is invoked.
*
* Copyright 2005 - Philip Isenhour - http://javatechniques.com/
*
* This software is provided 'as-is', without any express or
* implied warranty. In no event will the authors be held liable
* for any damages arising from the use of this software.
*
* Permission is granted to anyone to use this software for any
* purpose, including commercial applications, and to alter it and
* redistribute it freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you
* must not claim that you wrote the original software. If you
* use this software in a product, an acknowledgment in the
* product documentation would be appreciated but is not required.
*
* 2. Altered source versions must be plainly marked as such, and
* must not be misrepresented as being the original software.
*
* 3. This notice may not be removed or altered from any source
* distribution.
*
* $Id: 1.1 2005/10/26 17:19:05 isenhour Exp $
*/
public class CompressedBlockOutputStream extends FilterOutputStream {
/**
* Buffer for input data
*/
private byte[] inBuf = null;
/**
* Buffer for compressed data to be written
*/
private byte[] outBuf = null;
/**
* Number of bytes in the buffer
*/
private int len = 0;
/**
* Deflater for compressing data
*/
private Deflater deflater = null;
/**
* Constructs a CompressedBlockOutputStream that writes to
* the given underlying output stream 'os' and sends a compressed
* block once 'size' byte have been written. The default
* compression strategy and level are used.
*/
public CompressedBlockOutputStream(OutputStream os, int size) {
this(os, size, Deflater.DEFAULT_COMPRESSION, Deflater.DEFAULT_STRATEGY);
}
/**
* Constructs a CompressedBlockOutputStream that writes to the
* given underlying output stream 'os' and sends a compressed
* block once 'size' byte have been written. The compression
* level and strategy should be specified using the constants
* defined in java.util.zip.Deflator.
*/
public CompressedBlockOutputStream(OutputStream os, int size, int level, int strategy) {
super(os);
this.inBuf = new byte[size];
this.outBuf = new byte[size + 64];
this.deflater = new Deflater(level);
this.deflater.setStrategy(strategy);
}
protected void compressAndSend() throws IOException {
if (len > 0) {
deflater.setInput(inBuf, 0, len);
deflater.finish();
int size = deflater.deflate(outBuf);
// Write the size of the compressed data, followed
// by the size of the uncompressed data
out.write((size >> 24) & 0xFF);
out.write((size >> 16) & 0xFF);
out.write((size >> 8) & 0xFF);
out.write((size >> 0) & 0xFF);
out.write((len >> 24) & 0xFF);
out.write((len >> 16) & 0xFF);
out.write((len >> 8) & 0xFF);
out.write((len >> 0) & 0xFF);
out.write(outBuf, 0, size);
out.flush();
len = 0;
deflater.reset();
}
}
#Override
public void write(int b) throws IOException {
inBuf[len++] = (byte) b;
if (len == inBuf.length) {
compressAndSend();
}
}
#Override
public void write(byte[] b, int boff, int blen) throws IOException {
while ((len + blen) > inBuf.length) {
int toCopy = inBuf.length - len;
System.arraycopy(b, boff, inBuf, len, toCopy);
len += toCopy;
compressAndSend();
boff += toCopy;
blen -= toCopy;
}
System.arraycopy(b, boff, inBuf, len, blen);
len += blen;
}
#Override
public void flush() throws IOException {
compressAndSend();
out.flush();
}
#Override
public void close() throws IOException {
compressAndSend();
out.close();
}
}

You can replace the ObjectOutputStream and ObjectInputStream with normal InputStream and OutputStream or even BufferedInputStream and BufferedOutputStream
Here is an example:
try(InputStream is = socket.getInputStream()){
byte[] b = new byte[2048];// you can change the buffer's size.
for(int r = 0; (r = is.read(b))!= -1;){
for(OutputStream client : clients){
client.write(b, 0, r);
}
}
}catch(Exception e){
e.printStackTrace();
}
This will send the raw bytes received by the server to all the clients (without decompressing and compressing again)

Related

How to stop receive method when Websocket does not send a response within X amount of time (java)

I am currently trying to speed up some automation tests on a Websocket, and the only way I can cut time is to be able to stop a receive method if no response has been received from the websocket in X amount of time. Currently as it is implemented, my test sends a message through the Websocket and gets a response if a certain ACK parameter is set to true and does not get a response if it is set to false. When the ACK parameter is set to false, my tests are running for about 30 seconds each waiting on a server ping that happens every thirty seconds.
Here is my Websocket implementation:
/**
* The MIT License
*
* Copyright (c) 2009 Adam MacBeth
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.Socket;
import java.net.URI;
import java.util.HashMap;
import java.util.Map.Entry;
import javax.net.SocketFactory;
import javax.net.ssl.SSLSocketFactory;
/**
* An implementation of a WebSocket protocol client.
*/
public class WebSocket {
/** The URL. */
private URI mUrl;
/** The socket. */
private Socket mSocket;
/** Whether the handshake is complete. */
private boolean mHandshakeComplete;
/** The socket input stream. */
private InputStream mInput;
/** The socket mOutput stream. */
private OutputStream mOutput;
/** The external headers. */
private HashMap<String, String> mHeaders;
/**
* Creates a new WebSocket targeting the specified URL.
* #param url The URL for the socket.
*/
public WebSocket(URI url) {
mUrl = url;
String protocol = mUrl.getScheme();
if (!protocol.equals("ws") && !protocol.equals("wss")) {
throw new IllegalArgumentException("Unsupported protocol: " + protocol);
}
}
/**
* Sets extra headers to be sent.
* #param headers A hash of header name-values.
*/
public void setHeaders(HashMap<String, String> headers) {
mHeaders = headers;
}
/**
* Returns the underlying socket;
*/
public Socket getSocket() {
return mSocket;
}
/**
* Establishes the connection.
*/
public void connect() throws java.io.IOException {
String host = mUrl.getHost();
String path = mUrl.getPath();
if (path.equals("")) {
path = "/";
}
String query = mUrl.getQuery();
if (query != null) {
path = path + "?" + query;
}
String origin = "http://" + host;
mSocket = createSocket();
int port = mSocket.getPort();
if (port != 80) {
host = host + ":" + port;
}
mOutput = mSocket.getOutputStream();
StringBuffer extraHeaders = new StringBuffer();
if (mHeaders != null) {
for (Entry<String, String> entry : mHeaders.entrySet()) {
extraHeaders.append(entry.getKey() + ": " + entry.getValue() + "\r\n");
}
}
String request = "GET "+path+" HTTP/1.1\r\n" +
"Upgrade: WebSocket\r\n" +
"Connection: Upgrade\r\n" +
"Host: "+host+"\r\n" +
"Origin: "+origin+"\r\n" +
extraHeaders.toString() +
"\r\n";
mOutput.write(request.getBytes());
mOutput.flush();
mInput = mSocket.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(mInput));
String header = reader.readLine();
if (!header.equals("HTTP/1.1 101 Web Socket Protocol Handshake")) {
throw new IOException("Invalid handshake response");
}
header = reader.readLine();
if (!header.equals("Upgrade: WebSocket")) {
throw new IOException("Invalid handshake response");
}
header = reader.readLine();
if (!header.equals("Connection: Upgrade")) {
throw new IOException("Invalid handshake response");
}
do {
header = reader.readLine();
} while (!header.equals(""));
mHandshakeComplete = true;
}
private Socket createSocket() throws java.io.IOException {
String scheme = mUrl.getScheme();
String host = mUrl.getHost();
int port = mUrl.getPort();
if (port == -1) {
if (scheme.equals("wss")) {
port = 443;
} else if (scheme.equals("ws")) {
port = 80;
} else {
throw new IllegalArgumentException("Unsupported scheme");
}
}
if (scheme.equals("wss")) {
SocketFactory factory = SSLSocketFactory.getDefault();
return factory.createSocket(host, port);
} else {
return new Socket(host, port);
}
}
/**
* Sends the specified string as a data frame.
* #param str The string to send.
* #throws java.io.IOException
*/
public void send(String str) throws java.io.IOException {
if (!mHandshakeComplete) {
throw new IllegalStateException("Handshake not complete");
}
mOutput.write(0x00);
mOutput.write(str.getBytes("UTF-8"));
mOutput.write(0xff);
mOutput.flush();
}
/**
* Receives the next data frame.
* #return The received data.
* #throws java.io.IOException
*/
public String recv() throws java.io.IOException {
if (!mHandshakeComplete) {
throw new IllegalStateException("Handshake not complete");
}
StringBuffer buf = new StringBuffer();
int b = mInput.read();
if ((b & 0x80) == 0x80) {
// Skip data frame
int len = 0;
do {
b = mInput.read() & 0x7f;
len = len * 128 + b;
} while ((b & 0x80) != 0x80);
for (int i = 0; i < len; i++) {
mInput.read();
}
}
while (true) {
b = mInput.read();
if (b == 0xff) {
break;
}
buf.append((char)b);
}
return new String(buf.toString().getBytes(), "UTF8");
}
/**
* Closes the socket.
* #throws java.io.IOException
*/
public void close() throws java.io.IOException {
mInput.close();
mOutput.close();
mSocket.close();
}
}
And here is the receive method I want to be able to speed up:
public String recv() throws java.io.IOException {
if (!mHandshakeComplete) {
throw new IllegalStateException("Handshake not complete");
}
StringBuffer buf = new StringBuffer();
int b = mInput.read();
if ((b & 0x80) == 0x80) {
// Skip data frame
int len = 0;
do {
b = mInput.read() & 0x7f;
len = len * 128 + b;
} while ((b & 0x80) != 0x80);
for (int i = 0; i < len; i++) {
mInput.read();
}
}
while (true) {
b = mInput.read();
if (b == 0xff) {
break;
}
buf.append((char)b);
}
return new String(buf.toString().getBytes(), "UTF8");
}
you can use the setSoTimeout(int timeout); function on your socket object. when the time expires a SocketTimeoutException is thrown.
here is the oracle api link: http://docs.oracle.com/javase/7/docs/api/java/net/Socket.html#setSoTimeout(int)

Java Read Large Text File With 70million line of text

I have a big test file with 70 million lines of text.
I have to read the file line by line.
I used two different approaches:
InputStreamReader isr = new InputStreamReader(new FileInputStream(FilePath),"unicode");
BufferedReader br = new BufferedReader(isr);
while((cur=br.readLine()) != null);
and
LineIterator it = FileUtils.lineIterator(new File(FilePath), "unicode");
while(it.hasNext()) cur=it.nextLine();
Is there another approach that can make this task faster?
1) I am sure there is no difference speedwise, both use FileInputStream internally and buffering
2) You can take measurements and see for yourself
3) Though there's no performance benefits I like the 1.7 approach
try (BufferedReader br = Files.newBufferedReader(Paths.get("test.txt"), StandardCharsets.UTF_8)) {
for (String line = null; (line = br.readLine()) != null;) {
//
}
}
4) Scanner based version
try (Scanner sc = new Scanner(new File("test.txt"), "UTF-8")) {
while (sc.hasNextLine()) {
String line = sc.nextLine();
}
// note that Scanner suppresses exceptions
if (sc.ioException() != null) {
throw sc.ioException();
}
}
5) This may be faster than the rest
try (SeekableByteChannel ch = Files.newByteChannel(Paths.get("test.txt"))) {
ByteBuffer bb = ByteBuffer.allocateDirect(1000);
for(;;) {
StringBuilder line = new StringBuilder();
int n = ch.read(bb);
// add chars to line
// ...
}
}
it requires a bit of coding but it can be really faster because of ByteBuffer.allocateDirect. It allows OS to read bytes from file to ByteBuffer directly, without copying
6) Parallel processing would definitely increase speed. Make a big byte buffer, run several tasks that read bytes from file into that buffer in parallel, when ready find first end of line, make a String, find next...
If you are looking out at performance, you could have a look at the java.nio.* packages - those are supposedly faster than java.io.*
In Java 8, for anyone looking now to read file large files line by line,
Stream<String> lines = Files.lines(Paths.get("c:\myfile.txt"));
lines.forEach(l -> {
// Do anything line by line
});
I actually did a research in this topic for months in my free time and came up with a benchmark and here is a code to benchmark all the different ways to read a File line by line.The individual performance may vary based on the underlying system.
I ran on a windows 10 Java 8 Intel i5 HP laptop:Here is the code.
import java.io.*;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Pattern;
import java.util.stream.Stream;
public class ReadComplexDelimitedFile {
private static long total = 0;
private static final Pattern FIELD_DELIMITER_PATTERN = Pattern.compile("\\^\\|\\^");
#SuppressWarnings("unused")
private void readFileUsingScanner() {
String s;
try (Scanner stdin = new Scanner(new File(this.getClass().getResource("input.txt").getPath()))) {
while (stdin.hasNextLine()) {
s = stdin.nextLine();
String[] fields = FIELD_DELIMITER_PATTERN.split(s, 0);
total = total + fields.length;
}
} catch (Exception e) {
System.err.println("Error");
}
}
//Winner
private void readFileUsingCustomBufferedReader() {
try (CustomBufferedReader stdin = new CustomBufferedReader(new FileReader(new File(this.getClass().getResource("input.txt").getPath())))) {
String s;
while ((s = stdin.readLine()) != null) {
String[] fields = FIELD_DELIMITER_PATTERN.split(s, 0);
total += fields.length;
}
} catch (Exception e) {
System.err.println("Error");
}
}
private void readFileUsingBufferedReader() {
try (BufferedReader stdin = new BufferedReader(new FileReader(new File(this.getClass().getResource("input.txt").getPath())))) {
String s;
while ((s = stdin.readLine()) != null) {
String[] fields = FIELD_DELIMITER_PATTERN.split(s, 0);
total += fields.length;
}
} catch (Exception e) {
System.err.println("Error");
}
}
private void readFileUsingLineReader() {
try (LineNumberReader stdin = new LineNumberReader(new FileReader(new File(this.getClass().getResource("input.txt").getPath())))) {
String s;
while ((s = stdin.readLine()) != null) {
String[] fields = FIELD_DELIMITER_PATTERN.split(s, 0);
total += fields.length;
}
} catch (Exception e) {
System.err.println("Error");
}
}
private void readFileUsingStreams() {
try (Stream<String> stream = Files.lines((new File(this.getClass().getResource("input.txt").getPath())).toPath())) {
total += stream.mapToInt(s -> FIELD_DELIMITER_PATTERN.split(s, 0).length).sum();
} catch (IOException e1) {
e1.printStackTrace();
}
}
private void readFileUsingBufferedReaderFileChannel() {
try (FileInputStream fis = new FileInputStream(this.getClass().getResource("input.txt").getPath())) {
try (FileChannel inputChannel = fis.getChannel()) {
try (CustomBufferedReader stdin = new CustomBufferedReader(Channels.newReader(inputChannel, "UTF-8"))) {
String s;
while ((s = stdin.readLine()) != null) {
String[] fields = FIELD_DELIMITER_PATTERN.split(s, 0);
total = total + fields.length;
}
}
} catch (Exception e) {
System.err.println("Error");
}
} catch (Exception e) {
System.err.println("Error");
}
}
public static void main(String args[]) {
//JVM wamrup
for (int i = 0; i < 100000; i++) {
total += i;
}
// We know scanner is slow-Still warming up
ReadComplexDelimitedFile readComplexDelimitedFile = new ReadComplexDelimitedFile();
List<Long> longList = new ArrayList<>(50);
for (int i = 0; i < 50; i++) {
total = 0;
long startTime = System.nanoTime();
//readComplexDelimitedFile.readFileUsingScanner();
long stopTime = System.nanoTime();
long timeDifference = stopTime - startTime;
longList.add(timeDifference);
}
System.out.println("Time taken for readFileUsingScanner");
longList.forEach(System.out::println);
// Actual performance test starts here
longList = new ArrayList<>(10);
for (int i = 0; i < 10; i++) {
total = 0;
long startTime = System.nanoTime();
readComplexDelimitedFile.readFileUsingBufferedReaderFileChannel();
long stopTime = System.nanoTime();
long timeDifference = stopTime - startTime;
longList.add(timeDifference);
}
System.out.println("Time taken for readFileUsingBufferedReaderFileChannel");
longList.forEach(System.out::println);
longList.clear();
for (int i = 0; i < 10; i++) {
total = 0;
long startTime = System.nanoTime();
readComplexDelimitedFile.readFileUsingBufferedReader();
long stopTime = System.nanoTime();
long timeDifference = stopTime - startTime;
longList.add(timeDifference);
}
System.out.println("Time taken for readFileUsingBufferedReader");
longList.forEach(System.out::println);
longList.clear();
for (int i = 0; i < 10; i++) {
total = 0;
long startTime = System.nanoTime();
readComplexDelimitedFile.readFileUsingStreams();
long stopTime = System.nanoTime();
long timeDifference = stopTime - startTime;
longList.add(timeDifference);
}
System.out.println("Time taken for readFileUsingStreams");
longList.forEach(System.out::println);
longList.clear();
for (int i = 0; i < 10; i++) {
total = 0;
long startTime = System.nanoTime();
readComplexDelimitedFile.readFileUsingCustomBufferedReader();
long stopTime = System.nanoTime();
long timeDifference = stopTime - startTime;
longList.add(timeDifference);
}
System.out.println("Time taken for readFileUsingCustomBufferedReader");
longList.forEach(System.out::println);
longList.clear();
for (int i = 0; i < 10; i++) {
total = 0;
long startTime = System.nanoTime();
readComplexDelimitedFile.readFileUsingLineReader();
long stopTime = System.nanoTime();
long timeDifference = stopTime - startTime;
longList.add(timeDifference);
}
System.out.println("Time taken for readFileUsingLineReader");
longList.forEach(System.out::println);
}
}
I had to rewrite BufferedReader to avoid synchronized and a couple of boundary conditions that is not needed.(Atleast that's what I felt.It is not unit tested so use it at your own risk.)
import com.sun.istack.internal.NotNull;
import java.io.*;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
/**
* Reads text from a character-input stream, buffering characters so as to
* provide for the efficient reading of characters, arrays, and lines.
* <p>
* <p> The buffer size may be specified, or the default size may be used. The
* default is large enough for most purposes.
* <p>
* <p> In general, each read request made of a Reader causes a corresponding
* read request to be made of the underlying character or byte stream. It is
* therefore advisable to wrap a CustomBufferedReader around any Reader whose read()
* operations may be costly, such as FileReaders and InputStreamReaders. For
* example,
* <p>
* <pre>
* CustomBufferedReader in
* = new CustomBufferedReader(new FileReader("foo.in"));
* </pre>
* <p>
* will buffer the input from the specified file. Without buffering, each
* invocation of read() or readLine() could cause bytes to be read from the
* file, converted into characters, and then returned, which can be very
* inefficient.
* <p>
* <p> Programs that use DataInputStreams for textual input can be localized by
* replacing each DataInputStream with an appropriate CustomBufferedReader.
*
* #author Mark Reinhold
* #see FileReader
* #see InputStreamReader
* #see java.nio.file.Files#newBufferedReader
* #since JDK1.1
*/
public class CustomBufferedReader extends Reader {
private final Reader in;
private char cb[];
private int nChars, nextChar;
private static final int INVALIDATED = -2;
private static final int UNMARKED = -1;
private int markedChar = UNMARKED;
private int readAheadLimit = 0; /* Valid only when markedChar > 0 */
/**
* If the next character is a line feed, skip it
*/
private boolean skipLF = false;
/**
* The skipLF flag when the mark was set
*/
private boolean markedSkipLF = false;
private static int defaultCharBufferSize = 8192;
private static int defaultExpectedLineLength = 80;
private ReadWriteLock rwlock;
/**
* Creates a buffering character-input stream that uses an input buffer of
* the specified size.
*
* #param in A Reader
* #param sz Input-buffer size
* #throws IllegalArgumentException If {#code sz <= 0}
*/
public CustomBufferedReader(#NotNull final Reader in, int sz) {
super(in);
if (sz <= 0)
throw new IllegalArgumentException("Buffer size <= 0");
this.in = in;
cb = new char[sz];
nextChar = nChars = 0;
rwlock = new ReentrantReadWriteLock();
}
/**
* Creates a buffering character-input stream that uses a default-sized
* input buffer.
*
* #param in A Reader
*/
public CustomBufferedReader(#NotNull final Reader in) {
this(in, defaultCharBufferSize);
}
/**
* Fills the input buffer, taking the mark into account if it is valid.
*/
private void fill() throws IOException {
int dst;
if (markedChar <= UNMARKED) {
/* No mark */
dst = 0;
} else {
/* Marked */
int delta = nextChar - markedChar;
if (delta >= readAheadLimit) {
/* Gone past read-ahead limit: Invalidate mark */
markedChar = INVALIDATED;
readAheadLimit = 0;
dst = 0;
} else {
if (readAheadLimit <= cb.length) {
/* Shuffle in the current buffer */
System.arraycopy(cb, markedChar, cb, 0, delta);
markedChar = 0;
dst = delta;
} else {
/* Reallocate buffer to accommodate read-ahead limit */
char ncb[] = new char[readAheadLimit];
System.arraycopy(cb, markedChar, ncb, 0, delta);
cb = ncb;
markedChar = 0;
dst = delta;
}
nextChar = nChars = delta;
}
}
int n;
do {
n = in.read(cb, dst, cb.length - dst);
} while (n == 0);
if (n > 0) {
nChars = dst + n;
nextChar = dst;
}
}
/**
* Reads a single character.
*
* #return The character read, as an integer in the range
* 0 to 65535 (<tt>0x00-0xffff</tt>), or -1 if the
* end of the stream has been reached
* #throws IOException If an I/O error occurs
*/
public char readChar() throws IOException {
for (; ; ) {
if (nextChar >= nChars) {
fill();
if (nextChar >= nChars)
return (char) -1;
}
return cb[nextChar++];
}
}
/**
* Reads characters into a portion of an array, reading from the underlying
* stream if necessary.
*/
private int read1(char[] cbuf, int off, int len) throws IOException {
if (nextChar >= nChars) {
/* If the requested length is at least as large as the buffer, and
if there is no mark/reset activity, and if line feeds are not
being skipped, do not bother to copy the characters into the
local buffer. In this way buffered streams will cascade
harmlessly. */
if (len >= cb.length && markedChar <= UNMARKED && !skipLF) {
return in.read(cbuf, off, len);
}
fill();
}
if (nextChar >= nChars) return -1;
int n = Math.min(len, nChars - nextChar);
System.arraycopy(cb, nextChar, cbuf, off, n);
nextChar += n;
return n;
}
/**
* Reads characters into a portion of an array.
* <p>
* <p> This method implements the general contract of the corresponding
* <code>{#link Reader#read(char[], int, int) read}</code> method of the
* <code>{#link Reader}</code> class. As an additional convenience, it
* attempts to read as many characters as possible by repeatedly invoking
* the <code>read</code> method of the underlying stream. This iterated
* <code>read</code> continues until one of the following conditions becomes
* true: <ul>
* <p>
* <li> The specified number of characters have been read,
* <p>
* <li> The <code>read</code> method of the underlying stream returns
* <code>-1</code>, indicating end-of-file, or
* <p>
* <li> The <code>ready</code> method of the underlying stream
* returns <code>false</code>, indicating that further input requests
* would block.
* <p>
* </ul> If the first <code>read</code> on the underlying stream returns
* <code>-1</code> to indicate end-of-file then this method returns
* <code>-1</code>. Otherwise this method returns the number of characters
* actually read.
* <p>
* <p> Subclasses of this class are encouraged, but not required, to
* attempt to read as many characters as possible in the same fashion.
* <p>
* <p> Ordinarily this method takes characters from this stream's character
* buffer, filling it from the underlying stream as necessary. If,
* however, the buffer is empty, the mark is not valid, and the requested
* length is at least as large as the buffer, then this method will read
* characters directly from the underlying stream into the given array.
* Thus redundant <code>CustomBufferedReader</code>s will not copy data
* unnecessarily.
*
* #param cbuf Destination buffer
* #param off Offset at which to start storing characters
* #param len Maximum number of characters to read
* #return The number of characters read, or -1 if the end of the
* stream has been reached
* #throws IOException If an I/O error occurs
*/
public int read(char cbuf[], int off, int len) throws IOException {
int n = read1(cbuf, off, len);
if (n <= 0) return n;
while ((n < len) && in.ready()) {
int n1 = read1(cbuf, off + n, len - n);
if (n1 <= 0) break;
n += n1;
}
return n;
}
/**
* Reads a line of text. A line is considered to be terminated by any one
* of a line feed ('\n'), a carriage return ('\r'), or a carriage return
* followed immediately by a linefeed.
*
* #param ignoreLF If true, the next '\n' will be skipped
* #return A String containing the contents of the line, not including
* any line-termination characters, or null if the end of the
* stream has been reached
* #throws IOException If an I/O error occurs
* #see java.io.LineNumberReader#readLine()
*/
String readLine(boolean ignoreLF) throws IOException {
StringBuilder s = null;
int startChar;
bufferLoop:
for (; ; ) {
if (nextChar >= nChars)
fill();
if (nextChar >= nChars) { /* EOF */
if (s != null && s.length() > 0)
return s.toString();
else
return null;
}
boolean eol = false;
char c = 0;
int i;
/* Skip a leftover '\n', if necessary */
charLoop:
for (i = nextChar; i < nChars; i++) {
c = cb[i];
if ((c == '\n')) {
eol = true;
break charLoop;
}
}
startChar = nextChar;
nextChar = i;
if (eol) {
String str;
if (s == null) {
str = new String(cb, startChar, i - startChar);
} else {
s.append(cb, startChar, i - startChar);
str = s.toString();
}
nextChar++;
return str;
}
if (s == null)
s = new StringBuilder(defaultExpectedLineLength);
s.append(cb, startChar, i - startChar);
}
}
/**
* Reads a line of text. A line is considered to be terminated by any one
* of a line feed ('\n'), a carriage return ('\r'), or a carriage return
* followed immediately by a linefeed.
*
* #return A String containing the contents of the line, not including
* any line-termination characters, or null if the end of the
* stream has been reached
* #throws IOException If an I/O error occurs
* #see java.nio.file.Files#readAllLines
*/
public String readLine() throws IOException {
return readLine(false);
}
/**
* Skips characters.
*
* #param n The number of characters to skip
* #return The number of characters actually skipped
* #throws IllegalArgumentException If <code>n</code> is negative.
* #throws IOException If an I/O error occurs
*/
public long skip(long n) throws IOException {
if (n < 0L) {
throw new IllegalArgumentException("skip value is negative");
}
rwlock.readLock().lock();
long r = n;
try{
while (r > 0) {
if (nextChar >= nChars)
fill();
if (nextChar >= nChars) /* EOF */
break;
if (skipLF) {
skipLF = false;
if (cb[nextChar] == '\n') {
nextChar++;
}
}
long d = nChars - nextChar;
if (r <= d) {
nextChar += r;
r = 0;
break;
} else {
r -= d;
nextChar = nChars;
}
}
} finally {
rwlock.readLock().unlock();
}
return n - r;
}
/**
* Tells whether this stream is ready to be read. A buffered character
* stream is ready if the buffer is not empty, or if the underlying
* character stream is ready.
*
* #throws IOException If an I/O error occurs
*/
public boolean ready() throws IOException {
rwlock.readLock().lock();
try {
/*
* If newline needs to be skipped and the next char to be read
* is a newline character, then just skip it right away.
*/
if (skipLF) {
/* Note that in.ready() will return true if and only if the next
* read on the stream will not block.
*/
if (nextChar >= nChars && in.ready()) {
fill();
}
if (nextChar < nChars) {
if (cb[nextChar] == '\n')
nextChar++;
skipLF = false;
}
}
} finally {
rwlock.readLock().unlock();
}
return (nextChar < nChars) || in.ready();
}
/**
* Tells whether this stream supports the mark() operation, which it does.
*/
public boolean markSupported() {
return true;
}
/**
* Marks the present position in the stream. Subsequent calls to reset()
* will attempt to reposition the stream to this point.
*
* #param readAheadLimit Limit on the number of characters that may be
* read while still preserving the mark. An attempt
* to reset the stream after reading characters
* up to this limit or beyond may fail.
* A limit value larger than the size of the input
* buffer will cause a new buffer to be allocated
* whose size is no smaller than limit.
* Therefore large values should be used with care.
* #throws IllegalArgumentException If {#code readAheadLimit < 0}
* #throws IOException If an I/O error occurs
*/
public void mark(int readAheadLimit) throws IOException {
if (readAheadLimit < 0) {
throw new IllegalArgumentException("Read-ahead limit < 0");
}
rwlock.readLock().lock();
try {
this.readAheadLimit = readAheadLimit;
markedChar = nextChar;
markedSkipLF = skipLF;
} finally {
rwlock.readLock().unlock();
}
}
/**
* Resets the stream to the most recent mark.
*
* #throws IOException If the stream has never been marked,
* or if the mark has been invalidated
*/
public void reset() throws IOException {
rwlock.readLock().lock();
try {
if (markedChar < 0)
throw new IOException((markedChar == INVALIDATED)
? "Mark invalid"
: "Stream not marked");
nextChar = markedChar;
skipLF = markedSkipLF;
} finally {
rwlock.readLock().unlock();
}
}
public void close() throws IOException {
rwlock.readLock().lock();
try {
in.close();
} finally {
cb = null;
rwlock.readLock().unlock();
}
}
public Stream<String> lines() {
Iterator<String> iter = new Iterator<String>() {
String nextLine = null;
#Override
public boolean hasNext() {
if (nextLine != null) {
return true;
} else {
try {
nextLine = readLine();
return (nextLine != null);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
}
#Override
public String next() {
if (nextLine != null || hasNext()) {
String line = nextLine;
nextLine = null;
return line;
} else {
throw new NoSuchElementException();
}
}
};
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(
iter, Spliterator.ORDERED | Spliterator.NONNULL), false);
}
}
And now the results:
Time taken for readFileUsingBufferedReaderFileChannel
2902690903
1845190694
1894071377
1815161868
1861056735
1867693540
1857521371
1794176251
1768008762
1853089582
Time taken for readFileUsingBufferedReader
2022837353
1925901163
1802266711
1842689572
1899984555
1843101306
1998642345
1821242301
1820168806
1830375108
Time taken for readFileUsingStreams
1992855461
1930827034
1850876033
1843402533
1800378283
1863581324
1810857226
1798497108
1809531144
1796345853
Time taken for readFileUsingCustomBufferedReader
1759732702
1765987214
1776997357
1772999486
1768559162
1755248431
1744434555
1750349867
1740582606
1751390934
Time taken for readFileUsingLineReader
1845307174
1830950256
1829847321
1828125293
1827936280
1836947487
1832186310
1820276327
1830157935
1829171481
Process finished with exit code 0
Inference:
The test was run on a 200 MB file.
The test was repeated several times.
The data looked like this
Start Date^|^Start Time^|^End Date^|^End Time^|^Event Title ^|^All Day Event^|^No End Time^|^Event Description^|^Contact ^|^Contact Email^|^Contact Phone^|^Location^|^Category^|^Mandatory^|^Registration^|^Maximum^|^Last Date To Register
9/5/2011^|^3:00:00 PM^|^9/5/2011^|^^|^Social Studies Dept. Meeting^|^N^|^Y^|^Department meeting^|^Chris Gallagher^|^cgallagher#schoolwires.com^|^814-555-5179^|^High School^|^2^|^N^|^N^|^25^|^9/2/2011
Bottomline not much difference between BufferedReader and my CustomReader and it is very miniscule and hence you can use this to read your file.
Trust me you don't have to break your head.use BufferedReader with readLine,it is properly tested.At worst if you feel you can improve it just override and change it to StringBuilder instead of StringBuffer just to shave off half a second
I had a similar problem, but I only needed the bytes from the file. I read through links provided in the various answers, and ultimately tried writing one similar to #5 in Evgeniy's answer. They weren't kidding, it took a lot of code.
The basic premise is that each line of text is of unknown length. I will start with a SeekableByteChannel, read data into a ByteBuffer, then loop over it looking for EOL. When something is a "carryover" between loops, it increments a counter and then ultimately moves the SeekableByteChannel position around and reads the entire buffer.
It is verbose ... but it works. It was plenty fast for what I needed, but I'm sure there are more improvements that can be made.
The process method is stripped down to the basics for kicking off reading the file.
private long startOffset;
private long endOffset;
private SeekableByteChannel sbc;
private final ByteBuffer buffer = ByteBuffer.allocateDirect(1024);
public void process() throws IOException
{
startOffset = 0;
sbc = Files.newByteChannel(FILE, EnumSet.of(READ));
byte[] message = null;
while((message = readRecord()) != null)
{
// do something
}
}
public byte[] readRecord() throws IOException
{
endOffset = startOffset;
boolean eol = false;
boolean carryOver = false;
byte[] record = null;
while(!eol)
{
byte data;
buffer.clear();
final int bytesRead = sbc.read(buffer);
if(bytesRead == -1)
{
return null;
}
buffer.flip();
for(int i = 0; i < bytesRead && !eol; i++)
{
data = buffer.get();
if(data == '\r' || data == '\n')
{
eol = true;
endOffset += i;
if(carryOver)
{
final int messageSize = (int)(endOffset - startOffset);
sbc.position(startOffset);
final ByteBuffer tempBuffer = ByteBuffer.allocateDirect(messageSize);
sbc.read(tempBuffer);
tempBuffer.flip();
record = new byte[messageSize];
tempBuffer.get(record);
}
else
{
record = new byte[i];
// Need to move the buffer position back since the get moved it forward
buffer.position(0);
buffer.get(record, 0, i);
}
// Skip past the newline characters
if(isWindowsOS())
{
startOffset = (endOffset + 2);
}
else
{
startOffset = (endOffset + 1);
}
// Move the file position back
sbc.position(startOffset);
}
}
if(!eol && sbc.position() == sbc.size())
{
// We have hit the end of the file, just take all the bytes
record = new byte[bytesRead];
eol = true;
buffer.position(0);
buffer.get(record, 0, bytesRead);
}
else if(!eol)
{
// The EOL marker wasn't found, continue the loop
carryOver = true;
endOffset += bytesRead;
}
}
// System.out.println(new String(record));
return record;
}
This article is a great way to start.
Also, you need to create test cases in which you read first 10k(or something else, but shouldn't be too small) lines and calculate the reading times accordingly.
Threading might be a good way to go, but it's important that we know what you will be doing with the data.
Another thing to be considered is, how you will store that size of data.
I tried the following three methods, my file size is 1M, and I got results:
I run the program several times it looks that BufferedReader is faster.
#Test
public void testLargeFileIO_Scanner() throws Exception {
long start = new Date().getTime();
String fileName = "/Downloads/SampleTextFile_1000kb.txt"; //this path is on my local
InputStream inputStream = new FileInputStream(fileName);
try (Scanner fileScanner = new Scanner(inputStream, StandardCharsets.UTF_8.name())) {
while (fileScanner.hasNextLine()) {
String line = fileScanner.nextLine();
//System.out.println(line);
}
}
long end = new Date().getTime();
long time = end - start;
System.out.println("Scanner Time Consumed => " + time);
}
#Test
public void testLargeFileIO_BufferedReader() throws Exception {
long start = new Date().getTime();
String fileName = "/Downloads/SampleTextFile_1000kb.txt"; //this path is on my local
try (BufferedReader fileBufferReader = new BufferedReader(new FileReader(fileName))) {
String fileLineContent;
while ((fileLineContent = fileBufferReader.readLine()) != null) {
//System.out.println(fileLineContent);
}
}
long end = new Date().getTime();
long time = (long) (end - start);
System.out.println("BufferedReader Time Consumed => " + time);
}
#Test
public void testLargeFileIO_Stream() throws Exception {
long start = new Date().getTime();
String fileName = "/Downloads/SampleTextFile_1000kb.txt"; //this path is on my local
try (Stream inputStream = Files.lines(Paths.get(fileName), StandardCharsets.UTF_8)) {
//inputStream.forEach(System.out::println);
}
long end = new Date().getTime();
long time = end - start;
System.out.println("Stream Time Consumed => " + time);
}

How to deal with BOM in InputStream [duplicate]

I'm trying to read CSV files using Java. Some of the files may have a byte order mark in the beginning, but not all. When present, the byte order gets read along with the rest of the first line, thus causing problems with string compares.
Is there an easy way to skip the byte order mark when it is present?
EDIT: I've made a proper release on GitHub: https://github.com/gpakosz/UnicodeBOMInputStream
Here is a class I coded a while ago, I just edited the package name before pasting. Nothing special, it is quite similar to solutions posted in SUN's bug database. Incorporate it in your code and you're fine.
/* ____________________________________________________________________________
*
* File: UnicodeBOMInputStream.java
* Author: Gregory Pakosz.
* Date: 02 - November - 2005
* ____________________________________________________________________________
*/
package com.stackoverflow.answer;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
/**
* The <code>UnicodeBOMInputStream</code> class wraps any
* <code>InputStream</code> and detects the presence of any Unicode BOM
* (Byte Order Mark) at its beginning, as defined by
* RFC 3629 - UTF-8, a transformation format of ISO 10646
*
* <p>The
* Unicode FAQ
* defines 5 types of BOMs:<ul>
* <li><pre>00 00 FE FF = UTF-32, big-endian</pre></li>
* <li><pre>FF FE 00 00 = UTF-32, little-endian</pre></li>
* <li><pre>FE FF = UTF-16, big-endian</pre></li>
* <li><pre>FF FE = UTF-16, little-endian</pre></li>
* <li><pre>EF BB BF = UTF-8</pre></li>
* </ul></p>
*
* <p>Use the {#link #getBOM()} method to know whether a BOM has been detected
* or not.
* </p>
* <p>Use the {#link #skipBOM()} method to remove the detected BOM from the
* wrapped <code>InputStream</code> object.</p>
*/
public class UnicodeBOMInputStream extends InputStream
{
/**
* Type safe enumeration class that describes the different types of Unicode
* BOMs.
*/
public static final class BOM
{
/**
* NONE.
*/
public static final BOM NONE = new BOM(new byte[]{},"NONE");
/**
* UTF-8 BOM (EF BB BF).
*/
public static final BOM UTF_8 = new BOM(new byte[]{(byte)0xEF,
(byte)0xBB,
(byte)0xBF},
"UTF-8");
/**
* UTF-16, little-endian (FF FE).
*/
public static final BOM UTF_16_LE = new BOM(new byte[]{ (byte)0xFF,
(byte)0xFE},
"UTF-16 little-endian");
/**
* UTF-16, big-endian (FE FF).
*/
public static final BOM UTF_16_BE = new BOM(new byte[]{ (byte)0xFE,
(byte)0xFF},
"UTF-16 big-endian");
/**
* UTF-32, little-endian (FF FE 00 00).
*/
public static final BOM UTF_32_LE = new BOM(new byte[]{ (byte)0xFF,
(byte)0xFE,
(byte)0x00,
(byte)0x00},
"UTF-32 little-endian");
/**
* UTF-32, big-endian (00 00 FE FF).
*/
public static final BOM UTF_32_BE = new BOM(new byte[]{ (byte)0x00,
(byte)0x00,
(byte)0xFE,
(byte)0xFF},
"UTF-32 big-endian");
/**
* Returns a <code>String</code> representation of this <code>BOM</code>
* value.
*/
public final String toString()
{
return description;
}
/**
* Returns the bytes corresponding to this <code>BOM</code> value.
*/
public final byte[] getBytes()
{
final int length = bytes.length;
final byte[] result = new byte[length];
// Make a defensive copy
System.arraycopy(bytes,0,result,0,length);
return result;
}
private BOM(final byte bom[], final String description)
{
assert(bom != null) : "invalid BOM: null is not allowed";
assert(description != null) : "invalid description: null is not allowed";
assert(description.length() != 0) : "invalid description: empty string is not allowed";
this.bytes = bom;
this.description = description;
}
final byte bytes[];
private final String description;
} // BOM
/**
* Constructs a new <code>UnicodeBOMInputStream</code> that wraps the
* specified <code>InputStream</code>.
*
* #param inputStream an <code>InputStream</code>.
*
* #throws NullPointerException when <code>inputStream</code> is
* <code>null</code>.
* #throws IOException on reading from the specified <code>InputStream</code>
* when trying to detect the Unicode BOM.
*/
public UnicodeBOMInputStream(final InputStream inputStream) throws NullPointerException,
IOException
{
if (inputStream == null)
throw new NullPointerException("invalid input stream: null is not allowed");
in = new PushbackInputStream(inputStream,4);
final byte bom[] = new byte[4];
final int read = in.read(bom);
switch(read)
{
case 4:
if ((bom[0] == (byte)0xFF) &&
(bom[1] == (byte)0xFE) &&
(bom[2] == (byte)0x00) &&
(bom[3] == (byte)0x00))
{
this.bom = BOM.UTF_32_LE;
break;
}
else
if ((bom[0] == (byte)0x00) &&
(bom[1] == (byte)0x00) &&
(bom[2] == (byte)0xFE) &&
(bom[3] == (byte)0xFF))
{
this.bom = BOM.UTF_32_BE;
break;
}
case 3:
if ((bom[0] == (byte)0xEF) &&
(bom[1] == (byte)0xBB) &&
(bom[2] == (byte)0xBF))
{
this.bom = BOM.UTF_8;
break;
}
case 2:
if ((bom[0] == (byte)0xFF) &&
(bom[1] == (byte)0xFE))
{
this.bom = BOM.UTF_16_LE;
break;
}
else
if ((bom[0] == (byte)0xFE) &&
(bom[1] == (byte)0xFF))
{
this.bom = BOM.UTF_16_BE;
break;
}
default:
this.bom = BOM.NONE;
break;
}
if (read > 0)
in.unread(bom,0,read);
}
/**
* Returns the <code>BOM</code> that was detected in the wrapped
* <code>InputStream</code> object.
*
* #return a <code>BOM</code> value.
*/
public final BOM getBOM()
{
// BOM type is immutable.
return bom;
}
/**
* Skips the <code>BOM</code> that was found in the wrapped
* <code>InputStream</code> object.
*
* #return this <code>UnicodeBOMInputStream</code>.
*
* #throws IOException when trying to skip the BOM from the wrapped
* <code>InputStream</code> object.
*/
public final synchronized UnicodeBOMInputStream skipBOM() throws IOException
{
if (!skipped)
{
in.skip(bom.bytes.length);
skipped = true;
}
return this;
}
/**
* {#inheritDoc}
*/
public int read() throws IOException
{
return in.read();
}
/**
* {#inheritDoc}
*/
public int read(final byte b[]) throws IOException,
NullPointerException
{
return in.read(b,0,b.length);
}
/**
* {#inheritDoc}
*/
public int read(final byte b[],
final int off,
final int len) throws IOException,
NullPointerException
{
return in.read(b,off,len);
}
/**
* {#inheritDoc}
*/
public long skip(final long n) throws IOException
{
return in.skip(n);
}
/**
* {#inheritDoc}
*/
public int available() throws IOException
{
return in.available();
}
/**
* {#inheritDoc}
*/
public void close() throws IOException
{
in.close();
}
/**
* {#inheritDoc}
*/
public synchronized void mark(final int readlimit)
{
in.mark(readlimit);
}
/**
* {#inheritDoc}
*/
public synchronized void reset() throws IOException
{
in.reset();
}
/**
* {#inheritDoc}
*/
public boolean markSupported()
{
return in.markSupported();
}
private final PushbackInputStream in;
private final BOM bom;
private boolean skipped = false;
} // UnicodeBOMInputStream
And you're using it this way:
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
public final class UnicodeBOMInputStreamUsage
{
public static void main(final String[] args) throws Exception
{
FileInputStream fis = new FileInputStream("test/offending_bom.txt");
UnicodeBOMInputStream ubis = new UnicodeBOMInputStream(fis);
System.out.println("detected BOM: " + ubis.getBOM());
System.out.print("Reading the content of the file without skipping the BOM: ");
InputStreamReader isr = new InputStreamReader(ubis);
BufferedReader br = new BufferedReader(isr);
System.out.println(br.readLine());
br.close();
isr.close();
ubis.close();
fis.close();
fis = new FileInputStream("test/offending_bom.txt");
ubis = new UnicodeBOMInputStream(fis);
isr = new InputStreamReader(ubis);
br = new BufferedReader(isr);
ubis.skipBOM();
System.out.print("Reading the content of the file after skipping the BOM: ");
System.out.println(br.readLine());
br.close();
isr.close();
ubis.close();
fis.close();
}
} // UnicodeBOMInputStreamUsage
The Apache Commons IO library has an InputStream that can detect and discard BOMs: BOMInputStream (javadoc):
BOMInputStream bomIn = new BOMInputStream(in);
int firstNonBOMByte = bomIn.read(); // Skips BOM
if (bomIn.hasBOM()) {
// has a UTF-8 BOM
}
If you also need to detect different encodings, it can also distinguish among various different byte-order marks, e.g. UTF-8 vs. UTF-16 big + little endian - details at the doc link above. You can then use the detected ByteOrderMark to choose a Charset to decode the stream. (There's probably a more streamlined way to do this if you need all of this functionality - maybe the UnicodeReader in BalusC's answer?). Note that, in general, there's not a very good way to detect what encoding some bytes are in, but if the stream starts with a BOM, apparently this can be helpful.
Edit: If you need to detect the BOM in UTF-16, UTF-32, etc, then the constructor should be:
new BOMInputStream(is, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE,
ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE)
Upvote #martin-charlesworth's comment :)
More simple solution:
public class BOMSkipper
{
public static void skip(Reader reader) throws IOException
{
reader.mark(1);
char[] possibleBOM = new char[1];
reader.read(possibleBOM);
if (possibleBOM[0] != '\ufeff')
{
reader.reset();
}
}
}
Usage sample:
BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(file), fileExpectedCharset));
BOMSkipper.skip(input);
//Now UTF prefix not present:
input.readLine();
...
It works with all 5 UTF encodings!
Google Data API has an UnicodeReader which automagically detects the encoding.
You can use it instead of InputStreamReader. Here's an -slightly compactized- extract of its source which is pretty straightforward:
public class UnicodeReader extends Reader {
private static final int BOM_SIZE = 4;
private final InputStreamReader reader;
/**
* Construct UnicodeReader
* #param in Input stream.
* #param defaultEncoding Default encoding to be used if BOM is not found,
* or <code>null</code> to use system default encoding.
* #throws IOException If an I/O error occurs.
*/
public UnicodeReader(InputStream in, String defaultEncoding) throws IOException {
byte bom[] = new byte[BOM_SIZE];
String encoding;
int unread;
PushbackInputStream pushbackStream = new PushbackInputStream(in, BOM_SIZE);
int n = pushbackStream.read(bom, 0, bom.length);
// Read ahead four bytes and check for BOM marks.
if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
encoding = "UTF-8";
unread = n - 3;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
encoding = "UTF-16BE";
unread = n - 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
encoding = "UTF-16LE";
unread = n - 2;
} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
encoding = "UTF-32BE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
encoding = "UTF-32LE";
unread = n - 4;
} else {
encoding = defaultEncoding;
unread = n;
}
// Unread bytes if necessary and skip BOM marks.
if (unread > 0) {
pushbackStream.unread(bom, (n - unread), unread);
} else if (unread < -1) {
pushbackStream.unread(bom, 0, 0);
}
// Use given encoding.
if (encoding == null) {
reader = new InputStreamReader(pushbackStream);
} else {
reader = new InputStreamReader(pushbackStream, encoding);
}
}
public String getEncoding() {
return reader.getEncoding();
}
public int read(char[] cbuf, int off, int len) throws IOException {
return reader.read(cbuf, off, len);
}
public void close() throws IOException {
reader.close();
}
}
The Apache Commons IO Library's BOMInputStream has already been mentioned by #rescdsk, but I did not see it mention how to get an InputStream without the BOM.
Here's how I did it in Scala.
import java.io._
val file = new File(path_to_xml_file_with_BOM)
val fileInpStream = new FileInputStream(file)
val bomIn = new BOMInputStream(fileInpStream,
false); // false means don't include BOM
To simply remove the BOM characters from your file, I recomend using Apache Common IO
public BOMInputStream(InputStream delegate,
boolean include)
Constructs a new BOM InputStream that detects a a ByteOrderMark.UTF_8 and optionally includes it.
Parameters:
delegate - the InputStream to delegate to
include - true to include the UTF-8 BOM or false to exclude it
Set include to false and your BOM characters will be excluded.
Regrettably not. You'll have to identify and skip yourself. This page details what you have to watch for. Also see this SO question for more details.
Here is my code to read csv files in most char sets. It should cover 99% situations.
try(InputStream inputStream = new FileInputStream(csvFile);){
BOMInputStream bomInputStream = new BOMInputStream(inputStream ,ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);
Charset charset;
if(!bomInputStream.hasBOM()) charset = StandardCharsets.UTF_8;
else if(bomInputStream.hasBOM(ByteOrderMark.UTF_8)) charset = StandardCharsets.UTF_8;
else if(bomInputStream.hasBOM(ByteOrderMark.UTF_16LE)) charset = StandardCharsets.UTF_16LE;
else if(bomInputStream.hasBOM(ByteOrderMark.UTF_16BE)) charset = StandardCharsets.UTF_16BE;
else { throw new Exception("The charset of the file " + csvFile + " is not supported.");}
try(Reader streamReader = new InputStreamReader(bomInputStream, charset);
BufferedReader bufferedReader = new BufferedReader(streamReader);) {
for(String line; (line = bufferedReader.readLine()) != null; ) {
String[] columns = line.split(",");
//read csv columns
}
}
IMO none of the given answers is really satisfying. Just skipping the BOM and then read the rest of the stream in the current platform's default encoding is definitively wrong. Remember: The platform default on Unix/Linux and windows differ: former is UTF-8, later is ANSI. Such a solution only works if the rest of the stream (after the BOM) only contains 7-bit ASCII characters (which, I admit, in most programmer near files like configurations is true). But as soon there are non ASCII characters, you will fail with this approach.
That's why all java classes/methods, which can convert byte arrays/streams to string (and vice versa) have a second parameter indicating the encoding to be used (Reader, Writer, Scanner, String.getBytes(), etc.).
There are so much character encodings out in the world, not only UTF-xx. And still - in the current year 2021 - there are so much encoding problems between end user applications, especially if they run on different platforms (iOS, windows, unix). All these problems only exist because the programmer was too lazy to learn how character encoding works.
Thus, it's an absolute MUST to evaluate first the encoding to be used, and then performing the string/stream conversion using the found encoding. Consulting the respective specification(s) is the first step. And only if you cannot be sure which encoding you encounter while reading a stream you have to evaluate it by yourself. But caution: such an evaluation always will only be a 'best guess', there is no algorithm which can cover all possibilities.
In this sense, Lee's answer (and coding example) from Feb 6,2021 is IMO the best one, except that he falls back to UTF-8 if there is no BOM.
I had the same problem, and because I wasn't reading in a bunch of files I did a simpler solution. I think my encoding was UTF-8 because when I printed out the offending character with the help of this page: Get unicode value of a character I found that it was \ufeff. I used the code System.out.println( "\\u" + Integer.toHexString(str.charAt(0) | 0x10000).substring(1) ); to print out the offending unicode value.
Once I had the offending unicode value, I replaced it in the first line of my file before I went on reading. The business logic of that section:
String str = reader.readLine().trim();
str = str.replace("\ufeff", "");
This fixed my problem. Then I was able to go on processing the file with no issue. I added on trim() just in case of leading or trailing whitespace, you can do that or not, based on what your specific needs are.
NotePad++ is a good tool to convert UTF-8 encoding to UTF-8(BOM) encoding.
https://notepad-plus-plus.org/downloads/
UTF8BOMTester.java
public class UTF8BOMTester {
public static void main(String[] args) throws FileNotFoundException, IOException {
// TODO Auto-generated method stub
File file = new File("test.txt");
boolean same = UTF8BOMInputStream.isSameEncodingType(file);
System.out.println(same);
if (same) {
UTF8BOMInputStream is = new UTF8BOMInputStream(file);
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
System.out.println(br.readLine());
}
}
static void bytesPrint(byte[] b) {
for (byte a : b)
System.out.printf("%x ", a);
}}
UTF8BOMInputStream.java
public class UTF8BOMInputStream extends InputStream {
byte[] SYMBLE_BOM = { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF };
FileInputStream fis;
final boolean isSameEncodingType;
public UTF8BOMInputStream(File file) throws IOException {
FileInputStream fis=new FileInputStream(file);
byte[] symble=new byte[3];
fis.read(symble);
bytesPrint(symble);
isSameEncodingType=isSameEncodingType(symble);
if(isSameEncodingType)
this.fis=fis;
else
this.fis=null;
}
#Override
public int read() throws IOException {
return fis.read();
}
void bytesPrint(byte[] b) {
for (byte a : b)
System.out.printf("%x ", a);
}
boolean bytesCompare(byte[] a, byte[] b) {
if (a.length != b.length)
return false;
for (int i = 0; i < a.length; i++) {
if (a[i] != b[i])
return false;
}
return true;
}
boolean isSameEncodingType(byte[] symble) {
return bytesCompare(symble,SYMBLE_BOM);
}
public static boolean isSameEncodingType(File file) throws IOException {
return (new UTF8BOMInputStream(file)).isSameEncodingType;
}

Multiple readers for InputStream in Java

I have an InputStream from which I'm reading characters. I would like multiple readers to access this InputStream. It seems that a reasonable way to achieve this is to write incoming data to a StringBuffer or StringBuilder, and have the multiple readers read that. Unfortunately, StringBufferInputStream is deprecated. StringReader reads a string, not a mutable object that's continuously being updated. What are my options? Write my own?
Note: My other answer is more general (and better in my opinion).
As noted by #dimo414, the answer below requires the first reader to always be ahead of the second reader. If this is indeed the case for you, then this answer might still be preferable since it builds upon standard classes.
To create two readers that read independently from the same source, you'll have to make sure they don't consume data from the same stream.
This can be achieved by combining TeeInputStream from Apache Commons and a PipedInputStream and PipedOutputStream as follows:
import java.io.*;
import org.apache.commons.io.input.TeeInputStream;
class Test {
public static void main(String[] args) throws IOException {
// Create the source input stream.
InputStream is = new FileInputStream("filename.txt");
// Create a piped input stream for one of the readers.
PipedInputStream in = new PipedInputStream();
// Create a tee-splitter for the other reader.
TeeInputStream tee = new TeeInputStream(is, new PipedOutputStream(in));
// Create the two buffered readers.
BufferedReader br1 = new BufferedReader(new InputStreamReader(tee));
BufferedReader br2 = new BufferedReader(new InputStreamReader(in));
// Do some interleaved reads from them.
System.out.println("One line from br1:");
System.out.println(br1.readLine());
System.out.println();
System.out.println("Two lines from br2:");
System.out.println(br2.readLine());
System.out.println(br2.readLine());
System.out.println();
System.out.println("One line from br1:");
System.out.println(br1.readLine());
System.out.println();
}
}
Output:
One line from br1:
Line1: Lorem ipsum dolor sit amet, <-- reading from start
Two lines from br2:
Line1: Lorem ipsum dolor sit amet, <-- reading from start
Line2: consectetur adipisicing elit,
One line from br1:
Line2: consectetur adipisicing elit, <-- resumes on line 2
As you've probably noted, once you've read a byte from an input stream, it's gone forever (unless you've saved it somewhere yourself).
The solution below does save the bytes until all subscribing input streams have read it.
It works as follows:
// Create a SplittableInputStream from the originalStream
SplittableInputStream is = new SplittableInputStream(originalStream);
// Fork this to get more input streams reading independently from originalStream
SplittableInputStream is2 = is.split();
SplittableInputStream is3 = is.split();
Each time is is split() it will yield a new InputStream that will read the bytes from the point where is was split.
The SplittableInputStream looks as follows (copy'n'paste away!):
class SplittableInputStream extends InputStream {
// Almost an input stream: The read-method takes an id.
static class MultiplexedSource {
static int MIN_BUF = 4096;
// Underlying source
private InputStream source;
// Read positions of each SplittableInputStream
private List<Integer> readPositions = new ArrayList<>();
// Data to be read by the SplittableInputStreams
int[] buffer = new int[MIN_BUF];
// Last valid position in buffer
int writePosition = 0;
public MultiplexedSource(InputStream source) {
this.source = source;
}
// Add a multiplexed reader. Return new reader id.
int addSource(int splitId) {
readPositions.add(splitId == -1 ? 0 : readPositions.get(splitId));
return readPositions.size() - 1;
}
// Make room for more data (and drop data that has been read by
// all readers)
private void readjustBuffer() {
int from = Collections.min(readPositions);
int to = Collections.max(readPositions);
int newLength = Math.max((to - from) * 2, MIN_BUF);
int[] newBuf = new int[newLength];
System.arraycopy(buffer, from, newBuf, 0, to - from);
for (int i = 0; i < readPositions.size(); i++)
readPositions.set(i, readPositions.get(i) - from);
writePosition -= from;
buffer = newBuf;
}
// Read and advance position for given reader
public int read(int readerId) throws IOException {
// Enough data in buffer?
if (readPositions.get(readerId) >= writePosition) {
readjustBuffer();
buffer[writePosition++] = source.read();
}
int pos = readPositions.get(readerId);
int b = buffer[pos];
if (b != -1)
readPositions.set(readerId, pos + 1);
return b;
}
}
// Non-root fields
MultiplexedSource multiSource;
int myId;
// Public constructor: Used for first SplittableInputStream
public SplittableInputStream(InputStream source) {
multiSource = new MultiplexedSource(source);
myId = multiSource.addSource(-1);
}
// Private constructor: Used in split()
private SplittableInputStream(MultiplexedSource multiSource, int splitId) {
this.multiSource = multiSource;
myId = multiSource.addSource(splitId);
}
// Returns a new InputStream that will read bytes from this position
// onwards.
public SplittableInputStream split() {
return new SplittableInputStream(multiSource, myId);
}
#Override
public int read() throws IOException {
return multiSource.read(myId);
}
}
Finally, a demo:
String str = "Lorem ipsum\ndolor sit\namet\n";
InputStream is = new ByteArrayInputStream(str.getBytes("UTF-8"));
// Create the two buffered readers.
SplittableInputStream is1 = new SplittableInputStream(is);
SplittableInputStream is2 = is1.split();
BufferedReader br1 = new BufferedReader(new InputStreamReader(is1));
BufferedReader br2 = new BufferedReader(new InputStreamReader(is2));
// Do some interleaved reads from them.
System.out.println("One line from br1:");
System.out.println(br1.readLine());
System.out.println();
System.out.println("Two lines from br2:");
System.out.println(br2.readLine());
System.out.println(br2.readLine());
System.out.println();
System.out.println("One line from br1:");
System.out.println(br1.readLine());
System.out.println();
Output:
One line from br1:
Lorem ipsum
Two lines from br2:
Lorem ipsum
dolor sit
One line from br1:
dolor sit
Use TeeInputStream to copy all the bytes read from InputStream to secondary OutputStream, e.g. ByteArrayOutputStream.
Input stream work like this: once you read a portion from it, it's gone forever. You can't go back and re-read it. what you could do is something like this:
class InputStreamSplitter {
InputStreamSplitter(InputStream toReadFrom) {
this.reader = new InputStreamReader(toReadFrom);
}
void addListener(Listener l) {
this.listeners.add(l);
}
void work() {
String line = this.reader.readLine();
while(line != null) {
for(Listener l : this.listeners) {
l.processLine(line);
}
}
}
}
interface Listener {
processLine(String line);
}
have all interested parties implement Listener and add them to InputStreamSplitter
Instead of using StringWriter/StringBufferInputStream, write your original InputStream to a ByteArrayOutputStream. Once you've finished reading from the original InputStream, pass the byte array returned from ByteArrayOutputStream.toByteArray to a ByteArrayInputStream. Use this InputStream as the InputStream of choice for passing around other things that need to read from it.
Essentially, all you'd be doing here is storing the contents of the original InputStream into a byte[] cache in memory as you tried to do originally with StringWriter/StringBufferInputStream.
Here's another way to read from two streams independently, without presuming one is ahead of the other, but with standard classes. It does, however, eagerly read from the underlying input stream in the background, which may be undesirable, depending on your application.
public static void main(String[] args) throws IOException {
// Create the source input stream.
InputStream is = new ByteArrayInputStream("line1\nline2\nline3".getBytes());
// Create a piped input stream for each reader;
PipedInputStream in1 = new PipedInputStream();
PipedInputStream in2 = new PipedInputStream();
// Start copying the input stream to both piped input streams.
startCopy(is, new TeeOutputStream(
new PipedOutputStream(in1), new PipedOutputStream(in2)));
// Create the two buffered readers.
BufferedReader br1 = new BufferedReader(new InputStreamReader(in1));
BufferedReader br2 = new BufferedReader(new InputStreamReader(in2));
// Do some interleaved reads from them.
// ...
}
private static void startCopy(InputStream in, OutputStream out) {
(new Thread() {
public void run() {
try {
IOUtils.copy(in, out);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}).start();
}
Looking for a possible way to have an outputstream sending bytes to two or more different Inputstream, I found this forum.
Unfortunately, the exact solution was directing to PipedOutputStream and PipedInputStream.
So, I was declined to write a PipeOutputStream extension.
Here it is. The example is written in the PipedOutputStream's "main" method.
/**
* Extensao de {#link PipedOutputStream}, onde eh possivel conectar mais de um {#link PipedInputStream}
*/
public class PipedOutputStreamEx extends PipedOutputStream {
/**
*
*/
public PipedOutputStreamEx() {
// TODO Auto-generated constructor stub
}
/* REMIND: identification of the read and write sides needs to be
more sophisticated. Either using thread groups (but what about
pipes within a thread?) or using finalization (but it may be a
long time until the next GC). */
private PipedInputStreamEx[] sinks=null;
public synchronized void connect(PipedInputStreamEx... pIns) throws IOException {
for (PipedInputStreamEx snk : pIns) {
if (snk == null) {
throw new NullPointerException();
} else if (sinks != null || snk.connected) {
throw new IOException("Already connected");
}
snk.in = -1;
snk.out = 0;
snk.connected = true;
}
this.sinks = pIns;
}
/**
* Writes the specified <code>byte</code> to the piped output stream.
* <p>
* Implements the <code>write</code> method of <code>OutputStream</code>.
*
* #param b the <code>byte</code> to be written.
* #exception IOException if the pipe is <a href=#BROKEN> broken</a>,
* {#link #connect(java.io.PipedInputStream) unconnected},
* closed, or if an I/O error occurs.
*/
public void write(int b) throws IOException {
if (this.sinks == null) {
throw new IOException("Pipe(s) not connected");
}
for (PipedInputStreamEx sink : this.sinks) {
sink.receive(b);
}
}
/**
* Writes <code>len</code> bytes from the specified byte array
* starting at offset <code>off</code> to this piped output stream.
* This method blocks until all the bytes are written to the output
* stream.
*
* #param b the data.
* #param off the start offset in the data.
* #param len the number of bytes to write.
* #exception IOException if the pipe is <a href=#BROKEN> broken</a>,
* {#link #connect(java.io.PipedInputStream) unconnected},
* closed, or if an I/O error occurs.
*/
public void write(byte b[], int off, int len) throws IOException {
if (sinks == null) {
throw new IOException("Pipe not connected");
} else if (b == null) {
throw new NullPointerException();
} else if ((off < 0) || (off > b.length) || (len < 0) ||
((off + len) > b.length) || ((off + len) < 0)) {
throw new IndexOutOfBoundsException();
} else if (len == 0) {
return;
}
for (PipedInputStreamEx sink : this.sinks) {
sink.receive(b, off, len);
}
}
/**
* Flushes this output stream and forces any buffered output bytes
* to be written out.
* This will notify any readers that bytes are waiting in the pipe.
*
* #exception IOException if an I/O error occurs.
*/
public synchronized void flush() throws IOException {
if (sinks != null) {
for (PipedInputStreamEx sink : this.sinks) {
synchronized (sink) {
sink.notifyAll();
}
}
}
}
/**
* Closes this piped output stream and releases any system resources
* associated with this stream. This stream may no longer be used for
* writing bytes.
*
* #exception IOException if an I/O error occurs.
*/
public void close() throws IOException {
if (sinks != null) {
for (PipedInputStreamEx sink : this.sinks) {
sink.receivedLast();
}
}
}
/**
* Teste desta extensao de {#link PipedOutputStream}
* #param args
* #throws InterruptedException
* #throws IOException
*/
public static void main(String[] args) throws InterruptedException, IOException {
final PipedOutputStreamEx pOut = new PipedOutputStreamEx();
final PipedInputStreamEx pInHash = new PipedInputStreamEx();
final PipedInputStreamEx pInConsole = new PipedInputStreamEx();
pOut.connect(pInHash, pInConsole);
Thread escreve = new Thread("Escrevendo") {
#Override
public void run() {
String[] paraGravar = new String[]{
"linha1 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha2 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha3 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha4 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha5 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha6 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha7 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha8 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha9 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha10 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha11 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha12 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha13 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha14 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha15 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha16 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha17 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha18 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha19 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
, "linha20 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789\n"
};
for (String s :paraGravar) {
try {
pOut.write(s.getBytes("ISO-8859-1") );
Thread.sleep(100);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
try {
pOut.close();
} catch (IOException e) {
e.printStackTrace();
}
}
};
Thread le1 = new Thread("Le1 - hash") {
#Override
public void run() {
try {
System.out.println("HASH: "+HashUtil.getHashCRC(pInHash,true));
} catch (Exception e) {
e.printStackTrace();
}
}
};
Thread le2 = new Thread("Le2 - escreve no console") {
#Override
public void run() {
BufferedReader bIn = new BufferedReader(new InputStreamReader(pInConsole));
String s;
try {
while ( (s=bIn.readLine())!=null) {
Thread.sleep(700); //teste simulando o um leitor lento...
System.out.println(s);
}
} catch (IOException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
};
escreve.start();
le1.start();
le2.start();
escreve.join();
le1.join();
le2.join();
pInHash.close();
pInConsole.close();
}
}
Here is the PipedInputStreamEx code. Unfortunately, I had to copy all JDK code, to have access to "connected", "in" and "out" properties.
/**
* Extensao de {#link PipedInputStream}, que permite conetar mais de um destes no {#link PipedOutputStream}
* Como a classe ancestral possui propriedades 'package friend', tivemos que copiar o codigo herdado :/
*/
public class PipedInputStreamEx extends PipedInputStream {
#Override
public void connect(PipedOutputStream src) throws IOException {
throw new IOException("conecte usando PipedOutputStream.connect()");
}
//----------------------------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------------------------
//--------- INICIO codigo da classe herdada (alguns metodos comentados...)----------------------------------
//----------------------------------------------------------------------------------------------------------
boolean closedByWriter = false;
volatile boolean closedByReader = false;
boolean connected = false;
/* REMIND: identification of the read and write sides needs to be
more sophisticated. Either using thread groups (but what about
pipes within a thread?) or using finalization (but it may be a
long time until the next GC). */
Thread readSide;
Thread writeSide;
private static final int DEFAULT_PIPE_SIZE = 1024;
/**
* The default size of the pipe's circular input buffer.
* #since JDK1.1
*/
// This used to be a constant before the pipe size was allowed
// to change. This field will continue to be maintained
// for backward compatibility.
protected static final int PIPE_SIZE = DEFAULT_PIPE_SIZE;
/**
* The circular buffer into which incoming data is placed.
* #since JDK1.1
*/
protected byte buffer[];
/**
* The index of the position in the circular buffer at which the
* next byte of data will be stored when received from the connected
* piped output stream. <code>in<0</code> implies the buffer is empty,
* <code>in==out</code> implies the buffer is full
* #since JDK1.1
*/
protected int in = -1;
/**
* The index of the position in the circular buffer at which the next
* byte of data will be read by this piped input stream.
* #since JDK1.1
*/
protected int out = 0;
// /**
// * Creates a <code>PipedInputStream</code> so
// * that it is connected to the piped output
// * stream <code>src</code>. Data bytes written
// * to <code>src</code> will then be available
// * as input from this stream.
// *
// * #param src the stream to connect to.
// * #exception IOException if an I/O error occurs.
// */
// public PipedInputStream(PipedOutputStream src) throws IOException {
// this(src, DEFAULT_PIPE_SIZE);
// }
//
// /**
// * Creates a <code>PipedInputStream</code> so that it is
// * connected to the piped output stream
// * <code>src</code> and uses the specified pipe size for
// * the pipe's buffer.
// * Data bytes written to <code>src</code> will then
// * be available as input from this stream.
// *
// * #param src the stream to connect to.
// * #param pipeSize the size of the pipe's buffer.
// * #exception IOException if an I/O error occurs.
// * #exception IllegalArgumentException if <code>pipeSize <= 0</code>.
// * #since 1.6
// */
// public PipedInputStream(PipedOutputStream src, int pipeSize)
// throws IOException {
// initPipe(pipeSize);
// connect(src);
// }
/**
* Creates a <code>PipedInputStream</code> so
* that it is not yet {#linkplain #connect(java.io.PipedOutputStream)
* connected}.
* It must be {#linkplain java.io.PipedOutputStream#connect(
* java.io.PipedInputStream) connected} to a
* <code>PipedOutputStream</code> before being used.
*/
public PipedInputStreamEx() {
initPipe(DEFAULT_PIPE_SIZE);
}
/**
* Creates a <code>PipedInputStream</code> so that it is not yet
* {#linkplain #connect(java.io.PipedOutputStream) connected} and
* uses the specified pipe size for the pipe's buffer.
* It must be {#linkplain java.io.PipedOutputStream#connect(
* java.io.PipedInputStream)
* connected} to a <code>PipedOutputStream</code> before being used.
*
* #param pipeSize the size of the pipe's buffer.
* #exception IllegalArgumentException if <code>pipeSize <= 0</code>.
* #since 1.6
*/
public PipedInputStreamEx(int pipeSize) {
initPipe(pipeSize);
}
private void initPipe(int pipeSize) {
if (pipeSize <= 0) {
throw new IllegalArgumentException("Pipe Size <= 0");
}
buffer = new byte[pipeSize];
}
// /**
// * Causes this piped input stream to be connected
// * to the piped output stream <code>src</code>.
// * If this object is already connected to some
// * other piped output stream, an <code>IOException</code>
// * is thrown.
// * <p>
// * If <code>src</code> is an
// * unconnected piped output stream and <code>snk</code>
// * is an unconnected piped input stream, they
// * may be connected by either the call:
// * <p>
// * <pre><code>snk.connect(src)</code> </pre>
// * <p>
// * or the call:
// * <p>
// * <pre><code>src.connect(snk)</code> </pre>
// * <p>
// * The two
// * calls have the same effect.
// *
// * #param src The piped output stream to connect to.
// * #exception IOException if an I/O error occurs.
// */
// public void connect(PipedOutputStream src) throws IOException {
// src.connect(this);
// }
/**
* Receives a byte of data. This method will block if no input is
* available.
* #param b the byte being received
* #exception IOException If the pipe is <a href=#BROKEN> <code>broken</code></a>,
* {#link #connect(java.io.PipedOutputStream) unconnected},
* closed, or if an I/O error occurs.
* #since JDK1.1
*/
protected synchronized void receive(int b) throws IOException {
checkStateForReceive();
writeSide = Thread.currentThread();
if (in == out)
awaitSpace();
if (in < 0) {
in = 0;
out = 0;
}
buffer[in++] = (byte)(b & 0xFF);
if (in >= buffer.length) {
in = 0;
}
}
/**
* Receives data into an array of bytes. This method will
* block until some input is available.
* #param b the buffer into which the data is received
* #param off the start offset of the data
* #param len the maximum number of bytes received
* #exception IOException If the pipe is <a href=#BROKEN> broken</a>,
* {#link #connect(java.io.PipedOutputStream) unconnected},
* closed,or if an I/O error occurs.
*/
synchronized void receive(byte b[], int off, int len) throws IOException {
checkStateForReceive();
writeSide = Thread.currentThread();
int bytesToTransfer = len;
while (bytesToTransfer > 0) {
if (in == out)
awaitSpace();
int nextTransferAmount = 0;
if (out < in) {
nextTransferAmount = buffer.length - in;
} else if (in < out) {
if (in == -1) {
in = out = 0;
nextTransferAmount = buffer.length - in;
} else {
nextTransferAmount = out - in;
}
}
if (nextTransferAmount > bytesToTransfer)
nextTransferAmount = bytesToTransfer;
assert(nextTransferAmount > 0);
System.arraycopy(b, off, buffer, in, nextTransferAmount);
bytesToTransfer -= nextTransferAmount;
off += nextTransferAmount;
in += nextTransferAmount;
if (in >= buffer.length) {
in = 0;
}
}
}
private void checkStateForReceive() throws IOException {
if (!connected) {
throw new IOException("Pipe not connected");
} else if (closedByWriter || closedByReader) {
throw new IOException("Pipe closed");
} else if (readSide != null && !readSide.isAlive()) {
throw new IOException("Read end dead");
}
}
private void awaitSpace() throws IOException {
while (in == out) {
checkStateForReceive();
/* full: kick any waiting readers */
notifyAll();
try {
wait(1000);
} catch (InterruptedException ex) {
throw new java.io.InterruptedIOException();
}
}
}
/**
* Notifies all waiting threads that the last byte of data has been
* received.
*/
synchronized void receivedLast() {
closedByWriter = true;
notifyAll();
}
/**
* Reads the next byte of data from this piped input stream. The
* value byte is returned as an <code>int</code> in the range
* <code>0</code> to <code>255</code>.
* This method blocks until input data is available, the end of the
* stream is detected, or an exception is thrown.
*
* #return the next byte of data, or <code>-1</code> if the end of the
* stream is reached.
* #exception IOException if the pipe is
* {#link #connect(java.io.PipedOutputStream) unconnected},
* <a href=#BROKEN> <code>broken</code></a>, closed,
* or if an I/O error occurs.
*/
public synchronized int read() throws IOException {
if (!connected) {
throw new IOException("Pipe not connected");
} else if (closedByReader) {
throw new IOException("Pipe closed");
} else if (writeSide != null && !writeSide.isAlive()
&& !closedByWriter && (in < 0)) {
throw new IOException("Write end dead");
}
readSide = Thread.currentThread();
int trials = 2;
while (in < 0) {
if (closedByWriter) {
/* closed by writer, return EOF */
return -1;
}
if ((writeSide != null) && (!writeSide.isAlive()) && (--trials < 0)) {
throw new IOException("Pipe broken");
}
/* might be a writer waiting */
notifyAll();
try {
wait(1000);
} catch (InterruptedException ex) {
throw new java.io.InterruptedIOException();
}
}
int ret = buffer[out++] & 0xFF;
if (out >= buffer.length) {
out = 0;
}
if (in == out) {
/* now empty */
in = -1;
}
return ret;
}
/**
* Reads up to <code>len</code> bytes of data from this piped input
* stream into an array of bytes. Less than <code>len</code> bytes
* will be read if the end of the data stream is reached or if
* <code>len</code> exceeds the pipe's buffer size.
* If <code>len </code> is zero, then no bytes are read and 0 is returned;
* otherwise, the method blocks until at least 1 byte of input is
* available, end of the stream has been detected, or an exception is
* thrown.
*
* #param b the buffer into which the data is read.
* #param off the start offset in the destination array <code>b</code>
* #param len the maximum number of bytes read.
* #return the total number of bytes read into the buffer, or
* <code>-1</code> if there is no more data because the end of
* the stream has been reached.
* #exception NullPointerException If <code>b</code> is <code>null</code>.
* #exception IndexOutOfBoundsException If <code>off</code> is negative,
* <code>len</code> is negative, or <code>len</code> is greater than
* <code>b.length - off</code>
* #exception IOException if the pipe is <a href=#BROKEN> <code>broken</code></a>,
* {#link #connect(java.io.PipedOutputStream) unconnected},
* closed, or if an I/O error occurs.
*/
public synchronized int read(byte b[], int off, int len) throws IOException {
if (b == null) {
throw new NullPointerException();
} else if (off < 0 || len < 0 || len > b.length - off) {
throw new IndexOutOfBoundsException();
} else if (len == 0) {
return 0;
}
/* possibly wait on the first character */
int c = read();
if (c < 0) {
return -1;
}
b[off] = (byte) c;
int rlen = 1;
while ((in >= 0) && (len > 1)) {
int available;
if (in > out) {
available = Math.min((buffer.length - out), (in - out));
} else {
available = buffer.length - out;
}
// A byte is read beforehand outside the loop
if (available > (len - 1)) {
available = len - 1;
}
System.arraycopy(buffer, out, b, off + rlen, available);
out += available;
rlen += available;
len -= available;
if (out >= buffer.length) {
out = 0;
}
if (in == out) {
/* now empty */
in = -1;
}
}
return rlen;
}
/**
* Returns the number of bytes that can be read from this input
* stream without blocking.
*
* #return the number of bytes that can be read from this input stream
* without blocking, or {#code 0} if this input stream has been
* closed by invoking its {#link #close()} method, or if the pipe
* is {#link #connect(java.io.PipedOutputStream) unconnected}, or
* <a href=#BROKEN> <code>broken</code></a>.
*
* #exception IOException if an I/O error occurs.
* #since JDK1.0.2
*/
public synchronized int available() throws IOException {
if(in < 0)
return 0;
else if(in == out)
return buffer.length;
else if (in > out)
return in - out;
else
return in + buffer.length - out;
}
/**
* Closes this piped input stream and releases any system resources
* associated with the stream.
*
* #exception IOException if an I/O error occurs.
*/
public void close() throws IOException {
closedByReader = true;
synchronized (this) {
in = -1;
}
}
//----------------------------------------------------------------------------------------------------------
//--------- FIM codigo da classe herdada -------------------------------------------------------------------
//----------------------------------------------------------------------------------------------------------
}

Byte order mark screws up file reading in Java

I'm trying to read CSV files using Java. Some of the files may have a byte order mark in the beginning, but not all. When present, the byte order gets read along with the rest of the first line, thus causing problems with string compares.
Is there an easy way to skip the byte order mark when it is present?
EDIT: I've made a proper release on GitHub: https://github.com/gpakosz/UnicodeBOMInputStream
Here is a class I coded a while ago, I just edited the package name before pasting. Nothing special, it is quite similar to solutions posted in SUN's bug database. Incorporate it in your code and you're fine.
/* ____________________________________________________________________________
*
* File: UnicodeBOMInputStream.java
* Author: Gregory Pakosz.
* Date: 02 - November - 2005
* ____________________________________________________________________________
*/
package com.stackoverflow.answer;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
/**
* The <code>UnicodeBOMInputStream</code> class wraps any
* <code>InputStream</code> and detects the presence of any Unicode BOM
* (Byte Order Mark) at its beginning, as defined by
* RFC 3629 - UTF-8, a transformation format of ISO 10646
*
* <p>The
* Unicode FAQ
* defines 5 types of BOMs:<ul>
* <li><pre>00 00 FE FF = UTF-32, big-endian</pre></li>
* <li><pre>FF FE 00 00 = UTF-32, little-endian</pre></li>
* <li><pre>FE FF = UTF-16, big-endian</pre></li>
* <li><pre>FF FE = UTF-16, little-endian</pre></li>
* <li><pre>EF BB BF = UTF-8</pre></li>
* </ul></p>
*
* <p>Use the {#link #getBOM()} method to know whether a BOM has been detected
* or not.
* </p>
* <p>Use the {#link #skipBOM()} method to remove the detected BOM from the
* wrapped <code>InputStream</code> object.</p>
*/
public class UnicodeBOMInputStream extends InputStream
{
/**
* Type safe enumeration class that describes the different types of Unicode
* BOMs.
*/
public static final class BOM
{
/**
* NONE.
*/
public static final BOM NONE = new BOM(new byte[]{},"NONE");
/**
* UTF-8 BOM (EF BB BF).
*/
public static final BOM UTF_8 = new BOM(new byte[]{(byte)0xEF,
(byte)0xBB,
(byte)0xBF},
"UTF-8");
/**
* UTF-16, little-endian (FF FE).
*/
public static final BOM UTF_16_LE = new BOM(new byte[]{ (byte)0xFF,
(byte)0xFE},
"UTF-16 little-endian");
/**
* UTF-16, big-endian (FE FF).
*/
public static final BOM UTF_16_BE = new BOM(new byte[]{ (byte)0xFE,
(byte)0xFF},
"UTF-16 big-endian");
/**
* UTF-32, little-endian (FF FE 00 00).
*/
public static final BOM UTF_32_LE = new BOM(new byte[]{ (byte)0xFF,
(byte)0xFE,
(byte)0x00,
(byte)0x00},
"UTF-32 little-endian");
/**
* UTF-32, big-endian (00 00 FE FF).
*/
public static final BOM UTF_32_BE = new BOM(new byte[]{ (byte)0x00,
(byte)0x00,
(byte)0xFE,
(byte)0xFF},
"UTF-32 big-endian");
/**
* Returns a <code>String</code> representation of this <code>BOM</code>
* value.
*/
public final String toString()
{
return description;
}
/**
* Returns the bytes corresponding to this <code>BOM</code> value.
*/
public final byte[] getBytes()
{
final int length = bytes.length;
final byte[] result = new byte[length];
// Make a defensive copy
System.arraycopy(bytes,0,result,0,length);
return result;
}
private BOM(final byte bom[], final String description)
{
assert(bom != null) : "invalid BOM: null is not allowed";
assert(description != null) : "invalid description: null is not allowed";
assert(description.length() != 0) : "invalid description: empty string is not allowed";
this.bytes = bom;
this.description = description;
}
final byte bytes[];
private final String description;
} // BOM
/**
* Constructs a new <code>UnicodeBOMInputStream</code> that wraps the
* specified <code>InputStream</code>.
*
* #param inputStream an <code>InputStream</code>.
*
* #throws NullPointerException when <code>inputStream</code> is
* <code>null</code>.
* #throws IOException on reading from the specified <code>InputStream</code>
* when trying to detect the Unicode BOM.
*/
public UnicodeBOMInputStream(final InputStream inputStream) throws NullPointerException,
IOException
{
if (inputStream == null)
throw new NullPointerException("invalid input stream: null is not allowed");
in = new PushbackInputStream(inputStream,4);
final byte bom[] = new byte[4];
final int read = in.read(bom);
switch(read)
{
case 4:
if ((bom[0] == (byte)0xFF) &&
(bom[1] == (byte)0xFE) &&
(bom[2] == (byte)0x00) &&
(bom[3] == (byte)0x00))
{
this.bom = BOM.UTF_32_LE;
break;
}
else
if ((bom[0] == (byte)0x00) &&
(bom[1] == (byte)0x00) &&
(bom[2] == (byte)0xFE) &&
(bom[3] == (byte)0xFF))
{
this.bom = BOM.UTF_32_BE;
break;
}
case 3:
if ((bom[0] == (byte)0xEF) &&
(bom[1] == (byte)0xBB) &&
(bom[2] == (byte)0xBF))
{
this.bom = BOM.UTF_8;
break;
}
case 2:
if ((bom[0] == (byte)0xFF) &&
(bom[1] == (byte)0xFE))
{
this.bom = BOM.UTF_16_LE;
break;
}
else
if ((bom[0] == (byte)0xFE) &&
(bom[1] == (byte)0xFF))
{
this.bom = BOM.UTF_16_BE;
break;
}
default:
this.bom = BOM.NONE;
break;
}
if (read > 0)
in.unread(bom,0,read);
}
/**
* Returns the <code>BOM</code> that was detected in the wrapped
* <code>InputStream</code> object.
*
* #return a <code>BOM</code> value.
*/
public final BOM getBOM()
{
// BOM type is immutable.
return bom;
}
/**
* Skips the <code>BOM</code> that was found in the wrapped
* <code>InputStream</code> object.
*
* #return this <code>UnicodeBOMInputStream</code>.
*
* #throws IOException when trying to skip the BOM from the wrapped
* <code>InputStream</code> object.
*/
public final synchronized UnicodeBOMInputStream skipBOM() throws IOException
{
if (!skipped)
{
in.skip(bom.bytes.length);
skipped = true;
}
return this;
}
/**
* {#inheritDoc}
*/
public int read() throws IOException
{
return in.read();
}
/**
* {#inheritDoc}
*/
public int read(final byte b[]) throws IOException,
NullPointerException
{
return in.read(b,0,b.length);
}
/**
* {#inheritDoc}
*/
public int read(final byte b[],
final int off,
final int len) throws IOException,
NullPointerException
{
return in.read(b,off,len);
}
/**
* {#inheritDoc}
*/
public long skip(final long n) throws IOException
{
return in.skip(n);
}
/**
* {#inheritDoc}
*/
public int available() throws IOException
{
return in.available();
}
/**
* {#inheritDoc}
*/
public void close() throws IOException
{
in.close();
}
/**
* {#inheritDoc}
*/
public synchronized void mark(final int readlimit)
{
in.mark(readlimit);
}
/**
* {#inheritDoc}
*/
public synchronized void reset() throws IOException
{
in.reset();
}
/**
* {#inheritDoc}
*/
public boolean markSupported()
{
return in.markSupported();
}
private final PushbackInputStream in;
private final BOM bom;
private boolean skipped = false;
} // UnicodeBOMInputStream
And you're using it this way:
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
public final class UnicodeBOMInputStreamUsage
{
public static void main(final String[] args) throws Exception
{
FileInputStream fis = new FileInputStream("test/offending_bom.txt");
UnicodeBOMInputStream ubis = new UnicodeBOMInputStream(fis);
System.out.println("detected BOM: " + ubis.getBOM());
System.out.print("Reading the content of the file without skipping the BOM: ");
InputStreamReader isr = new InputStreamReader(ubis);
BufferedReader br = new BufferedReader(isr);
System.out.println(br.readLine());
br.close();
isr.close();
ubis.close();
fis.close();
fis = new FileInputStream("test/offending_bom.txt");
ubis = new UnicodeBOMInputStream(fis);
isr = new InputStreamReader(ubis);
br = new BufferedReader(isr);
ubis.skipBOM();
System.out.print("Reading the content of the file after skipping the BOM: ");
System.out.println(br.readLine());
br.close();
isr.close();
ubis.close();
fis.close();
}
} // UnicodeBOMInputStreamUsage
The Apache Commons IO library has an InputStream that can detect and discard BOMs: BOMInputStream (javadoc):
BOMInputStream bomIn = new BOMInputStream(in);
int firstNonBOMByte = bomIn.read(); // Skips BOM
if (bomIn.hasBOM()) {
// has a UTF-8 BOM
}
If you also need to detect different encodings, it can also distinguish among various different byte-order marks, e.g. UTF-8 vs. UTF-16 big + little endian - details at the doc link above. You can then use the detected ByteOrderMark to choose a Charset to decode the stream. (There's probably a more streamlined way to do this if you need all of this functionality - maybe the UnicodeReader in BalusC's answer?). Note that, in general, there's not a very good way to detect what encoding some bytes are in, but if the stream starts with a BOM, apparently this can be helpful.
Edit: If you need to detect the BOM in UTF-16, UTF-32, etc, then the constructor should be:
new BOMInputStream(is, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE,
ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE)
Upvote #martin-charlesworth's comment :)
More simple solution:
public class BOMSkipper
{
public static void skip(Reader reader) throws IOException
{
reader.mark(1);
char[] possibleBOM = new char[1];
reader.read(possibleBOM);
if (possibleBOM[0] != '\ufeff')
{
reader.reset();
}
}
}
Usage sample:
BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(file), fileExpectedCharset));
BOMSkipper.skip(input);
//Now UTF prefix not present:
input.readLine();
...
It works with all 5 UTF encodings!
Google Data API has an UnicodeReader which automagically detects the encoding.
You can use it instead of InputStreamReader. Here's an -slightly compactized- extract of its source which is pretty straightforward:
public class UnicodeReader extends Reader {
private static final int BOM_SIZE = 4;
private final InputStreamReader reader;
/**
* Construct UnicodeReader
* #param in Input stream.
* #param defaultEncoding Default encoding to be used if BOM is not found,
* or <code>null</code> to use system default encoding.
* #throws IOException If an I/O error occurs.
*/
public UnicodeReader(InputStream in, String defaultEncoding) throws IOException {
byte bom[] = new byte[BOM_SIZE];
String encoding;
int unread;
PushbackInputStream pushbackStream = new PushbackInputStream(in, BOM_SIZE);
int n = pushbackStream.read(bom, 0, bom.length);
// Read ahead four bytes and check for BOM marks.
if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
encoding = "UTF-8";
unread = n - 3;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
encoding = "UTF-16BE";
unread = n - 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
encoding = "UTF-16LE";
unread = n - 2;
} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
encoding = "UTF-32BE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
encoding = "UTF-32LE";
unread = n - 4;
} else {
encoding = defaultEncoding;
unread = n;
}
// Unread bytes if necessary and skip BOM marks.
if (unread > 0) {
pushbackStream.unread(bom, (n - unread), unread);
} else if (unread < -1) {
pushbackStream.unread(bom, 0, 0);
}
// Use given encoding.
if (encoding == null) {
reader = new InputStreamReader(pushbackStream);
} else {
reader = new InputStreamReader(pushbackStream, encoding);
}
}
public String getEncoding() {
return reader.getEncoding();
}
public int read(char[] cbuf, int off, int len) throws IOException {
return reader.read(cbuf, off, len);
}
public void close() throws IOException {
reader.close();
}
}
The Apache Commons IO Library's BOMInputStream has already been mentioned by #rescdsk, but I did not see it mention how to get an InputStream without the BOM.
Here's how I did it in Scala.
import java.io._
val file = new File(path_to_xml_file_with_BOM)
val fileInpStream = new FileInputStream(file)
val bomIn = new BOMInputStream(fileInpStream,
false); // false means don't include BOM
To simply remove the BOM characters from your file, I recomend using Apache Common IO
public BOMInputStream(InputStream delegate,
boolean include)
Constructs a new BOM InputStream that detects a a ByteOrderMark.UTF_8 and optionally includes it.
Parameters:
delegate - the InputStream to delegate to
include - true to include the UTF-8 BOM or false to exclude it
Set include to false and your BOM characters will be excluded.
Regrettably not. You'll have to identify and skip yourself. This page details what you have to watch for. Also see this SO question for more details.
Here is my code to read csv files in most char sets. It should cover 99% situations.
try(InputStream inputStream = new FileInputStream(csvFile);){
BOMInputStream bomInputStream = new BOMInputStream(inputStream ,ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);
Charset charset;
if(!bomInputStream.hasBOM()) charset = StandardCharsets.UTF_8;
else if(bomInputStream.hasBOM(ByteOrderMark.UTF_8)) charset = StandardCharsets.UTF_8;
else if(bomInputStream.hasBOM(ByteOrderMark.UTF_16LE)) charset = StandardCharsets.UTF_16LE;
else if(bomInputStream.hasBOM(ByteOrderMark.UTF_16BE)) charset = StandardCharsets.UTF_16BE;
else { throw new Exception("The charset of the file " + csvFile + " is not supported.");}
try(Reader streamReader = new InputStreamReader(bomInputStream, charset);
BufferedReader bufferedReader = new BufferedReader(streamReader);) {
for(String line; (line = bufferedReader.readLine()) != null; ) {
String[] columns = line.split(",");
//read csv columns
}
}
IMO none of the given answers is really satisfying. Just skipping the BOM and then read the rest of the stream in the current platform's default encoding is definitively wrong. Remember: The platform default on Unix/Linux and windows differ: former is UTF-8, later is ANSI. Such a solution only works if the rest of the stream (after the BOM) only contains 7-bit ASCII characters (which, I admit, in most programmer near files like configurations is true). But as soon there are non ASCII characters, you will fail with this approach.
That's why all java classes/methods, which can convert byte arrays/streams to string (and vice versa) have a second parameter indicating the encoding to be used (Reader, Writer, Scanner, String.getBytes(), etc.).
There are so much character encodings out in the world, not only UTF-xx. And still - in the current year 2021 - there are so much encoding problems between end user applications, especially if they run on different platforms (iOS, windows, unix). All these problems only exist because the programmer was too lazy to learn how character encoding works.
Thus, it's an absolute MUST to evaluate first the encoding to be used, and then performing the string/stream conversion using the found encoding. Consulting the respective specification(s) is the first step. And only if you cannot be sure which encoding you encounter while reading a stream you have to evaluate it by yourself. But caution: such an evaluation always will only be a 'best guess', there is no algorithm which can cover all possibilities.
In this sense, Lee's answer (and coding example) from Feb 6,2021 is IMO the best one, except that he falls back to UTF-8 if there is no BOM.
I had the same problem, and because I wasn't reading in a bunch of files I did a simpler solution. I think my encoding was UTF-8 because when I printed out the offending character with the help of this page: Get unicode value of a character I found that it was \ufeff. I used the code System.out.println( "\\u" + Integer.toHexString(str.charAt(0) | 0x10000).substring(1) ); to print out the offending unicode value.
Once I had the offending unicode value, I replaced it in the first line of my file before I went on reading. The business logic of that section:
String str = reader.readLine().trim();
str = str.replace("\ufeff", "");
This fixed my problem. Then I was able to go on processing the file with no issue. I added on trim() just in case of leading or trailing whitespace, you can do that or not, based on what your specific needs are.
NotePad++ is a good tool to convert UTF-8 encoding to UTF-8(BOM) encoding.
https://notepad-plus-plus.org/downloads/
UTF8BOMTester.java
public class UTF8BOMTester {
public static void main(String[] args) throws FileNotFoundException, IOException {
// TODO Auto-generated method stub
File file = new File("test.txt");
boolean same = UTF8BOMInputStream.isSameEncodingType(file);
System.out.println(same);
if (same) {
UTF8BOMInputStream is = new UTF8BOMInputStream(file);
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
System.out.println(br.readLine());
}
}
static void bytesPrint(byte[] b) {
for (byte a : b)
System.out.printf("%x ", a);
}}
UTF8BOMInputStream.java
public class UTF8BOMInputStream extends InputStream {
byte[] SYMBLE_BOM = { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF };
FileInputStream fis;
final boolean isSameEncodingType;
public UTF8BOMInputStream(File file) throws IOException {
FileInputStream fis=new FileInputStream(file);
byte[] symble=new byte[3];
fis.read(symble);
bytesPrint(symble);
isSameEncodingType=isSameEncodingType(symble);
if(isSameEncodingType)
this.fis=fis;
else
this.fis=null;
}
#Override
public int read() throws IOException {
return fis.read();
}
void bytesPrint(byte[] b) {
for (byte a : b)
System.out.printf("%x ", a);
}
boolean bytesCompare(byte[] a, byte[] b) {
if (a.length != b.length)
return false;
for (int i = 0; i < a.length; i++) {
if (a[i] != b[i])
return false;
}
return true;
}
boolean isSameEncodingType(byte[] symble) {
return bytesCompare(symble,SYMBLE_BOM);
}
public static boolean isSameEncodingType(File file) throws IOException {
return (new UTF8BOMInputStream(file)).isSameEncodingType;
}

Categories