org.apache.commons.logging.Log cannot be resolved - java

When I am trying to declare an byte array using private byte[] startTag;.
Eclipse show this line as erroneous.
Hovering over it, I get this message:
The type org.apache.commons.logging.Log cannot be resolved. It is indirectly referenced from required .class files
I tried adding jar file in the classpath by viewing other solutions, I'm but unable to remove the error.
What should I do now?
If any specific jar file needs to be added please mention it.

import java.io.IOException;
import java.util.List;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
public class XmlInputFormat extends TextInputFormat {
public static final String START_TAG_KEY = "< student>";
public static final String END_TAG_KEY = "</student>";
#Override
public RecordReader<LongWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) {
return new XmlRecordReader();
}
public static class XmlRecordReader extends
RecordReader<LongWritable, Text> {
private byte[] startTag;
private byte[] endTag;
private long start;
private long end;
private FSDataInputStream fsin;
private DataOutputBuffer buffer = new DataOutputBuffer();
private LongWritable key = new LongWritable();
private Text value = new Text();
#Override
public void initialize(InputSplit is, TaskAttemptContext tac)
throws IOException, InterruptedException {
FileSplit fileSplit = (FileSplit) is;
String START_TAG_KEY = "<employee>";
String END_TAG_KEY = "</employee>";
startTag = START_TAG_KEY.getBytes("utf-8");
endTag = END_TAG_KEY.getBytes("utf-8");
start = fileSplit.getStart();
end = start + fileSplit.getLength();
Path file = fileSplit.getPath();
FileSystem fs =file.getFileSystem(tac.getConfiguration());
fsin = fs.open(fileSplit.getPath());
fsin.seek(start);
}
#Override
public boolean nextKeyValue() throws
IOException,InterruptedException {
if (fsin.getPos() < end) {
if (readUntilMatch(startTag, false)) {
try {
buffer.write(startTag);
if (readUntilMatch(endTag, true)) {
value.set(buffer.getData(), 0,
buffer.getLength());
key.set(fsin.getPos());
return true;
}
} finally {
buffer.reset();
}
}
}
return false;
}
#Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException,
InterruptedException {
return value;
}
#Override
public float getProgress() throws IOException,
InterruptedException {
return (fsin.getPos() - start) / (float) (end - start);
}
#Override
public void close() throws IOException {
fsin.close();
}
private boolean readUntilMatch(byte[] match, boolean
withinBlock)throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
if (b == -1)
return false;
if (withinBlock)
buffer.write(b);
if (b == match[i]) {
i++;
if (i >= match.length)
return true;
} else
i = 0;
if (!withinBlock && i == 0 && fsin.getPos() >= end)
return false;
}
}
}
}

I have solved the issue, finding the .jar library inside the $HADOOP_HOME.
I post an image to explain better:
I've also answered on this thread, for a similar problem:
https://stackoverflow.com/a/73427233/6685449

Related

Flink ParquetSinkWriter FileAlreadyExistsException

I am trying to use Apache Flink write parquet file on HDFS by using BucketingSink and a custom ParquetSinkWriter.
Here is the code and above error indicate when enable checking point (call snapshotState() in BucketingSink Class) flush method from below is not quiet working. Even writer is closed with "writer.close();" but still got error from "writer = createWriter();". Any thoughts? thanks
Got error like this:
org.apache.hadoop.fs.FileAlreadyExistsException:
/user/hive/flink_parquet_fils_with_checkingpoint/year=20/month=2/day=1/hour=17/_part-4-9.in-progress
for client 192.168.56.202 already exists
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFileInternal(FSNamesystem.java:3003)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFileInt(FSNamesystem.java:2890)
....
.
at flink.untils.ParquetSinkWriter.flush(ParquetSinkWriterForecast.java:81)
at
org.apache.flink.streaming.connectors.fs.bucketing.BucketingSink.snapshotState(BucketingSink.java:749)
import org.apache.flink.util.Preconditions;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroParquetWriter;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import java.io.IOException;
/**
* Parquet writer.
*
* #param <T>
*/
public class ParquetSinkWriter<T extends GenericRecord> implements Writer<T> {
private static final long serialVersionUID = -975302556515811398L;
private final CompressionCodecName compressionCodecName = CompressionCodecName.SNAPPY;
private final int pageSize = 64 * 1024;
private final String schemaRepresentation;
private transient Schema schema;
private transient ParquetWriter<GenericRecord> writer;
private transient Path path;
private int position;
public ParquetSinkWriter(String schemaRepresentation) {
this.schemaRepresentation = Preconditions.checkNotNull(schemaRepresentation);
}
#Override
public void open(FileSystem fs, Path path) throws IOException {
this.position = 0;
this.path = path;
if (writer != null) {
writer.close();
}
writer = createWriter();
}
#Override
public long flush() throws IOException {
Preconditions.checkNotNull(writer);
position += writer.getDataSize();
writer.close();
writer = createWriter();
return position;
}
#Override
public long getPos() throws IOException {
Preconditions.checkNotNull(writer);
return position + writer.getDataSize();
}
#Override
public void close() throws IOException {
if (writer != null) {
writer.close();
writer = null;
}
}
#Override
public void write(T element) throws IOException {
Preconditions.checkNotNull(writer);
writer.write(element);
}
#Override
public Writer<T> duplicate() {
return new ParquetSinkWriter<>(schemaRepresentation);
}
private ParquetWriter<GenericRecord> createWriter() throws IOException {
if (schema == null) {
schema = new Schema.Parser().parse(schemaRepresentation);
}
return AvroParquetWriter.<GenericRecord>builder(path)
.withSchema(schema)
.withDataModel(new GenericData())
.withCompressionCodec(compressionCodecName)
.withPageSize(pageSize)
.build();
}
}
It seems that the file You are trying to create currently exists. This is because You are using the default write mode CREATE, which fails when the file exists. What You can try to do is change Your code to use the OVERWRITE mode. You can change the createWriter() method to return something like below:
return AvroParquetWriter.<GenericRecord>builder(path)
.withSchema(schema)
.withDataModel(new GenericData())
.withCompressionCodec(compressionCodecName)
.withPageSize(pageSize)
.withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
.build();

Read parquet data from ByteArrayOutputStream instead of file

I would like to convert this code:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.example.data.simple.SimpleGroup;
import org.apache.parquet.example.data.simple.convert.GroupRecordConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.io.ColumnIOFactory;
import org.apache.parquet.io.MessageColumnIO;
import org.apache.parquet.io.RecordReader;
import org.apache.parquet.schema.MessageType;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class ParquetReaderUtils {
public static Parquet getParquetData(String filePath) throws IOException {
List<SimpleGroup> simpleGroups = new ArrayList<>();
ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(filePath), new Configuration()));
MessageType schema = reader.getFooter().getFileMetaData().getSchema();
//List<Type> fields = schema.getFields();
PageReadStore pages;
while ((pages = reader.readNextRowGroup()) != null) {
long rows = pages.getRowCount();
MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
SimpleGroup simpleGroup = (SimpleGroup) recordReader.read();
simpleGroups.add(simpleGroup);
}
}
reader.close();
return new Parquet(simpleGroups, schema);
}
}
(which is from https://www.arm64.ca/post/reading-parquet-files-java/)
to take a ByteArrayOutputStream parameter instead of a filePath.
Is this possible? I don't see a ParquetStreamReader in org.apache.parquet.hadoop.
Any help is appreciated. I am trying to write a test app for parquet coming from kafka and writing each of many messages out to a file is rather slow.
So without deeper testing, I would try with this class (albeit the content of the outputstream should be parquet-compatible). I put there a streamId to make the identification of the processed bytearray easier (the ParquetFileReader prints the instance.toString() out if something went wrong).
public class ParquetStream implements InputFile {
private final String streamId;
private final byte[] data;
private static class SeekableByteArrayInputStream extends ByteArrayInputStream {
public SeekableByteArrayInputStream(byte[] buf) {
super(buf);
}
public void setPos(int pos) {
this.pos = pos;
}
public int getPos() {
return this.pos;
}
}
public ParquetStream(String streamId, ByteArrayOutputStream stream) {
this.streamId = streamId;
this.data = stream.toByteArray();
}
#Override
public long getLength() throws IOException {
return this.data.length;
}
#Override
public SeekableInputStream newStream() throws IOException {
return new DelegatingSeekableInputStream(new SeekableByteArrayInputStream(this.data)) {
#Override
public void seek(long newPos) throws IOException {
((SeekableByteArrayInputStream) this.getStream()).setPos((int) newPos);
}
#Override
public long getPos() throws IOException {
return ((SeekableByteArrayInputStream) this.getStream()).getPos();
}
};
}
#Override
public String toString() {
return "ParquetStream[" + streamId + "]";
}
}

Changing number of splits for Hadoop job

I am currently writing code to process a single image using Hadoop, so my input is only one file (.png). I have working code that will run a job, but instead of running sequential mappers, it runs only one mapper and never spawns other mappers.
I have created my own extensions of the FileInputFormat and RecordReader classes in order to create (what I thought were) "n" custom splits -> "n" map tasks.
I've been searching the web like crazy for examples of this nature to learn from, but all I've been able to find are examples which deal with using entire files as a split (meaning exactly one mapper) or using a fixed number of lines from a text file (e.g., 3) per map task.
What I'm trying to do is send a pair of coordinates ((x1, y1), (x2, y2)) to each mapper where the coordinates correspond to the top-left/bottom-right pixels of some rectangle in the image.
Any suggestions/guidance/examples/links to examples would greatly be appreciated.
Custom FileInputFormat
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
public class FileInputFormat1 extends FileInputFormat
{
#Override
public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
return new RecordReader1();
}
#Override
protected boolean isSplitable(JobContext context, Path filename) {
return true;
}
}
Custom RecordReader
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
public class RecordReader1 extends RecordReader<KeyChunk1, NullWritable> {
private KeyChunk1 key;
private NullWritable value;
private ImagePreprocessor IMAGE;
public RecordReader1()
{
}
#Override
public void close() throws IOException {
}
#Override
public float getProgress() throws IOException, InterruptedException {
return IMAGE.getProgress();
}
#Override
public KeyChunk1 getCurrentKey() throws IOException, InterruptedException {
return key;
}
#Override
public NullWritable getCurrentValue() throws IOException, InterruptedException {
return value;
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
boolean gotNextValue = IMAGE.hasAnotherChunk();
if (gotNextValue)
{
if (key == null)
{
key = new KeyChunk1();
}
if (value == null)
{
value = NullWritable.get();
}
int[] data = IMAGE.getChunkIndicesAndIndex();
key.setChunkIndex(data[2]);
key.setStartRow(data[0]);
key.setStartCol(data[1]);
key.setChunkWidth(data[3]);
key.setChunkHeight(data[4]);
}
else
{
key = null;
value = null;
}
return gotNextValue;
}
#Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
Configuration config = taskAttemptContext.getConfiguration();
IMAGE = new ImagePreprocessor(
config.get("imageName"),
config.getInt("v_slices", 1),
config.getInt("h_slices", 1),
config.getInt("kernel_rad", 2),
config.getInt("grad_rad", 1),
config.get("hdfs_address"),
config.get("local_directory")
);
}
}
ImagePreprocessor Class (Used in custom RecordReader - only showing necessary information)
import java.awt.image.BufferedImage;
import java.io.IOException;
public class ImagePreprocessor {
private String filename;
private int num_v_slices;
private int num_h_slices;
private int minSize;
private int width, height;
private int chunkWidth, chunkHeight;
private int indexI, indexJ;
String hdfs_address, local_directory;
public ImagePreprocessor(String filename, int num_v_slices, int num_h_slices, int kernel_radius, int gradient_radius,
String hdfs_address, String local_directory) throws IOException{
this.hdfs_address = hdfs_address;
this.local_directory = local_directory;
// all "validate" methods throw errors if input data is invalid
checkValidFilename(filename);
checkValidNumber(num_v_slices, "vertical strips");
this.num_v_slices = num_v_slices;
checkValidNumber(num_h_slices, "horizontal strips");
this.num_h_slices = num_h_slices;
checkValidNumber(kernel_radius, "kernel radius");
checkValidNumber(gradient_radius, "gradient radius");
this.minSize = 1 + 2 * (kernel_radius + gradient_radius);
getImageData(); // loads image and saves width/height to class variables
validateImageSize();
chunkWidth = validateWidth((int)Math.ceil(((double)width) / num_v_slices));
chunkHeight = validateHeight((int)Math.ceil(((double)height) / num_h_slices));
indexI = 0;
indexJ = 0;
}
public boolean hasAnotherChunk()
{
return indexI < num_h_slices;
}
public int[] getChunkIndicesAndIndex()
{
int[] ret = new int[5];
ret[0] = indexI;
ret[1] = indexJ;
ret[2] = indexI*num_v_slices + indexJ;
ret[3] = chunkWidth;
ret[4] = chunkHeight;
indexJ += 1;
if (indexJ >= num_v_slices)
{
indexJ = 0;
indexI += 1;
}
return ret;
}
}
Thank you for your time!
You should override method public InputSplit[] getSplits(JobConf job, int numSplits) in your FileInputFormat1 class. Create your own class based on InputSplit with rectangle coordinates, so inside FileInputFormat you can get this information to return correct key/value pairs to mapper.
Probably implementation of getSplits in FileInputFormat could help you see here.

Using Jackcess with JCIFS to manipulate an Access database on an SMB share

I need to work with an MS Access file in Java using Jackcess. The file is located on an SMB share so I assume I would have to use JCIFS.
I tried this
String testdirectory = "smb://" + "file location";
SmbFile testsmbdir = null;
try{
testsmbdir = new SmbFile(testdirectory,auth);
}catch(Exception e){
e.printStackTrace();
}
SmbFileInputStream smbFilestream = new SmbFileInputStream(testsmbdir);
db = DatabaseBuilder.open(testsmbdir);
However, it says SMBFile can not be converted to File for the
db = DatabaseBuilder.open(testsmbdir)"
line. Also if I try using "smbFilestream" instead it says it cannot convert SmbFileInputStream to File either.
Do I have to copy the file to the local machine or something completely different? If how can I do so?
(I'm a windows user by the way. I am just converting my application to Mac so sorry if my lingo is off.)
In reply to a thread on the Jackcess forums here, James suggested that
it should be relatively straightforward to implement a version of FileChannel which works with a SmbRandomAccessFile
I just tried it in a Maven project named smb4jackcess in Eclipse, and I got it working without having to write too much code. The class I created is named SmbFileChannel:
// FileChannel using jcifs.smb.SmbRandomAccessFile
package smb4jackcess;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;
import jcifs.smb.SmbException;
import jcifs.smb.SmbFile;
import jcifs.smb.SmbRandomAccessFile;
public class SmbFileChannel extends FileChannel {
private final SmbRandomAccessFile _file;
private long _length;
public SmbFileChannel(String smbURL) throws SmbException, MalformedURLException, UnknownHostException {
_file = new SmbRandomAccessFile(smbURL, "rw", SmbFile.FILE_NO_SHARE);
_length = _file.length();
}
#Override
public void force(boolean metaData) throws SmbException, MalformedURLException, UnknownHostException {
// do nothing
}
#Override
public FileLock lock(long position, long size, boolean shared) {
throw new UnsupportedOperationException();
}
#Override
public MappedByteBuffer map(MapMode mode, long position, long size) {
throw new UnsupportedOperationException();
}
#Override
public long position() throws SmbException {
return _file.getFilePointer();
}
#Override
public FileChannel position(long newPosition) throws SmbException {
_file.seek(newPosition);
return this;
}
#Override
public int read(ByteBuffer dst) {
throw new UnsupportedOperationException();
}
#Override
public int read(ByteBuffer dst, long position) throws SmbException {
byte[] b = new byte[dst.remaining()];
_file.seek(position);
int bytesRead =_file.read(b);
dst.put(b);
return bytesRead;
}
#Override
public long read(ByteBuffer[] dsts, int offset, int length) {
throw new UnsupportedOperationException();
}
#Override
public long size() throws SmbException {
return _length;
}
#Override
public long transferFrom(ReadableByteChannel src, long position, long count) throws IOException {
ByteBuffer bb = ByteBuffer.allocate((int)count);
int bytesWritten = src.read(bb);
bb.rewind();
bb.limit(bytesWritten);
this.write(bb, position);
return bytesWritten;
}
#Override
public long transferTo(long position, long count, WritableByteChannel target) {
throw new UnsupportedOperationException();
}
#Override
public FileChannel truncate(long newSize) throws SmbException {
if (newSize < 0L) {
throw new IllegalArgumentException("negative size");
}
_file.setLength(newSize);
_length = newSize;
return this;
}
#Override
public FileLock tryLock(long position, long size, boolean shared) {
throw new UnsupportedOperationException();
}
#Override
public int write(ByteBuffer src) throws SmbException {
throw new UnsupportedOperationException();
}
#Override
public int write(ByteBuffer src, long position) throws SmbException {
byte[] b = new byte[src.remaining()];
src.get(b);
_file.seek(position);
_file.write(b);
long endPos = position + b.length;
if(endPos > _length) {
_length = endPos;
}
return b.length;
}
#Override
public long write(ByteBuffer[] srcs, int offset, int length) {
throw new UnsupportedOperationException();
}
#Override
protected void implCloseChannel() throws SmbException {
_file.close();
}
}
and the main class I used was
package smb4jackcess;
import java.nio.channels.FileChannel;
import com.healthmarketscience.jackcess.Column;
import com.healthmarketscience.jackcess.ColumnBuilder;
import com.healthmarketscience.jackcess.DataType;
import com.healthmarketscience.jackcess.Database;
import com.healthmarketscience.jackcess.Database.FileFormat;
import com.healthmarketscience.jackcess.DatabaseBuilder;
import com.healthmarketscience.jackcess.IndexBuilder;
import com.healthmarketscience.jackcess.Table;
import com.healthmarketscience.jackcess.TableBuilder;
public class Smb4jackcessMain {
public static void main(String[] args) {
String smbURL = "smb://gord:mypassword#SERVERNAME/sharename/etc/newdb.accdb";
try (SmbFileChannel sfc = new SmbFileChannel(smbURL)) {
// create a brand new database file
Database db = new DatabaseBuilder()
.setChannel(sfc)
.setFileFormat(FileFormat.V2010)
.create();
// add a table to it
Table newTable = new TableBuilder("NewTable")
.addColumn(new ColumnBuilder("ID", DataType.LONG)
.setAutoNumber(true))
.addColumn(new ColumnBuilder("TextField", DataType.TEXT))
.addIndex(new IndexBuilder(IndexBuilder.PRIMARY_KEY_NAME)
.addColumns("ID").setPrimaryKey())
.toTable(db);
// insert a row into the table
newTable.addRow(Column.AUTO_NUMBER, "This is a new row.");
db.close();
} catch (Exception e) {
e.printStackTrace(System.err);
}
}
}
Updated 2016-02-04: Code improvements. Many thanks to James at Dell Boomi for his assistance!

Getting Filename/FileData as key/value input for Map when running a Hadoop MapReduce Job

I went through the question How to get Filename/File Contents as key/value input for MAP when running a Hadoop MapReduce Job? here. Though it explains the concept, I am unable to successfully transform it to code.
Basically, I want the file name as key and the file data as value. For that I wrote a custom RecordReader as recommended in the aforementioned question. But I couldn't understand how to get the file name as the key in this class. Also, while writing the custom FileInputFormat class, I couldn't understand how to return the custom RecordReader I wrote previously.
The RecordReader code is:
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
public class CustomRecordReader extends RecordReader<Text, Text> {
private static final String LINE_SEPARATOR = System.getProperty("line.separator");
private StringBuffer valueBuffer = new StringBuffer("");
private Text key = new Text();
private Text value = new Text();
private RecordReader<Text, Text> recordReader;
public SPDRecordReader(RecordReader<Text, Text> recordReader) {
this.recordReader = recordReader;
}
#Override
public void close() throws IOException {
recordReader.close();
}
#Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
#Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
#Override
public float getProgress() throws IOException, InterruptedException {
return recordReader.getProgress();
}
#Override
public void initialize(InputSplit arg0, TaskAttemptContext arg1)
throws IOException, InterruptedException {
recordReader.initialize(arg0, arg1);
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (valueBuffer.equals("")) {
while (recordReader.nextKeyValue()) {
valueBuffer.append(recordReader.getCurrentValue());
valueBuffer.append(LINE_SEPARATOR);
}
value.set(valueBuffer.toString());
return true;
}
return false;
}
}
And the incomplete FileInputFormat class is:
import java.io.IOException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
public class CustomFileInputFormat extends FileInputFormat<Text, Text> {
#Override
protected boolean isSplitable(FileSystem fs, Path filename) {
return false;
}
#Override
public RecordReader<Text, Text> getRecordReader(InputSplit arg0, JobConf arg1,
Reporter arg2) throws IOException {
return null;
}
}
Have this code in your CustomRecordReader class.
private LineRecordReader lineReader;
private String fileName;
public CustomRecordReader(JobConf job, FileSplit split) throws IOException {
lineReader = new LineRecordReader(job, split);
fileName = split.getPath().getName();
}
public boolean next(Text key, Text value) throws IOException {
// get the next line
if (!lineReader.next(key, value)) {
return false;
}
key.set(fileName);
value.set(value);
return true;
}
public Text createKey() {
return new Text("");
}
public Text createValue() {
return new Text("");
}
Remove SPDRecordReader constructor (It is an error).
And have this code in your CustomFileInputFormat class
public RecordReader<Text, Text> getRecordReader(
InputSplit input, JobConf job, Reporter reporter)
throws IOException {
reporter.setStatus(input.toString());
return new CustomRecordReader(job, (FileSplit)input);
}

Categories