Generating a file and writing to it in map function - java

I am generating a csv file in my map function. So that each map task generates one csv file. Now this is a side effect and not the output of the mapper. The way I am naming those files is something like filename_inputkey. However when I run the application on a single node cluster, there is only one file generated. I have 10 lines in my input and as per my understanding goes, there will be 10 mapper tasks and 10 files would be generated. Let me know if I am thinking in a wrong way here.
Here is my GWASInputFormat class
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
public class GWASInputFormat extends FileInputFormat<LongWritable, GWASGenotypeBean>{
#Override
public RecordReader<LongWritable, GWASGenotypeBean> getRecordReader(org.apache.hadoop.mapred.InputSplit input, JobConf job, Reporter arg2) throws IOException {
return (RecordReader<LongWritable, GWASGenotypeBean>) new GWASRecordReader(job, (FileSplit)input);
}
}
Here is GWASRecordReader
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.LineRecordReader;
import org.apache.hadoop.mapred.RecordReader;
public class GWASRecordReader implements RecordReader<LongWritable, GWASGenotypeBean>{
private LineRecordReader lineReader;
private LongWritable lineKey;
private Text lineValue;
#Override
public void close() throws IOException {
if(lineReader != null) {
lineReader.close();
}
}
public GWASRecordReader(JobConf job, FileSplit split) throws IOException {
lineReader = new LineRecordReader(job, split);
lineKey = lineReader.createKey();
lineValue = lineReader.createValue();
}
#Override
public LongWritable createKey() {
return new LongWritable();
}
#Override
public GWASGenotypeBean createValue() {
return new GWASGenotypeBean();
}
#Override
public long getPos() throws IOException {
return lineReader.getPos();
}
#Override
public boolean next(LongWritable key, GWASGenotypeBean value) throws IOException {
if(!lineReader.next(lineKey, lineValue)){
return false;
}
String[] values = lineValue.toString().split(",");
if(values.length !=32) {
throw new IOException("Invalid Record ");
}
value.setPROJECT_NAME(values[0]);
value.setRESEARCH_CODE(values[1]);
value.setFACILITY_CODE(values[2]);
value.setPROJECT_CODE(values[3]);
value.setINVESTIGATOR(values[4]);
value.setPATIENT_NUMBER(values[5]);
value.setSAMPLE_COLLECTION_DATE(values[6]);
value.setGENE_NAME(values[7]);
value.setDbSNP_RefSNP_ID(values[8]);
value.setSNP_ID(values[9]);
value.setALT_SNP_ID(values[10]);
value.setSTRAND(values[11]);
value.setASSAY_PLATFORM(values[12]);
value.setSOFTWARE_NAME(values[13]);
value.setSOFTWARE_VERSION_NUMBER(values[14]);
value.setTEST_DATE(values[15]);
value.setPLATE_POSITION(values[16]);
value.setPLATE_ID(values[17]);
value.setOPERATOR(values[18]);
value.setGENOTYPE(values[19]);
value.setGENOTYPE_QS1_NAME(values[20]);
value.setGENOTYPE_QS2_NAME(values[21]);
value.setGENOTYPE_QS3_NAME(values[22]);
value.setGENOTYPE_QS4_NAME(values[23]);
value.setGENOTYPE_QS5_NAME(values[24]);
value.setGENOTYPE_QS1_RESULT(values[25]);
value.setGENOTYPE_QS2_RESULT(values[26]);
value.setGENOTYPE_QS3_RESULT(values[27]);
value.setGENOTYPE_QS4_RESULT(values[28]);
value.setGENOTYPE_QS5_RESULT(values[29]);
value.setSTAGE(values[30]);
value.setLAB(values[31]);
return true;
}
#Override
public float getProgress() throws IOException {
return lineReader.getProgress();
}
}
Mapper class
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import com.google.common.base.Strings;
public class GWASMapper extends MapReduceBase implements Mapper<LongWritable, GWASGenotypeBean, Text, Text> {
private static Configuration conf;
#SuppressWarnings("rawtypes")
public void setup(org.apache.hadoop.mapreduce.Mapper.Context context) throws IOException {
conf = context.getConfiguration();
// Path[] otherFiles = DistributedCache.getLocalCacheFiles(context.getConfiguration());
}
#Override
public void map(LongWritable inputKey, GWASGenotypeBean inputValue, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
checkForNulls(inputValue, inputKey.toString());
output.collect(new Text(inputValue.getPROJECT_CODE()), new Text(inputValue.getFACILITY_CODE()));
}
private void checkForNulls(GWASGenotypeBean user, String inputKey) {
String f1 = " does not have a value_fail";
String p1 = "Must not contain NULLS for required fields";
// have to initialize these two to some paths in hdfs
String edtChkRptDtl = "/user/hduser/output6/detail" + inputKey + ".csv";
String edtChkRptSmry = "/user/hduser/output6/summary" + inputKey + ".csv";
../
List<String> errSmry = new ArrayList<String>();
Map<String, String> loc = new TreeMap<String, String>();
if(Strings.isNullOrEmpty(user.getPROJECT_NAME())) {
loc.put("test", "PROJECT_NAME ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getRESEARCH_CODE())) {
loc.put("test", "RESEARCH_CODE ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getFACILITY_CODE())) {
loc.put("test", "FACILITY_CODE ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getPROJECT_CODE())) {
loc.put("test", "PROJECT_CODE ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getINVESTIGATOR())) {
loc.put("test", "INVESTIGATOR ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getPATIENT_NUMBER())) {
loc.put("test", "PATIENT_NUMBER ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getSAMPLE_COLLECTION_DATE())) {
loc.put("test", "SAMPLE_COLLECTION_DATE ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getGENE_NAME())) {
loc.put("test", "GENE_NAME ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getSTRAND())) {
loc.put("test", "STRAND ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getASSAY_PLATFORM())) {
loc.put("test", "ASSAY_PLATFORM ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getSOFTWARE_NAME())) {
loc.put("test", "SOFTWARE_NAME ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getTEST_DATE())) {
loc.put("test", "TEST_DATE ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getPLATE_POSITION())) {
loc.put("test", "PLATE_POSITION ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getPLATE_ID())) {
loc.put("test", "PLATE_ID ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getOPERATOR())) {
loc.put("test", "OPERATOR ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getGENOTYPE())) {
loc.put("test", "GENOTYPE ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getSTAGE())) {
loc.put("test", "STAGE ");
errSmry.add("_fail");
} else if(Strings.isNullOrEmpty(user.getLAB())) {
loc.put("test", "LAB ");
errSmry.add("_fail");
}
String customNullMsg = "Required Genotype column(s)";
List<String> error = new ArrayList<String>();
String message = null;
if(!loc.isEmpty()) {
for (Map.Entry<String, String> entry : loc.entrySet()) {
message = "line:" + entry.getKey() + " column:" + entry.getValue() + " " + f1;
error.add(message);
}
} else {
message = "_pass";
error.add(message);
}
int cnt = 0;
if(!errSmry.isEmpty()) {
// not able to understand this. Are we trying to get the occurances
// if the last key that contains _fail
for (String key : errSmry) {
if(key.contains("_fail")) {
cnt = Collections.frequency(errSmry, key);
// ******************** Nikhil added this
break;
}
}
if(cnt > 0) {
writeCsvFileSmry(edtChkRptSmry, customNullMsg, p1, "failed", Integer.toString(cnt));
} else {
writeCsvFileSmry(edtChkRptSmry, customNullMsg, p1, "passed", "0");
}
} else {
writeCsvFileSmry(edtChkRptSmry, customNullMsg, p1, "passed", "0");
}
// loop the list and write out items to the error report file
if(!error.isEmpty()) {
for (String s : error) {
//System.out.println(s);
if(s.contains("_fail")) {
String updatedFailmsg = s.replace("_fail", "");
writeCsvFileDtl(edtChkRptDtl, "genotype", updatedFailmsg, "failed");
}
if(s.contains("_pass")) {
writeCsvFileDtl(edtChkRptDtl, "genotype", p1, "passed");
}
}
} else {
writeCsvFileDtl(edtChkRptDtl, "genotype", p1, "passed");
}
// end loop
}
private void writeCsvFileDtl(String edtChkRptDtl, String col1, String col2, String col3) {
try {
if(conf == null) {
conf = new Configuration();
}
FileSystem fs = FileSystem.get(conf);
Path path = new Path(edtChkRptDtl);
if (!fs.exists(path)) {
FSDataOutputStream out = fs.create(path);
out.writeChars(col1);
out.writeChar(',');
out.writeChars(col2);
out.writeChar(',');
out.writeChars(col3);
out.writeChar('\n');
out.flush();
out.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
private void writeCsvFileSmry(String edtChkRptSmry, String col1, String col2, String col3, String col4) {
try {
if(conf == null) {
conf = new Configuration();
}
FileSystem fs = FileSystem.get(conf);
Path path = new Path(edtChkRptSmry);
if (!fs.exists(path)) {
FSDataOutputStream out = fs.create(path);
out.writeChars(col1);
out.writeChar(',');
out.writeChars(col2);
out.writeChar(',');
out.writeChars(col3);
out.writeChar(',');
out.writeChars(col4);
out.writeChar('\n');
out.flush();
out.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
Here is my driver class
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class GWASMapReduce extends Configured implements Tool{
/**
* #param args
*/
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
ToolRunner.run(configuration, new GWASMapReduce(), args);
}
#Override
public int run(String[] arg0) throws Exception {
JobConf conf = new JobConf(new Configuration());
conf.setInputFormat(GWASInputFormat.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setJarByClass(GWASMapReduce.class);
conf.setMapperClass(GWASMapper.class);
conf.setNumReduceTasks(0);
FileInputFormat.addInputPath(conf, new Path(arg0[0]));
FileOutputFormat.setOutputPath(conf, new Path(arg0[1]));
JobClient.runJob(conf);
return 0;
}
}

There will probably be only one Mapper task, and ten invocations of it's map method. If you wish to write out one file per Mapper, you should do so in its configure method. If you wish to write out one file per input record, you should so in its map method.
Edit: The above turned out to be unrelated to the problem. The issue is that in GWASInputFormat, you do not set the key in the next method, so your map input key is always the same. Simply add key.set(lineKey.get()); to the next method, and it should work.

Related

How to use spark Java API to read binary file stream from HDFS?

I am writing a component which needs to get the new binary file in a specific HDFS path, so that I can do some online learning based on this data. So, I want to read binary file created by Flume from HDFS in stream. I found several functions provided by spark API, such as
public JavaDStream<byte[]> binaryRecordsStream(String directory,int recordLength)
and
public <K,V,F extends org.apache.hadoop.mapreduce.InputFormat<K,V>>
JavaPairInputDStream<K,V> fileStream(String directory, Class<K> kClass, Class<V> vClass, Class<F> fClass)
But, I really do not know how to use these functions. I have tried binaryRecordStream, but it defines the specific length of file, so it is not good.
For fileStream function, I have used:
SparkConf sparkConf = new SparkConf().setAppName("SparkFileStreamTest").setMaster("local[2]");
// Create the context with the specified batch size
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(durationInMillis));
JavaPairInputDStream<LongWritable, BytesWritable> inputDStream = jssc.fileStream(hdfsPath, LongWritable.class, BytesWritable.class, CustomInputFormat.class);
//**********************************************************************
JavaPairInputDStream<LongWritable, BytesWritable> inputDStream = jssc.fileStream(
hdfsPath, LongWritable.class, BytesWritable.class, CustomInputFormat.class);
JavaDStream<byte[]> content = inputDStream.map(new Function<Tuple2<LongWritable, BytesWritable>, byte[]>() {
#Override
public byte[] call(Tuple2<LongWritable, BytesWritable> tuple2) {
System.out.println("----------------[testReadFileStreamFromHDFS] ENTER ......");
if (tuple2 == null) {
System.out.println("----------------[testReadFileStreamFromHDFS] TUPLE = NULL");
System.out.println("----------------[testReadFileStreamFromHDFS] END.");
return null;
}
else {
System.out.println("----------------[testReadFileStreamFromHDFS] KEY = [" + tuple2._1().toString() + "]");
System.out.println("----------------[testReadFileStreamFromHDFS] VAL-LENGTH = [" + tuple2._2().getBytes().length + "]");
System.out.println("----------------[testReadFileStreamFromHDFS] END.");
return tuple2._2().getBytes();
}
}
});
/***********************************************************************/
if (content == null) {
System.out.println("----------------[testReadFileStreamFromHDFS] CONTENT = NULL");
}
else {
System.out.println("----------------[testReadFileStreamFromHDFS] CONTENT-length = [" + content.count());
content.print();
}
System.out.println("----------------[testReadFileStreamFromHDFS] END-111.");
jssc.start();
jssc.awaitTermination();
System.out.println("----------------[testReadFileStreamFromHDFS] END-222.");
For CustomInputFormat, I created
public class CustomInputFormat extends FileInputFormat<LongWritable, BytesWritable> {
private CustomInputSplit mInputSplit;
public CustomInputFormat() {
mInputSplit = new CustomInputSplit();
}
#Override
public List<InputSplit> getSplits(JobContext context)
throws IOException {
System.out.println("----------------[CustomInputFormat] 1111 ......");
final ArrayList<InputSplit> result = new ArrayList<InputSplit>();
result.add(mInputSplit);
System.out.println("----------------[CustomInputFormat] 2222 ......");
return result;
}
#Override
public RecordReader<LongWritable, BytesWritable> createRecordReader(
InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
throws IOException, InterruptedException {
System.out.println("----------------[CustomInputFormat] 3333 ......");
System.out.println("----------------[CustomInputFormat] ENTER createRecordReader, inputSplit-length = ["
+ inputSplit.getLength() + "]");
mInputSplit.init(inputSplit);
System.out.println("----------------[CustomInputFormat] 4444 ......");
return new CustomRecordReader();
}
#Override
protected boolean isSplitable(JobContext context, Path filename) {
System.out.println("----------------[CustomInputFormat] 5555 ......");
return false;
}
public class CustomRecordReader extends RecordReader<LongWritable, BytesWritable> {
private BytesWritable mValues;
private int mCursor;
public CustomRecordReader() {
System.out.println("----------------[CustomRecordReader] 1111 ......");
mValues = null;
mCursor = 0;
System.out.println("----------------[CustomRecordReader] 2222 ......");
}
#Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
throws IOException, InterruptedException {
System.out.println("----------------[CustomRecordReader] 3333 ......");
CustomInputSplit customInputSplit = (CustomInputSplit) inputSplit;
mValues = customInputSplit.getValues();
System.out.println("----------------[CustomRecordReader] 4444 ......");
}
#Override
public boolean nextKeyValue() throws IOException, InterruptedException {
System.out.println("----------------[CustomRecordReader] 5555 ......");
boolean existNext = (mCursor == 0);
mCursor++;
System.out.println("----------------[CustomRecordReader] 6666 ......");
return existNext;
}
#Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
System.out.println("----------------[CustomRecordReader] 7777 ......");
return new LongWritable(0);
}
#Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
System.out.println("----------------[CustomRecordReader] 8888 ......");
return mValues;
}
#Override
public float getProgress() throws IOException, InterruptedException {
System.out.println("----------------[CustomRecordReader] 9999 ......");
return 0;
}
#Override
public void close() throws IOException {
System.out.println("----------------[CustomRecordReader] AAAA ......");
mValues = null;
}
}
public class CustomInputSplit extends InputSplit implements Writable {
private long mLength;
private String[] mLocations;
private final BytesWritable mContent;
public CustomInputSplit() {
System.out.println("----------------[CustomInputSplit] 1111 ......");
mLength = 0;
mLocations = null;
mContent = new BytesWritable();
System.out.println("----------------[CustomInputSplit] 2222 ......");
}
public void init(InputSplit inputSplit) throws IOException, InterruptedException {
System.out.println("----------------[CustomInputSplit] 3333 ......");
mLength = inputSplit.getLength();
String[] locations = inputSplit.getLocations();
if (locations != null) {
int numLocations = locations.length;
mLocations = new String[numLocations];
for (int i = 0; i < numLocations; i++) {
mLocations[i] = locations[i];
}
}
System.out.println("----------------[CustomInputSplit] 4444 ......");
}
#Override
public long getLength() throws IOException, InterruptedException {
System.out.println("----------------[CustomInputSplit] 5555 ......");
return mLength;
}
#Override
public String[] getLocations() throws IOException, InterruptedException {
if (mLocations == null) {
System.out.println("----------------[CustomInputSplit] 6666-0001 ...... mLocations = [NULL]");
mLocations = new String[] {"localhost"};
}
System.out.println("----------------[CustomInputSplit] 6666-0002 ...... mLocations-length = [" + mLocations.length + "]");
return mLocations;
}
#Override
public void write(DataOutput dataOutput) throws IOException {
System.out.println("----------------[CustomInputSplit] 7777 ......");
mContent.write(dataOutput);
}
#Override
public void readFields(DataInput dataInput) throws IOException {
System.out.println("----------------[CustomInputSplit] 8888 ......");
mContent.readFields(dataInput);
}
public BytesWritable getValues() {
System.out.println("----------------[CustomInputSplit] 9999 ......");
return mContent;
}
}
But when I print:
System.out.println("----------------[testReadFileStreamFromHDFS] VAL-LENGTH = [" + tuple2._2().getBytes().length + "]");
I always get 0 length:
----------------[testReadFileStreamFromHDFS] VAL-LENGTH = [0]
Are there some problems with CustomerInputFormat.class? Does anybody know how to use Spark stream Java API to read binary file from HDFS?
try this
JavaStreamingContext context
JavaSparkContext jContext = context.sparkContext();
JavaPairRDD<String, PortableDataStream> rdd = jContext.binaryFiles(fsURI + directoryPath);
JavaRDD<Object> rdd1 = rdd.map(new Function<Tuple2<String, PortableDataStream>, Object>() {
private static final long serialVersionUID = -7894402430221488712L;
#Override
public Object call(Tuple2<String, PortableDataStream> arg0) throws Exception {
byte[] imageInByte = arg0._2().toArray();
String base64Encoded = DatatypeConverter.printBase64Binary(imageInByte);
return (arg0._1 + Constants.COMMA_DELIMITER + base64Encoded).getBytes();
}
});
java.util.Queue<JavaRDD<Object>> queue = new LinkedList();
queue.add(rdd1);
JavaDStream<Object> dStream = context.queueStream(queue);
The only limitation with this apparoach is that it will not be able to read new files from HDFS created after starting this pipeline.
Use this Approach:
Write a Custom Receiver:
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import javax.xml.bind.DatatypeConverter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.input.PortableDataStream;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.receiver.Receiver;
class DFSReceiver extends Receiver<byte[]> {
/** The Constant serialVersionUID. */
private static final long serialVersionUID = -1051061769769056605L;
Long windowSize = 20000l;
/** Instantiates a new RMQ receiver. */
DFSReceiver() {
super(StorageLevel.MEMORY_AND_DISK_SER_2());
}
#Override
public void onStart() {
System.out.println("Inside onStart method");
new Thread() {
#Override
public void run() {
try {
receive();
}
} catch (Exception e) {
e.printStackTrace();
LOGGER.error("Exception raised at DFSReceiverHelper , exception : " + e);
}
}
}.start();
}
/** Receive.
*
* #throws Exception
* the exception */
protected void receive() throws Exception {
try {
ConnectionMetadata connectionMetadata = ConnectionMetadataFactory.getConnectionMetadataObj(ConnectionConstants.HDFS_DATA_STORE);
String connectionId = connectionMetadata.getConnectionId(ConnectionConstants.HDFS_DATA_STORE, connectionName);
ConnectionMetaDataDTO c = connectionMetadata.getConnectionMetaDataById(connectionId);
Map<String, Object> map = connectionMetadata.getConnectionConfigParameters(c);
FileSystem fs = HDFSUtils.getFileSystemInstance(map);
JavaPairRDD<String, PortableDataStream> rdd = sparkContext.binaryFiles(fsURI + directoryPath);
List<Tuple2<String, PortableDataStream>> rddList = rdd.collect();
for (Tuple2<String, PortableDataStream> arg0 : rddList) {
byte[] imageInByte = arg0._2().toArray();
String base64Encoded = DatatypeConverter.printBase64Binary(imageInByte);
store((arg0._1 + Constants.COMMA_DELIMITER + base64Encoded).getBytes());
}
Long time = System.currentTimeMillis();
System.out.println();
Thread.currentThread().sleep(windowSize);
while (true) {
List<Path> newFiles = checkIfNewFileCreated(fs, new Path(fsURI + directoryPath), time);
for (Path p : newFiles) {
JavaPairRDD<String, PortableDataStream> rdd11 = sparkContext.binaryFiles(p.toString());
Tuple2<String, PortableDataStream> arg0 = rdd11.first();
byte[] imageInByte = arg0._2().toArray();
String base64Encoded = DatatypeConverter.printBase64Binary(imageInByte);
store((arg0._1 + Constants.COMMA_DELIMITER + base64Encoded).getBytes());
}
Thread.currentThread().sleep(windowSize);
time += windowSize;
}
} catch (ShutdownSignalException s) {
LOGGER.error("ShutdownSignalException raised in receive method of DFSReceiver", s);
}
}
private List<Path> checkIfNewFileCreated(FileSystem fs, Path p, Long timeStamp) throws IOException {
List<Path> fileList = new ArrayList<>();
if (fs.isDirectory(p)) {
FileStatus[] fStatus = fs.listStatus(p);
for (FileStatus status : fStatus) {
if (status.isFile() && timeStamp < status.getModificationTime() && timeStamp + windowSize >= status.getModificationTime()) {
fileList.add(status.getPath());
}
}
}
return fileList;
}
#Override
public void onStop() {
}
}
With this receiver you will be able to read newly created files also every 20 second.

Different Result on DBPedia Spotlight by using the code and DBPedia Spotlight endpoint

This is the main class in which query is being fired
package extractKeyword;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.methods.GetMethod;
import org.dbpedia.spotlight.exceptions.AnnotationException;
import org.dbpedia.spotlight.model.DBpediaResource;
import org.dbpedia.spotlight.model.Text;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.LinkedList;
import java.util.List;
public class db extends AnnotationClient {
//private final static String API_URL = "http://jodaiber.dyndns.org:2222/";
private static String API_URL = "http://spotlight.dbpedia.org/";
private static double CONFIDENCE = 0.0;
private static int SUPPORT = 0;
// private static String powered_by ="non";
// private static String spotter ="CoOccurrenceBasedSelector";//"LingPipeSpotter"=Annotate all spots
//AtLeastOneNounSelector"=No verbs and adjs.
//"CoOccurrenceBasedSelector" =No 'common words'
//"NESpotter"=Only Per.,Org.,Loc.
//private static String disambiguator ="Default";//Default ;Occurrences=Occurrence-centric;Document=Document-centric
//private static String showScores ="yes";
#SuppressWarnings("static-access")
public void configiration(double CONFIDENCE,int SUPPORT)
//, String powered_by,String spotter,String disambiguator,String showScores)
{
this.CONFIDENCE=CONFIDENCE;
this.SUPPORT=SUPPORT;
// this.powered_by=powered_by;
//this.spotter=spotter;
//this.disambiguator=disambiguator;
//showScores=showScores;
}
public List<DBpediaResource> extract(Text text) throws AnnotationException {
// LOG.info("Querying API.");
String spotlightResponse;
try {
String Query=API_URL + "rest/annotate/?" +
"confidence=" + CONFIDENCE
+ "&support=" + SUPPORT
// + "&spotter=" + spotter
// + "&disambiguator=" + disambiguator
// + "&showScores=" + showScores
// + "&powered_by=" + powered_by
+ "&text=" + URLEncoder.encode(text.text(), "utf-8");
//LOG.info(Query);
GetMethod getMethod = new GetMethod(Query);
getMethod.addRequestHeader(new Header("Accept", "application/json"));
spotlightResponse = request(getMethod);
} catch (UnsupportedEncodingException e) {
throw new AnnotationException("Could not encode text.", e);
}
assert spotlightResponse != null;
JSONObject resultJSON = null;
JSONArray entities = null;
try {
resultJSON = new JSONObject(spotlightResponse);
entities = resultJSON.getJSONArray("Resources");
} catch (JSONException e) {
//throw new AnnotationException("Received invalid response from DBpedia Spotlight API.");
}
LinkedList<DBpediaResource> resources = new LinkedList<DBpediaResource>();
if(entities!=null)
for(int i = 0; i < entities.length(); i++) {
try {
JSONObject entity = entities.getJSONObject(i);
resources.add(
new DBpediaResource(entity.getString("#URI"),
Integer.parseInt(entity.getString("#support"))));
} catch (JSONException e) {
//((Object) LOG).error("JSON exception "+e);
}
}
return resources;
}
}
The extended class
package extractKeyword;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethodBase;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.dbpedia.spotlight.exceptions.AnnotationException;
import org.dbpedia.spotlight.model.DBpediaResource;
import org.dbpedia.spotlight.model.Text;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Logger;
import javax.ws.rs.HttpMethod;
/**
* #author pablomendes
*/
public abstract class AnnotationClient {
//public Logger LOG = Logger.getLogger(this.getClass());
private List<String> RES = new ArrayList<String>();
// Create an instance of HttpClient.
private static HttpClient client = new HttpClient();
public List<String> getResu(){
return RES;
}
public String request(GetMethod getMethod) throws AnnotationException {
String response = null;
// Provide custom retry handler is necessary
( getMethod).getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler(3, false));
try {
// Execute the method.
int statusCode = client.executeMethod((org.apache.commons.httpclient.HttpMethod) getMethod);
if (statusCode != HttpStatus.SC_OK) {
// LOG.error("Method failed: " + ((HttpMethodBase) method).getStatusLine());
}
// Read the response body.
byte[] responseBody = ((HttpMethodBase) getMethod).getResponseBody(); //TODO Going to buffer response body of large or unknown size. Using getResponseBodyAsStream instead is recommended.
// Deal with the response.
// Use caution: ensure correct character encoding and is not binary data
response = new String(responseBody);
} catch (HttpException e) {
// LOG.error("Fatal protocol violation: " + e.getMessage());
throw new AnnotationException("Protocol error executing HTTP request.",e);
} catch (IOException e) {
//((Object) LOG).error("Fatal transport error: " + e.getMessage());
//((Object) LOG).error(((HttpMethodBase) method).getQueryString());
throw new AnnotationException("Transport error executing HTTP request.",e);
} finally {
// Release the connection.
((HttpMethodBase) getMethod).releaseConnection();
}
return response;
}
protected static String readFileAsString(String filePath) throws java.io.IOException{
return readFileAsString(new File(filePath));
}
protected static String readFileAsString(File file) throws IOException {
byte[] buffer = new byte[(int) file.length()];
#SuppressWarnings("resource")
BufferedInputStream f = new BufferedInputStream(new FileInputStream(file));
f.read(buffer);
return new String(buffer);
}
static abstract class LineParser {
public abstract String parse(String s) throws ParseException;
static class ManualDatasetLineParser extends LineParser {
public String parse(String s) throws ParseException {
return s.trim();
}
}
static class OccTSVLineParser extends LineParser {
public String parse(String s) throws ParseException {
String result = s;
try {
result = s.trim().split("\t")[3];
} catch (ArrayIndexOutOfBoundsException e) {
throw new ParseException(e.getMessage(), 3);
}
return result;
}
}
}
public void saveExtractedEntitiesSet(String Question, LineParser parser, int restartFrom) throws Exception {
String text = Question;
int i=0;
//int correct =0 ; int error = 0;int sum = 0;
for (String snippet: text.split("\n")) {
String s = parser.parse(snippet);
if (s!= null && !s.equals("")) {
i++;
if (i<restartFrom) continue;
List<DBpediaResource> entities = new ArrayList<DBpediaResource>();
try {
entities = extract(new Text(snippet.replaceAll("\\s+"," ")));
System.out.println(entities.get(0).getFullUri());
} catch (AnnotationException e) {
// error++;
//LOG.error(e);
e.printStackTrace();
}
for (DBpediaResource e: entities) {
RES.add(e.uri());
}
}
}
}
public abstract List<DBpediaResource> extract(Text text) throws AnnotationException;
public void evaluate(String Question) throws Exception {
evaluateManual(Question,0);
}
public void evaluateManual(String Question, int restartFrom) throws Exception {
saveExtractedEntitiesSet(Question,new LineParser.ManualDatasetLineParser(), restartFrom);
}
}
The Main Class
package extractKeyword;
public class startAnnonation {
public static void main(String[] args) throws Exception {
String question = "What is the winning chances of BJP in New Delhi elections?";
db c = new db ();
c.configiration(0.25,0);
//, 0, "non", "AtLeastOneNounSelector", "Default", "yes");
c.evaluate(question);
System.out.println("resource : "+c.getResu());
}
}
The main problem is here when I am using DBPedia spotlight using spotlight jar (above code)then i am getting different result as compared to the dbpedia spotlight endpoint(dbpedia-spotlight.github.io/demo/)
Result using the above code:-
Text :-What is the winning chances of BJP in New Delhi elections?
Confidence level:-0.35
resource : [Election]
Result on DBPedia Spotlight endpoint(//dbpedia-spotlight.github.io/demo/)
Text:-What is the winning chances of BJP in New Delhi elections?
Confidence level:-0.35
resource : [Bharatiya_Janata_Party, New_Delhi, Election]
Why also the spotlight now don't have support as a parameter?

map.size() not working with static map

Hey i am trying to get the size of Static map from other class...
i am defining Static map in one class...as
tasklet.class
package com.hcsc.ccsp.nonadj.subrogation.integration;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.springframework.batch.core.StepContribution;
import org.springframework.batch.core.scope.context.ChunkContext;
import org.springframework.batch.core.step.tasklet.Tasklet;
import org.springframework.batch.repeat.RepeatStatus;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.core.io.Resource;
import org.springframework.util.Assert;
import com.hcsc.ccsp.nonadj.subrogation.batch.Subrogation;
import com.hcsc.ccsp.nonadj.subrogation.common.SubrogationConstants;
/**
* #author Manan Shah
*
*/
public class SubrogationFileTransferTasklet implements Tasklet,
InitializingBean {
private Logger logger = LogManager
.getLogger(SubrogationFileTransferTasklet.class);
private Resource inputfile;
private Resource outputfile;
public static String fileLastName;
public static String header = null;
public static String trailer = null;
public static List<Subrogation> fileDataListSubro = new ArrayList<Subrogation>();
public List<String> fileDataListS = new ArrayList<String>();
public static TreeMap<String, Subrogation> map = new TreeMap<String, Subrogation>();
public int counter = 0;
public String value;
#Override
public void afterPropertiesSet() throws Exception {
Assert.notNull(inputfile, "inputfile must be set");
}
public void setTrailer(String trailer) {
this.trailer = trailer;
}
public void setHeader(String header) {
this.header = header;
}
public String getTrailer() {
return trailer;
}
public String getHeader() {
return header;
}
public Resource getInputfile() {
return inputfile;
}
public void setInputfile(Resource inputfile) {
this.inputfile = inputfile;
}
public Resource getOutputfile() {
return outputfile;
}
public void setOutputfile(Resource outputfile) {
this.outputfile = outputfile;
}
public static void setFileDataListSubro(List<Subrogation> fileDataListSubro) {
SubrogationFileTransferTasklet.fileDataListSubro = fileDataListSubro;
}
public static List<Subrogation> getFileDataListSubro() {
return fileDataListSubro;
}
public static void setMap(TreeMap<String, Subrogation> map) {
SubrogationFileTransferTasklet.map = map;
}
public static TreeMap<String, Subrogation> getMap() {
return map;
}
#Override
public RepeatStatus execute(StepContribution contribution,
ChunkContext chunkContext) throws Exception {
value = (String) chunkContext.getStepContext().getStepExecution()
.getJobExecution().getExecutionContext().get("outputFile");
readFromFile();
return RepeatStatus.FINISHED;
}
public void readFromFile() {
BufferedReader br = null;
try {
String sCurrentLine;
br = new BufferedReader(new FileReader(inputfile.getFile()));
fileLastName = inputfile.getFile().getName();
while ((sCurrentLine = br.readLine()) != null) {
if (sCurrentLine.indexOf("TRAILER") != -1) {
setTrailer(sCurrentLine);
} else if (sCurrentLine.indexOf("HEADER") != -1) {
setHeader(sCurrentLine);
} else if (sCurrentLine.equalsIgnoreCase("")) {
} else {
fileDataListS.add(sCurrentLine);
}
}
convertListOfStringToListOfSubrogaion(fileDataListS);
writeDataToFile();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (br != null)
br.close();
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
public void convertListOfStringToListOfSubrogaion(List<String> list) {
Iterator<String> iterator = list.iterator();
while (iterator.hasNext()) {
Subrogation subrogration = new Subrogation();
String s = iterator.next();
subrogration.setGRP_NBR(StringUtils.substring(s, 0, 6));
subrogration.setSECT_NBR(StringUtils.substring(s, 6, 10));
subrogration.setAFP_VAL(StringUtils.substring(s, 10, 13));
subrogration.setDOL_MIN_VAL(StringUtils.substring(s, 13, 20));
subrogration
.setCORP_ENT_CD(StringUtils.substring(s, 20, s.length()));
map.put(subrogration.getGRP_NBR() + subrogration.getSECT_NBR(),
subrogration);
fileDataListSubro.add(subrogration);
}
}
public void writeDataToFile() {
try {
File file = new File(value);
if (!file.exists()) {
logger.info("output file is:-" + file.getAbsolutePath());
file.createNewFile();
}
FileWriter fw = new FileWriter(file.getAbsoluteFile());
BufferedWriter bw = new BufferedWriter(fw);
Iterator it = map.entrySet().iterator();
while (it.hasNext()) {
Map.Entry subrogation = (Map.Entry) it.next();
// System.out.println(subrogation.getKey() + " = " +
// subrogation.getValue());
// it.remove(); // avoids a ConcurrentModificationException
bw.append(subrogation.getValue().toString()
+ SubrogationConstants.filler58);
}
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
logger.info("subrogationFileTransferTasklet Step completes");
}
}
In processor i want to put map size into int.
processor.class
package com.hcsc.ccsp.nonadj.subrogation.processor;
import org.apache.commons.lang3.StringUtils;
import org.springframework.batch.item.ItemProcessor;
import com.hcsc.ccsp.nonadj.subrogation.Utils.SubrogationUtils;
import com.hcsc.ccsp.nonadj.subrogation.batch.Subrogation;
import com.hcsc.ccsp.nonadj.subrogation.common.SubrogationConstants;
import com.hcsc.ccsp.nonadj.subrogation.integration.SubrogationFileTransferTasklet;
public class SubrogationProcessor implements
ItemProcessor<Subrogation, Subrogation> {
public SubrogationFileTransferTasklet fileTransferTasklet = new SubrogationFileTransferTasklet();
SubrogationUtils subrogationUtils = new SubrogationUtils();
public int countFromFile=SubrogationFileTransferTasklet.map.size();
public static int totalRecords = 0;
public static int duplicate = 0;
#Override
public Subrogation process(Subrogation subrogration) throws Exception {
// TODO Auto-generated method stub
if (subrogationUtils.validateData(subrogration)) {
Subrogation newSubro = new Subrogation();
newSubro.setGRP_NBR(StringUtils.leftPad(subrogration.getGRP_NBR()
.trim(), SubrogationConstants.length6, "0"));
if (subrogration.getSECT_NBR().trim().length() < 5) {
newSubro.setSECT_NBR(StringUtils.leftPad(subrogration
.getSECT_NBR().trim(), SubrogationConstants.length4,
"0"));
} else if (subrogration.getSECT_NBR().trim().length() == 5) {
newSubro.setSECT_NBR(StringUtils.substring(subrogration.getSECT_NBR().trim(), 1));
} else {
return null;
}
newSubro.setAFP_VAL(StringUtils.leftPad(subrogration.getAFP_VAL()
.trim(), SubrogationConstants.length3, "0"));
if (subrogration.getDOL_MIN_VAL().trim().contains(".")) {
newSubro.setDOL_MIN_VAL(StringUtils.leftPad(StringUtils.substring(subrogration.getDOL_MIN_VAL(),0,subrogration.getDOL_MIN_VAL().indexOf(".")), SubrogationConstants.length7,
"0"));
} else {
newSubro.setDOL_MIN_VAL(StringUtils.leftPad(subrogration
.getDOL_MIN_VAL().trim(), SubrogationConstants.length7,
"0"));
}
newSubro.setCORP_ENT_CD(StringUtils.substring(
subrogration.getCORP_ENT_CD(), 0, 2));
if (SubrogationFileTransferTasklet.map.containsKey(newSubro
.getGRP_NBR() + newSubro.getSECT_NBR())) {
duplicate++;
return null;
} else {
if(SubrogationFileTransferTasklet.fileLastName.contains("TX")){
if(newSubro.getCORP_ENT_CD().equalsIgnoreCase("TX")){
SubrogationFileTransferTasklet.map.put(newSubro
.getGRP_NBR() + newSubro.getSECT_NBR(), newSubro);
totalRecords++;
return newSubro;
}
}
else{
if(SubrogationFileTransferTasklet.fileLastName.contains("IL")){
if(!newSubro.getCORP_ENT_CD().equalsIgnoreCase("TX"))
{
newSubro.setCORP_ENT_CD("IL");
SubrogationFileTransferTasklet.map.put(newSubro
.getGRP_NBR() + newSubro.getSECT_NBR(), newSubro);
totalRecords++;
return newSubro;
}
}
else{
return null;
}
}
return null;
}
}
else {
return null;
}
}
}
class SubrogrationException extends RuntimeException {
private static final long serialVersionUID = -8971030257905108630L;
public SubrogrationException(String message) {
super(message);
}
}
and at last i want to use that countFromFile in other class..
writer.class
package com.hcsc.ccsp.nonadj.subrogation.writer;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Writer;
import java.util.Date;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.springframework.batch.item.ItemStreamException;
import org.springframework.batch.item.ItemWriter;
import org.springframework.batch.item.file.FlatFileFooterCallback;
import org.springframework.batch.item.file.FlatFileHeaderCallback;
import com.hcsc.ccsp.nonadj.subrogation.Utils.SubrogationUtils;
import com.hcsc.ccsp.nonadj.subrogation.batch.Subrogation;
import com.hcsc.ccsp.nonadj.subrogation.common.SubrogationConstants;
import com.hcsc.ccsp.nonadj.subrogation.integration.SubrogationFileTransferTasklet;
import com.hcsc.ccsp.nonadj.subrogation.processor.SubrogationProcessor;
public class SubrogationHeaderFooterWriter implements FlatFileFooterCallback,FlatFileHeaderCallback{
private Logger logger = LogManager
.getLogger(SubrogationHeaderFooterWriter.class);
SubrogationFileTransferTasklet fileTransferTasklet = new SubrogationFileTransferTasklet();
SubrogationUtils subrogationUtils=new SubrogationUtils();
SubrogationProcessor processor=new SubrogationProcessor();
private ItemWriter<Subrogation> delegate;
public void setDelegate(ItemWriter<Subrogation> delegate) {
this.delegate = delegate;
}
public ItemWriter<Subrogation> getDelegate() {
return delegate;
}
#Override
public void writeHeader(Writer writer) throws IOException {
//writer.write(SubrogationFileTransferTasklet.header);
}
#Override
public void writeFooter(Writer writer) throws IOException {
String trailer = SubrogationFileTransferTasklet.trailer;
String s1 = StringUtils.substring(trailer, 0, 23);
logger.info(" Data from input file size is---- "+new SubrogationProcessor().countFromFile);
int trailerCounter=new SubrogationProcessor().countFromFile+SubrogationProcessor.totalRecords;
logger.info(" Data comming from database is"+SubrogationProcessor.totalRecords);
logger.info(" Duplicate data From DataBase is " +SubrogationProcessor.duplicate);
logger.info(" Traileer is " + s1+ trailerCounter);
writer.write(s1 + trailerCounter);
SubrogationFileTransferTasklet.map.clear();
SubrogationFileTransferTasklet.fileDataListSubro.clear();
SubrogationProcessor.totalRecords=0;
SubrogationProcessor.duplicate=0;
}
public void writeErrorDataToFile(List<String> errorDataList,String errorfile){
File file;
try {
file = new File(errorfile);
logger.info("error file is "+errorfile);
FileWriter fileWriter = new FileWriter(file,true);
BufferedWriter bufferWritter = new BufferedWriter(fileWriter);
for(String data:errorDataList){
bufferWritter.write(new Date()+" "+data);
bufferWritter.write(SubrogationConstants.LINE_SEPARATOR);
}
bufferWritter.close();
}
catch (IOException e) {
throw new ItemStreamException("Could not convert resource to file: [" + errorfile + "]", e);
}
}
/*
public void write(List<? extends Subrogation> subrogation) throws Exception {
System.out.println("inside writer");
delegate.write(subrogation);
}*/
}
so here in logger massage.size prints 0....
I am not able to understand why???
Do in this way to make sure that It is initialized with the current size of the map when object is constructed.
class SubrogationProcessor{
public int countFromFile;
public SubrogationProcessor(){
countFromFile=SubrogationFileTransferTasklet.map.size();
}
}
This depends on when the "map.put" line of code is run. Is it in a static block in the tasklet class?
If processor instance is initialized before record has been added to the map then map.size() will indeed be 0.
my suggestion would be to add the map into a static block if at all possible or to debug the code and see when the .put() method is being called in comparison to when the .size() method is called
public static TreeMap<String, Subrogation> map = new TreeMap<String, Subrogation>();
static{
map.put(subrogration.getGRP_NBR() + subrogration.getSECT_NBR(), subrogration);
}

Run two classes one after the other

How to run two classes in which one gives some data in a textfile & the other should take that file and process it?
I have two Java files. File1 processes something and outputs a text file. File2 should take that text file and process it to create a final output.
My requirement is to have two independent java files that work together.
File1
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.Map;
import java.util.TreeMap;
import java.util.Iterator;
import java.util.List;
import java.util.ArrayList;
public class FlatFileParser
{
public static void main(String[] args)
{
try
{
// The stream we're reading from
BufferedReader in;
List<String> ls = new ArrayList<String>();
BufferedWriter out1 = new BufferedWriter(new FileWriter("inValues.txt" , true ));
BufferedReader out11 = new BufferedReader(new FileReader("inValues.txt"));
// Return value of next call to next()
String nextline;
String line="";
if (args[0].equals("1"))
{
in = new BufferedReader(new FileReader(args[1]));
nextline = in.readLine();
while(nextline != null)
{
nextline = nextline.replaceAll("\\<packet","\n<packet");
System.out.println(nextline);
nextline = in.readLine();
}
in.close();
}
else
{
in = new BufferedReader(new FileReader(args[1]));
nextline = in.readLine();
HashMap<String,String> inout = new HashMap<String,String>();
while(nextline != null)
{
try
{
if (nextline.indexOf("timetracker")>0)
{
String from = "";
String indate = "";
if (nextline.indexOf("of in")>0)
{
int posfrom = nextline.indexOf("from");
int posnextAt = nextline.indexOf("#", posfrom);
int posts = nextline.indexOf("timestamp");
from = nextline.substring(posfrom+5,posnextAt);
indate = nextline.substring(posts+11, posts+23);
String dd = indate.split(" ")[1];
String key = dd+"-"+from+"-"+indate;
//String key = from+"-"+indate;
String intime = "-in-"+nextline.substring(posts+24, posts+35);
inout.put(key, intime);
}
else if (nextline.indexOf("of out")>0)
{
int posfrom = nextline.indexOf("from");
int posnextAt = nextline.indexOf("#", posfrom);
int posts = nextline.indexOf("timestamp");
from = nextline.substring(posfrom+5,posnextAt);
indate = nextline.substring(posts+11, posts+23);
String dd = indate.split(" ")[1];
String key = dd+"-"+from+"-"+indate;
String outtime = "-out-"+nextline.substring(posts+24, posts+35);
if (inout.containsKey(key))
{
String val = inout.get(key);
if (!(val.indexOf("out")>0))
inout.put(key, val+outtime);
}
else
{
inout.put(key, outtime);
}
}
}
}
catch(Exception e)
{
System.err.println(nextline);
System.err.println(e.getMessage());
}
nextline = in.readLine();
}
in.close();
for(String key: inout.keySet())
{
String val = inout.get(key);
out1.write(key+" , "+val+"\n");
}
out1.close();
}
}
catch (IOException e)
{
throw new IllegalArgumentException(e);
}
}
File2
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.io.File;
import java.io.FileReader;
public class RecordParser
{
private static BufferedReader reader;
private List<Person> resource;
private List<String> finalRecords;
public RecordParser(BufferedReader reader)
{
this.reader = reader;
this.resource = new ArrayList<Person>();
this.finalRecords = new ArrayList<String>();
}
public void execute() throws IOException
{
String line = null;
while ((line = reader.readLine()) != null)
{
String[] parts = line.split(" , ");
addPerson(new Person(parts[0]));
if ((parts[1].contains("-in-")) && (parts[1].contains("-out-")))
{
String[] inout = parts[1].split("-out-");
Person person = getPerson(parts[0]);
person.setInTime(inout[0]);
person.setOutTime("-out-" + inout[1]);
}
else if (parts[1].contains("-in-"))
{
Person person = getPerson(parts[0]);
person.setInTime(parts[1]);
}
else
{
Person person = getPerson(parts[0]);
person.setOutTime(parts[1]);
}
}
// finalRecords the resource to the String list
for (Person p : resource)
{
finalRecords.add(p.getPerson());
}
}
private void addPerson(Person person)
{
for (Person p : resource)
{
if (p.getNameDate().equals(person.getNameDate()))
{
return;
}
}
resource.add(person);
}
private Person getPerson(String nameDate)
{
for (Person p : resource)
{
if (p.getNameDate().equals(nameDate))
{
return p;
}
}
return null;
}
public List<String> getfinalRecords()
{
return finalRecords;
}
public static void main(String[] args)
{
try {
BufferedReader reader = new BufferedReader(new FileReader("sample.txt"));
RecordParser recordParser = new RecordParser(reader);
recordParser.execute();
for (String s : recordParser.getfinalRecords())
{
System.out.println(s);
}
reader.close();
} catch (IOException e)
{
e.printStackTrace();
}
}
public class Person
{
private String nameDate;
private String inTime;
private String outTime;
public Person (String nameDate)
{
this.nameDate = nameDate;
this.inTime = "missing in";
this.outTime = "missing out";
}
public void setInTime(String inTime)
{
this.inTime = inTime;
}
public void setOutTime(String outTime)
{
this.outTime = outTime;
}
public String getNameDate()
{
return nameDate;
}
public String getPerson()
{
StringBuilder builder = new StringBuilder();
builder.append(nameDate);
builder.append(" , ");
builder.append(inTime);
builder.append(" , ");
builder.append(outTime);
return builder.toString();
}
}
}
I want to be able to import the values from inValues.txt (created in File1) and process them in File2.
Create a batch/sh file and run one java program after the other. If you want to pass the file details to the second program you can do that by providing a run time argument.
on windows:
java -classpath .;yourjars FlatFileParser
java -classpath .;yourjars RecordParser {optionalfiledetails}
on linux
java -classpath .:yourjars FlatFileParser
java -classpath .:yourjars RecordParser {optionalfiledetails}

How can I write large output to Process getOutputStream?

I am trying to execute a command (eg. ps -ef | grep apache) using ProcessBuilder and Process. The code works as long as the output of 'ps -ef' is small. But if the output is too big, the program hangs. Is there a way to fix this? Here is my code based on [http://www.javaworld.com/javaworld/jw-12-2000/jw-1229-traps.html]
#### Program.java ####
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
public class Program {
private List<String> command;
public Program(String commandString) throws IOException {
this(commandString, null);
}
public List<String> getCommand() {
return this.command;
}
private void setCommand(String filename, String location, String commandString, List<String> parameters) throws IOException {
if(filename != null) {
commandString = new File(location, filename).getCanonicalPath();
}
this.command =
Collections.synchronizedList(new ArrayList<String>());
this.command.add(commandString);
if (parameters != null) {
for (String arg: parameters) {
command.add(arg);
}
}
}
public String[] run() throws IOException, InterruptedException {
return this.run(null);
}
public String[] run(String input) throws IOException, InterruptedException {
ProcessBuilder processBuilder = new ProcessBuilder(this.command);
List<String> commandList = processBuilder.command();
Process process = processBuilder.start();
if(input != null) {
PrintWriter writer = new PrintWriter(new OutputStreamWriter(new BufferedOutputStream(process.getOutputStream())), true);
writer.println(input);
writer.flush();
writer.close();
}
process.getOutputStream().close();
Gobbler outGobbler = new Gobbler(process.getInputStream());
Gobbler errGobbler = new Gobbler(process.getErrorStream());
Thread outThread = new Thread(outGobbler);
Thread errThread = new Thread(errGobbler);
outThread.start();
errThread.start();
outThread.join();
errThread.join();
int exitVal = process.waitFor();
System.out.println("PROCESS WAIT FOR: " + exitVal);
List<String> output = outGobbler.getOuput();
return output.toArray(new String[output.size()]);
}
}
#### CommandExecutor.java ####
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
public class CommandExecutor {
public List<List<Object>> programs;
public static void main(String[] args) {
try {
CommandExecutor ce = new CommandExecutor(args[0]);
String output = ce.run();
System.out.println("Command: " + args[0]);
System.out.println("Output: " + output);
} catch (IOException e) {
// TODO Auto-generated catch block
System.out.println(e.getLocalizedMessage());
e.printStackTrace();
} catch (InterruptedException ie) {
// TODO Auto-generated catch block
System.out.println(ie.getLocalizedMessage());
ie.printStackTrace();
}
}
public CommandExecutor(String command) throws IOException {
this.setPrograms(command);
}
private void setPrograms(String command) throws IOException {
this.programs = new ArrayList<List<Object>>();
//String cmdstring = "";
String[] commands = command.split("\\s*;\\s*");
for(String c: commands) {
//String subcmdstr = "";
String file = null;
String[] chainedCommands = c.split("\\s*\\|\\s*");
String lastCmd = chainedCommands[chainedCommands.length-1];
String[] fileCmd = lastCmd.split("\\s*>\\s*");
if(fileCmd.length > 1) {
chainedCommands[chainedCommands.length-1] = fileCmd[0];
file = fileCmd[1];
}
List<Object> l = new ArrayList<Object>();
for(String p: chainedCommands) {
/*if(subcmdstr.equals("")) {
subcmdstr = p;
}
else {
subcmdstr += " redirects to " + p;
}*/
String[] cmdparams = p.split(" ");
String cmd = cmdparams[0];
List<String> params = new ArrayList<String>();
for(int j = 1; j < cmdparams.length; j++) {
params.add(cmdparams[j]);
}
Program prog = new Program(cmd, params);
l.add(prog);
}
if(file != null) {
//subcmdstr += " redirects to file: " + file;
l.add(file);
}
this.programs.add(l);
//cmdstring += "new command: " + subcmdstr + "\n";
}
//System.out.println("Actual Command: " + command);
//System.out.println("Command String:\n" + cmdstring);
}
public String run() throws IOException, InterruptedException {
String output = "";
for(List<Object> l: this.programs) {
String[] out = new String[0];
int count = 0;
boolean filenotfound = true;
for(Object o: l) {
if(o instanceof Program) {
Program p = (Program) o;
if(count == 0) {
out = p.run();
}
else {
out = p.run(CommandExecutor.arrayToString(out));
}
}
else if(o instanceof String) {
PrintWriter f = new PrintWriter(new File((String)o));
f.print(CommandExecutor.arrayToString(out));
f.close();
filenotfound = false;
}
count++;
}
if(filenotfound) {
output += CommandExecutor.arrayToString(out);
}
}
return output;
}
public static String arrayToString(String[] strArray) {
String str = "";
for(String s: strArray) {
str += s;
}
return str;
}
}
Thanks,
Quadir
Ok, I got it working. Below is the code, given a list of commands, it pipes the output of one command to the next.
/*
####### PipeRedirection.java
*/
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
public class PipeRedirection {
public static void main(String[] args) throws FileNotFoundException {
if(args.length < 2) {
System.err.println("Need at least two arguments");
System.exit(1);
}
try {
String input = null;
for(int i = 0; i < args.length; i++) {
String[] commandList = args[i].split(" ");
ProcessBuilder pb = new ProcessBuilder(commandList);
//pb.redirectErrorStream(true);
Process p = pb.start();
if(input != null) {
PrintWriter writer = new PrintWriter(new OutputStreamWriter(new BufferedOutputStream(p.getOutputStream())), true);
writer.println(input);
writer.flush();
writer.close();
}
InputProcess.Gobbler outGobbler = new InputProcess.Gobbler(p.getInputStream());
InputProcess.Gobbler errGobbler = new InputProcess.Gobbler(p.getErrorStream());
Thread outThread = new Thread(outGobbler);
Thread errThread = new Thread(errGobbler);
outThread.start();
errThread.start();
outThread.join();
errThread.join();
int exitVal = p.waitFor();
System.out.println("\n****************************");
System.out.println("Command: " + args[i]);
System.out.println("Exit Value = " + exitVal);
List<String> output = outGobbler.getOuput();
input = "";
for(String o: output) {
input += o;
}
}
System.out.println("Final Output:");
System.out.println(input);
} catch (IOException ioe) {
// TODO Auto-generated catch block
System.err.println(ioe.getLocalizedMessage());
ioe.printStackTrace();
} catch (InterruptedException ie) {
// TODO Auto-generated catch block
System.err.println(ie.getLocalizedMessage());
ie.printStackTrace();
}
}
public static class Gobbler implements Runnable {
private BufferedReader reader;
private List<String> output;
public Gobbler(InputStream inputStream) {
this.reader = new BufferedReader(new InputStreamReader(inputStream));
}
public void run() {
String line;
this.output = new ArrayList<String>();
try {
while((line = this.reader.readLine()) != null) {
this.output.add(line + "\n");
}
this.reader.close();
}
catch (IOException e) {
// TODO
System.err.println("ERROR: " + e.getMessage());
}
}
public List<String> getOuput() {
return this.output;
}
}
}
Don't print it as a String but give the CommandExecuter an optional OutputStream (in your Case you pass System.out as the argument) and write it to that stream.
In your current program the Main Method will execute the program and won't print anything (hang) until your run method returns something.

Categories