Reading an ORC file in Java - java

How do you read an ORC file in Java? I'm wanting to read in a small file for some unit test output verification, but I can't find a solution.

Came across this and implemented one myself recently
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.Reader;
import org.apache.hadoop.hive.ql.io.orc.RecordReader;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import java.util.List;
public class OrcFileDirectReaderExample {
public static void main(String[] argv)
{
try {
Reader reader = OrcFile.createReader(HdfsFactory.getFileSystem(), new Path("/user/hadoop/000000_0"));
StructObjectInspector inspector = (StructObjectInspector)reader.getObjectInspector();
System.out.println(reader.getMetadata());
RecordReader records = reader.rows();
Object row = null;
//These objects are the metadata for each column. They give you the type of each column and can parse it unless you
//want to parse each column yourself
List fields = inspector.getAllStructFieldRefs();
for(int i = 0; i < fields.size(); ++i) {
System.out.print(((StructField)fields.get(i)).getFieldObjectInspector().getTypeName() + '\t');
}
while(records.hasNext())
{
row = records.next(row);
List value_lst = inspector.getStructFieldsDataAsList(row);
StringBuilder builder = new StringBuilder();
//iterate over the fields
//Also fields can be null if a null was passed as the input field when processing wrote this file
for(Object field : value_lst) {
if(field != null)
builder.append(field.toString());
builder.append('\t');
}
//this writes out the row as it would be if this were a Text tab seperated file
System.out.println(builder.toString());
}
}catch (Exception e)
{
e.printStackTrace();
}
}
}

As per Apache Wiki, ORC file format was introduced in Hive 0.11.
So you will need Hive packages in your project source path to read ORC files. The package for the same are
org.apache.hadoop.hive.ql.io.orc.Reader;
org.apache.hadoop.hive.ql.io.orc.OrcFile

read orc testcase
#Test
public void read_orc() throws Exception {
//todo do kerberos auth
String orcPath = "hdfs://user/hive/warehouse/demo.db/orc_path";
//load hdfs conf
Configuration conf = new Configuration();
conf.addResource(getClass().getResource("/hdfs-site.xml"));
conf.addResource(getClass().getResource("/core-site.xml"));
FileSystem fs = FileSystem.get(conf);
// custom read column
List<String> columns = Arrays.asList("id", "title");
final List<Map<String, Object>> maps = OrcUtil.readOrcFile(fs, orcPath, columns);
System.out.println(new Gson().toJson(maps));
}
OrcUtil to read orc path with special columns
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.ql.io.orc.OrcSplit;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.hive.ql.io.orc.Reader;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
public class OrcUtil {
public static List<Map<String, Object>> readOrcFile(FileSystem fs, String orcPath, List<String> readColumns)
throws IOException, SerDeException {
JobConf jobConf = new JobConf();
for (Map.Entry<String, String> entry : fs.getConf()) {
jobConf.set(entry.getKey(), entry.getValue());
}
FileInputFormat.setInputPaths(jobConf, orcPath);
FileInputFormat.setInputPathFilter(jobConf, ((PathFilter) path1 -> true).getClass());
InputSplit[] splits = new OrcInputFormat().getSplits(jobConf, 1);
InputFormat<NullWritable, OrcStruct> orcInputFormat = new OrcInputFormat();
List<Map<String, Object>> rows = new ArrayList<>();
for (InputSplit split : splits) {
OrcSplit orcSplit = (OrcSplit) split;
System.out.printf("read orc split %s%n", ((OrcSplit) split).getPath());
StructObjectInspector inspector = getStructObjectInspector(orcSplit.getPath(), jobConf, fs);
List<? extends StructField> readFields = inspector.getAllStructFieldRefs()
.stream().filter(e -> readColumns.contains(e.getFieldName())).collect(Collectors.toList());
// 49B file is empty
if (orcSplit.getLength() > 49) {
RecordReader<NullWritable, OrcStruct> recordReader = orcInputFormat.getRecordReader(orcSplit, jobConf, Reporter.NULL);
NullWritable key = recordReader.createKey();
OrcStruct value = recordReader.createValue();
while (recordReader.next(key, value)) {
Map<String, Object> entity = new HashMap<>();
for (StructField field : readFields) {
entity.put(field.getFieldName(), inspector.getStructFieldData(value, field));
}
rows.add(entity);
}
}
}
return rows;
}
private static StructObjectInspector getStructObjectInspector(Path path, JobConf jobConf, FileSystem fs)
throws IOException, SerDeException {
OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(jobConf);
readerOptions.filesystem(fs);
Reader reader = OrcFile.createReader(path, readerOptions);
String typeStruct = reader.getObjectInspector().getTypeName();
System.out.println(typeStruct);
List<String> columnList = parseColumnAndType(typeStruct);
String[] fullColNames = new String[columnList.size()];
String[] fullColTypes = new String[columnList.size()];
for (int i = 0; i < columnList.size(); ++i) {
String[] temp = columnList.get(i).split(":");
fullColNames[i] = temp[0];
fullColTypes[i] = temp[1];
}
Properties p = new Properties();
p.setProperty("columns", StringUtils.join(fullColNames, ","));
p.setProperty("columns.types", StringUtils.join(fullColTypes, ":"));
OrcSerde orcSerde = new OrcSerde();
orcSerde.initialize(jobConf, p);
return (StructObjectInspector) orcSerde.getObjectInspector();
}
private static List<String> parseColumnAndType(String typeStruct) {
int startIndex = typeStruct.indexOf("<") + 1;
int endIndex = typeStruct.lastIndexOf(">");
typeStruct = typeStruct.substring(startIndex, endIndex);
List<String> columnList = new ArrayList<>();
List<String> splitList = Arrays.asList(typeStruct.split(","));
Iterator<String> it = splitList.iterator();
while (it.hasNext()) {
StringBuilder current = new StringBuilder(it.next());
String currentStr = current.toString();
boolean left = currentStr.contains("(");
boolean right = currentStr.contains(")");
if (!left && !right) {
columnList.add(currentStr);
continue;
}
if (left && right) {
columnList.add(currentStr);
continue;
}
if (left && !right) {
while (it.hasNext()) {
String next = it.next();
current.append(",").append(next);
if (next.contains(")")) {
break;
}
}
columnList.add(current.toString());
}
}
return columnList;
}
}

Try this for getting ORCFile rowcount...
private long getRowCount(FileSystem fs, String fName) throws Exception {
long tempCount = 0;
Reader rdr = OrcFile.createReader(fs, new Path(fName));
StructObjectInspector insp = (StructObjectInspector) rdr.getObjectInspector();
Iterable<StripeInformation> iterable = rdr.getStripes();
for(StripeInformation stripe:iterable){
tempCount = tempCount + stripe.getNumberOfRows();
}
return tempCount;
}
//fName is hdfs path to file.
long rowCount = getRowCount(fs,fName);

Related

Tensorflow lite model(.tflite) in android is always giving the same result for text classification

I am trying to classify the messages that i receive into 5 categories(sports, politics, business, tech, entertainment), but for every message I send the tflite model classify it as sports ONLY.
this is my model:
i am using Average Word Vector model to train and test my data. and it gives me correct predictions when testing it. However, when I integrate the model into android studio, the model always predict the message as sports with high accuracy(around 95%)
!pip install -q tflite-model-maker
import numpy as np
import os
from tflite_model_maker import configs
from tflite_model_maker import ExportFormat
from tflite_model_maker import model_spec
from tflite_model_maker import text_classifier
from tflite_model_maker import TextClassifierDataLoader
import pandas as pd
import tensorflow as tf
assert tf.__version__.startswith('2')
data = pd.read_csv("bbc-text.csv")
print(data)
awv_spec = model_spec.get('average_word_vec')
awv_train_data = TextClassifierDataLoader.from_csv(
filename='bbc-text.csv',
text_column='text',
label_column='category',
model_spec=awv_spec,
is_training=True)
awv_test_data = TextClassifierDataLoader.from_csv(
filename='bbc-text1.csv',
text_column='text',
label_column='category',
model_spec=awv_spec,
is_training=False)
awv_model = text_classifier.create(awv_train_data, model_spec=awv_spec, epochs=20)
awv_model.evaluate(awv_test_data)
awv_model.export(export_dir='average_word_vec/')
and this is how i retrive info in android:
package com.example.letstalk.lib_interpreter;
import android.content.Context;
import android.content.res.AssetFileDescriptor;
import android.content.res.AssetManager;
import android.util.Log;
import com.example.letstalk.lib_interpreter.Result;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import org.tensorflow.lite.Interpreter;
import org.tensorflow.lite.support.metadata.MetadataExtractor;
public class TextClassificationClient {
private static final String TAG = "Interpreter";
private static final int SENTENCE_LEN = 256; // The maximum length of an input sentence.
// Simple delimiter to split words.
private static final String SIMPLE_SPACE_OR_PUNCTUATION = " |\\,|\\.|\\!|\\?|\n";
private static final String MODEL_PATH = "sentiment_analysis.tflite";
/*
* Reserved values in ImdbDataSet dic:
* dic["<PAD>"] = 0 used for padding
* dic["<START>"] = 1 mark for the start of a sentence
* dic["<UNKNOWN>"] = 2 mark for unknown words (OOV)
*/
private static final String START = "<START>";
private static final String PAD = "<PAD>";
private static final String UNKNOWN = "<UNKNOWN>";
/** Number of results to show in the UI. */
private static final int MAX_RESULTS = 3;
private final Context context;
private final Map<String, Integer> dic = new HashMap<>();
private final List<String> labels = new ArrayList<>();
private Interpreter tflite;
public TextClassificationClient(Context context) {
this.context = context;
}
/** Load the TF Lite model and dictionary so that the client can start classifying text. */
public void load() {
loadModel();
}
/** Load TF Lite model. */
private synchronized void loadModel() {
try {
// Load the TF Lite model
ByteBuffer buffer = loadModelFile(this.context.getAssets(), MODEL_PATH);
tflite = new Interpreter(buffer);
Log.v(TAG, "TFLite model loaded.");
// Use metadata extractor to extract the dictionary and label files.
MetadataExtractor metadataExtractor = new MetadataExtractor(buffer);
// Extract and load the dictionary file.
InputStream dictionaryFile = metadataExtractor.getAssociatedFile("vocab.txt");
loadDictionaryFile(dictionaryFile);
Log.v(TAG, "Dictionary loaded.");
// Extract and load the label file.
InputStream labelFile = metadataExtractor.getAssociatedFile("labels.txt");
loadLabelFile(labelFile);
Log.v(TAG, "Labels loaded.");
} catch (IOException ex) {
Log.e(TAG, "Error loading TF Lite model.\n", ex);
}
}
/** Free up resources as the client is no longer needed. */
public synchronized void unload() {
tflite.close();
dic.clear();
labels.clear();
}
/** Classify an input string and returns the classification results. */
public synchronized List<Result> classify(String text) {
// Pre-prosessing.
int[][] input = tokenizeInputText(text);
// Run inference.
Log.v(TAG, "Classifying text with TF Lite...");
float[][] output = new float[1][labels.size()];
tflite.run(input, output);
// Find the best classifications.
PriorityQueue<Result> pq =
new PriorityQueue<>(
MAX_RESULTS, (lhs, rhs) -> Float.compare(rhs.getConfidence(), lhs.getConfidence()));
for (int i = 0; i < labels.size(); i++) {
pq.add(new Result("" + i, labels.get(i), output[0][i]));
}
final ArrayList<Result> results = new ArrayList<>();
while (!pq.isEmpty()) {
results.add(pq.poll());
}
Collections.sort(results);
// Return the probability of each class.
return results;
}
/** Load TF Lite model from assets. */
private static MappedByteBuffer loadModelFile(AssetManager assetManager, String modelPath)
throws IOException {
try (AssetFileDescriptor fileDescriptor = assetManager.openFd(modelPath);
FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor())) {
FileChannel fileChannel = inputStream.getChannel();
long startOffset = fileDescriptor.getStartOffset();
long declaredLength = fileDescriptor.getDeclaredLength();
return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
}
}
/** Load dictionary from model file. */
private void loadLabelFile(InputStream ins) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(ins));
// Each line in the label file is a label.
while (reader.ready()) {
labels.add(reader.readLine());
}
}
/** Load labels from model file. */
private void loadDictionaryFile(InputStream ins) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(ins));
// Each line in the dictionary has two columns.
// First column is a word, and the second is the index of this word.
while (reader.ready()) {
List<String> line = Arrays.asList(reader.readLine().split(" "));
if (line.size() < 2) {
continue;
}
dic.put(line.get(0), Integer.parseInt(line.get(1)));
}
}
/** Pre-prosessing: tokenize and map the input words into a float array. */
int[][] tokenizeInputText(String text) {
Log.d("hello", "tokenize: "+ text);
int[] tmp = new int[SENTENCE_LEN];
List<String> array = Arrays.asList(text.split(SIMPLE_SPACE_OR_PUNCTUATION));
int index = 0;
// Prepend <START> if it is in vocabulary file.
if (dic.containsKey(START)) {
tmp[index++] = dic.get(START);
}
for (String word : array) {
if (index >= SENTENCE_LEN) {
break;
}
tmp[index++] = dic.containsKey(word) ? dic.get(word) : (int) dic.get(UNKNOWN);
}
// Padding and wrapping.
Arrays.fill(tmp, index, SENTENCE_LEN - 1, (int) dic.get(PAD));
int[][] ans = {tmp};
return ans;
}
Map<String, Integer> getDic() {
return this.dic;
}
Interpreter getTflite() {
return this.tflite;
}
List<String> getLabels() {
return this.labels;
}
}

Is there any logic to compare two word documents(docx) and catch the missing String, Special chars, space and all the stuff?

I'm working on two word document comparison manually where i should not miss any Strings, Special chars, space and all the stuff and that document is around 150 pages or more. so its very headache to do comparison. Then I have written small java program to compare two documents but I'm not able to list the missing words.
Using Apche POI Library
Thanks in advance.
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFFooter;
import org.apache.poi.xwpf.usermodel.XWPFHeader;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
public class ReadDocFile {
private static XWPFDocument docx;
// private static String path = "C:\\States wise\\NH\\Assessment
// 2nd\\test.docx";
private static ArrayList<String> firstList = new ArrayList<String>(); // refers to first document list
private static ArrayList<String> secondList = new ArrayList<String>(); // refers to second document list
private static List<XWPFParagraph> paragraphList;
private static Map<String, String> map = null;
private static LinkedHashSet<String> firstMissedArray = new LinkedHashSet<String>(); // refers to first document Linked hash set
private static LinkedHashSet<String> secondMissedArray = new LinkedHashSet<String>(); // refers to second document Linked hash set
public static void getFilePath(String path) {
FileInputStream fis;
try {
fis = new FileInputStream(path);
docx = new XWPFDocument(fis);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void get_First_Doc_Data() {
getFilePath("C:\\States wise\\NH\\Assessment 2nd\\test.docx");
paragraphList = docx.getParagraphs();
System.out.println("******************** first list Starts here ******************** ");
System.out.println();
for (int i = 0; i < paragraphList.size() - 1; i++) {
firstList.add(paragraphList.get(i).getText().toString());
System.out.println(firstList.get(i).toString());
}
System.out.println("*********** first list Ends here ********************");
}
public static void get_Second_Doc_Data() {
getFilePath("C:\\States wise\\NH\\Assessment 2nd\\test1.docx");
paragraphList = docx.getParagraphs();
System.out.println("******************** Second list Starts here ******************** ");
System.out.println();
for (int i = 0; i < paragraphList.size() - 1; i++) {
secondList.add(paragraphList.get(i).getText().toString());
System.out.println(secondList.get(i).toString());
}
System.out.println("*********** Second list Ends here ********************");
}
public static void main(String[] args) {
get_First_Doc_Data();
get_Second_Doc_Data();
//System.out.println("First Para: " + firstList.contains(secondList));
compare();
compare_Two_List();
}
private static void compare() {
String firstMiss = null;
//String secondMiss = null;
for (int i = 0; i < firstList.size(); i++) {
for (int j = 0; j < secondList.size(); j++) {
if (!firstList.get(i).toString().equals(secondList.get(i).toString())) {
firstMiss = firstList.get(i).toString();
//secondMiss = secondList.get(i).toString();
map = new HashMap<String, String>();
}
}
firstMissedArray.add(firstMiss);
//secondMissedArray.add(secondMiss);
// System.out.println(missedArray.get(i).toString());
}
}
private static void compare_Two_List() {
int num = 0;
map.clear();
Iterator<String> first = firstMissedArray.iterator();
//Iterator<String> second = secondMissedArray.iterator();
while (first.hasNext()) {
map.put(""+num, first.next());
num++;
}
System.out.println(firstMissedArray.size());
Iterator it = map.entrySet().iterator();
while (it.hasNext()) {
Map.Entry pair = (Map.Entry) it.next();
System.out.println(pair.getKey() + " = " + pair.getValue());
// it.remove(); // avoids a ConcurrentModificationException
}
}
}
I have taken liberty to modify your code to arrive at the solution for your problem. Please go through this.
This should pretty much solve your problem - put SYSO statements wherever you think is necessary and tweak the flow of the program to achieve desired checks as per you requirement. In my hurry, I may not have made use of coding standards of using try catch block for error handling and handling the negative scenarios, so please take care of that when implementing it live.
In case if the documents are not .DOCX but .PDF make use of the Apache PDFBox api.
Here is the Code:
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
public class Comapre_Docs {
private static final String FIRST_DOC_PATH = "E:\\Workspace_Luna\\assignments\\Expected.docx";
private static final String SECOND_DOC_PATH = "E:\\Workspace_Luna\\assignments\\Actual.docx";
private static XWPFDocument docx;
private static List<XWPFParagraph> paragraphList;
private static ArrayList<String> firstList = new ArrayList<String>();
private static ArrayList<String> secondList = new ArrayList<String>();
public static void get_Doc_Data(String filePath, ArrayList listName)
throws IOException {
File file = new File(filePath);
FileInputStream fis = new FileInputStream(file);
docx = new XWPFDocument(fis);
paragraphList = docx.getParagraphs();
for (int i = 0; i <= paragraphList.size() - 1; i++) {
listName.add(paragraphList.get(i).getText().toString());
}
fis.close();
}
public static void main(String[] args) throws IOException {
get_Doc_Data(FIRST_DOC_PATH, firstList);
get_Doc_Data(SECOND_DOC_PATH, secondList);
compare(firstList, secondList);
}
private static void compare(ArrayList<String> firstList_1,
ArrayList<String> secondList_1) {
simpleCheck(firstList_1, secondList_1);
int size = firstList_1.size();
for (int i = 0; i < size; i++) {
paragraphCheck(firstList_1.get(i).toString().split(" "),
secondList_1.get(i).toString().split(" "), i);
}
}
private static void paragraphCheck(String[] firstParaArray,
String[] secondParaArray, int paraNumber) {
System.out
.println("=============================================================");
System.out.println("Paragraph No." + (paraNumber + 1) + ": Started");
if (firstParaArray.length != secondParaArray.length) {
System.out.println("There is mismatch of "
+ Math.abs(firstParaArray.length - secondParaArray.length)
+ " words in this paragraph");
}
TreeMap<String, Integer> firstDocPara = getOccurence(firstParaArray);
TreeMap<String, Integer> secondDocPara = getOccurence(secondParaArray);
ArrayList<String> keyData = new ArrayList<String>(firstDocPara.keySet());
for (int i = 0; i < keyData.size(); i++) {
if (firstDocPara.get(keyData.get(i)) != secondDocPara.get(keyData
.get(i))) {
System.out
.println("The following word is missing in actual document : "
+ keyData.get(i));
}
}
System.out.println("Paragraph No." + (paraNumber + 1) + ": Done");
System.out
.println("=============================================================");
}
private static TreeMap<String, Integer> getOccurence(String[] paraArray) {
TreeMap<String, Integer> paragraphStringCountHolder = new TreeMap<String, Integer>();
paragraphStringCountHolder.clear();
for (String a : paraArray) {
int count = 1;
if (paragraphStringCountHolder.containsKey(a)) {
count = paragraphStringCountHolder.get(a) + 1;
paragraphStringCountHolder.put(a, count);
} else {
paragraphStringCountHolder.put(a, count);
}
}
return paragraphStringCountHolder;
}
private static boolean simpleCheck(ArrayList<String> firstList,
ArrayList<String> secondList) {
boolean flag = false;
if (firstList.size() > secondList.size()) {
System.out
.println("There are more paragraph in Expected document than in Actual document");
} else if (firstList.size() < secondList.size()) {
System.out
.println("There are more paragraph in Actual document than in Expected document");
} else if (firstList.size() == secondList.size()) {
System.out.println("The paragraph count in both documents match");
flag = true;
}
return flag;
}
}

MAPREDUCE error: method write in interface TaskInputOutputContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT> cannot be applied to given types

package br.edu.ufam.anibrata;
import java.io.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import java.util.Arrays;
import java.util.HashSet;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.kohsuke.args4j.ParserProperties;
import tl.lin.data.array.ArrayListWritable;
import tl.lin.data.pair.PairOfStringInt;
import tl.lin.data.pair.PairOfWritables;
import br.edu.ufam.data.Dataset;
import com.google.gson.JsonSyntaxException;
public class BuildIndexWebTables extends Configured implements Tool {
private static final Logger LOG = Logger.getLogger(BuildIndexWebTables.class);
public static void main(String[] args) throws Exception
{
ToolRunner.run(new BuildIndexWebTables(), args);
}
#Override
public int run(String[] argv) throws Exception {
// Creates a new job configuration for this Hadoop job.
Args args = new Args();
CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(100));
try
{
parser.parseArgument(argv);
}
catch (CmdLineException e)
{
System.err.println(e.getMessage());
parser.printUsage(System.err);
return -1;
}
Configuration conf = getConf();
conf.setBoolean("mapreduce.map.output.compress", true);
conf.setBoolean("mapreduce.map.output.compress", true);
conf.set("mapreduce.map.failures.maxpercent", "10");
conf.set("mapreduce.max.map.failures.percent", "10");
conf.set("mapred.max.map.failures.percent", "10");
conf.set("mapred.map.failures.maxpercent", "10");
conf.setBoolean("mapred.compress.map.output", true);
conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec");
conf.setBoolean("mapreduce.map.output.compress", true);
/*String inputPrefixes = args[0];
String outputFile = args[1];*/
Job job = Job.getInstance(conf);
/*FileInputFormat.addInputPath(job, new Path(inputPrefixes));
FileOutputFormat.setOutputPath(job, new Path(outputFile));*/
FileInputFormat.setInputPaths(job, new Path(args.input));
FileOutputFormat.setOutputPath(job, new Path(args.output));
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job,org.apache.hadoop.io.compress.GzipCodec.class);
job.setMapperClass(BuildIndexWebTablesMapper.class);
job.setReducerClass(BuildIndexWebTablesReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(PairOfWritables.class);
//job.setOutputFormatClass(MapFileOutputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
/*job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);*/
job.setJarByClass(BuildIndexWebTables.class);
job.setNumReduceTasks(args.numReducers);
//job.setNumReduceTasks(500);
FileInputFormat.setInputPaths(job, new Path(args.input));
FileOutputFormat.setOutputPath(job, new Path(args.output));
System.out.println(Arrays.deepToString(FileInputFormat.getInputPaths(job)));
// Delete the output directory if it exists already.
Path outputDir = new Path(args.output);
FileSystem.get(getConf()).delete(outputDir, true);
long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
return 0;
}
private BuildIndexWebTables() {}
public static class Args
{
#Option(name = "-input", metaVar = "[path]", required = true, usage = "input path")
public String input;
#Option(name = "-output", metaVar = "[path]", required = true, usage = "output path")
public String output;
#Option(name = "-reducers", metaVar = "[num]", required = false, usage = "number of reducers")
public int numReducers = 1;
}
public static class BuildIndexWebTablesMapper extends Mapper<LongWritable, Text, Text, Text> {
//public static final Log log = LogFactory.getLog(BuildIndexWebTablesMapper.class);
private static final Text WORD = new Text();
private static final Text OPVAL = new Text();
#Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// Log to stdout file
System.out.println("Map key : TEST");
//log to the syslog file
//log.info("Map key "+ key);
/*if(log.isDebugEanbled()){
log.debug("Map key "+ key);
}*/
Dataset ds;
String pgTitle; // Table page title
List<String> tokens = new ArrayList<String>(); // terms for frequency and other data
ds = Dataset.fromJson(value.toString()); // Get all text values from the json corpus
String[][] rel = ds.getRelation(); // Extract relation from the first json
int numCols = rel.length; // Number of columns in the relation
String[] attributes = new String[numCols]; // To store attributes for the relation
for (int j = 0; j < numCols; j++) { // Attributes of the relation
attributes[j] = rel[j][0];
}
int numRows = rel[0].length; //Number of rows of the relation
//dsTabNum = ds.getTableNum(); // Gets the table number from json
// Reads terms from relation and stores in tokens
for (int i = 0; i < numRows; i++ ){
for (int j = 0; j < numCols; j++ ){
String w = rel[i][j].toLowerCase().replaceAll("(^[^a-z]+|[^a-z]+$)", "");
if (w.length() == 0)
continue;
else {
w = w + "|" + pgTitle + "." + j + "|" + i; // Concatenate the term/PageTitle.Column number/row number in term
tokens.add(w);
}
}
}
// Emit postings.
for (String token : tokens){
String[] tokenPart = token.split("|", -2); // Split based on "|", -2(any negative) to split multiple times.
String newkey = tokenPart[0] + "|" + tokenPart[1];
WORD.set(newkey); // Emit term as key
//String valstr = Arrays.toString(Arrays.copyOfRange(tokenPart, 2, tokenPart.length)); // Emit rest of the string as value
String valstr = tokenPart[2];
OPVAL.set(valstr);
context.write(WORD,OPVAL);
}
}
}
public static class BuildIndexWebTablesReducer extends Reducer<Text, Text, Text, Text> {
private static final Text TERM = new Text();
private static final IntWritable TF = new IntWritable();
private String PrevTerm = null;
private int termFrequency = 0;
#Override
protected void reduce(Text key, Iterable<Text> textval, Context context) throws IOException, InterruptedException {
Iterator<Text> iter = textval.iterator();
IntWritable tnum = new IntWritable();
ArrayListWritable<IntWritable> postings = new ArrayListWritable<IntWritable>();
PairOfStringInt relColInfo = new PairOfStringInt();
PairOfWritables keyVal = new PairOfWritables<PairOfStringInt, ArrayListWritable<IntWritable>>();
if((!key.toString().equals(PrevTerm)) && (PrevTerm != null)) {
String[] parseKey = PrevTerm.split("|", -2);
TERM.set(parseKey[0]);
relColInfo.set(parseKey[1],termFrequency);
keyVal.set(relColInfo, postings);
context.write(TERM, keyVal);
termFrequency = 0;
postings.clear();
}
PrevTerm = key.toString();
while (iter.hasNext()) {
int tupleset = Integer.parseInt(iter.next().toString());
tnum.set(tupleset);
postings.add(tnum);
termFrequency++;
}
}
}
}`
I am getting the below mentioned error while compilation.
[ERROR] Failed to execute goal
org.apache.maven.plugins:maven-compiler-plugin:2.3.2:compile
(default-compile) on project projeto-final: Compilation failure
[ERROR]
/home/cloudera/topicosBD-pis/topicosBD-pis/projeto-final/src/main/java/br/edu/ufam/anibrata/BuildIndexWebTables.java:[278,11]
error: method write in interface
TaskInputOutputContext cannot be
applied to given types;
The line where exactly this occurs is "context.write(TERM, keyVal);". This code has some dependencies that is based on my local machine though. I am stuck at the error since I am not getting any idea about it anywhere. If someone can help me understand the origin of the issue and how this can be tackled. I am pretty new to hadoop / mapreduce.
I have tried toggling between OutputFormatClass among job.setOutputFormatClass(MapFileOutputFormat.class); and
job.setOutputFormatClass(TextOutputFormat.class);, both of them throwing the same error. I am using "mvn clean package" to compile.
Any help is appreciated very much.
Thanks in advance.
As i can see, you are trying to write in context a key(TERM) of type Text and a value(keyval) of type PairOfWritables, but your reducer class extends Reducer with VALUEOUT(the last one) of type TEXT. You should change VALUEOUT to proper type.
In your case:
public static class BuildIndexWebTablesReducer extends Reducer<Text, Text, Text, PairOfWritables>

How to read duplicate words count from a directory or a folder

I got this below program from an coding site.
The following code read text file and find duplicate words.
To read from each text files and display it's duplicate words count line by line.
And how to call that files if it is not stored as String, I used buffered reader but I am not getting my output.
My questions:
How can I make the program read multiple files from given folder?
How to save the results in Excel file format?
Any suggestions Welcomed.
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Map.Entry;
public class MaxDuplicateWordCount {
public Map<String, Integer> getWordCount(String fileName){
FileInputStream fis = null;
DataInputStream dis = null;
BufferedReader br = null;
Map<String, Integer> wordMap = new HashMap<String, Integer>();
try {
fis = new FileInputStream(fileName);
dis = new DataInputStream(fis);
br = new BufferedReader(new InputStreamReader(dis));
String line = null;
while((line = br.readLine()) != null){
StringTokenizer st = new StringTokenizer(line, " ");
while(st.hasMoreTokens()){
String tmp = st.nextToken().toLowerCase();
if(wordMap.containsKey(tmp)){
wordMap.put(tmp, wordMap.get(tmp)+1);
} else {
wordMap.put(tmp, 1);
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally{
try{if(br != null) br.close();}catch(Exception ex){}
}
return wordMap;
}
public List<Entry<String, Integer>> sortByValue(Map<String, Integer> wordMap){
Set<Entry<String, Integer>> set = wordMap.entrySet();
List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(set);
Collections.sort( list, new Comparator<Map.Entry<String, Integer>>()
{
public int compare( Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2 )
{
return (o2.getValue()).compareTo( o1.getValue() );
}
} );
return list;
}
public static void main(String a[]){
MaxDuplicateWordCount mdc = new MaxDuplicateWordCount();
Map<String, Integer> wordMap = mdc.getWordCount("E:\\Blog 39.txt");
List<Entry<String, Integer>> list = mdc.sortByValue(wordMap);
for(Map.Entry<String, Integer> entry:list){
System.out.println(entry.getKey()+" ="+entry.getValue());
}
}
}
Intro
After chatting with OP, here is briefly what OP requires:
1- Read file/s from specific folder, files are typically Unicode as text files.
2- The files will be process in OP Algorithm in the Question, and the results of the Algorithm should be saved on Unicode file again (Later OP asked to be saved as Excel file (.XLS) because of Unicode compatibility with Excel)
Solution
This can be solved in following steps:
step 1 We define (declare) our work-space
step 2 We create output folder in work-space if not exist
step 3 We read all existing files in work-space folder and process them in the Algorithm.
step 4 The results of each file will saved as Excel file in output folder.
The code
First of all you need to import POI package, this will allow you to create XLS sheet. I have downloaded this poi/poi-3.5-FINAL.jar.zip( 1,372 k) and the following imports should added to your code.
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFRow;
Next you added following code to your code, it is self explainable code:
final static String WORKSPACE = "C:/testfolder/";
private static void createOutputFolder(String outputFolderName) {
File outputDirectory = new File(WORKSPACE + outputFolderName);
if (!outputDirectory.exists()) {
try {
outputDirectory.mkdir();
} catch (Exception e) {
}
}
}
private static void exlCreator() {
String outputFolder = "output/";
String fileName, fileNameWPathInput;
int serialNumber = 1;
createOutputFolder(outputFolder);
MaxDuplicateWordCount mdc = new MaxDuplicateWordCount();
File folder = new File(WORKSPACE);
File[] listOfFiles = folder.listFiles();
for (int i = 0; i < listOfFiles.length; i++) {
if (listOfFiles[i].isFile()) {
fileName = listOfFiles[i].getName();
fileNameWPathInput = WORKSPACE + fileName;
Map<String, Integer> wordMap = mdc.getWordCount(fileNameWPathInput);
List<Entry<String, Integer>> list = mdc.sortByValue(wordMap);
String fileNameWPathOutput = WORKSPACE + outputFolder +
fileName.substring(0, fileName.length() - 4)
+ "output.xls";
try {
HSSFWorkbook workbook = new HSSFWorkbook();
HSSFSheet sheet = workbook.createSheet("ResultSheet");
HSSFRow rowhead = sheet.createRow((short) 0);
rowhead.createCell(0).setCellValue("Serial No.");
rowhead.createCell(1).setCellValue("Word");
rowhead.createCell(2).setCellValue("Count");
for (Map.Entry<String, Integer> entry : list) {
HSSFRow row = sheet.createRow((short) serialNumber);
row.createCell(0).setCellValue(serialNumber);
row.createCell(1).setCellValue(entry.getKey());
row.createCell(2).setCellValue(entry.getValue());
serialNumber++;
}
FileOutputStream fileOut = new FileOutputStream(fileNameWPathOutput);
workbook.write(fileOut);
fileOut.close();
serialNumber = 1;
System.out.println(fileNameWPathOutput + " is created");
} catch (Exception ex) {
System.out.println(ex);
}
}
}
}
public static void main(String [] args) throws IOException {
exlCreator();
}
Finally
By manipulating the code, it is possible to create one output file but create each output results in work sheets.
As you can see in the image below, the output file is opened in Excel showing Unicode text with out problem, as it was the issue in my first solution:
Links
Download POI
POI documentation
Unicode problem in CSV
More about CSV
Full code, requested from OP
import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Map.Entry;
//for Excel ark
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFRow;
public class MaxDuplicateWordCount {
public Map<String, Integer> getWordCount(String fileName) {
FileInputStream fis = null;
DataInputStream dis = null;
BufferedReader br = null;
Map<String, Integer> wordMap = new HashMap<String, Integer>();
try {
fis = new FileInputStream(fileName);
dis = new DataInputStream(fis);
br = new BufferedReader(new InputStreamReader(dis));
String line = null;
while ((line = br.readLine()) != null) {
StringTokenizer st = new StringTokenizer(line, " ");
while (st.hasMoreTokens()) {
String tmp = st.nextToken().toLowerCase();
if (wordMap.containsKey(tmp)) {
wordMap.put(tmp, wordMap.get(tmp) + 1);
} else {
wordMap.put(tmp, 1);
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (br != null) br.close();
} catch (Exception ex) {
}
}
return wordMap;
}
public List<Entry<String, Integer>> sortByValue(Map<String, Integer> wordMap) {
Set<Entry<String, Integer>> set = wordMap.entrySet();
List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(set);
Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return (o2.getValue()).compareTo(o1.getValue());
}
});
return list;
}
final static String WORKSPACE = "C:/testfolder/";
private static void createOutputFolder(String outputFolderName) {
File outputDirectory = new File(WORKSPACE + outputFolderName);
if (!outputDirectory.exists()) {
try {
outputDirectory.mkdir();
} catch (Exception e) {
}
}
}
private static void exlCreator() {
String outputFolder = "output/";
String fileName, fileNameWPathInput;
int serialNumber = 1;
createOutputFolder(outputFolder);
MaxDuplicateWordCount mdc = new MaxDuplicateWordCount();
File folder = new File(WORKSPACE);
File[] listOfFiles = folder.listFiles();
for (int i = 0; i < listOfFiles.length; i++) {
if (listOfFiles[i].isFile()) {
fileName = listOfFiles[i].getName();
fileNameWPathInput = WORKSPACE + fileName;
Map<String, Integer> wordMap = mdc.getWordCount(fileNameWPathInput);
List<Entry<String, Integer>> list = mdc.sortByValue(wordMap);
String fileNameWPathOutput = WORKSPACE + outputFolder +
fileName.substring(0, fileName.length() - 4)
+ "output.xls";
try {
HSSFWorkbook workbook = new HSSFWorkbook();
HSSFSheet sheet = workbook.createSheet("ResultSheet");
HSSFRow rowhead = sheet.createRow((short) 0);
rowhead.createCell(0).setCellValue("Serial No.");
rowhead.createCell(1).setCellValue("Word");
rowhead.createCell(2).setCellValue("Count");
for (Map.Entry<String, Integer> entry : list) {
HSSFRow row = sheet.createRow((short) serialNumber);
row.createCell(0).setCellValue(serialNumber);
row.createCell(1).setCellValue(entry.getKey());
row.createCell(2).setCellValue(entry.getValue());
serialNumber++;
}
FileOutputStream fileOut = new FileOutputStream(fileNameWPathOutput);
workbook.write(fileOut);
fileOut.close();
serialNumber = 1;
System.out.println(fileNameWPathOutput + " is created");
} catch (Exception ex) {
System.out.println(ex);
}
}
}
}
public static void main(String[] args) throws IOException {
exlCreator();
}
}
Let say you have a directory with all the files you want to read from.
File folder = new File("/Users/you/folder/");
File[] listOfFiles = folder.listFiles();
for (File file : listOfFiles) {
if (file.isFile()) {
/*
* Here if your file is not a text file
* If I undersood you correct:
* "And how to call that files if it is not stored as String"
* you can get it as byte[] and parse it to String
*/
byte[] bytes = Files.readAllBytes(file.toPath());
String decoded = new String(bytes, "UTF-8");
String[] words = decoded.split("\\s+");
for (int i = 0; i < words.length; i++) {
/* You may want to check for a non-word character before blindly
* performing a replacement
* It may also be necessary to adjust the character class
*/
words[i] = words[i].replaceAll("[^\\w]", "");
//Here are all the words from a file. You can do whatever you want with them
}
}
}

jsoup java html parsing

I'm a new french user on stack and I have a problem ^^
I use an HTML parse Jsoup for parsing a html page. For that it's ok but I can't parse more url in same time.
This is my code:
first class for parsing a web page
package test2;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public final class Utils {
public static Map<String, String> parse(String url){
Map<String, String> out = new HashMap<String, String>();
try
{
Document doc = Jsoup.connect(url).get();
doc.select("img").remove();
Elements denomination = doc.select(".AmmDenomination");
Elements composition = doc.select(".AmmComposition");
Elements corptexte = doc.select(".AmmCorpTexte");
for(int i = 0; i < denomination.size(); i++)
{
out.put("denomination" + i, denomination.get(i).text());
}
for(int i = 0; i < composition.size(); i++)
{
out.put("composition" + i, composition.get(i).text());
}
for(int i = 0; i < corptexte.size(); i++)
{
out.put("corptexte" + i, corptexte.get(i).text());
System.out.println(corptexte.get(i));
}
} catch(IOException e){
e.printStackTrace();
}
return out;
}//Fin Methode parse
public static void excelizer(int fileId, Map<String, String> values){
try
{
FileOutputStream out = new FileOutputStream("C:/Documents and Settings/c.bon/git/clinsearch/drugs/src/main/resources/META-INF/test/fichier2.xls" );
Workbook wb = new HSSFWorkbook();
Sheet mySheet = wb.createSheet();
Row row1 = mySheet.createRow(0);
Row row2 = mySheet.createRow(1);
String entete[] = {"CIS", "Denomination", "Composition", "Form pharma", "Indication therapeutiques", "Posologie", "Contre indication", "Mise en garde",
"Interraction", "Effet indesirable", "Surdosage", "Pharmacodinamie", "Liste excipients", "Incompatibilité", "Duree conservation",
"Conservation", "Emballage", "Utilisation Manipulation", "TitulaireAMM"};
for (int i = 0; i < entete.length; i++)
{
row1.createCell(i).setCellValue(entete[i]);
}
Set<String> set = values.keySet();
int rowIndexDenom = 1;
int rowIndexCompo = 1;
for(String key : set)
{
if(key.contains("denomination"))
{
mySheet.createRow(1).createCell(1).setCellValue(values.get(key));
rowIndexDenom++;
}
else if(key.contains("composition"))
{
row2.createCell(2).setCellValue(values.get(key));
rowIndexDenom++;
}
}
wb.write(out);
out.close();
}
catch(Exception e)
{
e.printStackTrace();
}
}
}
second class
package test2;
public final class Task extends Thread {
private static int fileId = 0;
private int id;
private String url;
public Task(String url)
{
this.url = url;
id = fileId;
fileId++;
}
#Override
public void run()
{
Utils.excelizer(id, Utils.parse(url));
}
}
the main class (entry point)
package test2;
import java.util.ArrayList;
public class Main {
public static void main(String[] args)
{
ArrayList<String> urls = new ArrayList<String>();
urls.add("http://base-donnees-publique.medicaments.gouv.fr/affichageDoc.php?specid=61266250&typedoc=R");
urls.add("http://base-donnees-publique.medicaments.gouv.fr/affichageDoc.php?specid=66207341&typedoc=R");
for(String url : urls)
{
new Task(url).run();
}
}
}
When the data was copied to my excel file, the second url doesn't work.
Can you help me solve my problem please?
Thanks
I think its because your main() exits before your second thread has a chance to do its job. You should wait for all spawned threads to complete using Thread.join(). Or better yet, create one of the ExecutorService's and use awaitTermination(...) to block until all URLs are parsed.
EDIT See some examples here http://www.javacodegeeks.com/2013/01/java-thread-pool-example-using-executors-and-threadpoolexecutor.html

Categories