Use two CSV file linked by id column in Hadoop

Use two CSV file linked by id column in Hadoop - java

I'm new to Hadoop and I don't know how link two csv file together.
Here is my two CSV file :
order_dataset.csv
order_id order_approved_at order_delivered_customer_date
---------- ------------------- -------------------------------
1 2017-10-02 19:55:00 2017-10-04 04:39:00
2 2017-01-26 14:16:31 2017-02-02 14:08:10
3 2018-06-09 03:13:12 2018-06-19 12:05:52
order_review_dataset.csv
order_id customer_id review_score
---------- ------------- --------------
1 12 3
2 23 4
3 93 5
I would like to have a result file like this :
delivery_time in day avg_review_score
---------------------- ------------------
1 3.03
2 4.5
3 3.76
For now, I have count the number of delivery time. I don't know how to use the second CSV file to add the review_score. Here is my code :
public class Question {
public static void main(String[] args) throws Exception
{
if (args.length != 1) {
System.err.println("Usage: Question <input path>");
System.exit(-1);
}
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Job Title");
job.setJarByClass(Question.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
Path outputPath = new Path("./output/question3");
FileOutputFormat.setOutputPath(job, outputPath);
outputPath.getFileSystem(conf).delete(outputPath,true);
job.setMapperClass(QuestionMapper.class);
job.setReducerClass(QuestionReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
public class QuestionMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private Text jourDelaiLivraison = new Text();
#Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException
{
if (key.get() == 0)
return;
String ligne = value.toString();
String[] tokens = ligne.split(",");
DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
try {
Date order_approved_at = formatter.parse(tokens[4]);
Date order_delivered_customer_date = formatter.parse(tokens[6]);
//Il faut que la commande soit delivered
//Et que les Dates soient cohérentes
if (tokens[2].equals("delivered") && order_approved_at.compareTo(order_delivered_customer_date) < 0) {
long diff = order_delivered_customer_date.getTime() - order_approved_at.getTime();
String delai = String.valueOf(TimeUnit.DAYS.convert(diff, TimeUnit.MILLISECONDS));
jourDelaiLivraison.set(delai);
}
context.write(jourDelaiLivraison, new IntWritable(1));
} catch (ParseException e) {
e.printStackTrace();
}
}
}
public class QuestionReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
private Map<Text, Integer> delaiNoteSatisfaction;
#Override
public void setup(Context context) {
this.delaiNoteSatisfaction = new HashMap<>();
}
#Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
{
int count = StreamSupport.stream(values.spliterator(), false)
.mapToInt(IntWritable::get)
.sum();
delaiNoteSatisfaction.put(new Text(key), count);
}
#Override
public void cleanup(Context context){
List<Text> keyList = new ArrayList(delaiNoteSatisfaction.keySet());
keyList.sort(Comparator.comparingInt((Text t) -> Integer.valueOf(t.toString())));
keyList.forEach(key -> {
try {
context.write(key, new IntWritable(delaiNoteSatisfaction.get(key)));
} catch (IOException | InterruptedException e) {
e.printStackTrace();
}
});
}
}

Related

Why output is empty on Hadoop MapReduce? Debug?

I want to aggregate the total_aomunt(float) for each date("yyyy-MM-dd HH:mm:ss.S").
The data is like here.
The output is empty like here.
The basic idea of the mapper is to skip the first line(header), and afterward get the date and total_amount of each line, then I change the date format to "yyyy-MM-dd". I guess the errors happen in try{} catch(){}.
By the way, I am wondering if there are some bad lines in the dataset, for instance, the value of total_amount is not floating but contains '$' like "$ 9.5" or the format of date is not like "yyyy-MM-dd HH:mm:ss.S", how to detect those lines and discard them? Do you have any ideas?
Here is the code:
public class RevenueSum extends Configured implements Tool {
static int printUsage() {
System.out.println("revenuesum [-m <maps>] [-r <reduces>] <input> <output>");
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
public static class RevenueSumMapper
extends Mapper<LongWritable, Text, Text, FloatWritable> {
#Override
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException {
try {
/*Some condition satisfying it is header*/
if (key.get() == 0 && value.toString().contains("header") )
return;
else {
// For rest of data it goes here
String line = value.toString();
String[] items = line.split(",");
String old_date = items[3];
LocalDateTime datetime = LocalDateTime.parse(old_date, DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.S"));
String date = datetime.format(DateTimeFormatter.ofPattern("yyyy-MM-dd"));
Float revenue = Float.parseFloat(items[10]);
context.write(new Text(date), new FloatWritable(revenue));
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
public static class RevenueSumReducer
extends Reducer<Text,FloatWritable,Text,FloatWritable> {
private FloatWritable result = new FloatWritable();
public void reduce(Text key, Iterable<FloatWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (FloatWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "revenue sum");
job.setJarByClass(RevenueSum.class);
job.setMapperClass(RevenueSumMapper.class);
job.setCombinerClass(RevenueSumReducer.class);
job.setReducerClass(RevenueSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
List<String> other_args = new ArrayList<String>();
for(int i=0; i < args.length; ++i) {
try {
if ("-r".equals(args[i])) {
job.setNumReduceTasks(Integer.parseInt(args[++i]));
} else {
other_args.add(args[i]);
}
} catch (NumberFormatException except) {
System.out.println("ERROR: Integer expected instead of " + args[i]);
return printUsage();
} catch (ArrayIndexOutOfBoundsException except) {
System.out.println("ERROR: Required parameter missing from " +
args[i-1]);
return printUsage();
}
}
// Make sure there are exactly 2 parameters left.
if (other_args.size() != 2) {
System.out.println("ERROR: Wrong number of parameters: " +
other_args.size() + " instead of 2.");
return printUsage();
}
FileInputFormat.setInputPaths(job, other_args.get(0));
FileOutputFormat.setOutputPath(job, new Path(other_args.get(1)));
return (job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new RevenueSum(), args);
System.exit(res);
}

MapReduce: Reduce function is writing strange values that are not expected

My reduce function in Java is writing on the output file values that are not expected. I inspect my code with breakpoints and I saw that, for each context.write call that I made, the key and the value that I'm writing are correct. Where am I making mistakes?
What I'm trying to do is taking in input row of type date, customer, vendor, amount that represent transactions and generate a dataset with row like date, user, balance where the balance is the sum of all transactions in which user was both customer or vendor.
Here is my code:
public class Transactions {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, Text>{
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
var splittedValues = value.toString().split(",");
var date = splittedValues[0];
var customer = splittedValues[1];
var vendor = splittedValues[2];
var amount = splittedValues[3];
var reduceValue = new Text(customer + "," + vendor + "," + amount);
context.write(new Text(date), reduceValue);
}
}
public static class IntSumReducer
extends Reducer<Text,Text,Text,Text> {
public void reduce(Text key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException {
Map<String, Integer> balanceByUserId = new ConcurrentHashMap<>();
values.forEach(transaction -> {
var splittedTransaction = transaction.toString().split(",");
var customer = splittedTransaction[0];
var vendor = splittedTransaction[1];
var amount = 0;
if (splittedTransaction.length > 2) {
amount = Integer.parseInt(splittedTransaction[2]);
}
if (!balanceByUserId.containsKey(customer)) {
balanceByUserId.put(customer, 0);
}
if (!balanceByUserId.containsKey(vendor)) {
balanceByUserId.put(vendor, 0);
}
balanceByUserId.put(customer, balanceByUserId.get(customer) - amount);
balanceByUserId.put(vendor, balanceByUserId.get(vendor) + amount);
});
balanceByUserId.entrySet().forEach(entry -> {
var reducerValue = new Text(entry.getKey() + "," + entry.getValue().toString());
try {
context.write(key, reducerValue);
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
});
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "transactions");
job.setJarByClass(Transactions.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

where the balance is the sum of all transactions in which user was both customer or vendor
balanceByUserId exists only for each unique date since your map key is the date.
If you want to aggregate by customer info (name / ID?), then customer should be the key of the mapper output.
Once all data from each customer is grouped by the reducer, you can then sort by date, if needed, but aggregate by other details.
Also worth pointing out that this would be easier in Hive or SparkSQL rather than Mapreduce.

Hadoop secondary sorting

I trying to implemented secondary sort,
And see that url as eg.:https://www.safaribooksonline.com/library/view/data-algorithms/9781491906170/ch01.html
But my problem it's different, I have a list of product, the year and month and the price like that:
201505011000######PEN DRIVE00951
201505011000######PEN DRIVE00952
201505011000######PEN DRIVE00458
201505011000######PEN DRIVE00459
201505011000#######NOTEBOOK11470
201605011000#######NOTEBOOK21471
201705011000#######NOTEBOOK21472
201705011000###GAVETA DE HD01472
201703011000###GAVETA DE HD01473
201705011000###GAVETA DE HD01474
Where for eg.: 201505 represent the year and the month, after the # sign I had the product name, and in the and the price 01470 represent 14,70.
What I need to do is get the lower price for each product and show the Year and month of that Price. But I don't know to do that, what I can show are the Lower price and the product.
Here is my program:
MAPPER
public class GroupMR {
public static class GroupMapper extends Mapper<LongWritable, Text, Product, IntWritable> {
Product prdt = new Product();
Text cntText = new Text();
Text YearMonthText = new Text();
IntWritable price = new IntWritable();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String produto = line.substring(13, 27);//Nome do produto
produto = produto.substring(produto.lastIndexOf("#")+1);
String ano = line.substring(0, 6);
int valor = Integer.parseInt(line.substring(27, 32));
cntText.set(new Text(produto));
YearMonthText.set(ano);
price.set(valor);
Product prdt = new Product(cntText, YearMonthText);
context.write(prdt, price);
}
}
REDUCER
public static class GroupReducer extends Reducer<Product, IntWritable, Product, IntWritable> {
public void reduce(Product key, Iterator<IntWritable> values, Context context) throws IOException,
InterruptedException {
int minValue = Integer.MAX_VALUE;
while (values.hasNext()) {
minValue = Math.min(minValue,values.next().get());
}
context.write(key, new IntWritable(minValue));
}
}
COMPARABLE
private static class Product implements WritableComparable<Product> {
Text Product;
Text YearMonth;
public Product(Text Product, Text YearMonth) {
this.Product = Product;
this.YearMonth = YearMonth;
}
public Product() {
this.Product = new Text();
this.YearMonth = new Text();
}
public void write(DataOutput out) throws IOException {
this.Product.write(out);
this.YearMonth.write(out);
}
public void readFields(DataInput in) throws IOException {
this.Product.readFields(in);
this.YearMonth.readFields(in);
}
public int compareTo(Product pric) {
if (pric == null)
return 0;
int intcnt = Product.compareTo(pric.Product);
return intcnt;
}
#Override
public String toString() {
return Product.toString() + " DATA: " + YearMonth.toString();
}
}
DRIVER
public static void main(String[] args)
throws IOException, ClassNotFoundException, InterruptedException {
FileUtils.deleteDirectory(new File("/Local/data/output"));
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "GroupMR");
job.setJarByClass(GroupMR.class);
job.setMapperClass(GroupMapper.class);
job.setReducerClass(GroupReducer.class);
job.setOutputKeyClass(Product.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path(args[2]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
RESULT
201605011000######PEN DRIVE00950
201505011000######PEN DRIVE00951
201505011000######PEN DRIVE00952
201505011000######PEN DRIVE00458
201505011000######PEN DRIVE00459
201505011000#######NOTEBOOK11470
201605011000#######NOTEBOOK21471
201705011000#######NOTEBOOK21472
201705011000###GAVETA DE HD01472
201703011000###GAVETA DE HD01473
201705011000###GAVETA DE HD01474
I think the problem is in the Reduce and in the CompareTo But I have no idea how to make. Someone could help me with it?

reducer not being called in the mapreduce program

I am writign a simple extension on Mapreduce program and found that my code is only displaying output from Map(). Mapred job runs in eclipse without any errors but does not invoke reduce().
Here is my map():
public static class KVMapper
extends Mapper<Text, Text, IntWritable, Text>{
// extends Mapper<Text, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private String word;// = new Text();
private IntWritable iw;
private final LongWritable val = new LongWritable();
public void map(Text key, Text value , Context context
) throws IOException, InterruptedException {
iw = new IntWritable(Integer.parseInt(value.toString()));
System.out.println(value +" hello , world " +key );
context.write(iw, key);
}
}
Reduce()
public static class KVReducer
extends Reducer<IntWritable,Text,IntWritable, Text> {
KVReducer(){
System.out.println("Inside reducer");
}
public void reduce(IntWritable key, Text value,
Context context
) throws IOException, InterruptedException {
System.out.println(value +" hello2 , world " +key );
context.write(key, value);
}
}
main()
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", "\t");
//conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator",",");
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
Job job = new Job(conf, "word desc");
job.setInputFormatClass(KeyValueTextInputFormat.class);
job.setJarByClass(WordDesc.class);
job.setMapperClass(KVMapper.class);
job.setCombinerClass(KVReducer.class);
job.setReducerClass(KVReducer.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job,
new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
Sample of the input:
1500s 1
1960s 1
Aldus 1
Sample output from the program, while I was expecting mapper to reverse key and value pairs
1500s 1
1960s 1
Aldus 1
Not sure why the reduce() is not being invoked in the above code

You are not overriding reduce() method of Reducer class.
For your case its signature should be like public void reduce(IntWritable key, Iterable<Text> values,Context context)
Here is updated KVReducer
public static class KVReducer
extends Reducer<IntWritable,Text,IntWritable, Text> {
KVReducer(){
System.out.println("Inside reducer");
}
public void reduce(IntWritable key, Iterable<Text> values,Context context) throws IOException, InterruptedException {
for(Text value: values){}
System.out.println(value +" hello2 , world " +key );
context.write(key, value);
}
}
}

Mapreduce - FloatArrayWritable printing address

I have a mapreduce program who's reduce method outputs a Text as the key and a FloatArrayWritable as the values. However, the values are outputting the array address instead of the values from the toString() method.
The output I am getting is:
IYE marketDataPackage.MarketData#69204998
IYE marketDataPackage.MarketData#69204998
The output should be:
IYE 38.89, 38.50, etc.
Could someone please advise the error in my code? Thanks.
public static class Map extends Mapper<LongWritable, Text, Text, MarketData> {
private Text symbol = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
StringTokenizer tokenizer2 = new StringTokenizer(tokenizer.nextToken().toString(), ",");
symbol.set(tokenizer2.nextToken());
context.write(symbol, new MarketData(tokenizer2.nextToken(), Float.parseFloat(tokenizer2.nextToken())));
}
}
}
public static class Reduce extends Reducer<Text, FloatWritable, Text, FloatArrayWritable> {
public void reduce(Text key, Iterable<MarketData> values, Context context) throws IOException, InterruptedException, ParseException {
Calendar today = Calendar.getInstance();
today.add(Calendar.DAY_OF_MONTH, -45);
Calendar testDate = Calendar.getInstance();
SimpleDateFormat sdf = new SimpleDateFormat("yyyy/m/d");
List<FloatWritable> prices = new ArrayList<FloatWritable>();
for (MarketData m : values) {
testDate.setTime(sdf.parse(m.getTradeDate()));
if (testDate.after(today)) {
prices.add(new FloatWritable(m.getPrice()));
}
}
context.write(key, new FloatArrayWritable(prices.toArray(new FloatWritable[prices.size()])));
}
}
public static void main(String[] args) {
Configuration conf = new Configuration();
Job job = new Job(conf, "Security_Closing_Prices");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(MarketData.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
FloatArrayWritable class:
public class FloatArrayWritable extends ArrayWritable {
public FloatArrayWritable() {
super(FloatWritable.class);
}
public FloatArrayWritable(FloatWritable[] values) {
super(FloatWritable.class, values);
}
#Override
public FloatWritable[] get() {
return (FloatWritable[]) super.get();
}
#Override
public String toString() {
FloatWritable[] values = get();
String prices = "";
for (FloatWritable f : values) {
prices = prices + f.toString() + ", ";
}
if (prices != null && !prices.isEmpty()) {
prices = prices.substring(0, prices.length() - 2);
}
return prices;
}
}

The MarketData class should override toString(). You don't provide code for that class, but I suspect that it doesn't.

We Keep Coding

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

Use two CSV file linked by id column in Hadoop - java

Related

Why output is empty on Hadoop MapReduce? Debug?

MapReduce: Reduce function is writing strange values that are not expected

Hadoop secondary sorting

reducer not being called in the mapreduce program

Mapreduce - FloatArrayWritable printing address

Categories

Resources