my UDF:
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.IntWritable;
public class HoursDiff extends UDF {
//private = new Text();
public IntWritable evaluate(String date,String time)
{
String dateStart = "2014-12-01 00:00:00";
String currentdate=date+" "+time;
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date d1 = null;
Date d2 = null;
try
{
d1 = format.parse(dateStart);
d2 = format.parse(currentdate);
long diff = d2.getTime() - d1.getTime();
long diffHours = diff / (3600000) % 24;
long diffDays = diff / (86400000);
int hours=(int)(diffDays*24+diffHours);
IntWritable hour=new IntWritable(hours);
return hour;
}
catch (Exception e)
{
e.printStackTrace();
}
return null;
}
}
I exported into /home/hadoop/mapreduce/HoursDiff.jar
I opened the hive shell:
add jar /home/hadoop/mapreduce/HoursDiff.jar;
create temporary function hoursdiff as HoursDiff;
when I am trying to execute the following command, im getting FileNotFoundException:
select hoursdiff(date,time) as hours from date_test;
STACK TRACE
create temporary function hoursdiff as 'HoursDiff';
OK
Time taken: 0.009 seconds
hive> select hoursdiff(date,time) as hours from date_test;
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
15/10/11 15:17:03 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Execution log at: /tmp/hadoop/hadoop_20151011151616_2c15561f-7cd2-4012-8bd2-b7dfcf488432.log
java.io.FileNotFoundException: File does not exist: hdfs://172.16.253.17:54310/home/hadoop/mapreduce/HoursDiff.jar
at org.apache.hadoop.hdfs.DistributedFileSystem$18.doCall(DistributedFileSystem.java:1122)
at org.apache.hadoop.hdfs.DistributedFileSystem$18.doCall(DistributedFileSystem.java:1114)
at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
at org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1114)
at org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.getFileStatus(ClientDistributedCacheManager.java:288)
at org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.getFileStatus(ClientDistributedCacheManager.java:224)
at org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.determineTimestamps(ClientDistributedCacheManager.java:93)
at org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.determineTimestampsAndCacheVisibilities(ClientDistributedCacheManager.java:57)
at org.apache.hadoop.mapreduce.JobSubmitter.copyAndConfigureFiles(JobSubmitter.java:269)
at org.apache.hadoop.mapreduce.JobSubmitter.copyAndConfigureFiles(JobSubmitter.java:390)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:483)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1296)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1293)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1628)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1293)
at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:562)
at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:557)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1628)
at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:557)
at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:548)
at org.apache.hadoop.hive.ql.exec.mr.ExecDriver.execute(ExecDriver.java:420)
at org.apache.hadoop.hive.ql.exec.mr.ExecDriver.main(ExecDriver.java:740)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:601)
at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
Job Submission failed with exception 'java.io.FileNotFoundException(File does not exist: hdfs://172.16.253.17:54310/home/hadoop/mapreduce/HoursDiff.jar)'
Execution failed with exit status: 1
Everything you done is correct, but it is searching in HDFS path, you registered with local path.
Copy the jar into HDFS location and try to register it with the HDFS path.
I hope you opened the hive terminal with HDFS user, so it is searching the path of HDFS.
Note: It will also accept the local path also to register the jar.
Related
I'm attempting to use the Java Spark libraries with a cluster running Spark 2.3.0 over Hadoop 3.1.0 (and using those versions of the Java libraries).
I've run into a problem where I simply cannot use groupByKey, and I am at a loss to explain why. Any attempted usage of groupByKey for any reason in any circumstance is returning a java.lang.IllegalArgumentException.
I've boiled this down to about the simplest test I can think of:
package com.failuretest;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
public class TestReport {
public static void main(String[] args) throws Exception {
SparkConf conf = new SparkConf().setAppName("TestReport").set("spark.executor.memory", "20G");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> test = sc.parallelize(generateTestData());
test.saveAsTextFile("/TEST/testfile1");
test.mapToPair(line -> {
String[] testParts = line.split(" ");
return new Tuple2<String, String>(testParts[0], testParts[1]);
}).groupByKey().saveAsTextFile("/TEST/testfile2");
sc.close();
}
private static List<String> generateTestData() {
List<String> testList = new ArrayList<String>();
int keyCount = 0;
int valCount = 0;
while (valCount++ < 2000000) {
if (valCount % 10 == 0) {
keyCount++;
}
testList.add("Key" + keyCount + " " + "Val" + valCount);
}
return testList;
}
}
I'm just programmatically creating an RDD that produces 10 values per key, then creating my JavaPairRDD with a simple split, then attempting groupByKey.
When it runs, I receive the following stack:
Exception in thread "main" java.lang.IllegalArgumentException
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.spark.util.ClosureCleaner$.getClassReader(ClosureCleaner.scala:46)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodInsn$2.apply(ClosureCleaner.scala:449)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodInsn$2.apply(ClosureCleaner.scala:432)
at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:733)
at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:103)
at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:103)
at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:230)
at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40)
at scala.collection.mutable.HashMap$$anon$1.foreach(HashMap.scala:103)
at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:732)
at org.apache.spark.util.FieldAccessFinder$$anon$3.visitMethodInsn(ClosureCleaner.scala:432)
at org.apache.xbean.asm5.ClassReader.a(Unknown Source)
at org.apache.xbean.asm5.ClassReader.b(Unknown Source)
at org.apache.xbean.asm5.ClassReader.accept(Unknown Source)
at org.apache.xbean.asm5.ClassReader.accept(Unknown Source)
at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:262)
at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:261)
at scala.collection.immutable.List.foreach(List.scala:381)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:261)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:159)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2292)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$combineByKeyWithClassTag$1.apply(PairRDDFunctions.scala:88)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$combineByKeyWithClassTag$1.apply(PairRDDFunctions.scala:77)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.PairRDDFunctions.combineByKeyWithClassTag(PairRDDFunctions.scala:77)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$groupByKey$1.apply(PairRDDFunctions.scala:505)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$groupByKey$1.apply(PairRDDFunctions.scala:498)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.PairRDDFunctions.groupByKey(PairRDDFunctions.scala:498)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$groupByKey$3.apply(PairRDDFunctions.scala:641)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$groupByKey$3.apply(PairRDDFunctions.scala:641)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.PairRDDFunctions.groupByKey(PairRDDFunctions.scala:640)
at org.apache.spark.api.java.JavaPairRDD.groupByKey(JavaPairRDD.scala:559)
at com.failuretest.TestReport.main(TestReport.java:22)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:564)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:879)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:197)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:227)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:136)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
It doesn't get any further than the groupByKey (I'm writing a file above with the results, but it really doesn't matter since it never gets there).
I can run it all day long in my local dev instance, but running spark-submit with a jar containing the above fails every time in the cluster.
I'm really not sure where to go from here - what I am trying to do is a bit of a challenge if I cannot group by key.
Am I messing up? Is this a version conflict somewhere?
Dave
I actually figured this out before posting this, but in the interests of helping others...
I discovered that one of my colleagues had decided to have a play around with Java 10 on this particular cluster. Moved it back to Java 8 (sorry - didn't try 9) and the problem went away.
Dave
I am trying to run a Spark sample in local mode, but am getting the following stack trace:
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/spark/internal/io/HadoopMapReduceCommitProtocol
at java.lang.ClassLoader.defineClass1(Native Method)
at java.lang.ClassLoader.defineClass(ClassLoader.java:763)
at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
at java.net.URLClassLoader.defineClass(URLClassLoader.java:467)
at java.net.URLClassLoader.access$100(URLClassLoader.java:73)
at java.net.URLClassLoader$1.run(URLClassLoader.java:368)
at java.net.URLClassLoader$1.run(URLClassLoader.java:362)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:361)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at org.apache.spark.sql.internal.SQLConf$.<init>(SQLConf.scala:383)
at org.apache.spark.sql.internal.SQLConf$.<clinit>(SQLConf.scala)
at org.apache.spark.sql.internal.StaticSQLConf$$anonfun$buildConf$1.apply(SQLConf.scala:930)
at org.apache.spark.sql.internal.StaticSQLConf$$anonfun$buildConf$1.apply(SQLConf.scala:928)
at org.apache.spark.internal.config.TypedConfigBuilder$$anonfun$createWithDefault$1.apply(ConfigBuilder.scala:122)
at org.apache.spark.internal.config.TypedConfigBuilder$$anonfun$createWithDefault$1.apply(ConfigBuilder.scala:122)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.internal.config.TypedConfigBuilder.createWithDefault(ConfigBuilder.scala:122)
at org.apache.spark.sql.internal.StaticSQLConf$.<init>(SQLConf.scala:937)
at org.apache.spark.sql.internal.StaticSQLConf$.<clinit>(SQLConf.scala)
at org.apache.spark.sql.SparkSession$.org$apache$spark$sql$SparkSession$$sessionStateClassName(SparkSession.scala:962)
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:111)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:109)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$getOrCreate$5.apply(SparkSession.scala:878)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$getOrCreate$5.apply(SparkSession.scala:878)
at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:99)
at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:99)
at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:230)
at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40)
at scala.collection.mutable.HashMap.foreach(HashMap.scala:99)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:878)
at com.megaport.PipelineExample$.main(PipelineExample.scala:37)
at com.megaport.PipelineExample.main(PipelineExample.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
I can see the class in the GitHub repo, but it is not in the Maven lib, or in the distro(I have the distro bundled with Hadoop) spark-core_2.11-2.0.2.jar.
The code I am trying to run is taken from the examples in the Spark distro, and it fails at the getOrCreate stage...
// scalastyle:off println
package com.megaport
// $example on$
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row
// $example off$
import org.apache.spark.sql.SparkSession
object PipelineExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder
.appName("My Spark Application") // optional and will be autogenerated if not specified
.master("local[*]") // avoid hardcoding the deployment environment
// .enableHiveSupport() // self-explanatory, isn't it?
.getOrCreate
// $example on$
// Prepare training documents from a list of (id, text, label) tuples.
val training = spark.createDataFrame(Seq(
(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
(3L, "hadoop mapreduce", 0.0)
)).toDF("id", "text", "label")
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
val tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words")
val hashingTF = new HashingTF()
.setNumFeatures(1000)
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("features")
val lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.01)
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, lr))
// Fit the pipeline to training documents.
val model = pipeline.fit(training)
// Now we can optionally save the fitted pipeline to disk
model.write.overwrite().save("/tmp/spark-logistic-regression-model")
// We can also save this unfit pipeline to disk
pipeline.write.overwrite().save("/tmp/unfit-lr-model")
// And load it back in during production
val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model")
// Prepare test documents, which are unlabeled (id, text) tuples.
val test = spark.createDataFrame(Seq(
(4L, "spark i j k"),
(5L, "l m n"),
(6L, "mapreduce spark"),
(7L, "apache hadoop")
)).toDF("id", "text")
// Make predictions on test documents.
model.transform(test)
.select("id", "text", "probability", "prediction")
.collect()
.foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
println(s"($id, $text) --> prob=$prob, prediction=$prediction")
}
// $example off$
spark.stop()
}
}
Well if its not in your java library, then you should download the dependent jar and add it.
Check this SO for more details
How to import a jar in Eclipse
I had a specific filtering problem (described here: Pig - How to manipulate and compare dates?), so as we told me, I decided to write my own filtering UDF. Here is the code:
import java.io.IOException;
import org.apache.pig.FilterFunc;
import org.apache.pig.data.Tuple;
import org.joda.time.*;
import org.joda.time.format.*;
public class DateCloseEnough extends FilterFunc {
int nbmois;
/*
* #param nbMois: if the number of months between two dates is inferior to this variable, then we consider that these two dates are close
*/
public DateCloseEnough(String nbmois_) {
nbmois = Integer.valueOf(nbmois_);
}
public Boolean exec(Tuple input) throws IOException {
// We're getting the date
String date1 = (String)input.get(0);
// We convert it into date
final DateTimeFormatter dtf = DateTimeFormat.forPattern("MM yyyy");
LocalDate d1 = new LocalDate();
d1 = LocalDate.parse(date1, dtf);
d1 = d1.withDayOfMonth(1);
// We're getting today's date
DateTime today = new DateTime();
int mois = today.getMonthOfYear();
String real_mois;
if(mois >= 1 && mois <= 9) real_mois = "0" + mois;
else real_mois = "" + mois;
LocalDate d2 = new LocalDate();
d2 = LocalDate.parse(real_mois + " " + today.getYear(), dtf);
d2 = d2.withDayOfMonth(1);
// Number of months between these two dates
String nb_months_between = "" + Months.monthsBetween(d1,d2);
return (Integer.parseInt(nb_months_between) <= nbmois);
}
}
I created a Jar file of this code from Eclipse.
I'm filtering my data with these lines of piglatin code:
REGISTER Desktop/myUDFs.jar
DEFINE DateCloseEnough DateCloseEnough('12');
experiences1 = LOAD '/home/training/Desktop/BDD/experience.txt' USING PigStorage(',') AS (id_cv:int, id_experience:int, date_deb:chararray, date_fin:chararray, duree:int, contenu_experience:chararray);
experiences = FILTER experiences1 BY DateCloseEnough(date_fin);
I'm launching my program with this linux command:
pig -x local "myScript.pig"
And I get this error:
2013-06-19 07:27:17,253 [main] INFO org.apache.pig.Main - Logging error messages to: /home/training/pig_1371652037252.log
2013-06-19 07:27:17,933 [main] ERROR org.apache.pig.tools.grunt.Grunt - ERROR 2998: Unhandled internal error. org/joda/time/ReadablePartial Details at logfile: /home/training/pig_1371652037252.log
I checked into the log file and I saw this:
Pig Stack Trace
ERROR 2998: Unhandled internal error. org/joda/time/ReadablePartial
java.lang.NoClassDefFoundError: org/joda/time/ReadablePartial
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:247)
at org.apache.pig.impl.PigContext.resolveClassName(PigContext.java:441)
at org.apache.pig.impl.PigContext.instantiateFuncFromSpec(PigContext.java:471)
at org.apache.pig.impl.PigContext.instantiateFuncFromAlias(PigContext.java:544)
at org.apache.pig.impl.logicalLayer.parser.QueryParser.EvalFuncSpec(QueryParser.java:4834)
at org.apache.pig.impl.logicalLayer.parser.QueryParser.PUnaryCond(QueryParser.java:1949)
at org.apache.pig.impl.logicalLayer.parser.QueryParser.PAndCond(QueryParser.java:1790)
at org.apache.pig.impl.logicalLayer.parser.QueryParser.POrCond(QueryParser.java:1734)
at org.apache.pig.impl.logicalLayer.parser.QueryParser.PCond(QueryParser.java:1700)
at org.apache.pig.impl.logicalLayer.parser.QueryParser.FilterClause(QueryParser.java:1548)
at org.apache.pig.impl.logicalLayer.parser.QueryParser.BaseExpr(QueryParser.java:1276)
at org.apache.pig.impl.logicalLayer.parser.QueryParser.Expr(QueryParser.java:893)
at org.apache.pig.impl.logicalLayer.parser.QueryParser.Parse(QueryParser.java:682)
at org.apache.pig.impl.logicalLayer.LogicalPlanBuilder.parse(LogicalPlanBuilder.java:63)
at org.apache.pig.PigServer$Graph.parseQuery(PigServer.java:1031)
at org.apache.pig.PigServer$Graph.registerQuery(PigServer.java:981)
at org.apache.pig.PigServer.registerQuery(PigServer.java:383)
at org.apache.pig.tools.grunt.GruntParser.processPig(GruntParser.java:717)
at org.apache.pig.tools.pigscript.parser.PigScriptParser.parse(PigScriptParser.java:273)
at org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:166)
at org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:142)
at org.apache.pig.tools.grunt.Grunt.exec(Grunt.java:89)
at org.apache.pig.Main.main(Main.java:320)
Caused by: java.lang.ClassNotFoundException: org.joda.time.ReadablePartial
at java.net.URLClassLoader$1.run(URLClassLoader.java:200)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:188)
at java.lang.ClassLoader.loadClass(ClassLoader.java:307)
at java.lang.ClassLoader.loadClass(ClassLoader.java:252)
at java.lang.ClassLoader.loadClassInternal(ClassLoader.java:320)
... 24 more
I tried to modify my PIG_CLASSPATH variable but i figured out that this variable doesn't exist at all (some other pig scripts are working though).
Do you have an idea to solve te problem ?
Thanks.
At first, you need to tell Pig which jar you are using. See this answer: how to include external jar file using PIG. Configure build path to add it in eclipse is not enough. Eclipse will not help you generate the correct jar.
Secondly, String nb_months_between = "" + Months.monthsBetween(d1,d2); is wrong. You can use int nb_months_between = Months.monthsBetween(d1,d2).getMonths();. If you read the Months.toString, it returns "P" + String.valueOf(getValue()) + "M";. So you can not use this value and want to convert it to a int.
u need this package: org/joda/time/ReadablePartial
can find here: jarfinder
download the joda-time-1.5.jar. Add to your project, this to should resolve.
I am trying to load up my own UDF in pig. I have made it into a jar using eclipse's export function. I am getting this 1066 error when running my pig script. I am not sure B = .. as I can dump A, but I can not dump B.
Script
REGISTER myudfs.jar;
DEFINE HOUR myudfs.HOUR;
A = load 'access_log_Jul95' using PigStorage(' ') as (ip:chararray, dash1:chararray, dash2:chararray, date:chararray, getRequset:chararray, status:int, port:int);
B = FOREACH A GENERATE HOUR(ip);
DUMP B;
Function
package myudfs;
import java.io.IOException;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.WrappedIOException;
public class HOUR extends EvalFunc<String>
{
#SuppressWarnings("deprecation")
public String exec(Tuple input) throws IOException {
if (input == null || input.size() == 0)
return null;
try{
String str = (String)input.get(0);
return str.toUpperCase();
}catch(Exception e){
throw WrappedIOException.wrap("Caught exception processing input row ", e);
}
}
}
Running command
pig -x mapreduce 2.pig
Data Format
199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245
| | | | |
ip date getRequest status port
Pig Stack Trace
ERROR 1066: Unable to open iterator for alias B
org.apache.pig.impl.logicalLayer.FrontendException: ERROR 1066: Unable to open iterator for alias B
at org.apache.pig.PigServer.openIterator(PigServer.java:836)
at org.apache.pig.tools.grunt.GruntParser.processDump(GruntParser.java:696)
at org.apache.pig.tools.pigscript.parser.PigScriptParser.parse(PigScriptParser.java:320)
at org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:194)
at org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:170)
at org.apache.pig.tools.grunt.Grunt.exec(Grunt.java:84)
at org.apache.pig.Main.run(Main.java:604)
at org.apache.pig.Main.main(Main.java:157)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:601)
at org.apache.hadoop.util.RunJar.main(RunJar.java:208)
Caused by: java.io.IOException: Job terminated with anomalous status FAILED
at org.apache.pig.PigServer.openIterator(PigServer.java:828)
... 12 more
I am extremely unfamiliar with pig, and any and all pointers would be greatly appreciated. I know this is a lot of information to look at, but I have had no luck in mutating any data in a UDF, and I am just not sure where I went wrong.
Thanks
I am using tomcat 6 on windows. Here is the code I am testing.
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.StringReader;
import javax.script.Invocable;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
/**
* Create and run THREAD_COUNT PHP threads, concurrently accessing a
* shared resource.
*
* Create 5 script engines, passing each a shared resource allocated
* from Java. Each script engine has to implement Runnable.
*
* Java accesses the Runnable script engine using
* scriptEngine.getInterface() and calls thread.start() to invoke each
* PHP Runnable implementations concurrently.
*/
class PhpThreads {
public static final String runnable = new String("<?php\n" +
"function run() {\n" +
" $out = java_context()->getAttribute('sharedResource', 100);\n" +
" $nr = (string)java_context()->getAttribute('nr', 100);\n" +
" echo \"started thread: $nr\n\";\n" +
" for($i=0; $i<100; $i++) {\n" +
" $out->write(ord($nr));\n" +
" java('java.lang.Thread')->sleep(1);\n" +
" }\n" +
"}\n" +
"?>\n");
static final int THREAD_COUNT = 5;
public static void main(String[] args) throws Exception {
ScriptEngineManager manager = new ScriptEngineManager();
Thread threads[] = new Thread[THREAD_COUNT];
ScriptEngine engines[] = new ScriptEngine[THREAD_COUNT];
ByteArrayOutputStream sharedResource = new ByteArrayOutputStream();
StringReader runnableReader = new StringReader(runnable);
// create THREAD_COUNT PHP threads
for (int i=0; i<THREAD_COUNT; i++) {
engines[i] = manager.getEngineByName("php-invocable");
if (engines[i] == null)
throw new NullPointerException ("php script engine not found");
engines[i].put("nr", new Integer(i+1));
engines[i].put("sharedResource", sharedResource);
engines[i].eval(runnableReader);
runnableReader.reset();
// cast the whole script to Runnable; note also getInterface(specificClosure, type)
Runnable r = (Runnable) ((Invocable)engines[i]).getInterface(Runnable.class);
threads[i] = new Thread(r);
}
// run the THREAD_COUNT PHP threads
for (int i=0; i<THREAD_COUNT; i++) {
threads[i].start();
}
// wait for the THREAD_COUNT PHP threads to finish
for (int i=0; i<THREAD_COUNT; i++) {
threads[i].join();
((Closeable)engines[i]).close();
}
// print the output generated by the THREAD_COUNT concurrent threads
String result = sharedResource.toString();
System.out.println(result);
// Check result
Object res=manager.getEngineByName("php").eval(
"<?php " +
"exit((int)('10011002100310041005'!=" +
"#system(\"echo -n "+result+"|sed 's/./&\\\n/g'|sort|uniq -c|tr -d ' \\\n'\")));" +
"?>");
System.exit(((Number)res).intValue());
}
}
I have added all the libraries. When I run the file I get the following error -
run:
Exception in thread "main" javax.script.ScriptException: java.io.IOException: Cannot run program "php-cgi": CreateProcess error=2, The system cannot find the file specified
at php.java.script.InvocablePhpScriptEngine.eval(InvocablePhpScriptEngine.java:209)
at php.java.script.SimplePhpScriptEngine.eval(SimplePhpScriptEngine.java:178)
at javax.script.AbstractScriptEngine.eval(AbstractScriptEngine.java:232)
at PhpThreads.main(NewClass.java:53)
Caused by: java.io.IOException: Cannot run program "php-cgi": CreateProcess error=2, The system cannot find the file specified
at java.lang.ProcessBuilder.start(ProcessBuilder.java:459)
at java.lang.Runtime.exec(Runtime.java:593)
at php.java.bridge.Util$Process.start(Util.java:1064)
at php.java.bridge.Util$ProcessWithErrorHandler.start(Util.java:1166)
at php.java.bridge.Util$ProcessWithErrorHandler.start(Util.java:1217)
at php.java.script.CGIRunner.doRun(CGIRunner.java:126)
at php.java.script.HttpProxy.doRun(HttpProxy.java:63)
at php.java.script.CGIRunner.run(CGIRunner.java:111)
at php.java.bridge.ThreadPool$Delegate.run(ThreadPool.java:60)
Caused by: java.io.IOException: CreateProcess error=2, The system cannot find the file specified
at java.lang.ProcessImpl.create(Native Method)
at java.lang.ProcessImpl.<init>(ProcessImpl.java:81)
at java.lang.ProcessImpl.start(ProcessImpl.java:30)
at java.lang.ProcessBuilder.start(ProcessBuilder.java:452)
... 8 more
What am I missing?
Just add this to your command line:
-Dphp.java.bridge.php_exec=/usr/bin/php
Problem solved!
copy the correct VERSION of the PHP of php5ts.dll and php-cgi.exe to "WEB-INF\cgi\amd64-windows" directory. then restart Tomcat. good luck.
...php-cgi...The system cannot find the file specified
I'm guessing that manager.getEngineByName("php-invocable") should return a wrapper around a system call to run PHP - but that wrapper doesn't know where to find the PHP executable.
A quick glance at the website for the PHP/Java bridge, and I infer that the path is hard coded in the Java - "For further information please see the INSTALL.J2EE file from the documentation download"
The Javadoc is decidedly vague on the topic.
You need to specifically make the -cgi version of PHP at compile time, assuming you've done that, and it is called php-cgi, then as a quick hack you could perpper your filesystem with links named "php-cgi" (its probably expected to be in /bin, /usr/bin/, /usr/local/bin, or the Java may be smart enough to check $PATH)
C.
when you get error like
Fatal Error: Failed to start PHP ["php-cgi", "-v"], reason: java.io.IOException:
Cannot run program ""php-cgi"" (in directory "C:\Documents and Settings\Adminis
trator"): CreateProcess error=2, The system cannot find the file specified
Could not start FCGI server: java.io.IOException: PHP not found. Please install
php-cgi. PHP test command was: [php-cgi, -v]
php.java.bridge.http.FCGIConnectException: Could not connect to server
at php.java.bridge.http.NPChannelFactory.test(NPChannel`enter code here`Factory.java:64)
at php.java.bridge.http.FCGIConnectionPool.<init>(FCGIConnectionPool.jav
a:175)
at php.java.bridge.http.FCGIConnectionPool.<init>(FCGIConnectionPool.jav
a:189)
at php.java.servlet.ContextLoaderListener.createConnectionPool(ContextLo
aderListener.java:541)
at php.java.servlet.ContextLoaderListener.contextInitialized(ContextLoad
erListener.java:185)
at org.apache.catalina.core.StandardContext.listenerStart(StandardContex
t.java:4135)
at org.apache.catalina.core.StandardContext.start(StandardContext.java:4
630)
at org.apache.catalina.core.ContainerBase.addChildInternal(ContainerBase
.java:791)
at org.apache.catalina.core.ContainerBase.addChild(ContainerBase.java:77
1)
at org.apache.catalina.core.StandardHost.addChild(StandardHost.java:546)
at org.apache.catalina.startup.HostConfig.deployDirectory(HostConfig.jav
a:1041)
at org.apache.catalina.startup.HostConfig.deployDirectories(HostConfig.j
ava:964)
at org.apache.catalina.startup.HostConfig.deployApps(HostConfig.java:502
)
at org.apache.catalina.startup.HostConfig.start(HostConfig.java:1277)
at org.apache.catalina.startup.HostConfig.lifecycleEvent(HostConfig.java
:321)
at org.apache.catalina.util.LifecycleSupport.fireLifecycleEvent(Lifecycl
eSupport.java:119)
at org.apache.catalina.core.ContainerBase.start(ContainerBase.java:1053)
at org.apache.catalina.core.StandardHost.start(StandardHost.java:785)
at org.apache.catalina.core.ContainerBase.start(ContainerBase.java:1045)
at org.apache.catalina.core.StandardEngine.start(StandardEngine.java:445
)
at org.apache.catalina.core.StandardService.start(StandardService.java:5
19)
at org.apache.catalina.core.StandardServer.start(StandardServer.java:710
)
at org.apache.catalina.startup.Catalina.start(Catalina.java:581)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at org.apache.catalina.startup.Bootstrap.start(Bootstrap.java:289)
at org.apache.catalina.startup.Bootstrap.main(Bootstrap.java:414)
Caused by: java.io.IOException: File \\.\pipe\C:\Documents and Settings\Administ
rator\Desktop\softwares\apache-tomcat-6.0.29\temp\JavaBridge3144995283109409611.
socket not writable
at php.java.bridge.http.FCGIConnectException.<init>(FCGIConnectException
.java:37)
... 29 more
Caused by: java.io.IOException: PHP not found. Please install php-cgi. PHP test
command was: [php-cgi, -v]
at php.java.bridge.Util$Process.start(Util.java:1145)
at php.java.servlet.fastcgi.FCGIProcess.start(FCGIProcess.java:68)
at php.java.bridge.http.NPChannelFactory.doBind(NPChannelFactory.java:94
)
at php.java.bridge.http.FCGIConnectionFactory.runFcgi(FCGIConnectionFact
ory.java:88)
at php.java.bridge.http.FCGIConnectionFactory$1.run(FCGIConnectionFactor
y.java:109)
with the JavaBridge.war deployment in (windows, tomcat)
please specify the path for the php installation in your environment variable.