This question already has answers here:
Caused by: java.lang.NullPointerException at org.apache.spark.sql.Dataset
(2 answers)
Closed 4 years ago.
I have two spark dfs, i want to do foreach iterator for one df and get particular someId related records from next df.
when i do this every time it occurred java.lang.NullPointerException,
I have posted my code with comments inside the foreach loop. i have tried 3 ways to do this, but every time occurred same error.
Please help me to fixed this issue.
val schListDf = spark.read.format("csv")
.option("header", "true")
.load("/home/user/projects/scheduled.csv")
schListDf.createOrReplaceTempView("scheduled")
val trsListDf = spark.read.format("csv")
.option("header", "true")
.load("/home/user/projects/transaction.csv")
trsListDf.createOrReplaceTempView("transaction")
//THIS WORK FINE
val df3 = spark.sql("select * from transaction limit 5").show()
schListDf.foreach(row => {
if(row(2) != null){
// I HAVE TRIED THIS WAY FIRST, BUT OCCURRED SAME ERROR
val df = spark.sql("select * from transaction where someid = '"+row(2)+"'")
// I HAVE TRIED THIS WAY SECOND(WITHOUT someID filter), BUT OCCURRED SAME ERROR
val df2 = spark.sql("select * from transaction limit 5")
// I HAVE TRIED THIS WAY ALSO(FILTER WITH DF), BUT OCCURRED SAME ERROR
val filteredDataListDf = trsListDf.filter($"someid" === row(2))
}
})
18/12/02 10:36:34 ERROR Executor: Exception in task 0.0 in stage 4.0 (TID 4)
java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
18/12/02 10:36:34 ERROR Executor: Exception in task 3.0 in stage 4.0 (TID 7)
java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
18/12/02 10:36:34 ERROR Executor: Exception in task 1.0 in stage 4.0 (TID 5)
java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
18/12/02 10:36:34 ERROR Executor: Exception in task 2.0 in stage 4.0 (TID 6)
java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
18/12/02 10:36:34 WARN TaskSetManager: Lost task 2.0 in stage 4.0 (TID 6, localhost, executor driver): java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Some Spark aspects are Driver related.
A DF cannot be accessed from within a foreach which implies Executor side.
That is the paradigm. Same applies to RDDs and Spark Session.
That is to say, foreach is fine, but not with a val DF or spark.sql. You would need a while loop, for example.
This is a common misconception when one starts out with Spark it appears.
I'm trying to submit a Spark application to my Spark master. The master and several slaves are running in an OpenShift-Environment. The web UI of the Spark master shows the connected workers.
The application jar is deployed to /jars in every spark pod.
This is my submit-script:
spark-submit2.cmd --conf "spark.driver.extraClassPath=/jars"
--conf "spark.executor.extraClassPath=/jars"
--conf "spark.submit.deployMode=cluster"
--master spark://******:31824
--class Main 'local:/jars/SparkHelloWorld-1.0-SNAPSHOT.jar'
The application itself is simple:
public class Main {
private static String MASTER = "spark://******:31824";
public static void main(String[] args) throws Exception {
//Create a SparkContext to initialize
SparkConf conf = new SparkConf()
.setMaster(MASTER)
.setAppName("SparkPi");
// Create a Java version of the Spark Context
JavaSparkContext sc = new JavaSparkContext(conf);
final int slices = (args.length == 1) ? Integer.parseInt(args[0]) : 4;
final int n = 100000 * slices;
final List<Integer> l = new ArrayList<>(n);
for (int i = 0; i < n; i++) {
l.add(i);
}
final JavaRDD<Integer> dataSet = sc.parallelize(l, slices);
final int count = dataSet.map(integer -> {
double x = Math.random() * 2 - 1;
double y = Math.random() * 2 - 1;
return (x * x + y * y < 1) ? 1 : 0;
}).reduce((a, b) -> a + b);
System.out.println("Pi is roughly " + 4.0 * count / n);
}
}
Every time I run this script, i get the following exception:
Exception in thread "main" org.apache.spark.SparkException: Exception thrown in awaitResult:
at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:205)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:100)
at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:108)
at org.apache.spark.deploy.Client$$anonfun$7.apply(Client.scala:233)
at org.apache.spark.deploy.Client$$anonfun$7.apply(Client.scala:233)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at org.apache.spark.deploy.Client$.main(Client.scala:233)
at org.apache.spark.deploy.Client.main(Client.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:755)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.RuntimeException: java.io.StreamCorruptedException: invalid stream header: 01000D31
at java.io.ObjectInputStream.readStreamHeader(ObjectInputStream.java:857)
at java.io.ObjectInputStream.<init>(ObjectInputStream.java:349)
at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.<init>(JavaSerializer.scala:63)
at org.apache.spark.serializer.JavaDeserializationStream.<init>(JavaSerializer.scala:63)
at org.apache.spark.serializer.JavaSerializerInstance.deserializeStream(JavaSerializer.scala:122)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:107)
at org.apache.spark.rpc.netty.NettyRpcEnv$$anonfun$deserialize$1$$anonfun$apply$1.apply(NettyRpcEnv.scala:259)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.rpc.netty.NettyRpcEnv.deserialize(NettyRpcEnv.scala:308)
at org.apache.spark.rpc.netty.NettyRpcEnv$$anonfun$deserialize$1.apply(NettyRpcEnv.scala:258)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.rpc.netty.NettyRpcEnv.deserialize(NettyRpcEnv.scala:257)
at org.apache.spark.rpc.netty.NettyRpcHandler.internalReceive(NettyRpcEnv.scala:577)
at org.apache.spark.rpc.netty.NettyRpcHandler.receive(NettyRpcEnv.scala:562)
at org.apache.spark.network.server.TransportRequestHandler.processRpcRequest(TransportRequestHandler.java:159)
at org.apache.spark.network.server.TransportRequestHandler.handle(TransportRequestHandler.java:107)
at org.apache.spark.network.server.TransportChannelHandler.channelRead(TransportChannelHandler.java:118)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
at io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:266)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
at org.apache.spark.network.util.TransportFrameDecoder.channelRead(TransportFrameDecoder.java:85)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)
at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1294)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353)
at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:911)
at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:652)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:575)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:489)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:451)
at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:140)
at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:144)
at java.lang.Thread.run(Thread.java:748)
at org.apache.spark.network.client.TransportResponseHandler.handle(TransportResponseHandler.java:207)
at org.apache.spark.network.server.TransportChannelHandler.channelRead(TransportChannelHandler.java:120)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:357)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:343)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:336)
at io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:287)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:357)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:343)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:336)
at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:357)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:343)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:336)
at org.apache.spark.network.util.TransportFrameDecoder.channelRead(TransportFrameDecoder.java:85)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:357)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:343)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:336)
at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1294)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:357)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:343)
at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:911)
at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:643)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:566)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:480)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:442)
at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:131)
at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:144)
at java.lang.Thread.run(Unknown Source)
I couldn't find this problem in any spark documentation. Am I missing something?
It may be related to a different Spark version in the master and in the slaves.
It may happen due to one of following :
Version of spark used in code is different than that of cluster.
Scala version mismatch of spark used for coding and used to run cluster.
In my case I was using Spark dependencies built with scala 2.10 while in cluster it was running on spark built with 2.11.
Hope it helps someone. Cheers!
In my case this was due to version mismatch between pyspark which I installed using pip and my spark.
I'm new to scala spark and its MLlib, and currently I'm struggling against an error that I don't know why it is happening.
I have an RDD with multiple partitions, containing data like this (output from take(#)):
Array[TermDoc] = Array(TermDoc(142389495503925248,Set(NEU),ArrayBuffer(salg, veotv, día, largooooo)), TermDoc(142389933619945473,Set(NEU),ArrayBuffer(librar, ayudar, bes, graci)), TermDoc(142391947707940864,Set(P),ArrayBuffer(graci, mar)), TermDoc(142416095012339712,Set(N+),ArrayBuffer(off, pensand, regalit, sind, va, sgae, van, corrupt, intent, sacar, conclusion, intent)), TermDoc(142422495721562112,Set(P+),ArrayBuffer(conozc, alguien, q, adict, dram, ja, ja, ja, suen, d)), TermDoc(142424715175280640,Set(NEU),ArrayBuffer(rt, si, amas, alguien, dejal, libr, si, grit, hombr, paurubi)), TermDoc(142483342040907776,Set(P+),ArrayBuffer(toca, grabacion, dl, especial, navideñ, mari, crism)), TermDoc(142493511634259968,Set(NEU))
Since there's an output, I assume that the RDD is not empty, but when I try to execute:
val count = rdd.count()
java.lang.UnsupportedOperationException: empty.init
at scala.collection.TraversableLike$class.init(TraversableLike.scala:475)
at scala.collection.mutable.ArrayOps$ofRef.scala$collection$IndexedSeqOptimized$$super$init(ArrayOps.scala:108)
at scala.collection.IndexedSeqOptimized$class.init(IndexedSeqOptimized.scala:129)
at scala.collection.mutable.ArrayOps$ofRef.init(ArrayOps.scala:108)
at $line24.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$.buildDocument(<console>:58)
at $line24.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at $line24.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1598)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
17/03/13 10:15:11 WARN scheduler.TaskSetManager: Lost task 0.0 in stage 2.0 (TID 2, localhost): java.lang.UnsupportedOperationException: empty.init
at scala.collection.TraversableLike$class.init(TraversableLike.scala:475)
at scala.collection.mutable.ArrayOps$ofRef.scala$collection$IndexedSeqOptimized$$super$init(ArrayOps.scala:108)
at scala.collection.IndexedSeqOptimized$class.init(IndexedSeqOptimized.scala:129)
at scala.collection.mutable.ArrayOps$ofRef.init(ArrayOps.scala:108)
at $line24.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$.buildDocument(<console>:58)
at $line24.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at $line24.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1598)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
17/03/13 10:15:11 ERROR scheduler.TaskSetManager: Task 0 in stage 2.0 failed 1 times; aborting job
17/03/13 10:15:11 WARN scheduler.TaskSetManager: Lost task 1.0 in stage 2.0 (TID 3, localhost): TaskKilled (killed intentionally)
17/03/13 10:15:11 WARN spark.ExecutorAllocationManager: No stages are running, but numRunningTasks != 0
17/03/13 10:15:11 ERROR scheduler.LiveListenerBus: Listener SQLListener threw an exception
java.lang.NullPointerException
at org.apache.spark.sql.execution.ui.SQLListener.onTaskEnd(SQLListener.scala:167)
at org.apache.spark.scheduler.SparkListenerBus$class.onPostEvent(SparkListenerBus.scala:42)
at org.apache.spark.scheduler.LiveListenerBus.onPostEvent(LiveListenerBus.scala:31)
at org.apache.spark.scheduler.LiveListenerBus.onPostEvent(LiveListenerBus.scala:31)
at org.apache.spark.util.ListenerBus$class.postToAll(ListenerBus.scala:55)
at org.apache.spark.util.AsynchronousListenerBus.postToAll(AsynchronousListenerBus.scala:37)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(AsynchronousListenerBus.scala:80)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(AsynchronousListenerBus.scala:65)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(AsynchronousListenerBus.scala:65)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1.apply$mcV$sp(AsynchronousListenerBus.scala:64)
at org.apache.spark.util.Utils$.tryOrStopSparkContext(Utils.scala:1181)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1.run(AsynchronousListenerBus.scala:63)
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 2, localhost): java.lang.UnsupportedOperationException: empty.init
at scala.collection.TraversableLike$class.init(TraversableLike.scala:475)
at scala.collection.mutable.ArrayOps$ofRef.scala$collection$IndexedSeqOptimized$$super$init(ArrayOps.scala:108)
at scala.collection.IndexedSeqOptimized$class.init(IndexedSeqOptimized.scala:129)
at scala.collection.mutable.ArrayOps$ofRef.init(ArrayOps.scala:108)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$.buildDocument(<console>:58)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1598)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1843)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1856)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1869)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1940)
at org.apache.spark.rdd.RDD.count(RDD.scala:1157)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:62)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:67)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:69)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:71)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:73)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:75)
at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:77)
at $iwC$$iwC$$iwC$$iwC.<init>(<console>:79)
at $iwC$$iwC$$iwC.<init>(<console>:81)
at $iwC$$iwC.<init>(<console>:83)
at $iwC.<init>(<console>:85)
at <init>(<console>:87)
at .<init>(<console>:91)
at .<clinit>(<console>)
at .<init>(<console>:7)
at .<clinit>(<console>)
at $print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1045)
at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1326)
at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:821)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:852)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:800)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1064)
at org.apache.spark.repl.Main$.main(Main.scala:31)
at org.apache.spark.repl.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.UnsupportedOperationException: empty.init
at scala.collection.TraversableLike$class.init(TraversableLike.scala:475)
at scala.collection.mutable.ArrayOps$ofRef.scala$collection$IndexedSeqOptimized$$super$init(ArrayOps.scala:108)
at scala.collection.IndexedSeqOptimized$class.init(IndexedSeqOptimized.scala:129)
at scala.collection.mutable.ArrayOps$ofRef.init(ArrayOps.scala:108)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$.buildDocument(<console>:58)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1598)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Apparently, it is saying that I'm trying to call count on an empty RDD. What's happening? It also fails with this line:
val terms = termDocsRdd.flatMap(_.terms).distinct().sortBy(identity)
Same empty.init exception.
Thanks.
UPDATE: Adding required information
object TweetParser extends Serializable{
val headerPart = "polarity"
val mentionRegex = """#(.)+?\s""".r
val fullRegex = """(\d+),(.+?),(N|P|NEU|NONE)(,\w+|;\w+)*""".r
def parseAll(csvFiles: Iterable[String], sc: SparkContext): RDD[Document] = {
val csv = sc.textFile(csvFiles mkString ",")
//val docs = scala.collection.mutable.ArrayBuffer.empty[Document]
val docs = csv.filter(!_.contains(headerPart)).map(buildDocument(_))
docs
//docs.filter(!_.docId.equals("INVALID"))
}
def buildDocument(line: String): Document = {
val lineSplit = line.split(",")
val id = lineSplit.head
val txt = lineSplit.tail.init.init.mkString(",")
val sent = lineSplit.init.last
val opt = lineSplit.last
if (id != null && txt != null && sent != null) {
if (txt.equals("")) {
//the line does not contain the option after sentiment
new Document(id, mentionRegex.replaceAllIn(sent, ""), Set(opt))
} else {
new Document(id, mentionRegex.replaceAllIn(txt, ""), Set(sent))
}
} else {
println("Invalid")
new Document("INVALID")
}
}
}
case class Document(docId: String, body: String = "", labels: Set[String] = Set.empty)
Tokenizer object:
import java.io.StringReader
import org.apache.lucene.analysis.es.SpanishAnalyzer
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.lucene.util.Version
import org.apache.spark.rdd.RDD
object Tokenizer extends Serializable {
//val LuceneVersion = Version.LUCENE_5_1_0
def tokenizeAll(docs: RDD[Document]) = docs.map(tokenize)
def tokenize(doc: Document): TermDoc = TermDoc(doc.docId, doc.labels, tokenize(doc.body))
def tokenize(content: String): Seq[String] = {
val result = scala.collection.mutable.ArrayBuffer.empty[String]
/*content.split("\n").foreach(line => line.split(" ").foreach(
word => if (word.startsWith("#")) result += word.substring(1) else word
))*/
val analyzer = new SpanishAnalyzer()
analyzer.setVersion(Version.LUCENE_5_1_0)
val tReader = new StringReader(content)
val tStream = analyzer.tokenStream("", tReader)
val term = tStream.addAttribute(classOf[CharTermAttribute])
tStream.reset()
while (tStream.incrementToken()) {
val termValue = term.toString
if (termValue.startsWith("#")) {
result += termValue.substring(1)
}
else {
result += termValue
}
}
result
}
}
case class TermDoc(doc: String, labels: Set[String], terms: Seq[String])
Driver:
val csvFiles = List("/path/to/file.csv", "/path/to/file2.csv", "/path/to/file3.csv")
val docs = TweetParser.parseAll(csvFiles, sc)
val termDocsRdd = Tokenizer.tokenizeAll(docs)
val numDocs = termDocsRdd.count()
val terms = termDocsRdd.flatMap(_.terms).distinct().sortBy(identity)
I'm testing this at spark-shell. That's why driver looks like this. Hope this clarifies the question.
Apparently, it is saying that I'm trying to call count on an empty RDD
Actually - no, that's not what the error says. count triggers the computation of this RDD, and this exception is thrown while calculating one of the RDD's records.
Specifically, the error states:
java.lang.UnsupportedOperationException: empty.init
This is probably thrown from one of these expressions within buildDocument:
val txt = lineSplit.tail.init.init.mkString(",")
val sent = lineSplit.init.last
This code fragment assumes lineSplit is a collection with at least 3 elements - and the exception you see is the result of that assumption being incorrect for at least one record: for example, if lineSplit had just 2 elements, lineSplit.tail.init would be an empty collection, and therefore lineSplit.tail.init.init would throw the exception you see.
To overcome this - you can rewrite your "parsing" method to handle such irregularities in the data properly:
Wrap it with a Try(...) and filter only the successful records, e.g.:
import scala.util.{Try, Success}
def parseAll(csvFiles: Iterable[String], sc: SparkContext): RDD[Document] = {
val csv = sc.textFile(csvFiles mkString ",")
val docs = csv.filter(!_.contains(headerPart))
.map(s => Try(buildDocument(s)))
.collect { case Success(v) => v }
docs
}
Change the parsing so that "missing" parts of lineSplit will be set to null (as the following lines seem to expect), e.g.:
def buildDocument(line: String): Document = {
val (id, txt, sent, opt) = line.split(",").padTo(5, null) match {
case Array(a,b,c,d,e,_*) => (a, s"$b,$c", d, e)
}
// continue as before....
}
I'm just trying to install Logback-demo, using maven to package the files and run the demo for the first time, but I keep getting these weird java exception messages:
10:12:03.103 [pool-3-thread-1] INFO ch.qos.logback.demo.LoggingTask - Howdydy-diddly-ho - 8499
java.lang.Exception: e
at ch.qos.logback.demo.LoggingTask.run(LoggingTask.java:28) [classes/:na]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) [na:1.7.0_65]
at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:304) [na:1.7.0_65]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:178) [na:1.7.0_65]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293) [na:1.7.0_65]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) [na:1.7.0_65]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) [na:1.7.0_65]
at java.lang.Thread.run(Thread.java:745) [na:1.7.0_65]
10:12:03,104 |-INFO in c.q.l.core.rolling.DefaultTimeBasedFileNamingAndTriggeringPolicy - Elapsed period: Thu May 21 10:11:03 PDT 2015
10:12:03,104 |-INFO in c.q.l.co.rolling.helper.RenameUtil - Renaming file [logFile.log] to [logFile.log4499498230724.tmp]
10:12:03,105 |-INFO in ch.qos.logback.core.rolling.helper.Compressor - ZIP compressing [logFile.log4499498230724.tmp] as [logFile.2015-05-21_10-11.log.zip]
this message will repeat over and over again for hours. Any ideas on what I am doing wrong?
According to the source code, this seems normal:
public class LoggingTask implements Runnable {
// ...
public void run() {
if(i % 100 == 99) {
logger.info(HOWDY_MARKER, msg + " - " + (i++), new Exception("e"));
} else {
logger.trace("x {]", i++);
}
}
}
Also note that no exception is thrown here, it's only logged.
Spark Java application throws NotSerializableException on hadoop writables.
public final class myAPP {
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Usage: myAPP <file>");
System.exit(1);
}
SparkConf sparkConf = new SparkConf().setAppName("myAPP").setMaster("local");
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
Configuration conf = new Configuration();
JavaPairRDD<LongWritable,Text> lines = ctx.newAPIHadoopFile(args[0], TextInputFormat.class, LongWritable.class, Text.class, conf);
System.out.println( lines.collect().toString());
ctx.stop();
}
.
java.io.NotSerializableException: org.apache.hadoop.io.LongWritable
Serialization stack:
- object not serializable (class: org.apache.hadoop.io.LongWritable, value: 15227295)
- field (class: scala.Tuple2, name: _1, type: class java.lang.Object)
- object (class scala.Tuple2, (15227295,))
- element of array (index: 0)
- array (class [Lscala.Tuple2;, size 1153163)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:38)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:80)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
15/04/26 16:05:05 ERROR TaskSetManager: Task 0.0 in stage 0.0 (TID 0) had a not serializable result: org.apache.hadoop.io.LongWritable
Serialization stack:
- object not serializable (class: org.apache.hadoop.io.LongWritable, value: 15227295)
- field (class: scala.Tuple2, name: _1, type: class java.lang.Object)
- object (class scala.Tuple2, (15227295,))
- element of array (index: 0)
- array (class [Lscala.Tuple2;, size 1153163); not retrying
15/04/26 16:05:05 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
15/04/26 16:05:05 INFO TaskSchedulerImpl: Cancelling stage 0
15/04/26 16:05:05 INFO DAGScheduler: Job 0 failed: collect at Parser2.java:60, took 0.460181 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0 in stage 0.0 (TID 0) had a not serializable result: org.apache.hadoop.io.LongWritable
In Spark Scala program I register hadoop writables as below and it works fine.
sparkConf.registerKryoClasses(Array(classOf[org.apache.hadoop.io.LongWritable], classOf[org.apache.hadoop.io.Text]))
Apparently this approach doesn't work with Apache Spark API
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
sparkConf.set("spark.kryo.registrator", LongWritable.class.getName());
.
Exception in thread "main" org.apache.spark.SparkException: Failed to register classes with Kryo
at org.apache.spark.serializer.KryoSerializer.newKryo(KryoSerializer.scala:101)
at org.apache.spark.serializer.KryoSerializerInstance.<init>(KryoSerializer.scala:153)
at org.apache.spark.serializer.KryoSerializer.newInstance(KryoSerializer.scala:115)
at org.apache.spark.broadcast.TorrentBroadcast$.blockifyObject(TorrentBroadcast.scala:200)
at org.apache.spark.broadcast.TorrentBroadcast.writeBlocks(TorrentBroadcast.scala:101)
at org.apache.spark.broadcast.TorrentBroadcast.<init>(TorrentBroadcast.scala:84)
at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:34)
at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:29)
at org.apache.spark.broadcast.BroadcastManager.newBroadcast(BroadcastManager.scala:62)
at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1051)
at org.apache.spark.rdd.NewHadoopRDD.<init>(NewHadoopRDD.scala:77)
at org.apache.spark.SparkContext.newAPIHadoopFile(SparkContext.scala:848)
at org.apache.spark.api.java.JavaSparkContext.newAPIHadoopFile(JavaSparkContext.scala:488)
at com.nsn.PMParser.Parser2.main(Parser2.java:56)
Caused by: java.lang.ClassCastException: org.apache.hadoop.io.LongWritable cannot be cast to org.apache.spark.serializer.KryoRegistrator
at org.apache.spark.serializer.KryoSerializer$$anonfun$newKryo$3.apply(KryoSerializer.scala:97)
at org.apache.spark.serializer.KryoSerializer$$anonfun$newKryo$3.apply(KryoSerializer.scala:97)
at scala.Option.map(Option.scala:145)
at org.apache.spark.serializer.KryoSerializer.newKryo(KryoSerializer.scala:97)
... 13 more
hadoop writables NotSerializableException with Apache Spark Java API?
As of Spark v1.4.0, you can use this Java API to register classes to be serialized using Kryo:
https://spark.apache.org/docs/latest/api/java/org/apache/spark/SparkConf.html#registerKryoClasses(java.lang.Class[])
,
by passing in an array of Class objects, each of which can be obtained using
http://docs.oracle.com/javase/7/docs/api/java/lang/Class.html#forName(java.lang.String)
such as:
new SparkConf().registerKryoClasses(new Class<?>[]{
Class.forName("org.apache.hadoop.io.LongWritable"),
Class.forName("org.apache.hadoop.io.Text")
});
Hope this helps.
use
sparkConf.set("spark.kryo.classesToRegister", "org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text")
or you can simply use
ctx.textFile(args[0]);
to load RDD