I'm new to scala spark and its MLlib, and currently I'm struggling against an error that I don't know why it is happening.
I have an RDD with multiple partitions, containing data like this (output from take(#)):
Array[TermDoc] = Array(TermDoc(142389495503925248,Set(NEU),ArrayBuffer(salg, veotv, día, largooooo)), TermDoc(142389933619945473,Set(NEU),ArrayBuffer(librar, ayudar, bes, graci)), TermDoc(142391947707940864,Set(P),ArrayBuffer(graci, mar)), TermDoc(142416095012339712,Set(N+),ArrayBuffer(off, pensand, regalit, sind, va, sgae, van, corrupt, intent, sacar, conclusion, intent)), TermDoc(142422495721562112,Set(P+),ArrayBuffer(conozc, alguien, q, adict, dram, ja, ja, ja, suen, d)), TermDoc(142424715175280640,Set(NEU),ArrayBuffer(rt, si, amas, alguien, dejal, libr, si, grit, hombr, paurubi)), TermDoc(142483342040907776,Set(P+),ArrayBuffer(toca, grabacion, dl, especial, navideñ, mari, crism)), TermDoc(142493511634259968,Set(NEU))
Since there's an output, I assume that the RDD is not empty, but when I try to execute:
val count = rdd.count()
java.lang.UnsupportedOperationException: empty.init
at scala.collection.TraversableLike$class.init(TraversableLike.scala:475)
at scala.collection.mutable.ArrayOps$ofRef.scala$collection$IndexedSeqOptimized$$super$init(ArrayOps.scala:108)
at scala.collection.IndexedSeqOptimized$class.init(IndexedSeqOptimized.scala:129)
at scala.collection.mutable.ArrayOps$ofRef.init(ArrayOps.scala:108)
at $line24.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$.buildDocument(<console>:58)
at $line24.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at $line24.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1598)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
17/03/13 10:15:11 WARN scheduler.TaskSetManager: Lost task 0.0 in stage 2.0 (TID 2, localhost): java.lang.UnsupportedOperationException: empty.init
at scala.collection.TraversableLike$class.init(TraversableLike.scala:475)
at scala.collection.mutable.ArrayOps$ofRef.scala$collection$IndexedSeqOptimized$$super$init(ArrayOps.scala:108)
at scala.collection.IndexedSeqOptimized$class.init(IndexedSeqOptimized.scala:129)
at scala.collection.mutable.ArrayOps$ofRef.init(ArrayOps.scala:108)
at $line24.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$.buildDocument(<console>:58)
at $line24.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at $line24.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1598)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
17/03/13 10:15:11 ERROR scheduler.TaskSetManager: Task 0 in stage 2.0 failed 1 times; aborting job
17/03/13 10:15:11 WARN scheduler.TaskSetManager: Lost task 1.0 in stage 2.0 (TID 3, localhost): TaskKilled (killed intentionally)
17/03/13 10:15:11 WARN spark.ExecutorAllocationManager: No stages are running, but numRunningTasks != 0
17/03/13 10:15:11 ERROR scheduler.LiveListenerBus: Listener SQLListener threw an exception
java.lang.NullPointerException
at org.apache.spark.sql.execution.ui.SQLListener.onTaskEnd(SQLListener.scala:167)
at org.apache.spark.scheduler.SparkListenerBus$class.onPostEvent(SparkListenerBus.scala:42)
at org.apache.spark.scheduler.LiveListenerBus.onPostEvent(LiveListenerBus.scala:31)
at org.apache.spark.scheduler.LiveListenerBus.onPostEvent(LiveListenerBus.scala:31)
at org.apache.spark.util.ListenerBus$class.postToAll(ListenerBus.scala:55)
at org.apache.spark.util.AsynchronousListenerBus.postToAll(AsynchronousListenerBus.scala:37)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(AsynchronousListenerBus.scala:80)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(AsynchronousListenerBus.scala:65)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(AsynchronousListenerBus.scala:65)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1.apply$mcV$sp(AsynchronousListenerBus.scala:64)
at org.apache.spark.util.Utils$.tryOrStopSparkContext(Utils.scala:1181)
at org.apache.spark.util.AsynchronousListenerBus$$anon$1.run(AsynchronousListenerBus.scala:63)
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 2, localhost): java.lang.UnsupportedOperationException: empty.init
at scala.collection.TraversableLike$class.init(TraversableLike.scala:475)
at scala.collection.mutable.ArrayOps$ofRef.scala$collection$IndexedSeqOptimized$$super$init(ArrayOps.scala:108)
at scala.collection.IndexedSeqOptimized$class.init(IndexedSeqOptimized.scala:129)
at scala.collection.mutable.ArrayOps$ofRef.init(ArrayOps.scala:108)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$.buildDocument(<console>:58)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1598)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1843)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1856)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1869)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1940)
at org.apache.spark.rdd.RDD.count(RDD.scala:1157)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:62)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:67)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:69)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:71)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:73)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:75)
at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:77)
at $iwC$$iwC$$iwC$$iwC.<init>(<console>:79)
at $iwC$$iwC$$iwC.<init>(<console>:81)
at $iwC$$iwC.<init>(<console>:83)
at $iwC.<init>(<console>:85)
at <init>(<console>:87)
at .<init>(<console>:91)
at .<clinit>(<console>)
at .<init>(<console>:7)
at .<clinit>(<console>)
at $print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1045)
at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1326)
at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:821)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:852)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:800)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1064)
at org.apache.spark.repl.Main$.main(Main.scala:31)
at org.apache.spark.repl.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.UnsupportedOperationException: empty.init
at scala.collection.TraversableLike$class.init(TraversableLike.scala:475)
at scala.collection.mutable.ArrayOps$ofRef.scala$collection$IndexedSeqOptimized$$super$init(ArrayOps.scala:108)
at scala.collection.IndexedSeqOptimized$class.init(IndexedSeqOptimized.scala:129)
at scala.collection.mutable.ArrayOps$ofRef.init(ArrayOps.scala:108)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$.buildDocument(<console>:58)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$TweetParser$$anonfun$2.apply(<console>:49)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1598)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1157)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1869)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Apparently, it is saying that I'm trying to call count on an empty RDD. What's happening? It also fails with this line:
val terms = termDocsRdd.flatMap(_.terms).distinct().sortBy(identity)
Same empty.init exception.
Thanks.
UPDATE: Adding required information
object TweetParser extends Serializable{
val headerPart = "polarity"
val mentionRegex = """#(.)+?\s""".r
val fullRegex = """(\d+),(.+?),(N|P|NEU|NONE)(,\w+|;\w+)*""".r
def parseAll(csvFiles: Iterable[String], sc: SparkContext): RDD[Document] = {
val csv = sc.textFile(csvFiles mkString ",")
//val docs = scala.collection.mutable.ArrayBuffer.empty[Document]
val docs = csv.filter(!_.contains(headerPart)).map(buildDocument(_))
docs
//docs.filter(!_.docId.equals("INVALID"))
}
def buildDocument(line: String): Document = {
val lineSplit = line.split(",")
val id = lineSplit.head
val txt = lineSplit.tail.init.init.mkString(",")
val sent = lineSplit.init.last
val opt = lineSplit.last
if (id != null && txt != null && sent != null) {
if (txt.equals("")) {
//the line does not contain the option after sentiment
new Document(id, mentionRegex.replaceAllIn(sent, ""), Set(opt))
} else {
new Document(id, mentionRegex.replaceAllIn(txt, ""), Set(sent))
}
} else {
println("Invalid")
new Document("INVALID")
}
}
}
case class Document(docId: String, body: String = "", labels: Set[String] = Set.empty)
Tokenizer object:
import java.io.StringReader
import org.apache.lucene.analysis.es.SpanishAnalyzer
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.lucene.util.Version
import org.apache.spark.rdd.RDD
object Tokenizer extends Serializable {
//val LuceneVersion = Version.LUCENE_5_1_0
def tokenizeAll(docs: RDD[Document]) = docs.map(tokenize)
def tokenize(doc: Document): TermDoc = TermDoc(doc.docId, doc.labels, tokenize(doc.body))
def tokenize(content: String): Seq[String] = {
val result = scala.collection.mutable.ArrayBuffer.empty[String]
/*content.split("\n").foreach(line => line.split(" ").foreach(
word => if (word.startsWith("#")) result += word.substring(1) else word
))*/
val analyzer = new SpanishAnalyzer()
analyzer.setVersion(Version.LUCENE_5_1_0)
val tReader = new StringReader(content)
val tStream = analyzer.tokenStream("", tReader)
val term = tStream.addAttribute(classOf[CharTermAttribute])
tStream.reset()
while (tStream.incrementToken()) {
val termValue = term.toString
if (termValue.startsWith("#")) {
result += termValue.substring(1)
}
else {
result += termValue
}
}
result
}
}
case class TermDoc(doc: String, labels: Set[String], terms: Seq[String])
Driver:
val csvFiles = List("/path/to/file.csv", "/path/to/file2.csv", "/path/to/file3.csv")
val docs = TweetParser.parseAll(csvFiles, sc)
val termDocsRdd = Tokenizer.tokenizeAll(docs)
val numDocs = termDocsRdd.count()
val terms = termDocsRdd.flatMap(_.terms).distinct().sortBy(identity)
I'm testing this at spark-shell. That's why driver looks like this. Hope this clarifies the question.
Apparently, it is saying that I'm trying to call count on an empty RDD
Actually - no, that's not what the error says. count triggers the computation of this RDD, and this exception is thrown while calculating one of the RDD's records.
Specifically, the error states:
java.lang.UnsupportedOperationException: empty.init
This is probably thrown from one of these expressions within buildDocument:
val txt = lineSplit.tail.init.init.mkString(",")
val sent = lineSplit.init.last
This code fragment assumes lineSplit is a collection with at least 3 elements - and the exception you see is the result of that assumption being incorrect for at least one record: for example, if lineSplit had just 2 elements, lineSplit.tail.init would be an empty collection, and therefore lineSplit.tail.init.init would throw the exception you see.
To overcome this - you can rewrite your "parsing" method to handle such irregularities in the data properly:
Wrap it with a Try(...) and filter only the successful records, e.g.:
import scala.util.{Try, Success}
def parseAll(csvFiles: Iterable[String], sc: SparkContext): RDD[Document] = {
val csv = sc.textFile(csvFiles mkString ",")
val docs = csv.filter(!_.contains(headerPart))
.map(s => Try(buildDocument(s)))
.collect { case Success(v) => v }
docs
}
Change the parsing so that "missing" parts of lineSplit will be set to null (as the following lines seem to expect), e.g.:
def buildDocument(line: String): Document = {
val (id, txt, sent, opt) = line.split(",").padTo(5, null) match {
case Array(a,b,c,d,e,_*) => (a, s"$b,$c", d, e)
}
// continue as before....
}
Related
This question already has answers here:
Caused by: java.lang.NullPointerException at org.apache.spark.sql.Dataset
(2 answers)
Closed 4 years ago.
I have two spark dfs, i want to do foreach iterator for one df and get particular someId related records from next df.
when i do this every time it occurred java.lang.NullPointerException,
I have posted my code with comments inside the foreach loop. i have tried 3 ways to do this, but every time occurred same error.
Please help me to fixed this issue.
val schListDf = spark.read.format("csv")
.option("header", "true")
.load("/home/user/projects/scheduled.csv")
schListDf.createOrReplaceTempView("scheduled")
val trsListDf = spark.read.format("csv")
.option("header", "true")
.load("/home/user/projects/transaction.csv")
trsListDf.createOrReplaceTempView("transaction")
//THIS WORK FINE
val df3 = spark.sql("select * from transaction limit 5").show()
schListDf.foreach(row => {
if(row(2) != null){
// I HAVE TRIED THIS WAY FIRST, BUT OCCURRED SAME ERROR
val df = spark.sql("select * from transaction where someid = '"+row(2)+"'")
// I HAVE TRIED THIS WAY SECOND(WITHOUT someID filter), BUT OCCURRED SAME ERROR
val df2 = spark.sql("select * from transaction limit 5")
// I HAVE TRIED THIS WAY ALSO(FILTER WITH DF), BUT OCCURRED SAME ERROR
val filteredDataListDf = trsListDf.filter($"someid" === row(2))
}
})
18/12/02 10:36:34 ERROR Executor: Exception in task 0.0 in stage 4.0 (TID 4)
java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
18/12/02 10:36:34 ERROR Executor: Exception in task 3.0 in stage 4.0 (TID 7)
java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
18/12/02 10:36:34 ERROR Executor: Exception in task 1.0 in stage 4.0 (TID 5)
java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
18/12/02 10:36:34 ERROR Executor: Exception in task 2.0 in stage 4.0 (TID 6)
java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
18/12/02 10:36:34 WARN TaskSetManager: Lost task 2.0 in stage 4.0 (TID 6, localhost, executor driver): java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:142)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:140)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:641)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:52)
at controllers.FileProcess$$anonfun$hnbFile$1.apply(FileProcess.scala:48)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Some Spark aspects are Driver related.
A DF cannot be accessed from within a foreach which implies Executor side.
That is the paradigm. Same applies to RDDs and Spark Session.
That is to say, foreach is fine, but not with a val DF or spark.sql. You would need a while loop, for example.
This is a common misconception when one starts out with Spark it appears.
I am trying to generate customer statistics using the following code. It's a combineByKey transformation. I got an ArrayIndexOutOfBounds exception. Wondering for the reason, but I am not getting any hint. Please can anyone clarify, why I am getting this exception. Thank you.
def createComb = (t:Array[String]) => {
val total = t(5).toDouble
val q = t(4).toInt
(total/q, total/q, q, total)}
def mergeValues : ((Double,Double,Int,Double), Array[String]) =>
(Double,Double,Int,Double) =
{case((mx,mn,q,tot),t) =>{
val total = t(5).toDouble
val quan = t(4).toInt
val mxx = scala.math.max(mx, total/q)
val minn = scala.math.min(mn, total/q)
(mxx,minn,quan+q,total+tot)}}
def mergeComb:((Double,Double,Int,Double),(Double,Double,Int,Double)) =>
(Double,Double,Int,Double) =
{ case((mx1,mn1,q1,tot1),(mx2,mn2,q2,tot2)) =>
(scala.math.max(mx1,mx2), scala.math.min(mn1,mn2), q1+q2, tot1+tot2)}
val statsOfCust = productsTotalByKey.combineByKey(createComb, mergeValues, mergeComb, new org.apache.spark.HashPartitioner(productsTotalByKey.partitions.size))
Here is the output which I got when executed on an RDD after executing above code on spark cluster.
scala> statsOfCust.first
[Stage 22:> (0 + 1) / 2]18/11/17 21:26:31 WARN TaskSetManager: Lost task 0.0 in stage 22.0 (TID 26, wn01.itversity.com, executor 9): java.lang.ArrayIndexOutOfBoundsException: 5
at $line80.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$createComb$1.apply(<console>:24)
at $line80.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$createComb$1.apply(<console>:23)
at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:189)
at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:188)
at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:144)
at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1358)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.take(RDD.scala:1331)
at org.apache.spark.rdd.RDD$$anonfun$first$1.apply(RDD.scala:1372)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.first(RDD.scala:1371)
... 49 elided
Caused by: java.lang.ArrayIndexOutOfBoundsException: 5
at $anonfun$createComb$1.apply(<console>:24)
at $anonfun$createComb$1.apply(<console>:23)
at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:189)
at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:188)
at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:144)
at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Seems like a problem with createComb methods were you are assuming that t array has at least 6 elements.
It is just a quick geuss. Let me know if it helps. If not, I will try to investigate it further :)
I have a class named child2 which i want to serialize and deserialize my class contains a LocalDateTime attribute for which i have to write a custom serializer i tried with the two solutions but both was throwing exceptions
here is my code
Solution 1
case class Child2(var str:String,var Num:Int,MyList:List[Int],val myDate : LocalDateTime = LocalDateTime.now()){
var number:Int=555
}
class Message1SerializerDateTime extends Serializer[LocalDateTime]{
private val LocalDateTimeClass = classOf[LocalDateTime]
def deserialize(implicit format: Formats): PartialFunction[(TypeInfo, JValue), LocalDateTime] = {
case (TypeInfo(LocalDateTimeClass, _), json) => json match {
case JString(dt) => LocalDateTime.parse(dt)
case x => throw new MappingException("Can't convert " + x + " to LocalDateTime")
}
}
def serialize(implicit format: Formats): PartialFunction[Any, JValue] = {
case x: LocalDateTime => JString(x.toString)
}
}
object MessageTest extends App {
implicit val formats = /*Serialization.formats(NoTypeHints)*/DefaultFormats + new FieldSerializer[Child2]+new Message1SerializerDateTime
var c= new Child2("Mary", 5,List(1, 2),LocalDateTime.now())
c.number=1
// println("number"+c.number)
val ser = write(c)
println("Child class converted to string" +ser)
var obj=read[Child2](ser)
println("object of Child is "+obj)
println("str"+obj.str)
println("Num"+obj.Num)
println("MyList"+obj)
println("myDate"+obj.myDate)
println("number"+obj.number)
}
and its throwing mapping exception
Child class converted to string{"number":1,"str":"Mary","Num":5,"MyList":[1,2],"myDate":"2015-07-28T16:45:44.030"}
[error] (run-main-2) net.liftweb.json.MappingException: unknown error
net.liftweb.json.MappingException: unknown error
at net.liftweb.json.Extraction$.extract(Extraction.scala:46)
at net.liftweb.json.JsonAST$JValue.extract(JsonAST.scala:312)
at net.liftweb.json.Serialization$.read(Serialization.scala:58)
at MessageTest$.delayedEndpoint$MessageTest$1(MessageTest.scala:42)
at MessageTest$delayedInit$body.apply(MessageTest.scala:15)
at scala.Function0$class.apply$mcV$sp(Function0.scala:40)
at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:12)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.collection.immutable.List.foreach(List.scala:383)
at scala.collection.generic.TraversableForwarder$class.foreach(TraversableForwarder.scala:35)
at scala.App$class.main(App.scala:76)
at MessageTest$.main(MessageTest.scala:15)
at MessageTest.main(MessageTest.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
Caused by: java.lang.ArrayIndexOutOfBoundsException: 49938
at com.thoughtworks.paranamer.BytecodeReadingParanamer$ClassReader.<init>(BytecodeReadingParanamer.java:451)
at com.thoughtworks.paranamer.BytecodeReadingParanamer$ClassReader.<init>(BytecodeReadingParanamer.java:431)
at com.thoughtworks.paranamer.BytecodeReadingParanamer$ClassReader.<init>(BytecodeReadingParanamer.java:492)
at com.thoughtworks.paranamer.BytecodeReadingParanamer$ClassReader.<init>(BytecodeReadingParanamer.java:337)
at com.thoughtworks.paranamer.BytecodeReadingParanamer.lookupParameterNames(BytecodeReadingParanamer.java:100)
at com.thoughtworks.paranamer.CachingParanamer.lookupParameterNames(CachingParanamer.java:75)
at com.thoughtworks.paranamer.CachingParanamer.lookupParameterNames(CachingParanamer.java:68)
at net.liftweb.json.Meta$ParanamerReader$.lookupParameterNames(Meta.scala:89)
at net.liftweb.json.Meta$Reflection$.argsInfo$1(Meta.scala:237)
at net.liftweb.json.Meta$Reflection$.constructorArgs(Meta.scala:253)
at net.liftweb.json.Meta$Reflection$$anonfun$constructors$1.apply(Meta.scala:227)
at net.liftweb.json.Meta$Reflection$$anonfun$constructors$1.apply(Meta.scala:227)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at net.liftweb.json.Meta$Reflection$.constructors(Meta.scala:227)
at net.liftweb.json.Meta$.net$liftweb$json$Meta$$constructors$1(Meta.scala:97)
at net.liftweb.json.Meta$.mkConstructor$1(Meta.scala:124)
at net.liftweb.json.Meta$.fieldMapping$1(Meta.scala:151)
at net.liftweb.json.Meta$.net$liftweb$json$Meta$$toArg$1(Meta.scala:155)
at net.liftweb.json.Meta$$anonfun$net$liftweb$json$Meta$$constructors$1$1$$anonfun$apply$1.apply(Meta.scala:99)
at net.liftweb.json.Meta$$anonfun$net$liftweb$json$Meta$$constructors$1$1$$anonfun$apply$1.apply(Meta.scala:98)
at scala.collection.immutable.List.map(List.scala:278)
at net.liftweb.json.Meta$$anonfun$net$liftweb$json$Meta$$constructors$1$1.apply(Meta.scala:98)
at net.liftweb.json.Meta$$anonfun$net$liftweb$json$Meta$$constructors$1$1.apply(Meta.scala:97)
at scala.collection.immutable.List.map(List.scala:274)
at net.liftweb.json.Meta$.net$liftweb$json$Meta$$constructors$1(Meta.scala:97)
at net.liftweb.json.Meta$$anonfun$mappingOf$1.apply(Meta.scala:169)
at net.liftweb.json.Meta$$anonfun$mappingOf$1.apply(Meta.scala:161)
at net.liftweb.json.Meta$Memo.memoize(Meta.scala:199)
at net.liftweb.json.Meta$.mappingOf(Meta.scala:161)
at net.liftweb.json.Extraction$.net$liftweb$json$Extraction$$mkMapping$1(Extraction.scala:194)
at net.liftweb.json.Extraction$.net$liftweb$json$Extraction$$extract0(Extraction.scala:199)
at net.liftweb.json.Extraction$.extract(Extraction.scala:43)
at net.liftweb.json.JsonAST$JValue.extract(JsonAST.scala:312)
at net.liftweb.json.Serialization$.read(Serialization.scala:58)
at MessageTest$.delayedEndpoint$MessageTest$1(MessageTest.scala:42)
at MessageTest$delayedInit$body.apply(MessageTest.scala:15)
at scala.Function0$class.apply$mcV$sp(Function0.scala:40)
at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:12)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.collection.immutable.List.foreach(List.scala:383)
at scala.collection.generic.TraversableForwarder$class.foreach(TraversableForwarder.scala:35)
at scala.App$class.main(App.scala:76)
at MessageTest$.main(MessageTest.scala:15)
at MessageTest.main(MessageTest.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
Solution 2
case class Child2(var str:String,var Num:Int,MyList:List[Int],val myDate : LocalDateTime = LocalDateTime.now()){
var number:Int=555
}
class Message1Serializer extends Serializer[Child2]{
private val IntervalClass = classOf[Child2]
def deserialize(implicit format: Formats): PartialFunction[(TypeInfo, JValue), Child2] = {
case (TypeInfo(IntervalClass, _), json) => json match {
case JObject(
JField("str", JString(str)) :: JField("Num", JInt(num)) ::
JField("MyList", JArray(mylist)) :: (JField("myDate", JString(mydate)) ::
JField("number", JInt(number)) ::Nil)
) => {
val c = Child2(
str, num.intValue(), mylist.map(_.values.toString.toInt), LocalDateTime.parse(mydate)
)
c.number = number.intValue()
c
}
case x => throw new MappingException("Can't convert " + x + " to Interval")
}
}
def serialize(implicit format: Formats): PartialFunction[Any, JValue] = {
case x: Child2 =>
JObject(
JField("str", JString(x.str)) :: JField("Num", JInt(x.Num)) ::
JField("MyList", JArray(x.MyList.map(JInt(_)))) ::
JField("myDate", JString(x.myDate.toString)) ::
JField("number", JInt(x.number)) :: Nil
)
}
}
object MessageTest extends App
{
implicit val formats = /*Serialization.formats(NoTypeHints)*/DefaultFormats+new Message1Serializer
var c= new Child2("Mary", 5,List(1, 2),LocalDateTime.now())
c.number=1
// println("number"+c.number)
val ser = write(c)
println("Child class converted to string" +ser)
var obj=read[Child2](ser)
println("object of Child is "+obj)
println("str"+obj.str)
println("Num"+obj.Num)
println("MyList"+obj)
println("myDate"+obj.myDate)
println("number"+obj.number)
}
its also throwing mapping exception
Child class converted to string{"str":"Mary","Num":5,"MyList":[1,2],"myDate":"2015-07-28T17:09:31.512","number":1}
[error] (run-main-1) net.liftweb.json.MappingException: unknown error
net.liftweb.json.MappingException: unknown error
at net.liftweb.json.Extraction$.extract(Extraction.scala:46)
at net.liftweb.json.JsonAST$JValue.extract(JsonAST.scala:312)
at net.liftweb.json.Serialization$.read(Serialization.scala:58)
at MessageTest$.delayedEndpoint$MessageTest$1(MessageTest.scala:58)
at MessageTest$delayedInit$body.apply(MessageTest.scala:15)
at scala.Function0$class.apply$mcV$sp(Function0.scala:40)
at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:12)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.collection.immutable.List.foreach(List.scala:383)
at scala.collection.generic.TraversableForwarder$class.foreach(TraversableForwarder.scala:35)
at scala.App$class.main(App.scala:76)
at MessageTest$.main(MessageTest.scala:15)
at MessageTest.main(MessageTest.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
Caused by: java.lang.ArrayIndexOutOfBoundsException: 49938
at com.thoughtworks.paranamer.BytecodeReadingParanamer$ClassReader.<init>(BytecodeReadingParanamer.java:451)
at com.thoughtworks.paranamer.BytecodeReadingParanamer$ClassReader.<init>(BytecodeReadingParanamer.java:431)
at com.thoughtworks.paranamer.BytecodeReadingParanamer$ClassReader.<init>(BytecodeReadingParanamer.java:492)
at com.thoughtworks.paranamer.BytecodeReadingParanamer$ClassReader.<init>(BytecodeReadingParanamer.java:337)
at com.thoughtworks.paranamer.BytecodeReadingParanamer.lookupParameterNames(BytecodeReadingParanamer.java:100)
at com.thoughtworks.paranamer.CachingParanamer.lookupParameterNames(CachingParanamer.java:75)
at com.thoughtworks.paranamer.CachingParanamer.lookupParameterNames(CachingParanamer.java:68)
at net.liftweb.json.Meta$ParanamerReader$.lookupParameterNames(Meta.scala:89)
at net.liftweb.json.Meta$Reflection$.argsInfo$1(Meta.scala:237)
at net.liftweb.json.Meta$Reflection$.constructorArgs(Meta.scala:253)
at net.liftweb.json.Meta$Reflection$$anonfun$constructors$1.apply(Meta.scala:227)
at net.liftweb.json.Meta$Reflection$$anonfun$constructors$1.apply(Meta.scala:227)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at net.liftweb.json.Meta$Reflection$.constructors(Meta.scala:227)
at net.liftweb.json.Meta$.net$liftweb$json$Meta$$constructors$1(Meta.scala:97)
at net.liftweb.json.Meta$.mkConstructor$1(Meta.scala:124)
at net.liftweb.json.Meta$.fieldMapping$1(Meta.scala:151)
at net.liftweb.json.Meta$.net$liftweb$json$Meta$$toArg$1(Meta.scala:155)
at net.liftweb.json.Meta$$anonfun$net$liftweb$json$Meta$$constructors$1$1$$anonfun$apply$1.apply(Meta.scala:99)
at net.liftweb.json.Meta$$anonfun$net$liftweb$json$Meta$$constructors$1$1$$anonfun$apply$1.apply(Meta.scala:98)
at scala.collection.immutable.List.map(List.scala:278)
at net.liftweb.json.Meta$$anonfun$net$liftweb$json$Meta$$constructors$1$1.apply(Meta.scala:98)
at net.liftweb.json.Meta$$anonfun$net$liftweb$json$Meta$$constructors$1$1.apply(Meta.scala:97)
at scala.collection.immutable.List.map(List.scala:274)
at net.liftweb.json.Meta$.net$liftweb$json$Meta$$constructors$1(Meta.scala:97)
at net.liftweb.json.Meta$$anonfun$mappingOf$1.apply(Meta.scala:169)
at net.liftweb.json.Meta$$anonfun$mappingOf$1.apply(Meta.scala:161)
at net.liftweb.json.Meta$Memo.memoize(Meta.scala:199)
at net.liftweb.json.Meta$.mappingOf(Meta.scala:161)
at net.liftweb.json.Extraction$.net$liftweb$json$Extraction$$mkMapping$1(Extraction.scala:194)
at net.liftweb.json.Extraction$.net$liftweb$json$Extraction$$extract0(Extraction.scala:199)
at net.liftweb.json.Extraction$.extract(Extraction.scala:43)
at net.liftweb.json.JsonAST$JValue.extract(JsonAST.scala:312)
at net.liftweb.json.Serialization$.read(Serialization.scala:58)
at MessageTest$.delayedEndpoint$MessageTest$1(MessageTest.scala:58)
at MessageTest$delayedInit$body.apply(MessageTest.scala:15)
at scala.Function0$class.apply$mcV$sp(Function0.scala:40)
at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:12)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.collection.immutable.List.foreach(List.scala:383)
at scala.collection.generic.TraversableForwarder$class.foreach(TraversableForwarder.scala:35)
at scala.App$class.main(App.scala:76)
at MessageTest$.main(MessageTest.scala:15)
at MessageTest.main(MessageTest.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
please help me how can i resolve this issue .How should i write serializer for java 8 LocalDateTime
I can vaguely remember having a similar issue and I solved it
by parsing with corresponding DateTimeFormat Class
LocalDateTime.parse(dt, DateTimeFormatter.ofPattern("yyyy-MM-ddTHH:mm"))
Correct the pattern for your case
I'm just trying to install Logback-demo, using maven to package the files and run the demo for the first time, but I keep getting these weird java exception messages:
10:12:03.103 [pool-3-thread-1] INFO ch.qos.logback.demo.LoggingTask - Howdydy-diddly-ho - 8499
java.lang.Exception: e
at ch.qos.logback.demo.LoggingTask.run(LoggingTask.java:28) [classes/:na]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) [na:1.7.0_65]
at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:304) [na:1.7.0_65]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:178) [na:1.7.0_65]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293) [na:1.7.0_65]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) [na:1.7.0_65]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) [na:1.7.0_65]
at java.lang.Thread.run(Thread.java:745) [na:1.7.0_65]
10:12:03,104 |-INFO in c.q.l.core.rolling.DefaultTimeBasedFileNamingAndTriggeringPolicy - Elapsed period: Thu May 21 10:11:03 PDT 2015
10:12:03,104 |-INFO in c.q.l.co.rolling.helper.RenameUtil - Renaming file [logFile.log] to [logFile.log4499498230724.tmp]
10:12:03,105 |-INFO in ch.qos.logback.core.rolling.helper.Compressor - ZIP compressing [logFile.log4499498230724.tmp] as [logFile.2015-05-21_10-11.log.zip]
this message will repeat over and over again for hours. Any ideas on what I am doing wrong?
According to the source code, this seems normal:
public class LoggingTask implements Runnable {
// ...
public void run() {
if(i % 100 == 99) {
logger.info(HOWDY_MARKER, msg + " - " + (i++), new Exception("e"));
} else {
logger.trace("x {]", i++);
}
}
}
Also note that no exception is thrown here, it's only logged.
Spark Java application throws NotSerializableException on hadoop writables.
public final class myAPP {
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Usage: myAPP <file>");
System.exit(1);
}
SparkConf sparkConf = new SparkConf().setAppName("myAPP").setMaster("local");
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
Configuration conf = new Configuration();
JavaPairRDD<LongWritable,Text> lines = ctx.newAPIHadoopFile(args[0], TextInputFormat.class, LongWritable.class, Text.class, conf);
System.out.println( lines.collect().toString());
ctx.stop();
}
.
java.io.NotSerializableException: org.apache.hadoop.io.LongWritable
Serialization stack:
- object not serializable (class: org.apache.hadoop.io.LongWritable, value: 15227295)
- field (class: scala.Tuple2, name: _1, type: class java.lang.Object)
- object (class scala.Tuple2, (15227295,))
- element of array (index: 0)
- array (class [Lscala.Tuple2;, size 1153163)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:38)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:80)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
15/04/26 16:05:05 ERROR TaskSetManager: Task 0.0 in stage 0.0 (TID 0) had a not serializable result: org.apache.hadoop.io.LongWritable
Serialization stack:
- object not serializable (class: org.apache.hadoop.io.LongWritable, value: 15227295)
- field (class: scala.Tuple2, name: _1, type: class java.lang.Object)
- object (class scala.Tuple2, (15227295,))
- element of array (index: 0)
- array (class [Lscala.Tuple2;, size 1153163); not retrying
15/04/26 16:05:05 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
15/04/26 16:05:05 INFO TaskSchedulerImpl: Cancelling stage 0
15/04/26 16:05:05 INFO DAGScheduler: Job 0 failed: collect at Parser2.java:60, took 0.460181 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0 in stage 0.0 (TID 0) had a not serializable result: org.apache.hadoop.io.LongWritable
In Spark Scala program I register hadoop writables as below and it works fine.
sparkConf.registerKryoClasses(Array(classOf[org.apache.hadoop.io.LongWritable], classOf[org.apache.hadoop.io.Text]))
Apparently this approach doesn't work with Apache Spark API
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
sparkConf.set("spark.kryo.registrator", LongWritable.class.getName());
.
Exception in thread "main" org.apache.spark.SparkException: Failed to register classes with Kryo
at org.apache.spark.serializer.KryoSerializer.newKryo(KryoSerializer.scala:101)
at org.apache.spark.serializer.KryoSerializerInstance.<init>(KryoSerializer.scala:153)
at org.apache.spark.serializer.KryoSerializer.newInstance(KryoSerializer.scala:115)
at org.apache.spark.broadcast.TorrentBroadcast$.blockifyObject(TorrentBroadcast.scala:200)
at org.apache.spark.broadcast.TorrentBroadcast.writeBlocks(TorrentBroadcast.scala:101)
at org.apache.spark.broadcast.TorrentBroadcast.<init>(TorrentBroadcast.scala:84)
at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:34)
at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:29)
at org.apache.spark.broadcast.BroadcastManager.newBroadcast(BroadcastManager.scala:62)
at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1051)
at org.apache.spark.rdd.NewHadoopRDD.<init>(NewHadoopRDD.scala:77)
at org.apache.spark.SparkContext.newAPIHadoopFile(SparkContext.scala:848)
at org.apache.spark.api.java.JavaSparkContext.newAPIHadoopFile(JavaSparkContext.scala:488)
at com.nsn.PMParser.Parser2.main(Parser2.java:56)
Caused by: java.lang.ClassCastException: org.apache.hadoop.io.LongWritable cannot be cast to org.apache.spark.serializer.KryoRegistrator
at org.apache.spark.serializer.KryoSerializer$$anonfun$newKryo$3.apply(KryoSerializer.scala:97)
at org.apache.spark.serializer.KryoSerializer$$anonfun$newKryo$3.apply(KryoSerializer.scala:97)
at scala.Option.map(Option.scala:145)
at org.apache.spark.serializer.KryoSerializer.newKryo(KryoSerializer.scala:97)
... 13 more
hadoop writables NotSerializableException with Apache Spark Java API?
As of Spark v1.4.0, you can use this Java API to register classes to be serialized using Kryo:
https://spark.apache.org/docs/latest/api/java/org/apache/spark/SparkConf.html#registerKryoClasses(java.lang.Class[])
,
by passing in an array of Class objects, each of which can be obtained using
http://docs.oracle.com/javase/7/docs/api/java/lang/Class.html#forName(java.lang.String)
such as:
new SparkConf().registerKryoClasses(new Class<?>[]{
Class.forName("org.apache.hadoop.io.LongWritable"),
Class.forName("org.apache.hadoop.io.Text")
});
Hope this helps.
use
sparkConf.set("spark.kryo.classesToRegister", "org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text")
or you can simply use
ctx.textFile(args[0]);
to load RDD