ClassNotFoundException HadoopMapReduceCommitProtocol - java

I am trying to run a Spark sample in local mode, but am getting the following stack trace:
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/spark/internal/io/HadoopMapReduceCommitProtocol
at java.lang.ClassLoader.defineClass1(Native Method)
at java.lang.ClassLoader.defineClass(ClassLoader.java:763)
at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
at java.net.URLClassLoader.defineClass(URLClassLoader.java:467)
at java.net.URLClassLoader.access$100(URLClassLoader.java:73)
at java.net.URLClassLoader$1.run(URLClassLoader.java:368)
at java.net.URLClassLoader$1.run(URLClassLoader.java:362)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:361)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at org.apache.spark.sql.internal.SQLConf$.<init>(SQLConf.scala:383)
at org.apache.spark.sql.internal.SQLConf$.<clinit>(SQLConf.scala)
at org.apache.spark.sql.internal.StaticSQLConf$$anonfun$buildConf$1.apply(SQLConf.scala:930)
at org.apache.spark.sql.internal.StaticSQLConf$$anonfun$buildConf$1.apply(SQLConf.scala:928)
at org.apache.spark.internal.config.TypedConfigBuilder$$anonfun$createWithDefault$1.apply(ConfigBuilder.scala:122)
at org.apache.spark.internal.config.TypedConfigBuilder$$anonfun$createWithDefault$1.apply(ConfigBuilder.scala:122)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.internal.config.TypedConfigBuilder.createWithDefault(ConfigBuilder.scala:122)
at org.apache.spark.sql.internal.StaticSQLConf$.<init>(SQLConf.scala:937)
at org.apache.spark.sql.internal.StaticSQLConf$.<clinit>(SQLConf.scala)
at org.apache.spark.sql.SparkSession$.org$apache$spark$sql$SparkSession$$sessionStateClassName(SparkSession.scala:962)
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:111)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:109)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$getOrCreate$5.apply(SparkSession.scala:878)
at org.apache.spark.sql.SparkSession$Builder$$anonfun$getOrCreate$5.apply(SparkSession.scala:878)
at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:99)
at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:99)
at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:230)
at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40)
at scala.collection.mutable.HashMap.foreach(HashMap.scala:99)
at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:878)
at com.megaport.PipelineExample$.main(PipelineExample.scala:37)
at com.megaport.PipelineExample.main(PipelineExample.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
I can see the class in the GitHub repo, but it is not in the Maven lib, or in the distro(I have the distro bundled with Hadoop) spark-core_2.11-2.0.2.jar.
The code I am trying to run is taken from the examples in the Spark distro, and it fails at the getOrCreate stage...
// scalastyle:off println
package com.megaport
// $example on$
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row
// $example off$
import org.apache.spark.sql.SparkSession
object PipelineExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder
.appName("My Spark Application") // optional and will be autogenerated if not specified
.master("local[*]") // avoid hardcoding the deployment environment
// .enableHiveSupport() // self-explanatory, isn't it?
.getOrCreate
// $example on$
// Prepare training documents from a list of (id, text, label) tuples.
val training = spark.createDataFrame(Seq(
(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
(3L, "hadoop mapreduce", 0.0)
)).toDF("id", "text", "label")
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
val tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words")
val hashingTF = new HashingTF()
.setNumFeatures(1000)
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("features")
val lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.01)
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, lr))
// Fit the pipeline to training documents.
val model = pipeline.fit(training)
// Now we can optionally save the fitted pipeline to disk
model.write.overwrite().save("/tmp/spark-logistic-regression-model")
// We can also save this unfit pipeline to disk
pipeline.write.overwrite().save("/tmp/unfit-lr-model")
// And load it back in during production
val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model")
// Prepare test documents, which are unlabeled (id, text) tuples.
val test = spark.createDataFrame(Seq(
(4L, "spark i j k"),
(5L, "l m n"),
(6L, "mapreduce spark"),
(7L, "apache hadoop")
)).toDF("id", "text")
// Make predictions on test documents.
model.transform(test)
.select("id", "text", "probability", "prediction")
.collect()
.foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
println(s"($id, $text) --> prob=$prob, prediction=$prediction")
}
// $example off$
spark.stop()
}
}

Well if its not in your java library, then you should download the dependent jar and add it.
Check this SO for more details
How to import a jar in Eclipse

Related

azure identity java sdk throwing MutableCoercionConfig error

I'm new to the java library, I learned that azure Java SDK can be used in the scala environment, So I have tried using the azure identity library in the data bricks environment
DBR Version 7.3 LTS
scala 2.12
maven coordinate com.azure:azure-identity:1.3.5
pls help me resolve the error,
here is the code :
import com.azure.identity._;
import com.azure.identity.DefaultAzureCredentialBuilder;
val clientID = dbutils.secrets.get(scope="****",key="****");
val ClientSecret = dbutils.secrets.get(scope="****",key="****");
val tenantID = dbutils.secrets.get(scope="****",key="****");
val endpoint = "****" ;
val clientSecretCredential:ClientSecretCredential = new ClientSecretCredentialBuilder().tenantId(tenantID).clientId(clientID).clientSecret(ClientSecret).build();
here is the error i face
java.lang.NoSuchMethodError: com.fasterxml.jackson.dataformat.xml.XmlMapper.coercionConfigDefaults()Lcom/fasterxml/jackson/databind/cfg/MutableCoercionConfig
at com.fasterxml.jackson.dataformat.xml.XmlMapper.<init>(XmlMapper.java:145)
at com.fasterxml.jackson.dataformat.xml.XmlMapper.<init>(XmlMapper.java:127)
at com.fasterxml.jackson.dataformat.xml.XmlMapper.builder(XmlMapper.java:218)
at com.azure.core.util.serializer.JacksonAdapter.<init>(JacksonAdapter.java:137)
at com.azure.core.util.serializer.JacksonAdapter.createDefaultSerializerAdapter(JacksonAdapter.java:189)
at com.azure.identity.implementation.IdentityClient.<clinit>(IdentityClient.java:96)
at com.azure.identity.implementation.IdentityClientBuilder.build(IdentityClientBuilder.java:113)
at com.azure.identity.ClientSecretCredential.<init>(ClientSecretCredential.java:50)
at com.azure.identity.ClientSecretCredentialBuilder.build(ClientSecretCredentialBuilder.java:76)
at linea34e0846b16045d09ff78af8b346393c25.$read$$iw$$iw$$iw$$iw$$iw$$iw.<init>(command-626790621540495:18)
at linea34e0846b16045d09ff78af8b346393c25.$read$$iw$$iw$$iw$$iw$$iw.<init>(command-626790621540495:68)
at linea34e0846b16045d09ff78af8b346393c25.$read$$iw$$iw$$iw$$iw.<init>(command-626790621540495:70)
at linea34e0846b16045d09ff78af8b346393c25.$read$$iw$$iw$$iw.<init>(command-626790621540495:72)
at linea34e0846b16045d09ff78af8b346393c25.$read$$iw$$iw.<init>(command-626790621540495:74)
at linea34e0846b16045d09ff78af8b346393c25.$read$$iw.<init>(command-626790621540495:76)
at linea34e0846b16045d09ff78af8b346393c25.$read.<init>(command-626790621540495:78)
at linea34e0846b16045d09ff78af8b346393c25.$read$.<init>(command-626790621540495:82)
at linea34e0846b16045d09ff78af8b346393c25.$read$.<clinit>(command-626790621540495)
at linea34e0846b16045d09ff78af8b346393c25.$eval$.$print$lzycompute(<notebook>:7)
at linea34e0846b16045d09ff78af8b346393c25.$eval$.$print(<notebook>:6)
at linea34e0846b16045d09ff78af8b346393c25.$eval.$print(<notebook>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at scala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:745)
at scala.tools.nsc.interpreter.IMain$Request.loadAndRun(IMain.scala:1021)
at scala.tools.nsc.interpreter.IMain.$anonfun$interpret$1(IMain.scala:574)
at scala.reflect.internal.util.ScalaClassLoader.asContext(ScalaClassLoader.scala:41)
at scala.reflect.internal.util.ScalaClassLoader.asContext$(ScalaClassLoader.scala:37)
at scala.reflect.internal.util.AbstractFileClassLoader.asContext(AbstractFileClassLoader.scala:41)
at scala.tools.nsc.interpreter.IMain.loadAndRunReq$1(IMain.scala:573)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:600)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:570)
at com.databricks.backend.daemon.driver.DriverILoop.execute(DriverILoop.scala:219)
at com.databricks.backend.daemon.driver.ScalaDriverLocal.$anonfun$repl$1(ScalaDriverLocal.scala:204)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.backend.daemon.driver.DriverLocal$TrapExitInternal$.trapExit(DriverLocal.scala:789)
at com.databricks.backend.daemon.driver.DriverLocal$TrapExit$.apply(DriverLocal.scala:742)
at com.databricks.backend.daemon.driver.ScalaDriverLocal.repl(ScalaDriverLocal.scala:204)
at com.databricks.backend.daemon.driver.DriverLocal.$anonfun$execute$10(DriverLocal.scala:431)
at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:239)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:234)
at com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:231)
at com.databricks.backend.daemon.driver.DriverLocal.withAttributionContext(DriverLocal.scala:48)
at com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:276)
at com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:269)
at com.databricks.backend.daemon.driver.DriverLocal.withAttributionTags(DriverLocal.scala:48)
at com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:408)
at com.databricks.backend.daemon.driver.DriverWrapper.$anonfun$tryExecutingCommand$1(DriverWrapper.scala:653)
at scala.util.Try$.apply(Try.scala:213)
at com.databricks.backend.daemon.driver.DriverWrapper.tryExecutingCommand(DriverWrapper.scala:645)
at com.databricks.backend.daemon.driver.DriverWrapper.getCommandOutputAndError(DriverWrapper.scala:486)
at com.databricks.backend.daemon.driver.DriverWrapper.executeCommand(DriverWrapper.scala:598)
at com.databricks.backend.daemon.driver.DriverWrapper.runInnerLoop(DriverWrapper.scala:391)
at com.databricks.backend.daemon.driver.DriverWrapper.runInner(DriverWrapper.scala:337)
at com.databricks.backend.daemon.driver.DriverWrapper.run(DriverWrapper.scala:219)
at java.lang.Thread.run(Thread.java:748)

No such DSL method in Jenkins Declarative pipeline

I have the following pipeline file which is meant to be used for creating an instance in the openstack.
#Library('dev-shared-lib#master')_
pipeline{
agent any
stages{
stage('Create OpenStack Instace'){
environment{
NAME_OF_THE_INSTANCE= "dev-setup"
IMAGE_REFERENCE = "8917nej3" // Ubuntu-18.04-LTS
KEYNAME = "user"
FLAVOR_REFERENCE = "eseeseee68fa1d2d5" // t2.large-80G
MAX_INSTANCE_COUNT = 1
MINIMUM_INSTANCE_COUNT = 1
NETWORK_ID = "i2a4b7428506" // dev-private-network
SECURITY_GROUP_NAME = "default"
REQUEST_JSON = ""
}
steps{
script{
try{
// Request to create an instance at openstack
REQUEST_JSON= openStackClient.createAnInstance "${NAME_OF_THE_INSTANCE}", "${IMAGE_REFERENCE}", "${KEYNAME}", "${FLAVOR_REFERENCE}", "${NETWORK_ID}", "${SECURITY_GROUP_NAME}", "${MAX_INSTANCE_COUNT}", "${MINIMUM_INSTANCE_COUNT}"
echo "${REQUEST_JSON}"
} catch (exception){
log.error(exception)
}
}
}
}
}
}
in the dev-shared-lib#master library which being used above, I have the openStackClient.groovy file located at /vars/ directory with the below content
def createAnInstance(String nameOfTheInstance, String imageReference, String keyName, String flavorReference,
String networkId, String securityGroupName, int maxInstanceCount, int minimumInstanceCount) {
return "Hi "
The shared lib is correctly configured at Jenkins and I have just returned Hi to check whether it is working. Once the above pipeline is run the following error is thrown, what I am doing wrong here?
[Pipeline] // stage
[Pipeline] }
[Pipeline] // node
[Pipeline] End of Pipeline
java.lang.NoSuchMethodError: No such DSL method 'createAnInstance' found among steps [ansiColor, archive, bat, build, catchError, checkout, deleteDir, dir, dockerFingerprintFrom, dockerFingerprintRun, echo, emailext, emailextrecipients, envVarsForTool, error, fileExists, findBuildScans, getContext, git, input, isUnix, jiraComment, jiraIssueSelector, jiraSearch, junit, library, libraryResource, load, lock, mail, milestone, node, parallel, powershell, properties, publishHTML, pwd, readFile, readTrusted, resolveScm, retry, script, sh, sleep, stage, stash, step, svn, timeout, timestamps, tm, tool, unarchive, unstable, unstash, validateDeclarativePipeline, waitUntil, warnError, withContext, withCredentials, withDockerContainer, withDockerRegistry, withDockerServer, withEnv, wrap, writeFile, ws] or symbols [all, allOf, always, ant, antFromApache, antOutcome, antTarget, any, anyOf, apiToken, architecture, archiveArtifacts, artifactManager, authorizationMatrix, batchFile, bitbucket, booleanParam, branch, brokenBuildSuspects, brokenTestsSuspects, buildButton, buildDiscarder, buildingTag, caseInsensitive, caseSensitive, certificate, changeRequest, changelog, changeset, checkoutToSubdirectory, choice, choiceParam, cleanWs, clock, cloud, command, credentials, cron, crumb, culprits, defaultView, demand, developers, disableConcurrentBuilds, disableResume, docker, dockerCert, dockerfile, downloadSettings, downstream, dumb, durabilityHint, envVars, environment, equals, expression, file, fileParam, filePath, fingerprint, frameOptions, freeStyle, freeStyleJob, fromScm, fromSource, git, gitHubBranchDiscovery, gitHubBranchHeadAuthority, gitHubForkDiscovery, gitHubSshCheckout, gitHubTagDiscovery, gitHubTrustContributors, gitHubTrustEveryone, gitHubTrustNobody, gitHubTrustPermissions, github, githubPush, gradle, headRegexFilter, headWildcardFilter, hyperlink, hyperlinkToModels, inheriting, inheritingGlobal, installSource, isRestartedRun, jdk, jdkInstaller, jgit, jgitapache, jnlp, jobName, label, lastDuration, lastFailure, lastGrantedAuthorities, lastStable, lastSuccess, legacy, legacySCM, list, local, location, logRotator, loggedInUsersCanDoAnything, masterBuild, maven, maven3Mojos, mavenErrors, mavenMojos, mavenWarnings, modernSCM, myView, newContainerPerStage, node, nodeProperties, nonInheriting, none, not, overrideIndexTriggers, paneStatus, parallelsAlwaysFailFast, parameters, password, pattern, permanent, pipeline-model, pipelineTriggers, plainText, plugin, pollSCM, preserveStashes, projectNamingStrategy, proxy, queueItemAuthenticator, quietPeriod, rateLimitBuilds, recipients, requestor, run, runParam, sSHLauncher, schedule, scmRetryCount, scriptApprovalLink, search, security, shell, skipDefaultCheckout, skipStagesAfterUnstable, slave, sourceRegexFilter, sourceWildcardFilter, ssh, sshUserPrivateKey, stackTrace, standard, status, string, stringParam, swapSpace, tag, text, textParam, tmpSpace, toolLocation, triggeredBy, unsecured, upstream, upstreamDevelopers, userSeed, usernameColonPassword, usernamePassword, viewsTabBar, weather, withAnt, zfs, zip] or globals [currentBuild, docker, env, log, openStackClient, params, pipeline, scm]
at org.jenkinsci.plugins.workflow.cps.DSL.invokeMethod(DSL.java:202)
at org.jenkinsci.plugins.workflow.cps.CpsScript.invokeMethod(CpsScript.java:122)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.codehaus.groovy.reflection.CachedMethod.invoke(CachedMethod.java:93)
at groovy.lang.MetaMethod.doMethodInvoke(MetaMethod.java:325)
at groovy.lang.MetaClassImpl.invokeMethod(MetaClassImpl.java:1213)
at groovy.lang.MetaClassImpl.invokeMethod(MetaClassImpl.java:1022)
at org.codehaus.groovy.runtime.callsite.PogoMetaClassSite.call(PogoMetaClassSite.java:42)
at org.codehaus.groovy.runtime.callsite.CallSiteArray.defaultCall(CallSiteArray.java:48)
at org.codehaus.groovy.runtime.callsite.AbstractCallSite.call(AbstractCallSite.java:113)
at org.kohsuke.groovy.sandbox.impl.Checker$1.call(Checker.java:160)
at org.kohsuke.groovy.sandbox.GroovyInterceptor.onMethodCall(GroovyInterceptor.java:23)
at org.jenkinsci.plugins.scriptsecurity.sandbox.groovy.SandboxInterceptor.onMethodCall(SandboxInterceptor.java:157)
at org.jenkinsci.plugins.scriptsecurity.sandbox.groovy.SandboxInterceptor.onMethodCall(SandboxInterceptor.java:142)
at org.kohsuke.groovy.sandbox.impl.Checker$1.call(Checker.java:158)
at org.kohsuke.groovy.sandbox.impl.Checker.checkedCall(Checker.java:162)
at com.cloudbees.groovy.cps.sandbox.SandboxInvoker.methodCall(SandboxInvoker.java:17)
at WorkflowScript.run(WorkflowScript:21)
at ___cps.transform___(Native Method)
at com.cloudbees.groovy.cps.impl.ContinuationGroup.methodCall(ContinuationGroup.java:84)
at com.cloudbees.groovy.cps.impl.FunctionCallBlock$ContinuationImpl.dispatchOrArg(FunctionCallBlock.java:113)
at com.cloudbees.groovy.cps.impl.FunctionCallBlock$ContinuationImpl.fixArg(FunctionCallBlock.java:83)
at sun.reflect.GeneratedMethodAccessor225.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.cloudbees.groovy.cps.impl.ContinuationPtr$ContinuationImpl.receive(ContinuationPtr.java:72)
at com.cloudbees.groovy.cps.impl.FunctionCallBlock$ContinuationImpl.dispatchOrArg(FunctionCallBlock.java:107)
at com.cloudbees.groovy.cps.impl.FunctionCallBlock$ContinuationImpl.fixArg(FunctionCallBlock.java:83)
at sun.reflect.GeneratedMethodAccessor225.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.cloudbees.groovy.cps.impl.ContinuationPtr$ContinuationImpl.receive(ContinuationPtr.java:72)
at com.cloudbees.groovy.cps.impl.ContinuationGroup.methodCall(ContinuationGroup.java:87)
at com.cloudbees.groovy.cps.impl.FunctionCallBlock$ContinuationImpl.dispatchOrArg(FunctionCallBlock.java:113)
at com.cloudbees.groovy.cps.impl.FunctionCallBlock$ContinuationImpl.fixArg(FunctionCallBlock.java:83)
at sun.reflect.GeneratedMethodAccessor225.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.cloudbees.groovy.cps.impl.ContinuationPtr$ContinuationImpl.receive(ContinuationPtr.java:72)
at com.cloudbees.groovy.cps.impl.ConstantBlock.eval(ConstantBlock.java:21)
at com.cloudbees.groovy.cps.Next.step(Next.java:83)
at com.cloudbees.groovy.cps.Continuable$1.call(Continuable.java:174)
at com.cloudbees.groovy.cps.Continuable$1.call(Continuable.java:163)
at org.codehaus.groovy.runtime.GroovyCategorySupport$ThreadCategoryInfo.use(GroovyCategorySupport.java:129)
at org.codehaus.groovy.runtime.GroovyCategorySupport.use(GroovyCategorySupport.java:268)
at com.cloudbees.groovy.cps.Continuable.run0(Continuable.java:163)
at org.jenkinsci.plugins.workflow.cps.SandboxContinuable.access$001(SandboxContinuable.java:18)
at org.jenkinsci.plugins.workflow.cps.SandboxContinuable.run0(SandboxContinuable.java:51)
at org.jenkinsci.plugins.workflow.cps.CpsThread.runNextChunk(CpsThread.java:186)
at org.jenkinsci.plugins.workflow.cps.CpsThreadGroup.run(CpsThreadGroup.java:370)
at org.jenkinsci.plugins.workflow.cps.CpsThreadGroup.access$200(CpsThreadGroup.java:93)
at org.jenkinsci.plugins.workflow.cps.CpsThreadGroup$2.call(CpsThreadGroup.java:282)
at org.jenkinsci.plugins.workflow.cps.CpsThreadGroup$2.call(CpsThreadGroup.java:270)
at org.jenkinsci.plugins.workflow.cps.CpsVmExecutorService$2.call(CpsVmExecutorService.java:66)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at hudson.remoting.SingleLaneExecutorService$1.run(SingleLaneExecutorService.java:131)
at jenkins.util.ContextResettingExecutorService$1.run(ContextResettingExecutorService.java:28)
at jenkins.security.ImpersonatingExecutorService$1.run(ImpersonatingExecutorService.java:59)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Finished: FAILURE

stream contents of 1GB file to sqlite table under single column

Below implementation gives Out of Memory Error for files of 1 GB size with 4 GB heap space. Files.lines will return a stream but while running Collectors.joining it gives Heap Error.
Can we stream a file with lesser memory footprint using jooq and jdbc preserving original line separators ?
Stream<String> lines = Files.lines(path);
dsl.createTable(TABLE1)
.column(COL1, SQLDataType.CLOB)
.column(COL2, SQLDataType.CLOB)
.execute();
dsl.insertInto(TABLE1)
.columns(COL1, COL2)
.values(VAL1, lines
.collect(Collectors.joining(System.lineSeparator())))
.execute();
Error ->
java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOf(Arrays.java:3332) ~[na:1.8.0_141]
at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:124) ~[na:1.8.0_141]
at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:448) ~[na:1.8.0_141]
at java.lang.StringBuilder.append(StringBuilder.java:136) ~[na:1.8.0_141]
at java.lang.StringBuilder.append(StringBuilder.java:76) ~[na:1.8.0_141]
at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:484) ~[na:1.8.0_141]
at java.lang.StringBuilder.append(StringBuilder.java:166) ~[na:1.8.0_141]
at java.util.StringJoiner.add(StringJoiner.java:185) ~[na:1.8.0_141]
at java.util.stream.Collectors$$Lambda$491/1699129225.accept(Unknown Source) ~[na:na]
at java.util.stream.ReduceOps$3ReducingSink.accept(ReduceOps.java:169) ~[na:1.8.0_141]
at java.util.Iterator.forEachRemaining(Iterator.java:116) ~[na:1.8.0_141]
at java.util.Spliterators$IteratorSpliterator.forEachRemaining(Spliterators.java:1801) ~[na:1.8.0_141]
at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:481) ~[na:1.8.0_141]
at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:471) ~[na:1.8.0_141]
at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708) ~[na:1.8.0_141]
at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) ~[na:1.8.0_141]
at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:499) ~[na:1.8.0_141]
jOOQ's default data type binding for CLOB data type treats the CLOB type as an ordinary String, which works well for small to mid sized lobs. For larger lobs, the streaming version of the JDBC API would be more suitable. Ideally, you would create your own data type binding, where you optimise your write operation for streaming:
https://www.jooq.org/doc/latest/manual/code-generation/custom-data-type-bindings
For example:
class StreamingLobBinding implements Binding<String, File> {
...
#Override
public void set(BindingSetStatementContext<File> ctx) {
// Ideally: register the input stream somewhere for explicit resource management
ctx.statement()
.setBinaryStream(ctx.index(), new FileInputStream(ctx.value()));
}
}
And then, you can either apply this binding to your column and have the code generator pick it up as documented in the above link, or you apply it only for single usage:
DataType<File> fileType = COL2.getDataType().asConvertedDataType(new StreamingLobBinding());
Field<File> fileCol = DSL.field(COL2.getName(), fileType);
dsl.insertInto(TABLE1)
.columns(COL1, fileCol)
.values(VAL1, DSL.val(theFile, fileType))
.execute();
Note that currently, you may need to register your input stream in some ThreadLocal to remember it and clean it up after statement execution. A future jOOQ version will offer SPI for handling that: https://github.com/jOOQ/jOOQ/issues/7591
Since sqlite-jdbc throws SQLFeatureNotSupportedException using setBinaryStream, there is one more approach as suggested in github issue above.
dsl.insertInto(TABLE1)
.columns(TABLE1.COL1, TABLE1.COL2)
.values("ABB", null)
.execute();
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8));
StringBuilder lines = new StringBuilder();
int bytesCount = 0;
String line = null;
int value = 0;
Field<String> coalesce = DSL.coalesce(TABLE1.COL2, "");
char[] buffer = new char[100 * 1000 * 1000];
while ((value = reader.read(buffer)) != -1) {
lines.append(String.valueOf(buffer, 0, value));
bytesCount += lines.length();
if (bytesCount >= 100 * 1000 * 1000) {
dsl.update(TABLE1).set(TABLE1.COL2, DSL.concat(coalesce, lines.toString())).where(TABLE1.COL1.eq("ABB")).execute();
bytesCount = 0;
lines.setLength(0);
}
}
if (lines.length() > 0) {
dsl.update(TABLE1).set(TABLE1.COL2, DSL.concat(coalesce, lines.toString())).where(TABLE1.COL1.eq("ABB")).execute();
}
reader.close();

Java Spark GroupByFailure

I'm attempting to use the Java Spark libraries with a cluster running Spark 2.3.0 over Hadoop 3.1.0 (and using those versions of the Java libraries).
I've run into a problem where I simply cannot use groupByKey, and I am at a loss to explain why. Any attempted usage of groupByKey for any reason in any circumstance is returning a java.lang.IllegalArgumentException.
I've boiled this down to about the simplest test I can think of:
package com.failuretest;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
public class TestReport {
public static void main(String[] args) throws Exception {
SparkConf conf = new SparkConf().setAppName("TestReport").set("spark.executor.memory", "20G");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> test = sc.parallelize(generateTestData());
test.saveAsTextFile("/TEST/testfile1");
test.mapToPair(line -> {
String[] testParts = line.split(" ");
return new Tuple2<String, String>(testParts[0], testParts[1]);
}).groupByKey().saveAsTextFile("/TEST/testfile2");
sc.close();
}
private static List<String> generateTestData() {
List<String> testList = new ArrayList<String>();
int keyCount = 0;
int valCount = 0;
while (valCount++ < 2000000) {
if (valCount % 10 == 0) {
keyCount++;
}
testList.add("Key" + keyCount + " " + "Val" + valCount);
}
return testList;
}
}
I'm just programmatically creating an RDD that produces 10 values per key, then creating my JavaPairRDD with a simple split, then attempting groupByKey.
When it runs, I receive the following stack:
Exception in thread "main" java.lang.IllegalArgumentException
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.spark.util.ClosureCleaner$.getClassReader(ClosureCleaner.scala:46)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodInsn$2.apply(ClosureCleaner.scala:449)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodInsn$2.apply(ClosureCleaner.scala:432)
at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:733)
at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:103)
at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:103)
at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:230)
at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40)
at scala.collection.mutable.HashMap$$anon$1.foreach(HashMap.scala:103)
at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:732)
at org.apache.spark.util.FieldAccessFinder$$anon$3.visitMethodInsn(ClosureCleaner.scala:432)
at org.apache.xbean.asm5.ClassReader.a(Unknown Source)
at org.apache.xbean.asm5.ClassReader.b(Unknown Source)
at org.apache.xbean.asm5.ClassReader.accept(Unknown Source)
at org.apache.xbean.asm5.ClassReader.accept(Unknown Source)
at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:262)
at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:261)
at scala.collection.immutable.List.foreach(List.scala:381)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:261)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:159)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2292)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$combineByKeyWithClassTag$1.apply(PairRDDFunctions.scala:88)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$combineByKeyWithClassTag$1.apply(PairRDDFunctions.scala:77)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.PairRDDFunctions.combineByKeyWithClassTag(PairRDDFunctions.scala:77)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$groupByKey$1.apply(PairRDDFunctions.scala:505)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$groupByKey$1.apply(PairRDDFunctions.scala:498)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.PairRDDFunctions.groupByKey(PairRDDFunctions.scala:498)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$groupByKey$3.apply(PairRDDFunctions.scala:641)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$groupByKey$3.apply(PairRDDFunctions.scala:641)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.PairRDDFunctions.groupByKey(PairRDDFunctions.scala:640)
at org.apache.spark.api.java.JavaPairRDD.groupByKey(JavaPairRDD.scala:559)
at com.failuretest.TestReport.main(TestReport.java:22)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:564)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:879)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:197)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:227)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:136)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
It doesn't get any further than the groupByKey (I'm writing a file above with the results, but it really doesn't matter since it never gets there).
I can run it all day long in my local dev instance, but running spark-submit with a jar containing the above fails every time in the cluster.
I'm really not sure where to go from here - what I am trying to do is a bit of a challenge if I cannot group by key.
Am I messing up? Is this a version conflict somewhere?
Dave
I actually figured this out before posting this, but in the interests of helping others...
I discovered that one of my colleagues had decided to have a play around with Java 10 on this particular cluster. Moved it back to Java 8 (sorry - didn't try 9) and the problem went away.
Dave

FileNotFoundException in hive UDF

my UDF:
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.IntWritable;
public class HoursDiff extends UDF {
//private = new Text();
public IntWritable evaluate(String date,String time)
{
String dateStart = "2014-12-01 00:00:00";
String currentdate=date+" "+time;
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date d1 = null;
Date d2 = null;
try
{
d1 = format.parse(dateStart);
d2 = format.parse(currentdate);
long diff = d2.getTime() - d1.getTime();
long diffHours = diff / (3600000) % 24;
long diffDays = diff / (86400000);
int hours=(int)(diffDays*24+diffHours);
IntWritable hour=new IntWritable(hours);
return hour;
}
catch (Exception e)
{
e.printStackTrace();
}
return null;
}
}
I exported into /home/hadoop/mapreduce/HoursDiff.jar
I opened the hive shell:
add jar /home/hadoop/mapreduce/HoursDiff.jar;
create temporary function hoursdiff as HoursDiff;
when I am trying to execute the following command, im getting FileNotFoundException:
select hoursdiff(date,time) as hours from date_test;
STACK TRACE
create temporary function hoursdiff as 'HoursDiff';
OK
Time taken: 0.009 seconds
hive> select hoursdiff(date,time) as hours from date_test;
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
15/10/11 15:17:03 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Execution log at: /tmp/hadoop/hadoop_20151011151616_2c15561f-7cd2-4012-8bd2-b7dfcf488432.log
java.io.FileNotFoundException: File does not exist: hdfs://172.16.253.17:54310/home/hadoop/mapreduce/HoursDiff.jar
at org.apache.hadoop.hdfs.DistributedFileSystem$18.doCall(DistributedFileSystem.java:1122)
at org.apache.hadoop.hdfs.DistributedFileSystem$18.doCall(DistributedFileSystem.java:1114)
at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
at org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1114)
at org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.getFileStatus(ClientDistributedCacheManager.java:288)
at org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.getFileStatus(ClientDistributedCacheManager.java:224)
at org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.determineTimestamps(ClientDistributedCacheManager.java:93)
at org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.determineTimestampsAndCacheVisibilities(ClientDistributedCacheManager.java:57)
at org.apache.hadoop.mapreduce.JobSubmitter.copyAndConfigureFiles(JobSubmitter.java:269)
at org.apache.hadoop.mapreduce.JobSubmitter.copyAndConfigureFiles(JobSubmitter.java:390)
at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:483)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1296)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1293)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1628)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1293)
at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:562)
at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:557)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1628)
at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:557)
at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:548)
at org.apache.hadoop.hive.ql.exec.mr.ExecDriver.execute(ExecDriver.java:420)
at org.apache.hadoop.hive.ql.exec.mr.ExecDriver.main(ExecDriver.java:740)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:601)
at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
Job Submission failed with exception 'java.io.FileNotFoundException(File does not exist: hdfs://172.16.253.17:54310/home/hadoop/mapreduce/HoursDiff.jar)'
Execution failed with exit status: 1
Everything you done is correct, but it is searching in HDFS path, you registered with local path.
Copy the jar into HDFS location and try to register it with the HDFS path.
I hope you opened the hive terminal with HDFS user, so it is searching the path of HDFS.
Note: It will also accept the local path also to register the jar.

Categories