Colons in Apache Spark application path - java

I'm submitting Apache Spark application to YARN programmatically:
package application.RestApplication;
import org.apache.hadoop.conf.Configuration;
import org.apache.spark.SparkConf;
import org.apache.spark.deploy.yarn.Client;
import org.apache.spark.deploy.yarn.ClientArguments;
public class App {
public static void main(String[] args1) {
String[] args = new String[] {
"--class", "org.apache.spark.examples.JavaWordCount",
"--jar", "/opt/spark/examples/jars/spark-examples_2.11-2.0.0.jar",
"--arg", "hdfs://hadoop-master:9000/input/file.txt"
};
Configuration config = new Configuration();
System.setProperty("SPARK_YARN_MODE", "true");
SparkConf sparkConf = new SparkConf();
ClientArguments cArgs = new ClientArguments(args);
Client client = new Client(cArgs, config, sparkConf);
client.run();
}
}
I have problem with line: "--arg", "hdfs://hadoop-master:9000/input/file.txt" - more specifically with colons:
16/08/29 09:54:16 ERROR yarn.ApplicationMaster: Uncaught exception:
java.lang.NumberFormatException: For input string: "9000/input/plik2.txt"
at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
at java.lang.Integer.parseInt(Integer.java:580)
at java.lang.Integer.parseInt(Integer.java:615)
at scala.collection.immutable.StringLike$class.toInt(StringLike.scala:272)
at scala.collection.immutable.StringOps.toInt(StringOps.scala:29)
at org.apache.spark.util.Utils$.parseHostPort(Utils.scala:935)
at org.apache.spark.deploy.yarn.ApplicationMaster.waitForSparkDriver(ApplicationMaster.scala:547)
at org.apache.spark.deploy.yarn.ApplicationMaster.runExecutorLauncher(ApplicationMaster.scala:405)
at org.apache.spark.deploy.yarn.ApplicationMaster.run(ApplicationMaster.scala:247)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anonfun$main$1.apply$mcV$sp(ApplicationMaster.scala:749)
at org.apache.spark.deploy.SparkHadoopUtil$$anon$1.run(SparkHadoopUtil.scala:71)
at org.apache.spark.deploy.SparkHadoopUtil$$anon$1.run(SparkHadoopUtil.scala:70)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
at org.apache.spark.deploy.SparkHadoopUtil.runAsSparkUser(SparkHadoopUtil.scala:70)
at org.apache.spark.deploy.yarn.ApplicationMaster$.main(ApplicationMaster.scala:747)
at org.apache.spark.deploy.yarn.ExecutorLauncher$.main(ApplicationMaster.scala:774)
at org.apache.spark.deploy.yarn.ExecutorLauncher.main(ApplicationMaster.scala)
How to write (as argument) path to file with colons? I try various combinations with slashes, backslashes, %3a, etc...

According to Utils#parseHostPort which get invoked during that call, Spark seems to consider as port all the text that is behind last : :
def parseHostPort(hostPort: String): (String, Int) = {
// Check cache first.
val cached = hostPortParseResults.get(hostPort)
if (cached != null) {
return cached
}
val indx: Int = hostPort.lastIndexOf(':')
// This is potentially broken - when dealing with ipv6 addresses for example, sigh ...
// but then hadoop does not support ipv6 right now.
// For now, we assume that if port exists, then it is valid - not check if it is an int > 0
if (-1 == indx) {
val retval = (hostPort, 0)
hostPortParseResults.put(hostPort, retval)
return retval
}
val retval = (hostPort.substring(0, indx).trim(), hostPort.substring(indx + 1).trim().toInt)
hostPortParseResults.putIfAbsent(hostPort, retval)
hostPortParseResults.get(hostPort)
}
As a consequence, the whole string 9000/input/file.txt is supposed to be a single port number. Which suggests you are not supposed to refer to your input file from HDFS file system. I guess someone more skilled in Apache Spark would give you better advice.

I changed program to: https://github.com/mahmoudparsian/data-algorithms-book/blob/master/src/main/java/org/dataalgorithms/chapB13/client/SubmitSparkPiToYARNFromJavaCode.java
import org.apache.spark.SparkConf;
import org.apache.spark.deploy.yarn.Client;
import org.apache.spark.deploy.yarn.ClientArguments;
import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.Logger;
public class SubmitSparkAppToYARNFromJavaCode {
public static void main(String[] args) throws Exception {
run();
}
static void run() throws Exception {
String sparkExamplesJar = "/opt/spark/examples/jars/spark-examples_2.11-2.0.0.jar";
final String[] args = new String[]{
"--jar",
sparkExamplesJar,
"--class",
"org.apache.spark.examples.JavaWordCount",
"--arg",
"hdfs://hadoop-master:9000/input/file.txt"
};
Configuration config = ConfigurationManager.createConfiguration();
System.setProperty("SPARK_YARN_MODE", "true");
SparkConf sparkConf = new SparkConf();
sparkConf.setSparkHome(SPARK_HOME);
sparkConf.setMaster("yarn");
sparkConf.setAppName("spark-yarn");
sparkConf.set("master", "yarn");
sparkConf.set("spark.submit.deployMode", "cluster");
ClientArguments clientArguments = new ClientArguments(args);
Client client = new Client(clientArguments, config, sparkConf);
client.run();
}
}
and now it works!

Related

Java NoClassDefFoundError amf

I'm trying to compile a java snippet into a jar file, I got into a classical java runtime exception but I'm unable to solve the problem.
This is the code, borrowed from Markus Wulftange:
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.Arrays;
import flex.messaging.io.SerializationContext;
import flex.messaging.io.amf.ActionContext;
import flex.messaging.io.amf.ActionMessage;
import flex.messaging.io.amf.AmfMessageDeserializer;
import flex.messaging.io.amf.AmfMessageSerializer;
import flex.messaging.io.amf.MessageBody;
public class Amf3ExternalizableUnicastRef {
public static void main(String[] args) throws IOException, ClassNotFoundException {
if (args.length < 2 || (args.length == 3 && !args[0].equals("-d"))) {
System.err.println("usage: java -jar " + Amf3ExternalizableUnicastRef.class.getSimpleName() + ".jar [-d] <host> <port>");
return;
}
boolean doDeserialize = false;
if (args.length == 3) {
doDeserialize = true;
args = Arrays.copyOfRange(args, 1, args.length);
}
// generate the UnicastRef object
Object unicastRef = generateUnicastRef(args[0], Integer.parseInt(args[1]));
// serialize object to AMF message
byte[] amf = serialize(unicastRef);
// deserialize AMF message
if (doDeserialize) {
deserialize(amf);
} else {
System.out.write(amf);
}
}
public static Object generateUnicastRef(String host, int port) {
java.rmi.server.ObjID objId = new java.rmi.server.ObjID();
sun.rmi.transport.tcp.TCPEndpoint endpoint = new sun.rmi.transport.tcp.TCPEndpoint(host, port);
sun.rmi.transport.LiveRef liveRef = new sun.rmi.transport.LiveRef(objId, endpoint, false);
return new sun.rmi.server.UnicastRef(liveRef);
}
public static byte[] serialize(Object data) throws IOException {
MessageBody body = new MessageBody();
body.setData(data);
ActionMessage message = new ActionMessage();
message.addBody(body);
ByteArrayOutputStream out = new ByteArrayOutputStream();
AmfMessageSerializer serializer = new AmfMessageSerializer();
serializer.initialize(SerializationContext.getSerializationContext(), out, null);
serializer.writeMessage(message);
return out.toByteArray();
}
public static void deserialize(byte[] amf) throws ClassNotFoundException, IOException {
ByteArrayInputStream in = new ByteArrayInputStream(amf);
AmfMessageDeserializer deserializer = new AmfMessageDeserializer();
deserializer.initialize(SerializationContext.getSerializationContext(), in, null);
deserializer.readMessage(new ActionMessage(), new ActionContext());
}
}
Using the package flex-messaging-core, located in the same directory, I compiled into a jar with
javac -cp flex...jar sourcefile.java.
Then compiled to a jar with
jar -cfm myjar.jar MANIFEST.ML myclass.class.
But then, when running from shell with proper arguments
java -jar myjar.jar -d 127.0.0.1 8000
it throws an Exception in threadmain java.lang.NoClassDefFoundError : flex/messaging/io/amf/MessageBody.
I googled and tried all solutions for 2 days, but really can't solve the problem by myself, can I kindly ask for a little help?
Shouldn't it be MANIFEST.MF, not MANIFEST.ML?

Saving an H2O model directly from Java

I'm trying to create and save a generated model directly from Java. The documentation specifies how to do this in R and Python, but not in Java. A similar question was asked before, but no real answer was provided (beyond linking to H2O doc, which doesn't contain a code example).
It'd be sufficient for my present purpose get some pointers to be able to translate the following reference code to Java. I'm mainly looking for guidance on the relevant JAR(s) to import from the Maven repository.
import h2o
h2o.init()
path = h2o.system_file("prostate.csv")
h2o_df = h2o.import_file(path)
h2o_df['CAPSULE'] = h2o_df['CAPSULE'].asfactor()
model = h2o.glm(y = "CAPSULE",
x = ["AGE", "RACE", "PSA", "GLEASON"],
training_frame = h2o_df,
family = "binomial")
h2o.download_pojo(model)
I think I've figured out an answer to my question. A self-contained sample code follows. However, I'll still appreciate an answer from the community since I don't know if this is the best/idiomatic way to do it.
package org.name.company;
import hex.glm.GLMModel;
import water.H2O;
import water.Key;
import water.api.StreamWriter;
import water.api.StreamingSchema;
import water.fvec.Frame;
import water.fvec.NFSFileVec;
import hex.glm.GLMModel.GLMParameters.Family;
import hex.glm.GLMModel.GLMParameters;
import hex.glm.GLM;
import water.util.JCodeGen;
import java.io.*;
import java.util.Map;
public class Launcher
{
public static void initCloud(){
String[] args = new String [] {"-name", "h2o_test_cloud"};
H2O.main(args);
H2O.waitForCloudSize(1, 10 * 1000);
}
public static void main( String[] args ) throws Exception {
// Initialize the cloud
initCloud();
// Create a Frame object from CSV
File f = new File("/path/to/data.csv");
NFSFileVec nfs = NFSFileVec.make(f);
Key frameKey = Key.make("frameKey");
Frame fr = water.parser.ParseDataset.parse(frameKey, nfs._key);
// Create a GLM and output coefficients
Key modelKey = Key.make("modelKey");
try {
GLMParameters params = new GLMParameters();
params._train = frameKey;
params._response_column = fr.names()[1];
params._intercept = true;
params._lambda = new double[]{0};
params._family = Family.gaussian;
GLMModel model = new GLM(params).trainModel().get();
Map<String, Double> coefs = model.coefficients();
for(Map.Entry<String, Double> entry : coefs.entrySet()) {
System.out.format("%s: %f\n", entry.getKey(), entry.getValue());
}
String filename = JCodeGen.toJavaId(model._key.toString()) + ".java";
StreamingSchema ss = new StreamingSchema(model.new JavaModelStreamWriter(false), filename);
StreamWriter sw = ss.getStreamWriter();
OutputStream os = new FileOutputStream("/base/path/" + filename);
sw.writeTo(os);
} finally {
if (fr != null) {
fr.remove();
}
}
}
}
Would something like this do the trick?
public void saveModel(URI uri, Keyed<Frame> model)
{
Persist p = H2O.getPM().getPersistForURI(uri);
OutputStream os = p.create(uri.toString(), true);
model.writeAll(new AutoBuffer(os, true)).close();
}
Make sure the URI has a proper form otherwise H2O will break on an npe. As for Maven you should be able to get away with the h2o core.
<dependency>
<groupId>ai.h2o</groupId>
<artifactId>h2o-core</artifactId>
<version>3.14.0.2</version>
</dependency>

fail to connect to Hbase with java api

Can I use java api to connect to Hbase in a standalone mode(without Hadoop)?
Here is my code, and I was wondering how to make it work. Should I set some property to the variable 'config'?
I have these installed locally : Hbase-0.98.0 Hadoop 2.2.0
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.util.Bytes;
public class MyLittleHBaseClient {
public static void main(String[] args) throws IOException {
// maybe I should do some configuration here, but I don't know how
Configuration config = HBaseConfiguration.create();
HTable table = new HTable(config, "myLittleHBaseTable");
Put p = new Put(Bytes.toBytes("myLittleRow"));
p.add(Bytes.toBytes("myLittleFamily"), Bytes.toBytes("someQualifier"),
Bytes.toBytes("Some Value"));
table.put(p);
Get g = new Get(Bytes.toBytes("myLittleRow"));
Result r = table.get(g);
byte [] value = r.getValue(Bytes.toBytes("myLittleFamily"),
Bytes.toBytes("someQualifier"));
String valueStr = Bytes.toString(value);
System.out.println("GET: " + valueStr);
Scan s = new Scan();
s.addColumn(Bytes.toBytes("myLittleFamily"), Bytes.toBytes("someQualifier"));
ResultScanner scanner = table.getScanner(s);
try {
for (Result rr = scanner.next(); rr != null; rr = scanner.next()) {
System.out.println("Found row: " + rr);
}
} finally {
scanner.close();
}
}
}
If your hbase-site.xml in standalone mode is empty(), you don't have to set any thing. If have overridden anything in the hbase-site.xml, better add that hbase-site.xml instead of setting parameter separately.
Configuration config = HBaseConfiguration.create();
config.addResource("<HBASE_CONF_DIR_PATH>/hbase-site.xml");

Oozie Java API Kerberos Authentication

I am trying to get an oozie job status using oozie java API. Currently it is failing with the message
Exception in thread "main" HTTP error code: 401 : Unauthorized
We are using a kerberos authentication in our cluster with a keytab file.
Please guide as how to proceed to implement the authentication.
My current program is:
import org.apache.oozie.client.OozieClient;
public class oozieCheck
{
public static void main(String[] args)
{
// get a OozieClient for local Oozie
OozieClient wc = new OozieClient(
"http://myserver:11000/oozie");
System.out.println(wc.getJobInfo(args[1]));
}
}
I figured a way to use kerberos in my java api.
First obtain kerberos tgt.
Then the below code works:
import java.io.BufferedReader;
import java.io.FileReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Properties;
import org.apache.log4j.Logger;
import org.apache.oozie.client.AuthOozieClient;
import org.apache.oozie.client.WorkflowJob.Status;
public class Wrapper
{
public static AuthOozieClient wc = null;
public static String OOZIE_SERVER_URL="http://localhost:11000/oozie";
public Wrapper ( String oozieUrlStr ) throws MalformedURLException
{
URL oozieUrl = new URL(oozieUrlStr);
// get a OozieClient for local Oozie
wc = new AuthOozieClient(oozieUrl.toString());
}
public static void main ( String [] args )
{
String lineCommon;
String jobId = args[0]; // The first argument is the oozie jobid
try
{
Wrapper client = new Wrapper(OOZIE_SERVER_URL);
Properties conf = wc.createConfiguration();
if(wc != null)
{
// get status of jobid from CLA
try
{
while (wc.getJobInfo(jobId).getStatus() == Status.RUNNING)
{
logger.info("Workflow job running ...");
logger.info("Workflow job ID:["+jobId+"]");
}
if(wc.getJobInfo(jobId).getStatus() == Status.SUCCEEDED)
{
// print the final status of the workflow job
logger.info("Workflow job completed ...");
logger.info(wc.getJobInfo(jobId));
}
else
{
// print the final status of the workflow job
logger.info("Workflow job Failed ...");
logger.info(wc.getJobInfo(jobId));
}
}
catch(Exception e)
{
e.printStackTrace();
}
}
else
{
System.exit(9999);
}
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
You have to patch the oozie client if docs do not mention Kerberos.

How to set amazon ami's hadoop configuration by using java code

I want set this configuration textinputformat.record.delimiter=; to the hadoop.
Right now i use the following code to run pig script on ami. Anyone knows how to set this configuration by using the following code?
Code:
StepConfig installPig = new StepConfig()
.withName("Install Pig")
.withActionOnFailure(ActionOnFailure.TERMINATE_JOB_FLOW.name())
.withHadoopJarStep(stepFactory.newInstallPigStep());
// [Configure pig script][1]
String[] scriptArgs = new String[] { "-p", input, "-p", output };
StepConfig runPigLatinScript = new StepConfig()
.withName("Run Pig Script") .withActionOnFailure(ActionOnFailure.CANCEL_AND_WAIT.name())
.withHadoopJarStep(stepFactory.newRunPigScriptStep("s3://pig/script.pig", scriptArgs));
// Configure JobFlow [R1][2], [R3][3]
//
//
RunJobFlowRequest request = new RunJobFlowRequest()
.withName(jobFlowName)
.withSteps(installPig, runPigLatinScript)
.withLogUri(logUri)
.withAmiVersion("2.3.2")
.withInstances(new JobFlowInstancesConfig()
.withEc2KeyName(this.ec2KeyName)
.withInstanceCount(this.count)
.withKeepJobFlowAliveWhenNoSteps(false)
.withMasterInstanceType(this.masterType)
.withSlaveInstanceType(this.slaveType));
// Run JobFlow
RunJobFlowResult runJobFlowResult = this.amazonEmrClient.runJobFlow(request);
What you need to do is create BootstrapActionConfig and add it to the RunJobFlowRequest being created, which would then add custom hadoop configuration to the cluster.
Here is the complete code I wrote for you after editing the code here :
import java.util.ArrayList;
import java.util.List;
import com.amazonaws.auth.AWSCredentials;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduceClient;
import com.amazonaws.services.elasticmapreduce.model.BootstrapActionConfig;
import com.amazonaws.services.elasticmapreduce.model.JobFlowInstancesConfig;
import com.amazonaws.services.elasticmapreduce.model.RunJobFlowRequest;
import com.amazonaws.services.elasticmapreduce.model.RunJobFlowResult;
import com.amazonaws.services.elasticmapreduce.model.ScriptBootstrapActionConfig;
import com.amazonaws.services.elasticmapreduce.model.StepConfig;
import com.amazonaws.services.elasticmapreduce.util.StepFactory;
/**
*
* #author amar
*
*/
public class RunEMRJobFlow {
private static final String CONFIG_HADOOP_BOOTSTRAP_ACTION = "s3://elasticmapreduce/bootstrap-actions/configure-hadoop";
public static void main(String[] args) {
String accessKey = "";
String secretKey = "";
AWSCredentials credentials = new BasicAWSCredentials(accessKey, secretKey);
AmazonElasticMapReduceClient emr = new AmazonElasticMapReduceClient(credentials);
StepFactory stepFactory = new StepFactory();
StepConfig enabledebugging = new StepConfig().withName("Enable debugging")
.withActionOnFailure("TERMINATE_JOB_FLOW").withHadoopJarStep(stepFactory.newEnableDebuggingStep());
StepConfig installHive = new StepConfig().withName("Install Hive").withActionOnFailure("TERMINATE_JOB_FLOW")
.withHadoopJarStep(stepFactory.newInstallHiveStep());
List<String> setMappersArgs = new ArrayList<String>();
setMappersArgs.add("-s");
setMappersArgs.add("textinputformat.record.delimiter=;");
BootstrapActionConfig mappersBootstrapConfig = createBootstrapAction("Set Hadoop Config",
CONFIG_HADOOP_BOOTSTRAP_ACTION, setMappersArgs);
RunJobFlowRequest request = new RunJobFlowRequest()
.withBootstrapActions(mappersBootstrapConfig)
.withName("Hive Interactive")
.withSteps(enabledebugging, installHive)
.withLogUri("s3://myawsbucket/")
.withInstances(
new JobFlowInstancesConfig().withEc2KeyName("keypair").withHadoopVersion("0.20")
.withInstanceCount(5).withKeepJobFlowAliveWhenNoSteps(true)
.withMasterInstanceType("m1.small").withSlaveInstanceType("m1.small"));
RunJobFlowResult result = emr.runJobFlow(request);
}
private static BootstrapActionConfig createBootstrapAction(String bootstrapName, String bootstrapPath,
List<String> args) {
ScriptBootstrapActionConfig bootstrapScriptConfig = new ScriptBootstrapActionConfig();
bootstrapScriptConfig.setPath(bootstrapPath);
if (args != null) {
bootstrapScriptConfig.setArgs(args);
}
BootstrapActionConfig bootstrapConfig = new BootstrapActionConfig();
bootstrapConfig.setName(bootstrapName);
bootstrapConfig.setScriptBootstrapAction(bootstrapScriptConfig);
return bootstrapConfig;
}
}

Categories