How to apply Limit and Offset based data select using Bigquery Storage Rad Api ?
Below is the sample I am trying to read data from a BigQuery Table. It is fetching entire table and I can provide filters based on column values. But I want to apply LIMIT and OFFSET and provide Custom SQL for data fetch/read. Is it possible in Storage API ?
import com.google.api.gax.rpc.ServerStream;
import com.google.cloud.bigquery.storage.v1.AvroRows;
import com.google.cloud.bigquery.storage.v1.BigQueryReadClient;
import com.google.cloud.bigquery.storage.v1.CreateReadSessionRequest;
import com.google.cloud.bigquery.storage.v1.DataFormat;
import com.google.cloud.bigquery.storage.v1.ReadRowsRequest;
import com.google.cloud.bigquery.storage.v1.ReadRowsResponse;
import com.google.cloud.bigquery.storage.v1.ReadSession;
import com.google.cloud.bigquery.storage.v1.ReadSession.TableModifiers;
import com.google.cloud.bigquery.storage.v1.ReadSession.TableReadOptions;
import com.google.common.base.Preconditions;
import com.google.protobuf.Timestamp;
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DecoderFactory;
public class StorageSample {
/*
* SimpleRowReader handles deserialization of the Avro-encoded row blocks transmitted
* from the storage API using a generic datum decoder.
*/
private static class SimpleRowReader {
private final DatumReader<GenericRecord> datumReader;
// Decoder object will be reused to avoid re-allocation and too much garbage collection.
private BinaryDecoder decoder = null;
// GenericRecord object will be reused.
private GenericRecord row = null;
public SimpleRowReader(Schema schema) {
Preconditions.checkNotNull(schema);
datumReader = new GenericDatumReader<>(schema);
}
/**
* Sample method for processing AVRO rows which only validates decoding.
*
* #param avroRows object returned from the ReadRowsResponse.
*/
public void processRows(AvroRows avroRows) throws IOException {
decoder =
DecoderFactory.get()
.binaryDecoder(avroRows.getSerializedBinaryRows().toByteArray(), decoder);
while (!decoder.isEnd()) {
// Reusing object row
row = datumReader.read(row, decoder);
System.out.println(row.toString());
}
}
}
public static void main(String... args) throws Exception {
// Sets your Google Cloud Platform project ID.
// String projectId = "YOUR_PROJECT_ID";
String projectId = "gcs-test";
Integer snapshotMillis = null;
// if (args.length > 1) {
// snapshotMillis = Integer.parseInt(args[1]);
// }
try (BigQueryReadClient client = BigQueryReadClient.create()) {
String parent = String.format("projects/%s", projectId);
// This example uses baby name data from the public datasets.
String srcTable =
String.format(
"projects/%s/datasets/%s/tables/%s",
"gcs-test", "testdata", "testtable");
// We specify the columns to be projected by adding them to the selected fields,
// and set a simple filter to restrict which rows are transmitted.
TableReadOptions options =
TableReadOptions.newBuilder()
.addSelectedFields("id")
.addSelectedFields("qtr")
.addSelectedFields("sales")
.addSelectedFields("year")
.addSelectedFields("comments")
//.setRowRestriction("state = \"WA\"")
.build();
// Start specifying the read session we want created.
ReadSession.Builder sessionBuilder =
ReadSession.newBuilder()
.setTable(srcTable)
// This API can also deliver data serialized in Apache Avro format.
// This example leverages Apache Avro.
.setDataFormat(DataFormat.AVRO)
.setReadOptions(options);
// Optionally specify the snapshot time. When unspecified, snapshot time is "now".
if (snapshotMillis != null) {
Timestamp t =
Timestamp.newBuilder()
.setSeconds(snapshotMillis / 1000)
.setNanos((int) ((snapshotMillis % 1000) * 1000000))
.build();
TableModifiers modifiers = TableModifiers.newBuilder().setSnapshotTime(t).build();
sessionBuilder.setTableModifiers(modifiers);
}
// Begin building the session creation request.
CreateReadSessionRequest.Builder builder =
CreateReadSessionRequest.newBuilder()
.setParent(parent)
.setReadSession(sessionBuilder)
.setMaxStreamCount(1);
// Request the session creation.
ReadSession session = client.createReadSession(builder.build());
SimpleRowReader reader =
new SimpleRowReader(new Schema.Parser().parse(session.getAvroSchema().getSchema()));
// Assert that there are streams available in the session. An empty table may not have
// data available. If no sessions are available for an anonymous (cached) table, consider
// writing results of a query to a named table rather than consuming cached results directly.
Preconditions.checkState(session.getStreamsCount() > 0);
// Use the first stream to perform reading.
String streamName = session.getStreams(0).getName();
ReadRowsRequest readRowsRequest =
ReadRowsRequest.newBuilder().setReadStream(streamName).build();
// Process each block of rows as they arrive and decode using our simple row reader.
ServerStream<ReadRowsResponse> stream = client.readRowsCallable().call(readRowsRequest);
for (ReadRowsResponse response : stream) {
Preconditions.checkState(response.hasAvroRows());
reader.processRows(response.getAvroRows());
}
}
}
}
With the BigQuery Storage read API functionality, LIMIT is effectively just a case of stopping row reading after you've processed the desired number of elements.
Applying the notion of an OFFSET clause is a bit more nuanced, as it implies ordering. If you're reading a table via multiple streams for improved throughput, you're either disregarding ordering entirely, or you're re-ordering data after you've read it from the API.
If you read a table as a single string, you preserve whatever ordering the input table had, and can specify the offset field in the ReadRowsRequest to start at a given offset.
Related
pom.xml
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-bom</artifactId>
<version>1.11.256</version>
<type>pom</type>
<scope>import</scope>
</dependency>
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-dynamodb</artifactId>
</dependency>
Client Code
public static AmazonDynamoDB getDynamoDBClient() {
return AmazonDynamoDBClientBuilder.standard().withRegion("ap-southeast-1").build();
}
Now i am trying to execute a normal query having few records but it is taking long time to fetch the result.
For first time it is fetching the records in around 5-6 seconds on multiple requests in reduces by half. 2-3 seconds is still large time for fetching only few items.
Already checked the tuning of dynamo DB using different client configurations (connection timeout, request timeout, retry etc.) but not giving results as expected.
Also checked with SDK version 2 URLConnectionHTTPClient config but same results came there too.
One possible cause can be the credentials fetch time for dynamo DB client but not having any credentials caching reference in java. Can any one suggest possible configuration to improve this latency.
You are using a very old API and is not best practice anymore. To use best practice with Java, use the AWS SDK for Java v2.
The AWS SDK for Java 2.x is a major rewrite of the version 1.x code base. It’s built on top of Java 8+ and adds several frequently requested features. These include support for non-blocking I/O and the ability to plug in a different HTTP implementation at run time.
The POM for v2 is:
<dependency>
<groupId>software.amazon.awssdk</groupId>
<artifactId>bom</artifactId>
<version>2.17.190</version>
<type>pom</type>
<scope>import</scope>
</dependency>
To retrieve records using the AWS SDK for Java v2, you have three choices.
Use the DynamoDbClient.
Use the Enhanced Client (that maps Java objects to tables)
Use PartiQL (uses SQL like syntax)
I will show you all ways. All examples can be found in the Amazon Github repo.
Use the DynamoDbClient
Code is:
package com.example.dynamodb;
// snippet-start:[dynamodb.java2.get_item.import]
import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.dynamodb.model.DynamoDbException;
import software.amazon.awssdk.services.dynamodb.DynamoDbClient;
import software.amazon.awssdk.services.dynamodb.model.AttributeValue;
import software.amazon.awssdk.services.dynamodb.model.GetItemRequest;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
// snippet-end:[dynamodb.java2.get_item.import]
/**
* Before running this Java V2 code example, set up your development environment, including your credentials.
*
* For more information, see the following documentation topic:
*
* https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/get-started.html
*
* To get an item from an Amazon DynamoDB table using the AWS SDK for Java V2, its better practice to use the
* Enhanced Client, see the EnhancedGetItem example.
*/
public class GetItem {
public static void main(String[] args) {
final String usage = "\n" +
"Usage:\n" +
" <tableName> <key> <keyVal>\n\n" +
"Where:\n" +
" tableName - The Amazon DynamoDB table from which an item is retrieved (for example, Music3). \n" +
" key - The key used in the Amazon DynamoDB table (for example, Artist). \n" +
" keyval - The key value that represents the item to get (for example, Famous Band).\n" ;
if (args.length != 3) {
System.out.println(usage);
System.exit(1);
}
String tableName = "Customer" ; //args[0];
String key = "id" ; //args[1];
String keyVal = "id101" ; //args[2];
System.out.format("Retrieving item \"%s\" from \"%s\"\n", keyVal, tableName);
ProfileCredentialsProvider credentialsProvider = ProfileCredentialsProvider.create();
Region region = Region.US_EAST_1;
DynamoDbClient ddb = DynamoDbClient.builder()
.credentialsProvider(credentialsProvider)
.region(region)
.build();
getDynamoDBItem(ddb, tableName, key, keyVal);
ddb.close();
}
// snippet-start:[dynamodb.java2.get_item.main]
public static void getDynamoDBItem(DynamoDbClient ddb,String tableName,String key,String keyVal ) {
HashMap<String,AttributeValue> keyToGet = new HashMap<>();
keyToGet.put(key, AttributeValue.builder()
.s(keyVal).build());
GetItemRequest request = GetItemRequest.builder()
.key(keyToGet)
.tableName(tableName)
.build();
try {
Map<String,AttributeValue> returnedItem = ddb.getItem(request).item();
if (returnedItem != null) {
Set<String> keys = returnedItem.keySet();
System.out.println("Amazon DynamoDB table attributes: \n");
for (String key1 : keys) {
System.out.format("%s: %s\n", key1, returnedItem.get(key1).toString());
}
} else {
System.out.format("No item found with the key %s!\n", key);
}
} catch (DynamoDbException e) {
System.err.println(e.getMessage());
System.exit(1);
}
}
// snippet-end:[dynamodb.java2.get_item.main]
}
Enhanced Client
Code is:
package com.example.dynamodb;
// snippet-start:[dynamodb.java2.mapping.getitem.import]
import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider;
import software.amazon.awssdk.enhanced.dynamodb.DynamoDbEnhancedClient;
import software.amazon.awssdk.enhanced.dynamodb.DynamoDbTable;
import software.amazon.awssdk.enhanced.dynamodb.Key;
import software.amazon.awssdk.enhanced.dynamodb.TableSchema;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.dynamodb.DynamoDbClient;
import software.amazon.awssdk.services.dynamodb.model.DynamoDbException;
// snippet-end:[dynamodb.java2.mapping.getitem.import]
/*
* Before running this code example, create an Amazon DynamoDB table named Customer with these columns:
* - id - the id of the record that is the key
* - custName - the customer name
* - email - the email value
* - registrationDate - an instant value when the item was added to the table
*
* Also, ensure that you have set up your development environment, including your credentials.
*
* For information, see this documentation topic:
*
* https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/get-started.html
*/
public class EnhancedGetItem {
public static void main(String[] args) {
ProfileCredentialsProvider credentialsProvider = ProfileCredentialsProvider.create();
Region region = Region.US_EAST_1;
DynamoDbClient ddb = DynamoDbClient.builder()
.credentialsProvider(credentialsProvider)
.region(region)
.build();
DynamoDbEnhancedClient enhancedClient = DynamoDbEnhancedClient.builder()
.dynamoDbClient(ddb)
.build();
getItem(enhancedClient);
ddb.close();
}
// snippet-start:[dynamodb.java2.mapping.getitem.main]
public static void getItem(DynamoDbEnhancedClient enhancedClient) {
try {
DynamoDbTable<Customer> table = enhancedClient.table("Customer", TableSchema.fromBean(Customer.class));
Key key = Key.builder()
.partitionValue("id120")
.build();
// Get the item by using the key.
Customer result = table.getItem(r->r.key(key));
System.out.println("******* The description value is "+result.getCustName());
} catch (DynamoDbException e) {
System.err.println(e.getMessage());
System.exit(1);
}
}
// snippet-end:[dynamodb.java2.mapping.getitem.main]
}
PartiQL
Code is
public static void getItem(DynamoDbClient ddb) {
String sqlStatement = "SELECT * FROM MoviesPartiQ where year=? and title=?";
List<AttributeValue> parameters = new ArrayList<>();
AttributeValue att1 = AttributeValue.builder()
.n("2012")
.build();
AttributeValue att2 = AttributeValue.builder()
.s("The Perks of Being a Wallflower")
.build();
parameters.add(att1);
parameters.add(att2);
try {
ExecuteStatementResponse response = executeStatementRequest(ddb, sqlStatement, parameters);
System.out.println("ExecuteStatement successful: "+ response.toString());
} catch (DynamoDbException e) {
System.err.println(e.getMessage());
System.exit(1);
}
}
All of these ways are recommend using over AWS SDK for Java V1. If you are not familiar with the V2 API - including creds and setting up your dev environment, see:
Developer guide - AWS SDK for Java 2.x
For first time it is fetching the records in around 5-6 seconds on multiple requests in reduces by half. 2-3 seconds is still large time for fetching only few items.
This is expected even with dynamo at production because first time you fetch the following sequence of operations happens:-
Look up for the partition ID your requested data belongs to from the route table.
Caches the Key to Partiton
Then refers to the partition for fetching the data.
As you see there is caching in step 2, so the next time requests go, the partition id is fetched from the cache thus the latency decreases. (will share the source once I have it)
Can any one suggest possible configuration to improve this latency.
Also, please don't use local dynamo for performance benchmarking, because it internally uses SQLite, i.e. SQL DB for storing.
refer:- https://www.dbi-services.com/blog/aws-dynamodb-local/#:~:text=Yes%2C%20this%20NoSQL%20database%20is,Local%20engine%2C%20embedded%20in%20Java.
I understand this has been asked for multiple times, but I am really stuck here and if it is fairly easy, please help me.
I have a sample java program and a jar file.
Here is what is inside of the java program (WriterSample.java).
// (c) Copyright 2014. TIBCO Software Inc. All rights reserved.
package com.spotfire.samples;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Date;
import java.util.Random;
import com.spotfire.sbdf.BinaryWriter;
import com.spotfire.sbdf.ColumnMetadata;
import com.spotfire.sbdf.FileHeader;
import com.spotfire.sbdf.TableMetadata;
import com.spotfire.sbdf.TableMetadataBuilder;
import com.spotfire.sbdf.TableWriter;
import com.spotfire.sbdf.ValueType;
/**
* This example is a simple command line tool that writes a simple SBDF file
* with random data.
*/
public class WriterSample {
public static void main(String[] args) throws IOException {
// The command line application requires one argument which is supposed to be
// the name of the SBDF file to write.
if (args.length != 1)
{
System.out.println("Syntax: WriterSample output.sbdf");
return;
}
String outputFile = args[0];
// First we just open the file as usual and then we need to wrap the stream
// in a binary writer.
OutputStream outputStream = new FileOutputStream(outputFile);
BinaryWriter writer = new BinaryWriter(outputStream);
// When writing an SBDF file you first need to write the file header.
FileHeader.writeCurrentVersion(writer);
// The second part of the SBDF file is the metadata, in order to create
// the table metadata we need to use the builder class.
TableMetadataBuilder tableMetadataBuilder = new TableMetadataBuilder();
// The table can have metadata properties defined. Here we add a custom
// property indicating the producer of the file. This will be imported as
// a table property in Spotfire.
tableMetadataBuilder.addProperty("GeneratedBy", "WriterSample.exe");
// All columns in the table needs to be defined and added to the metadata builder,
// the required information is the name of the column and the data type.
ColumnMetadata col1 = new ColumnMetadata("Category", ValueType.STRING);
tableMetadataBuilder.addColumn(col1);
// Similar to tables, columns can also have metadata properties defined. Here
// we add another custom property. This will be imported as a column property
// in Spotfire.
col1.addProperty("SampleProperty", "col1");
ColumnMetadata col2 = new ColumnMetadata("Value", ValueType.DOUBLE);
tableMetadataBuilder.addColumn(col2);
col2.addProperty("SampleProperty", "col2");
ColumnMetadata col3 = new ColumnMetadata("TimeStamp", ValueType.DATETIME);
tableMetadataBuilder.addColumn(col3);
col3.addProperty("SampleProperty", "col3");
// We need to call the build function in order to get an object that we can
// write to the file.
TableMetadata tableMetadata = tableMetadataBuilder.build();
tableMetadata.write(writer);
int rowCount = 10000;
Random random = new Random();
// Now that we have written all the metadata we can start writing the actual data.
// Here we use a TableWriter to write the data, remember to close the table writer
// otherwise you will not generate a correct SBDF file.
TableWriter tableWriter = new TableWriter(writer, tableMetadata);
for (int i = 0; i < rowCount; ++i) {
// You need to perform one addValue call for each column, for each row in the
// same order as you added the columns to the table metadata object.
// In this example we just generate some random values of the appropriate types.
// Here we write the first string column.
String[] col1Values = new String[] {"A", "B", "C", "D", "E"};
tableWriter.addValue(col1Values[random.nextInt(5)]);
// Next we write the second double column.
double doubleValue = random.nextDouble();
if (doubleValue < 0.5) {
// Note that if you want to write a null value you shouldn't send null to
// addValue, instead you should use theInvalidValue property of the columns
// ValueType.
tableWriter.addValue(ValueType.DOUBLE.getInvalidValue());
} else {
tableWriter.addValue(random.nextDouble());
}
// And finally the third date time column.
tableWriter.addValue(new Date());
}
// Finally we need to close the file and write the end of table marker.
tableWriter.writeEndOfTable();
writer.close();
outputStream.close();
System.out.print("Wrote file: ");
System.out.println(outputFile);
}
}
The jar file is sbdf.jar, which is in the same directory as the java file.
I can now compile with:
javac -cp "sbdf.jar" WriterSample.java
This will generate a WriterSample.class file.
The problem is that when I try to execute the program by
java -cp .:./sbdf.jar WriterSample
I got an error message:
Error: Could not find or load main class WriterSample
What should I do? Thanks!
You should use the fully qualified name of the WriterSample, which is com.spotfire.samples.WriterSample and the correct java command is:
java -cp .:././sbdf.jar com.spotfire.samples.WriterSample
I've been reading the H2O documentation for a while, and I haven't found a clear example of how to load model trained and saved using the Python API. I was following the next example.
import h2o
from h2o.estimators.naive_bayes import H2ONaiveBayesEstimator
model = H2ONaiveBayesEstimator()
h2o_df = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
model.train(y = "IsDepDelayed", x = ["Year", "Origin"],
training_frame = h2o_df,
family = "binomial",
lambda_search = True,
max_active_predictors = 10)
h2o.save_model(model, path=models)
But if you check the official documentation it states that you have to download the model as a POJO from the flow UI. Is it the only way? or, may I achieve the same result via python? Just for information, I show the doc's example below. I need some guidance.
import java.io.*;
import hex.genmodel.easy.RowData;
import hex.genmodel.easy.EasyPredictModelWrapper;
import hex.genmodel.easy.prediction.*;
public class main {
private static String modelClassName = "gbm_pojo_test";
public static void main(String[] args) throws Exception {
hex.genmodel.GenModel rawModel;
rawModel = (hex.genmodel.GenModel) Class.forName(modelClassName).newInstance();
EasyPredictModelWrapper model = new EasyPredictModelWrapper(rawModel);
//
// By default, unknown categorical levels throw PredictUnknownCategoricalLevelException.
// Optionally configure the wrapper to treat unknown categorical levels as N/A instead:
//
// EasyPredictModelWrapper model = new EasyPredictModelWrapper(
// new EasyPredictModelWrapper.Config()
// .setModel(rawModel)
// .setConvertUnknownCategoricalLevelsToNa(true));
RowData row = new RowData();
row.put("Year", "1987");
row.put("Month", "10");
row.put("DayofMonth", "14");
row.put("DayOfWeek", "3");
row.put("CRSDepTime", "730");
row.put("UniqueCarrier", "PS");
row.put("Origin", "SAN");
row.put("Dest", "SFO");
BinomialModelPrediction p = model.predictBinomial(row);
System.out.println("Label (aka prediction) is flight departure delayed: " + p.label);
System.out.print("Class probabilities: ");
for (int i = 0; i < p.classProbabilities.length; i++) {
if (i > 0) {
System.out.print(",");
}
System.out.print(p.classProbabilities[i]);
}
System.out.println("");
}
}
h2o.save_model will save the binary model to the provided file system, however, looking at the Java application above it seems you want to use model into a Java based scoring application.
Because of that you should be using h2o.download_pojo API to save the model to local file system along with genmodel jar file. The API is documented as below:
download_pojo(model, path=u'', get_jar=True)
Download the POJO for this model to the directory specified by the path; if the path is "", then dump to screen.
:param model: the model whose scoring POJO should be retrieved.
:param path: an absolute path to the directory where POJO should be saved.
:param get_jar: retrieve the h2o-genmodel.jar also.
Once you have download POJO, you can use the above sample application to perform the scoring and make sure the POJO class name and the "modelClassName" are same along with model type.
I'm trying to pass a .arff file to LinearRegression object and while doing so it gives me this exception Cannot handle multi-valued nominal class!.
What actually happening is i'm performing Attribute selection using CFSSubsetEval evaluater and search as GreedyStepwise after doing so, passing those attributes to LinearRegression as follows
LinearRegression rl=new LinearRegression(); rl.buildClassifier(data);
data is the Instance object which has the data from .arff file which previously converted to nominal values using weka only. Am i doing anything wrong here? I was trying to search for this error on google but couldn't find one.
Code
package com.attribute;
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.Random;
import weka.attributeSelection.AttributeSelection;
import weka.attributeSelection.CfsSubsetEval;
import weka.attributeSelection.GreedyStepwise;
import weka.classifiers.Evaluation;
import weka.classifiers.functions.LinearRegression;
import weka.classifiers.meta.AttributeSelectedClassifier;
import weka.classifiers.trees.J48;
import weka.core.Instances;
import weka.core.Utils;
import weka.filters.supervised.attribute.NominalToBinary;
/**
* performs attribute selection using CfsSubsetEval and GreedyStepwise
* (backwards) and trains J48 with that. Needs 3.5.5 or higher to compile.
*
* #author FracPete (fracpete at waikato dot ac dot nz)
*/
public class AttributeSelectionTest2 {
/**
* uses the meta-classifier
*/
protected static void useClassifier(Instances data) throws Exception {
System.out.println("\n1. Meta-classfier");
AttributeSelectedClassifier classifier = new AttributeSelectedClassifier();
CfsSubsetEval eval = new CfsSubsetEval();
GreedyStepwise search = new GreedyStepwise();
search.setSearchBackwards(true);
J48 base = new J48();
classifier.setClassifier(base);
classifier.setEvaluator(eval);
classifier.setSearch(search);
Evaluation evaluation = new Evaluation(data);
evaluation.crossValidateModel(classifier, data, 10, new Random(1));
System.out.println(evaluation.toSummaryString());
}
/**
* uses the low level approach
*/
protected static void useLowLevel(Instances data) throws Exception {
System.out.println("\n3. Low-level");
AttributeSelection attsel = new AttributeSelection();
CfsSubsetEval eval = new CfsSubsetEval();
GreedyStepwise search = new GreedyStepwise();
search.setSearchBackwards(true);
attsel.setEvaluator(eval);
attsel.setSearch(search);
attsel.SelectAttributes(data);
int[] indices = attsel.selectedAttributes();
System.out.println("selected attribute indices (starting with 0):\n"
+ Utils.arrayToString(indices));
useLinearRegression(indices, data);
}
protected static void useLinearRegression(int[] indices, Instances data) throws Exception{
System.out.println("\n 4. Linear-Regression on above selected attributes");
BufferedReader reader = new BufferedReader(new FileReader(
"C:/Entertainement/MS/Fall 2014/spdb/project 4/healthcare.arff"));
Instances data1 = new Instances(reader);
data.setClassIndex(data.numAttributes() - 1);
/*NominalToBinary nb = new NominalToBinary();
for(int i=0;i<=20; i++){
//Still coding left here, create an Instance variable to store the data from 'data' variable for given indices
Instances data_lr=data1.
}*/
LinearRegression rl=new LinearRegression(); //Creating a LinearRegression Object to pass data1
rl.buildClassifier(data1);
}
/**
* takes a dataset as first argument
*
* #param args
* the commandline arguments
* #throws Exception
* if something goes wrong
*/
public static void main(String[] args) throws Exception {
// load data
System.out.println("\n0. Loading data");
BufferedReader reader = new BufferedReader(new FileReader(
"C:/Entertainement/MS/Fall 2014/spdb/project 4/healthcare.arff"));
Instances data = new Instances(reader);
if (data.classIndex() == -1)
data.setClassIndex(data.numAttributes() - 14);
// 1. meta-classifier
useClassifier(data);
// 2. filter
//useFilter(data);
// 3. low-level
useLowLevel(data);
}
}
NOTE : As i have not written code to build an instance variable with 'indices' attributes, i'm (for the sake of program to run) loading data from the same original file.
I don't know how to upload a file for sample data, but it looks something like this. [link] (https://scontent-a-dfw.xx.fbcdn.net/hphotos-xfa1/t31.0-8/p552x414/10496920_756438941076936_8448023649960186530_o.jpg)
Based on your data, it appears that your last attribute is a nominal data type (Contains mostly numbers, but there are some strings as well). LinearRegression will not allow the prediction of nominal classes.
What you could potentially do to ensure that your given dataset works is run it through the Weka Explorer with Linear Regression and see if the desired outcome is generated. Following this, there is a good chance that the data will work correctly in your code.
Hope this Helps!
Here is example of dataset for LinearRegression (source)
#RELATION house
#ATTRIBUTE houseSize NUMERIC
#ATTRIBUTE lotSize NUMERIC
#ATTRIBUTE bedrooms NUMERIC
#ATTRIBUTE granite NUMERIC
#ATTRIBUTE bathroom NUMERIC
#ATTRIBUTE sellingPrice NUMERIC
#DATA
3529,9191,6,0,0,205000
3247,10061,5,1,1,224900
4032,10150,5,0,1,197900
2397,14156,4,1,0,189900
2200,9600,4,0,1,195000
3536,19994,6,1,1,325000
2983,9365,5,0,1,230000
I am looking for a SQL Library that will parse an SQL statement and return some sort of Object representation of the SQL statement. My main objective is actually to be able to parse the SQL statement and retrieve the list of table names present in the SQL statement (including subqueries, joins and unions).
I am looking for a free library with a license business friendly (e.g. Apache license). I am looking for a library and not for an SQL Grammar. I do not want to build my own parser.
The best I could find so far was JSQLParser, and the example they give is actually pretty close to what I am looking for. However it fails parsing too many good queries (DB2 Database) and I'm hoping to find a more reliable library.
I doubt you'll find anything prewritten that you can just use. The problem is that ISO/ANSI SQL is a very complicated grammar — something like more than 600 production rules IIRC.
Terence Parr's ANTLR parser generator (Java, but can generate parsers in any one of a number of target languages) has several SQL grammars available, including a couple for PL/SQL, one for a SQL Server SELECT statement, one for mySQL, and one for ISO SQL.
No idea how complete/correct/up-to-date they are.
http://www.antlr.org/grammar/list
You needn't reinvent the wheel, there is already such a reliable SQL parser library there, (it's commerical, not free), and this article shows how to retrieve the list of table names present in the SQL statement (including subqueries, joins and unions) that is exactly what you are looking for.
http://www.dpriver.com/blog/list-of-demos-illustrate-how-to-use-general-sql-parser/get-columns-and-tables-in-sql-script/
This SQL parser library supports Oracle, SQL Server, DB2, MySQL, Teradata and ACCESS.
You need the ultra light, ultra fast library to extract table names from SQL (Disclaimer: I am the owner)
Just add the following in your pom
<dependency>
<groupId>com.github.mnadeem</groupId>
<artifactId>sql-table-name-parser</artifactId>
<version>0.0.1</version>
And do the following
new TableNameParser(sql).tables()
For more details, refer the project
Old question, but I think this project contains what you need:
Data Tools Project - SQL Development Tools
Here's the documentation for the SQL Query Parser.
Also, here's a small sample program. I'm no Java programmer so use with care.
package org.lala;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.List;
import org.eclipse.datatools.modelbase.sql.query.QuerySelectStatement;
import org.eclipse.datatools.modelbase.sql.query.QueryStatement;
import org.eclipse.datatools.modelbase.sql.query.TableReference;
import org.eclipse.datatools.modelbase.sql.query.ValueExpressionColumn;
import org.eclipse.datatools.modelbase.sql.query.helper.StatementHelper;
import org.eclipse.datatools.sqltools.parsers.sql.SQLParseErrorInfo;
import org.eclipse.datatools.sqltools.parsers.sql.SQLParserException;
import org.eclipse.datatools.sqltools.parsers.sql.SQLParserInternalException;
import org.eclipse.datatools.sqltools.parsers.sql.query.SQLQueryParseResult;
import org.eclipse.datatools.sqltools.parsers.sql.query.SQLQueryParserManager;
import org.eclipse.datatools.sqltools.parsers.sql.query.SQLQueryParserManagerProvider;
public class SQLTest {
private static String readFile(String path) throws IOException {
FileInputStream stream = new FileInputStream(new File(path));
try {
FileChannel fc = stream.getChannel();
MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY, 0,
fc.size());
/* Instead of using default, pass in a decoder. */
return Charset.defaultCharset().decode(bb).toString();
} finally {
stream.close();
}
}
/**
* #param args
* #throws IOException
*/
public static void main(String[] args) throws IOException {
try {
// Create an instance the Parser Manager
// SQLQueryParserManagerProvider.getInstance().getParserManager
// returns the best compliant SQLQueryParserManager
// supporting the SQL dialect of the database described by the given
// database product information. In the code below null is passed
// for both the database and version
// in which case a generic parser is returned
SQLQueryParserManager parserManager = SQLQueryParserManagerProvider
.getInstance().getParserManager("DB2 UDB", "v9.1");
// Sample query
String sql = readFile("c:\\test.sql");
// Parse
SQLQueryParseResult parseResult = parserManager.parseQuery(sql);
// Get the Query Model object from the result
QueryStatement resultObject = parseResult.getQueryStatement();
// Get the SQL text
String parsedSQL = resultObject.getSQL();
System.out.println(parsedSQL);
// Here we have the SQL code parsed!
QuerySelectStatement querySelect = (QuerySelectStatement) parseResult
.getSQLStatement();
List columnExprList = StatementHelper
.getEffectiveResultColumns(querySelect);
Iterator columnIt = columnExprList.iterator();
while (columnIt.hasNext()) {
ValueExpressionColumn colExpr = (ValueExpressionColumn) columnIt
.next();
// DataType dataType = colExpr.getDataType();
System.out.println("effective result column: "
+ colExpr.getName());// + " with data type: " +
// dataType.getName());
}
List tableList = StatementHelper.getTablesForStatement(resultObject);
// List tableList = StatementHelper.getTablesForStatement(querySelect);
for (Object obj : tableList) {
TableReference t = (TableReference) obj;
System.out.println(t.getName());
}
} catch (SQLParserException spe) {
// handle the syntax error
System.out.println(spe.getMessage());
#SuppressWarnings("unchecked")
List<SQLParseErrorInfo> syntacticErrors = spe.getErrorInfoList();
Iterator<SQLParseErrorInfo> itr = syntacticErrors.iterator();
while (itr.hasNext()) {
SQLParseErrorInfo errorInfo = (SQLParseErrorInfo) itr.next();
// Example usage of the SQLParseErrorInfo object
// the error message
String errorMessage = errorInfo.getParserErrorMessage();
String expectedText = errorInfo.getExpectedText();
String errorSourceText = errorInfo.getErrorSourceText();
// the line numbers of error
int errorLine = errorInfo.getLineNumberStart();
int errorColumn = errorInfo.getColumnNumberStart();
System.err.println("Error in line " + errorLine + ", column "
+ errorColumn + ": " + expectedText + " "
+ errorMessage + " " + errorSourceText);
}
} catch (SQLParserInternalException spie) {
// handle the exception
System.out.println(spie.getMessage());
}
System.exit(0);
}
}