I've trained a neural network in NetBeans and saved it as neural_network.ser by using Serializable ,"all classes implement Serializable" , Now I want to use it in my android application but when loading the network ,ClassNotFoundException raised .
java.lang.ClassNotFoundException: neural_network.BackPropagation
Here is the Classes:
BackPropagation class:
public class BackPropagation extends Thread implements Serializable
{
private static final String TAG = "NetworkMessage";
private static final long serialVersionUID = -8862858027413741101L;
private double OverallError;
// The minimum Error Function defined by the user
private double MinimumError;
// The user-defined expected output pattern for a set of samples
private double ExpectedOutput[][];
// The user-defined input pattern for a set of samples
private double Input[][];
// User defined learning rate - used for updating the network weights
private double LearningRate;
// Users defined momentum - used for updating the network weights
private double Momentum;
// Number of layers in the network
private int NumberOfLayers;
// Number of training sets
private int NumberOfSamples;
// Current training set/sample that is used to train network
private int SampleNumber;
// Maximum number of Epochs before the traing stops training
private long MaximumNumberOfIterations;
// Public Variables
public LAYER Layer[];
public double ActualOutput[][];
long delay = 0;
boolean die = false;
// Calculate the node activations
public void FeedForward()
{
int i,j;
// Since no weights contribute to the output
// vector from the input layer,
// assign the input vector from the input layer
// to all the node in the first hidden layer
for (i = 0; i < Layer[0].Node.length; i++)
Layer[0].Node[i].Output = Layer[0].Input[i];
Layer[1].Input = Layer[0].Input;
for (i = 1; i < NumberOfLayers; i++)
{
Layer[i].FeedForward();
// Unless we have reached the last layer, assign the layer i's //output vector
// to the (i+1) layer's input vector
if (i != NumberOfLayers-1)
Layer[i+1].Input = Layer[i].OutputVector();
}
}
// FeedForward()
// Back propagated the network outputy error through
// the network to update the weight values
public void UpdateWeights()
{
CalculateSignalErrors();
BackPropagateError();
}
private void CalculateSignalErrors()
{
int i,j,k,OutputLayer;
double Sum;
OutputLayer = NumberOfLayers-1;
// Calculate all output signal error
for (i = 0; i < Layer[OutputLayer].Node.length; i++)
{
Layer[OutputLayer].Node[i].SignalError =
(ExpectedOutput[SampleNumber][i] -Layer[OutputLayer].Node[i].Output) *
Layer[OutputLayer].Node[i].Output *
(1-Layer[OutputLayer].Node[i].Output);
}
// Calculate signal error for all nodes in the hidden layer
// (back propagate the errors
for (i = NumberOfLayers-2; i > 0; i--)
{
for (j = 0; j < Layer[i].Node.length; j++)
{
Sum = 0;
for (k = 0; k < Layer[i+1].Node.length; k++)
Sum = Sum + Layer[i+1].Node[k].Weight[j] *
Layer[i+1].Node[k].SignalError;
Layer[i].Node[j].SignalError = Layer[i].Node[j].Output*(1 -
Layer[i].Node[j].Output)*Sum;
}
}
}
private void BackPropagateError()
{
int i,j,k;
// Update Weights
for (i = NumberOfLayers-1; i > 0; i--)
{
for (j = 0; j < Layer[i].Node.length; j++)
{
// Calculate Bias weight difference to node j
Layer[i].Node[j].ThresholdDiff = LearningRate *
Layer[i].Node[j].SignalError +
Momentum*Layer[i].Node[j].ThresholdDiff;
// Update Bias weight to node j
Layer[i].Node[j].Threshold =
Layer[i].Node[j].Threshold +
Layer[i].Node[j].ThresholdDiff;
// Update Weights
for (k = 0; k < Layer[i].Input.length; k++)
{
// Calculate weight difference between node j and k
Layer[i].Node[j].WeightDiff[k] =
LearningRate *
Layer[i].Node[j].SignalError*Layer[i-
1].Node[k].Output +
Momentum*Layer[i].Node[j].WeightDiff[k];
// Update weight between node j and k
Layer[i].Node[j].Weight[k] =
Layer[i].Node[j].Weight[k] +
Layer[i].Node[j].WeightDiff[k];
}
}
}
}
private void CalculateOverallError()
{
int i,j;
OverallError = 0;
for (i = 0; i < NumberOfSamples; i++)
for (j = 0; j < Layer[NumberOfLayers-1].Node.length; j++)
{
OverallError = OverallError +
0.5*( Math.pow(ExpectedOutput[i][j] - ActualOutput[i]
[j],2) );
}
}
public BackPropagation(int NumberOfNodes[],
double InputSamples[][],
double OutputSamples[][],
double LearnRate,
double Moment,
double MinError,
long MaxIter
)
{
int i,j;
// Initiate variables
NumberOfSamples = InputSamples.length;
MinimumError = MinError;
LearningRate = LearnRate;
Momentum = Moment;
NumberOfLayers = NumberOfNodes.length;
MaximumNumberOfIterations = MaxIter;
// Create network layers
Layer = new LAYER[NumberOfLayers];
// Assign the number of node to the input layer
Layer[0] = new LAYER(NumberOfNodes[0],NumberOfNodes[0]);
// Assign number of nodes to each layer
for (i = 1; i < NumberOfLayers; i++)
Layer[i] = new LAYER(NumberOfNodes[i],NumberOfNodes[i-1]);
Input = new double[NumberOfSamples][Layer[0].Node.length];
ExpectedOutput = new double[NumberOfSamples][Layer[NumberOfLayers-
1].Node.length];
ActualOutput = new double[NumberOfSamples][Layer[NumberOfLayers-
1].Node.length];
// Assign input set
for (i = 0; i < NumberOfSamples; i++)
for (j = 0; j < Layer[0].Node.length; j++)
Input[i][j] = InputSamples[i][j];
// Assign output set
for (i = 0; i < NumberOfSamples; i++)
for (j = 0; j < Layer[NumberOfLayers-1].Node.length; j++)
ExpectedOutput[i][j] = OutputSamples[i][j];
}
public void TrainNetwork()
{
int i,j;
long k=0;
do
{
// For each pattern
for (SampleNumber = 0; SampleNumber < NumberOfSamples; SampleNumber++)
{
for (i = 0; i < Layer[0].Node.length; i++)
Layer[0].Input[i] = Input[SampleNumber][i];
FeedForward();
// Assign calculated output vector from network to ActualOutput
for (i = 0; i < Layer[NumberOfLayers-1].Node.length; i++)
ActualOutput[SampleNumber][i] = Layer[NumberOfLayers-
1].Node[i].Output;
UpdateWeights();
// if we've been told to stop training, then
// stop thread execution
if (die){
return;
}
// if
}
k++;
// Calculate Error Function
CalculateOverallError();
System.out.println("OverallError =
"+Double.toString(OverallError)+"\n");
System.out.print("Epoch = "+Long.toString(k)+"\n");
} while ((OverallError > MinimumError) &&(k < MaximumNumberOfIterations));
}
public LAYER[] get_layers() { return Layer; }
// called when testing the network.
public double[] test(double[] input)
{
int winner = 0;
NODE[] output_nodes;
for (int j = 0; j < Layer[0].Node.length; j++)
{ Layer[0].Input[j] = input[j];}
FeedForward();
// get the last layer of nodes (the outputs)
output_nodes = (Layer[Layer.length - 1]).get_nodes();
double[] actual_output = new double[output_nodes.length];
for (int k=0; k < output_nodes.length; k++)
{
actual_output[k]=output_nodes[k].Output;
} // for
return actual_output;
}//test()
public double get_error()
{
CalculateOverallError();
return OverallError;
} // get_error()
// to change the delay in the network
public void set_delay(long time)
{
if (time >= 0) {
delay = time;
} // if
}
//save the trained network
public void save(String FileName)
{
try{
FileOutputStream fos = new FileOutputStream (new File(FileName), true);
// Serialize data object to a file
ObjectOutputStream os = new ObjectOutputStream(fos);
os.writeObject(this);
os.close();
fos.close();
System.out.println("Network Saved!!!!");
}
catch (IOException E){System.out.println(E.toString());}
catch (Exception e){System.out.println(e.toString());}
}
public BackPropagation load(String FileName)
{
BackPropagation myclass= null;
try
{
//File patternDirectory = new File(Environment.getExternalStorageDirectory().getAbsolutePath().toString()+"INDIAN_NUMBER_RECOGNITION.data");
//patternDirectory.mkdirs();
FileInputStream fis = new FileInputStream(new File(FileName));
//FileInputStream fis =context.openFileInput(FileName);
ObjectInputStream is = new ObjectInputStream(fis);
myclass = (BackPropagation) is.readObject();
System.out.println("Error After Reading = "+Double.toString(myclass.get_error())+"\n");
is.close();
fis.close();
return myclass;
}
catch (Exception e){System.out.println(e.toString());}
return myclass;
}
// needed to implement threading.
public void run() {
TrainNetwork();
File Net_File = new File(Environment.getExternalStorageDirectory(),"Number_Recognition_1.ser");
save(Net_File.getAbsolutePath());
System.out.println( "DONE TRAINING :) ^_^ ^_^ :) !\n");
System.out.println("With Network ERROR = "+Double.toString(get_error())+"\n");
} // run()
// to notify the network to stop training.
public void kill() { die = true; }
}
Layer Class:
public class LAYER implements Serializable
{
private double Net;
public double Input[];
// Vector of inputs signals from previous
// layer to the current layer
public NODE Node[];
// Vector of nodes in current layer
// The FeedForward function is called so that
// the outputs for all the nodes in the current
// layer are calculated
public void FeedForward() {
for (int i = 0; i < Node.length; i++) {
Net = Node[i].Threshold;
for (int j = 0; j < Node[i].Weight.length; j++)
{Net = Net + Input[j] * Node[i].Weight[j];
System.out.println("Net = "+Double.toString(Net)+"\n");
}
Node[i].Output = Sigmoid(Net);
System.out.println("Node["+Integer.toString(i)+".Output = "+Double.toString(Node[i].Output)+"\n");
}
}
// The Sigmoid function calculates the
// activation/output from the current node
private double Sigmoid (double Net) {
return 1/(1+Math.exp(-Net));
}
// Return the output from all node in the layer
// in a vector form
public double[] OutputVector() {
double Vector[];
Vector = new double[Node.length];
for (int i=0; i < Node.length; i++)
Vector[i] = Node[i].Output;
return (Vector);
}
public LAYER (int NumberOfNodes, int NumberOfInputs) {
Node = new NODE[NumberOfNodes];
for (int i = 0; i < NumberOfNodes; i++)
Node[i] = new NODE(NumberOfInputs);
Input = new double[NumberOfInputs];
}
// added by DSK
public NODE[] get_nodes() { return Node; }
}
Node Class:
public class NODE implements Serializable
{
public double Output;
// Output signal from current node
public double Weight[];
// Vector of weights from previous nodes to current node
public double Threshold;
// Node Threshold /Bias
public double WeightDiff[];
// Weight difference between the nth and the (n-1) iteration
public double ThresholdDiff;
// Threshold difference between the nth and the (n-1) iteration
public double SignalError;
// Output signal error
// InitialiseWeights function assigns a randomly
// generated number, between -1 and 1, to the
// Threshold and Weights to the current node
private void InitialiseWeights() {
Threshold = -1+2*Math.random();
// Initialise threshold nodes with a random
// number between -1 and 1
ThresholdDiff = 0;
// Initially, ThresholdDiff is assigned to 0 so
// that the Momentum term can work during the 1st
// iteration
for(int i = 0; i < Weight.length; i++) {
Weight[i]= -1+2*Math.random();
// Initialise all weight inputs with a
// random number between -1 and 1
WeightDiff[i] = 0;
// Initially, WeightDiff is assigned to 0
// so that the Momentum term can work during
// the 1st iteration
}
}
public NODE (int NumberOfNodes) {
Weight = new double[NumberOfNodes];
// Create an array of Weight with the same
// size as the vector of inputs to the node
WeightDiff = new double[NumberOfNodes];
// Create an array of weightDiff with the same
// size as the vector of inputs to the node
InitialiseWeights();
// Initialise the Weights and Thresholds to the node
}
public double[] get_weights() { return Weight; }
public double get_output() { return Output; }
}
I wrote the code in Netbeans exactly like this but it differs in the saving method where the file should be saved!.
How can I load the file correctly so I don't get this exception?
I Solved this by saving the network to XML file and then load it again in android so it just took two hours of training instead of days without any Serialization problems , although it took some time to load that XML I serialized it again to neural_network.ser so it will load much faster
I know it's not the best solution but that what I've done.
Here the is the code:
public void SaveToXML(String FileName)throws
ParserConfigurationException, FileNotFoundException,
TransformerException, TransformerConfigurationException
{
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder parser = factory.newDocumentBuilder();
Document doc = parser.newDocument();
Element root = doc.createElement("neuralNetwork");
Element layers = doc.createElement("structure");
layers.setAttribute("numberOfLayers",Integer.toString(this.NumberOfLayers));
for (int il=0; il<this.NumberOfLayers; il++){
Element layer = doc.createElement("layer");
layer.setAttribute("index",Integer.toString(il));
layer.setAttribute("numberOfNeurons",Integer.toString(this.Layer[il].Node.length));
if(il==0)
{
for(int in=0;in<this.Layer[il].Node.length;in++)
{
Element neuron = doc.createElement("neuron");
neuron.setAttribute("index",Integer.toString(in));
neuron.setAttribute("NumberOfInputs",Integer.toString(1));
neuron.setAttribute("threshold",Double.toString(this.Layer[il].Node[in].Threshold));
Element input = doc.createElement("input");
double[] weights = this.Layer[il].Node[in].get_weights();
input.setAttribute("index",Integer.toString(in));
input.setAttribute("weight",Double.toString(weights[in]));
neuron.appendChild(input);
layer.appendChild(neuron);
}
layers.appendChild(layer);
}
else
{
for (int in=0; in<this.Layer[il].Node.length;in++){
Element neuron = doc.createElement("neuron");
neuron.setAttribute("index",Integer.toString(in));
neuron.setAttribute("NumberOfInputs",Integer.toString(this.Layer[il].Node[in].Weight.length));
neuron.setAttribute("threshold",Double.toString(this.Layer[il].Node[in].Threshold));
for (int ii=0; ii<this.Layer[il].Node[in].Weight.length;ii++) {
double[] weights = this.Layer[il].Node[in].get_weights();
Element input = doc.createElement("input");
input.setAttribute("index",Integer.toString(ii));
input.setAttribute("weight",Double.toString(weights[ii]));
neuron.appendChild(input);
}
layer.appendChild(neuron);
layers.appendChild(layer);
}
}
}
root.appendChild(layers);
doc.appendChild(root);
File xmlOutputFile = new File(FileName);
FileOutputStream fos;
Transformer transformer;
fos = new FileOutputStream(xmlOutputFile);
TransformerFactory transformerFactory = TransformerFactory.newInstance();
transformer = transformerFactory.newTransformer();
DOMSource source = new DOMSource(doc);
StreamResult result = new StreamResult(fos);
transformer.setOutputProperty("encoding","iso-8859-2");
transformer.setOutputProperty("indent","yes");
transformer.transform(source, result);
}
LoadFromXML Function:
public BackPropagation LoadFromXML(String FileName)throws
ParserConfigurationException, SAXException, IOException, ParseException
{
BackPropagation myclass= new BackPropagation();
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder parser = factory.newDocumentBuilder();
File source = new File(FileName);
Document doc = parser.parse(source);
Node nodeNeuralNetwork = doc.getDocumentElement();
if (!nodeNeuralNetwork.getNodeName().equals("neuralNetwork")) throw new ParseException("[Error] NN-Load: Parse error in XML file, neural network couldn't be loaded.",0);
NodeList nodeNeuralNetworkContent = nodeNeuralNetwork.getChildNodes();
System.out.print("<neuralNetwork>\n");
for (int innc=0; innc<nodeNeuralNetworkContent.getLength(); innc++)
{
Node nodeStructure = nodeNeuralNetworkContent.item(innc);
if (nodeStructure.getNodeName().equals("structure"))
{
System.out.print("<stucture nuumberOfLayers = ");
myclass.NumberOfLayers = Integer.parseInt(((Element)nodeStructure).getAttribute("numberOfLayers"));
myclass.Layer = new LAYER[myclass.NumberOfLayers];
System.out.print(Integer.toString(myclass.NumberOfLayers)+">\n");
NodeList nodeStructureContent = nodeStructure.getChildNodes();
for (int isc=0; isc<nodeStructureContent.getLength();isc++)
{
Node nodeLayer = nodeStructureContent.item(isc);
if (nodeLayer.getNodeName().equals("layer"))
{
int index = Integer.parseInt(((Element)nodeLayer).getAttribute("index"));
System.out.print("<layer index = "+Integer.toString(index)+" numberOfNeurons = ");
int number_of_N = Integer.parseInt(((Element)nodeLayer).getAttribute("numberOfNeurons"));
System.out.print(Integer.toString(number_of_N)+">\n");
if(index==0)
{
myclass.Layer[0]=new LAYER(number_of_N,800);
}
else
{
int j=index-1;
myclass.Layer[index]=new LAYER(number_of_N,myclass.Layer[j].Node.length);
}
NodeList nodeLayerContent = nodeLayer.getChildNodes();
for (int ilc=0; ilc<nodeLayerContent.getLength();ilc++)
{
Node nodeNeuron = nodeLayerContent.item(ilc);
if (nodeNeuron.getNodeName().equals("neuron"))
{
System.out.print("<neuron index = ");
int neuron_index = Integer.parseInt(((Element)nodeNeuron).getAttribute("index"));
myclass.Layer[index].Node[neuron_index].Threshold = Double.parseDouble(((Element)nodeNeuron).getAttribute("threshold"));
System.out.print(Integer.toString(neuron_index)+" threshold = "+Double.toString(myclass.Layer[index].Node[neuron_index].Threshold)+">\n");
NodeList nodeNeuronContent = nodeNeuron.getChildNodes();
for (int inc=0; inc < nodeNeuronContent.getLength();inc++)
{
Node nodeNeuralInput = nodeNeuronContent.item(inc);
if (nodeNeuralInput.getNodeName().equals("input"))
{
System.out.print("<input index = ");
int index_input = Integer.parseInt(((Element)nodeNeuralInput).getAttribute("index"));
myclass.Layer[index].Node[neuron_index].Weight[index_input] = Double.parseDouble(((Element)nodeNeuralInput).getAttribute("weight"));
System.out.print(Integer.toString(index_input)+" weight = "+Double.toString(myclass.Layer[index].Node[neuron_index].Weight[index_input])+">\n");
}
}
}
}
}
}
System.out.print("</structure");
}
}
return myclass;
}
Related
I am trying to do a multi threading simulation in Java and I have managed to do it with a queue but the execution time is high, any ideas on how I could optimize this? Can using recursion save time?
The input has to be like this:
2 5 It means that there are two threads(workers) for 5 jobs
1 2 3 4 5 This is the jobs that are an integer which means the time cost of processing that job so the output will be this:
0 0 The two threads try to simultaneously take jobs from the list, so thread with index 0 actually
1 0 takes the first job and starts working on it at the moment 0
0 1 After 1 second, thread 0 is done with the first job and takes the third job from the list, and starts processing it immediately at time 1.
1 2 One second later, thread 1 is done with the second job and takes the fourth job from the list, and starts processing it immediately at time 2
0 4 Finally, after 2 more seconds, thread 0 is done with the third job and takes the fifth job from the list, and starts processing it immediately at time 4
This is the code:
import java.io.*;
import java.util.HashMap;
import java.util.HashSet;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.StringTokenizer;
public class JobQueue {
private int numWorkers;
private int[] jobs;
private int[] assignedWorker;
private long[] startTime;
private FastScanner in;
private PrintWriter out;
public static void main(String[] args) throws IOException {
new JobQueue().solve();
}
private void readData() throws IOException {
numWorkers = in.nextInt();
int m = in.nextInt();
jobs = new int[m];
for (int i = 0; i < m; ++i) {
jobs[i] = in.nextInt();
}
}
private void writeResponse() {
for (int i = 0; i < jobs.length; ++i) {
out.println(assignedWorker[i] + " " + startTime[i]);
}
}
private void assignJobs() {
// TODO: replace this code with a faster algorithm.
assignedWorker = new int[jobs.length];
startTime = new long[jobs.length];
PriorityQueue<Integer> nextTimesQueue = new PriorityQueue<Integer>();
HashMap<Integer, Set<Integer>> workersReadyAtTimeT = new HashMap<Integer,Set<Integer>>();
long[] nextFreeTime = new long[numWorkers];
int duration = 0;
int bestWorker = 0;
for (int i = 0; i < jobs.length; i++) {
duration = jobs[i];
if(i<numWorkers) {
bestWorker = i;
nextTimesQueue.add(duration);
addToSet(workersReadyAtTimeT, duration, i,0);
}else {
int currentTime = nextTimesQueue.poll();
Set<Integer> workersReady = workersReadyAtTimeT.get(currentTime);
if (workersReady.size()>1) {
bestWorker = workersReady.iterator().next();
workersReady.remove(bestWorker);
workersReadyAtTimeT.remove(currentTime);
workersReadyAtTimeT.put(currentTime,workersReady);
nextTimesQueue.add(currentTime);
} else {
bestWorker = workersReady.iterator().next();
workersReadyAtTimeT.remove(currentTime);
nextTimesQueue.add(currentTime+duration);
addToSet(workersReadyAtTimeT, duration, bestWorker, currentTime);
}
}
assignedWorker[i] = bestWorker;
startTime[i] = nextFreeTime[bestWorker];
nextFreeTime[bestWorker] += duration;
}
}
private void addToSet(HashMap<Integer, Set<Integer>> workersReadyAtTimeT, int duration, int worker, int current) {
if(workersReadyAtTimeT.get(current+duration)==null) {
HashSet<Integer> s = new HashSet<Integer>();
s.add(worker);
workersReadyAtTimeT.put(current+duration, s);
}else {
Set<Integer> s = workersReadyAtTimeT.get(current+duration);
s.add(worker);
workersReadyAtTimeT.put(current+duration,s);
}
}
public void solve() throws IOException {
in = new FastScanner();
out = new PrintWriter(new BufferedOutputStream(System.out));
readData();
assignJobs();
writeResponse();
out.close();
}
static class FastScanner {
private BufferedReader reader;
private StringTokenizer tokenizer;
public FastScanner() {
reader = new BufferedReader(new InputStreamReader(System.in));
tokenizer = null;
}
public String next() throws IOException {
while (tokenizer == null || !tokenizer.hasMoreTokens()) {
tokenizer = new StringTokenizer(reader.readLine());
}
return tokenizer.nextToken();
}
public int nextInt() throws IOException {
return Integer.parseInt(next());
}
}
}
It seems to me that your jobsList object is completely redundant, everything it contains is also in the jobs array and when you take the front element you get the item at jobs[i]. To speed up a little you could take the constructors of the ints out of the loop and just assign new numbers to them. Another optimization would be to not search during the first numWorkers jobs because you know you still have idle workers until you have exausted your pool. Once you have found one good worker you dont have to keep looking so you can continue out of your for-loop.
public class JobQueue {
private int numWorkers;
private int[] jobs;
private int[] assignedWorker;
private long[] startTime;
private void readData() throws IOException {
numWorkers = in.nextInt();
int m = in.nextInt();
jobs = new int[m];
for (int i = 0; i < m; ++i) {
jobs[i] = in.nextInt();
}
}
private void assignJobs() {
assignedWorker = new int[jobs.length];
startTime = new long[jobs.length];
long[] nextFreeTime = new long[numWorkers];
int duration = 0;
int bestWorker = 0;
for (int i = 0; i < jobs.length; i++) {
duration = jobs[i];
bestWorker = 0;
if (i< numWorkers){
bestWorker= i;
} else{
for (int j = 0; j < numWorkers; ++j) {
if (nextFreeTime[j] < nextFreeTime[bestWorker])
bestWorker = j;
continue;
}
}
assignedWorker[i] = bestWorker;
startTime[i] = nextFreeTime[bestWorker];
nextFreeTime[bestWorker] += duration;
}
}
However, both your solution and this slightly trimmed down one take 2 milliseconds to run. I also looked at having HashMap to maintain a NextWorker marker but at some point you catch up with it and end up looking everytime for the next one and don't win much.
You could try having an ordered List/Queue, but then you have expensive inserts instead of expensive searches, and you have to kee track of the timeslice. But a version like that could look like this:
private void assignJobs() {
assignedWorker = new int[jobs.length];
startTime = new long[jobs.length];
PriorityQueue<Integer> nextTimesQueue = new PriorityQueue<Integer>();
HashMap<Integer, Set<Integer>> workersReadyAtTimeT = new HashMap<Integer,Set<Integer>>();
long[] nextFreeTime = new long[numWorkers];
int duration = 0;
int bestWorker = 0;
for (int i = 0; i < jobs.length; i++) {
duration = jobs[i];
if(i<numWorkers) {
bestWorker = i;
nextTimesQueue.add(duration);
addToSet(workersReadyAtTimeT, duration, i,0);
}else {
int currentTime = nextTimesQueue.poll();
Set<Integer> workersReady = workersReadyAtTimeT.get(currentTime);
if (workersReady.size()>1) {
bestWorker = workersReady.iterator().next();
workersReady.remove(bestWorker);
workersReadyAtTimeT.remove(currentTime);
workersReadyAtTimeT.put(currentTime,workersReady);
nextTimesQueue.add(currentTime);
} else {
bestWorker = workersReady.iterator().next();
workersReadyAtTimeT.remove(currentTime);
nextTimesQueue.add(currentTime+duration);
addToSet(workersReadyAtTimeT, duration, bestWorker, currentTime);
}
}
assignedWorker[i] = bestWorker;
startTime[i] = nextFreeTime[bestWorker];
nextFreeTime[bestWorker] += duration;
}
}
private void addToSet(HashMap<Integer, Set<Integer>> workersReadyAtTimeT, int duration, int worker, int current) {
if(workersReadyAtTimeT.get(current+duration)==null) {
HashSet<Integer> s = new HashSet<Integer>();
s.add(worker);
workersReadyAtTimeT.put(current+duration, s);
}else {
Set<Integer> s = workersReadyAtTimeT.get(current+duration);
s.add(worker);
workersReadyAtTimeT.put(current+duration,s);
}
}
I want to get probability score for the extracted names using NameFinderME, but using the provided model gives very bad probabilities using the probs function.
For example, "Scott F. Fitzgerald" gets a score around 0.5 (averaging log probabilities, and taking an exponent), while "North Japan" and "Executive Vice President, Corporate Relations and Chief Philanthropy Officer" both get a score higher than 0.9...
I have more than 2 million first names, and another 2 million last names (with their frequency counts) And I want to synthetically create a huge dataset from outer multiplication of the first names X middle names (using the first names pool) X last names.
The problem is, I don't even get to go over all the last names once (even when discarding freq counts and only using each name only once) before I get a GC overhead limit exceeded exception...
I'm implementing a ObjectStream and give it to the train function:
public class OpenNLPNameStream implements ObjectStream<NameSample> {
private List<Map<String, Object>> firstNames = null;
private List<Map<String, Object>> lastNames = null;
private int firstNameIdx = 0;
private int firstNameCountIdx = 0;
private int middleNameIdx = 0;
private int middleNameCountIdx = 0;
private int lastNameIdx = 0;
private int lastNameCountIdx = 0;
private int firstNameMaxCount = 0;
private int middleNameMaxCount = 0;
private int lastNameMaxCount = 0;
private int firstNameKBSize = 0;
private int lastNameKBSize = 0;
Span span[] = new Span[1];
String fullName[] = new String[3];
String partialName[] = new String[2];
private void increaseFirstNameCountIdx()
{
firstNameCountIdx++;
if (firstNameCountIdx == firstNameMaxCount) {
firstNameIdx++;
if (firstNameIdx == firstNameKBSize)
return; //no need to update anything - this is the end of the run...
firstNameMaxCount = getFirstNameMaxCount(firstNameIdx);
firstNameCountIdx = 0;
}
}
private void increaseMiddleNameCountIdx()
{
lastNameCountIdx++;
if (middleNameCountIdx == middleNameMaxCount) {
if (middleNameIdx == firstNameKBSize) {
resetMiddleNameIdx();
increaseFirstNameCountIdx();
} else {
middleNameMaxCount = getMiddleNameMaxCount(middleNameIdx);
middleNameCountIdx = 0;
}
}
}
private void increaseLastNameCountIdx()
{
lastNameCountIdx++;
if (lastNameCountIdx == lastNameMaxCount) {
lastNameIdx++;
if (lastNameIdx == lastNameKBSize) {
resetLastNameIdx();
increaseMiddleNameCountIdx();
}
else {
lastNameMaxCount = getLastNameMaxCount(lastNameIdx);
lastNameCountIdx = 0;
}
}
}
private void resetLastNameIdx()
{
lastNameIdx = 0;
lastNameMaxCount = getLastNameMaxCount(0);
lastNameCountIdx = 0;
}
private void resetMiddleNameIdx()
{
middleNameIdx = 0;
middleNameMaxCount = getMiddleNameMaxCount(0);
middleNameCountIdx = 0;
}
private int getFirstNameMaxCount(int i)
{
return 1; //compromised on using just
//String occurences = (String) firstNames.get(i).get("occurences");
//return Integer.parseInt(occurences);
}
private int getMiddleNameMaxCount(int i)
{
return 3; //compromised on using just
//String occurences = (String) firstNames.get(i).get("occurences");
//return Integer.parseInt(occurences);
}
private int getLastNameMaxCount(int i)
{
return 1;
//String occurences = (String) lastNames.get(i).get("occurences");
//return Integer.parseInt(occurences);
}
#Override
public NameSample read() throws IOException {
if (firstNames == null) {
firstNames = CSVFileTools.readFileFromInputStream("namep_first_name_idf.csv", new ClassPathResource("namep_first_name_idf.csv").getInputStream());
firstNameKBSize = firstNames.size();
firstNameMaxCount = getFirstNameMaxCount(0);
middleNameMaxCount = getFirstNameMaxCount(0);
}
if (lastNames == null) {
lastNames = CSVFileTools.readFileFromInputStream("namep_last_name_idf.csv",new ClassPathResource("namep_last_name_idf.csv").getInputStream());
lastNameKBSize = lastNames.size();
lastNameMaxCount = getLastNameMaxCount(0);
}
increaseLastNameCountIdx();;
if (firstNameIdx == firstNameKBSize)
return null; //we've finished iterating over all permutations!
String [] sentence;
if (firstNameCountIdx < firstNameMaxCount / 3)
{
span[0] = new Span(0,2,"Name");
sentence = partialName;
sentence[0] = (String)firstNames.get(firstNameIdx).get("first_name");
sentence[1] = (String)lastNames.get(lastNameIdx).get("last_name");
}
else
{
span[0] = new Span(0,3,"name");
sentence = fullName;
sentence[0] = (String)firstNames.get(firstNameIdx).get("first_name");
sentence[2] = (String)lastNames.get(lastNameIdx).get("last_name");
if (firstNameCountIdx < 2*firstNameCountIdx/3) {
sentence[1] = (String)firstNames.get(middleNameIdx).get("first_name");
}
else {
sentence[1] = ((String)firstNames.get(middleNameIdx).get("first_name")).substring(0,1) + ".";
}
}
return new NameSample(sentence,span,true);
}
#Override
public void reset() throws IOException, UnsupportedOperationException {
firstNameIdx = 0;
firstNameCountIdx = 0;
middleNameIdx = 0;
middleNameCountIdx = 0;
lastNameIdx = 0;
lastNameCountIdx = 0;
firstNameMaxCount = 0;
middleNameMaxCount = 0;
lastNameMaxCount = 0;
}
#Override
public void close() throws IOException {
reset();
firstNames = null;
lastNames = null;
}
}
And
TokenNameFinderModel model = NameFinderME.train("en","person",new OpenNLPNameStream(),TrainingParameters.defaultParams(),new TokenNameFinderFactory());
model.serialize(new FileOutputStream("trainedNames.bin",false));
I get the following error after a few minutes of running:
java.lang.OutOfMemoryError: GC overhead limit exceeded
at opennlp.tools.util.featuregen.WindowFeatureGenerator.createFeatures(WindowFeatureGenerator.java:112)
at opennlp.tools.util.featuregen.AggregatedFeatureGenerator.createFeatures(AggregatedFeatureGenerator.java:79)
at opennlp.tools.util.featuregen.CachedFeatureGenerator.createFeatures(CachedFeatureGenerator.java:69)
at opennlp.tools.namefind.DefaultNameContextGenerator.getContext(DefaultNameContextGenerator.java:118)
at opennlp.tools.namefind.DefaultNameContextGenerator.getContext(DefaultNameContextGenerator.java:37)
at opennlp.tools.namefind.NameFinderEventStream.generateEvents(NameFinderEventStream.java:113)
at opennlp.tools.namefind.NameFinderEventStream.createEvents(NameFinderEventStream.java:137)
at opennlp.tools.namefind.NameFinderEventStream.createEvents(NameFinderEventStream.java:36)
at opennlp.tools.util.AbstractEventStream.read(AbstractEventStream.java:62)
at opennlp.tools.util.AbstractEventStream.read(AbstractEventStream.java:27)
at opennlp.tools.util.AbstractObjectStream.read(AbstractObjectStream.java:32)
at opennlp.tools.ml.model.HashSumEventStream.read(HashSumEventStream.java:46)
at opennlp.tools.ml.model.HashSumEventStream.read(HashSumEventStream.java:29)
at opennlp.tools.ml.model.TwoPassDataIndexer.computeEventCounts(TwoPassDataIndexer.java:130)
at opennlp.tools.ml.model.TwoPassDataIndexer.<init>(TwoPassDataIndexer.java:83)
at opennlp.tools.ml.AbstractEventTrainer.getDataIndexer(AbstractEventTrainer.java:74)
at opennlp.tools.ml.AbstractEventTrainer.train(AbstractEventTrainer.java:91)
at opennlp.tools.namefind.NameFinderME.train(NameFinderME.java:337)
Edit: After increasing the memory of the JVM to 8GB, I still don't get past the first 2 million last names, but now the Exception is:
java.lang.OutOfMemoryError: Java heap space
at java.util.HashMap.resize(HashMap.java:703)
at java.util.HashMap.putVal(HashMap.java:662)
at java.util.HashMap.put(HashMap.java:611)
at opennlp.tools.ml.model.AbstractDataIndexer.update(AbstractDataIndexer.java:141)
at opennlp.tools.ml.model.TwoPassDataIndexer.computeEventCounts(TwoPassDataIndexer.java:134)
at opennlp.tools.ml.model.TwoPassDataIndexer.<init>(TwoPassDataIndexer.java:83)
at opennlp.tools.ml.AbstractEventTrainer.getDataIndexer(AbstractEventTrainer.java:74)
at opennlp.tools.ml.AbstractEventTrainer.train(AbstractEventTrainer.java:91)
at opennlp.tools.namefind.NameFinderME.train(NameFinderME.java:337)
It seems the problem stems from the fact I'm creating a new NameSample along with new Spans and Strings at every read call... But I can't reuse Spans or NameSamples, since they're immutables.
Should I just write my own language model, is there a better Java library for doing this sort of thing (I'm only interested in getting the probability the extracted text is actually a name) are there parameters I should tweak for the model I'm training?
Any advice would be appreciated.
I am trying to create some nodes in Neo4j through a Maven Java Application and create relationships between those nodes. Exactly i want to create 16807 nodes and 17210368 relationships. I read a file and get the row variable which has the number of nodes i must create and i also have a list which has 34420736 elements (=17210368*2). I want to create a relationship from node[element 0 of list] to node[element 1 from list], from node[element 2 of list] to node[element 3 from list] etc. Also the max element of list is 16807. I create an ArrayList<Node> as to create the nodes dynamically cause i want the programm to run with different files(and with different row values).
Here is my code:
GraphDatabaseFactory dbFactory = new GraphDatabaseFactory();
GraphDatabaseService graphDb = dbFactory.newEmbeddedDatabase("C:\Users\....\default.graphdb");
Transaction tx = graphDb.beginTx();
try {
final RelationshipType type2 = DynamicRelationshipType.withName("KNOW");
ArrayList<Node> nodelist = new ArrayList<Node>();
for (int k = 0; k < row; k++) { //row=16807
nodelist.add(graphDb.createNode());
nodelist.get(k).setProperty("Name", "ListNode " + k);
}
int count=0;
for (int j = 0; j < list.size() ; j++) { //list.size()=34420736
nodelist.get(list.get(count)).createRelationshipTo(nodelist.get(list.get(count+1)), type2);
count=count+2;
}
tx.success();
}
finally {
tx.close();
}
graphDb.shutdown();
If i run the code without trying to create relationships it creates the nodes and run correctly. When i add the for loop that creates the realtionships it throws me the following error:
Exception in thread "main" org.neo4j.graphdb.TransactionFailureException: Unable to rollback transaction
at org.neo4j.kernel.TopLevelTransaction.close(TopLevelTransaction.java:131)
at com.mycompany.traverse_test.traverse_main.main(traverse_main.java:138)
Caused by: java.lang.IllegalStateException: No RelationshipState for added relationship!
at org.neo4j.kernel.api.txstate.RelationshipChangeVisitorAdapter$1.visit(RelationshipChangeVisitorAdapter.java:132)
at org.neo4j.kernel.api.txstate.RelationshipChangeVisitorAdapter.visitAddedRelationship(RelationshipChangeVisitorAdapter.java:83)
at org.neo4j.kernel.api.txstate.RelationshipChangeVisitorAdapter.visitAdded(RelationshipChangeVisitorAdapter.java:106)
at org.neo4j.kernel.api.txstate.RelationshipChangeVisitorAdapter.visitAdded(RelationshipChangeVisitorAdapter.java:47)
at org.neo4j.kernel.impl.util.diffsets.DiffSets.accept(DiffSets.java:76)
at org.neo4j.kernel.impl.api.state.TxState.accept(TxState.java:156)
at org.neo4j.kernel.impl.api.KernelTransactionImplementation.rollback(KernelTransactionImplementation.java:542)
at org.neo4j.kernel.impl.api.KernelTransactionImplementation.close(KernelTransactionImplementation.java:404)
at org.neo4j.kernel.TopLevelTransaction.close(TopLevelTransaction.java:112)
... 1 more
Any ideas??
Neo4j is trying to roll back your transaction due to a bug in your code. The fact that it is failing to roll back may be a bug in neo4j, but that is really not your main problem.
Looking at your code, it looks like you are iterating through your list too many times. That is, the code in the list loop is using up 2 list elements at a time, so you should only be looping list.size()/2 times.
Here is code that should fix that bug, and it also makes a few other improvements.
GraphDatabaseFactory dbFactory = new GraphDatabaseFactory();
GraphDatabaseService graphDb = dbFactory.newEmbeddedDatabase("C:\Users\....\default.graphdb");
Transaction tx = graphDb.beginTx();
try {
final RelationshipType type2 = DynamicRelationshipType.withName("KNOW");
ArrayList<Node> nodelist = new ArrayList<Node>();
for (int k = 0; k < row; k++) { //row=16807
Node node = graphDb.createNode();
node.setProperty("Name", "ListNode " + k);
nodelist.add(node);
}
for (int j = 0; j < list.size() ; j += 2) { //list.size()=34420736
nodelist.get(list.get(j)).createRelationshipTo(
nodelist.get(list.get(j+1)), type2);
}
tx.success();
} catch(Throwable e) {
e.printStackTrace();
// You may want to re-throw the exception, rather than just eating it here...
} finally {
tx.close();
}
graphDb.shutdown();
[EDITED]
However, the above code can still run out of memory, since it is trying to create so many resources (16K nodes and 17M relationships) in a single transaction.
The following example code does the work in multiple transactions (one for creating the nodes and node list, and multiple transactions for the relationships).
NUM_RELS_PER_CHUNK specifies the maximum number of relationships to be created in each transaction. The createRelEndpointList() method must be modified to fill in the list of relationship endpoint (node) indices (each index is the 0-origin position of a node in nodeList).
public class MyCode {
private static final int NODE_COUNT = 16807;
private static final int NUM_RELS_PER_CHUNK = 1000000;
public static void main(String[] args) {
doIt();
}
private static void doIt() {
GraphDatabaseFactory dbFactory = new GraphDatabaseFactory();
GraphDatabaseService graphDb = dbFactory.newEmbeddedDatabase(new File("C:\\Users\\....\\default.graphdb"));
try {
RelationshipType type = DynamicRelationshipType.withName("KNOW");
List<Node> nodeList = createNodes(graphDb, NODE_COUNT);
List<Integer> list = createRelEndpointList();
final int numRels = list.size() / 2;
final int numChunks = (numRels + NUM_RELS_PER_CHUNK - 1)/NUM_RELS_PER_CHUNK;
int startRelIndex = 0, endRelIndexPlus1;
for (int i = numChunks; --i >= 0 && startRelIndex < numRels; ) {
endRelIndexPlus1 = (i > 0) ? startRelIndex + NUM_RELS_PER_CHUNK : numRels;
createRelationships(graphDb, nodeList, list, startRelIndex, endRelIndexPlus1, type);
startRelIndex = endRelIndexPlus1;
}
} finally {
graphDb.shutdown();
}
}
private static List<Node> createNodes(GraphDatabaseService graphDb, int rowCount) {
ArrayList<Node> nodeList = new ArrayList<Node>(rowCount);
Transaction tx = graphDb.beginTx();
try {
final StringBuilder sb = new StringBuilder("ListNode ");
final int initLength = sb.length();
for (int k = 0; k < rowCount; k++) {
Node node = graphDb.createNode();
sb.setLength(initLength);
sb.append(k);
node.setProperty("Name", sb.toString());
nodeList.add(node);
}
tx.success();
System.out.println("Created nodes.");
} catch(Exception e) {
e.printStackTrace();
tx.failure();
return null;
} finally {
tx.close();
}
return nodeList;
}
private static List<Integer> createRelEndpointList() {
final List<Integer> list = new ArrayList<Integer>();
// Fill
// list
// ...
return list;
}
private static void createRelationships(GraphDatabaseService graphDb, List<Node> nodeList, List<Integer> list, int startRelIndex, int endRelIndexPlus1, RelationshipType type) {
Transaction tx = graphDb.beginTx();
try {
final int endPlus2 = endRelIndexPlus1 * 2;
for (int j = startRelIndex * 2; j < endPlus2; ) {
Node from = nodeList.get(list.get(j++));
Node to = nodeList.get(list.get(j++));
from.createRelationshipTo(to, type);
}
tx.success();
System.out.println("Created rels. Start: " + startRelIndex + ", count: " + (endRelIndexPlus1 - startRelIndex));
} catch(Exception e) {
e.printStackTrace();
tx.failure();
// You may want to re-throw the exception, rather than just eating it here...
} finally {
tx.close();
}
}
}
It's difficult to tell what is being asked here. This question is ambiguous, vague, incomplete, overly broad, or rhetorical and cannot be reasonably answered in its current form. For help clarifying this question so that it can be reopened, visit the help center.
Closed 9 years ago.
Current code is single-threaded. It reads data from file, generate random numbers and check if that numbers belong to given interval.
import java.io.*;
import java.util.*;
class Generator {
private double mean;
private double variance;
private long amountOfNumbersToGenerate;
public Generator(double mean, double variance, long amountOfNumbersToGenerate) {
this.mean = mean;
this.variance = variance;
this.amountOfNumbersToGenerate = amountOfNumbersToGenerate;
}
double getMean() {
return mean;
}
double getVariance() {
return variance;
}
long getAmountOfNumbersToGenerate() {
return amountOfNumbersToGenerate;
}
}
class Interval {
private double start;
private double end;
public Interval(double start, double end) {
this.start = start;
this.end = end;
}
double getStart() {
return start;
}
double getEnd() {
return end;
}
}
class ParsedData {
private Vector<Generator> generators;
private Vector<Interval> intervals;
public ParsedData(Vector<Generator> generators, Vector<Interval> intervals) {
this.generators = generators;
this.intervals = intervals;
}
Vector<Generator> getGenerators() {
return generators;
}
Vector<Interval> getIntervals() {
return intervals;
}
}
class Worker extends Thread {
public Worker() {
}
}
class Start {
static ParsedData readDataFromFile(String path) throws IOException {
File file = new File(path);
BufferedReader br = new BufferedReader(new FileReader(file));
String line;
line = br.readLine();
String delimiter = "\\s+";
// generators
long generatorSize = Long.parseLong(line);
Vector<Generator> generators = new Vector<Generator>();
for(long i =0; i < generatorSize; i++) {
line = br.readLine();
Scanner f = new Scanner(line);
f.useLocale(Locale.US); //without this line the program wouldn't work on machines with different locales
f.useDelimiter(delimiter);
Generator g = new Generator(f.nextDouble(), f.nextDouble(), f.nextInt());
generators.add(g);
}
line = br.readLine();
long intervalSize = Long.parseLong(line);
Vector<Interval> intervals = new Vector<Interval>();
for(long i = 0; i < intervalSize; i++) {
line = br.readLine();
System.out.println(line);
Scanner f = new Scanner(line);
f.useLocale(Locale.US); //without this line the program wouldn't work on machines with different locales
f.useDelimiter(delimiter);
Interval interval = new Interval(f.nextDouble(), f.nextDouble());
intervals.add(interval);
}
br.close();
return new ParsedData(generators, intervals);
}
static double boxMullerMarsagliaPolarRand(double mean, double variance) {
double micro = mean;
double sigma = Math.sqrt(variance);
double y, x, omega;
Random random = new Random();
do {
x = random.nextDouble();
y = random.nextDouble();
omega = x * x + y * y;
} while (!(0.0 < omega && omega < 1.0));
double sigma_sqrt = sigma * Math.sqrt(-2.0 * Math.log(omega) / omega);
double g = micro + x * sigma_sqrt;
// float h = micro + y * sigma_sqrt;
return g;
}
/////////////////////////////////////////
// TODO: refactor code into multithreaded
static Vector<Double> generateRandomNumbers(ParsedData parsedData) {
Vector<Double> generatedNumbers = new Vector<Double>();
for(int i = 0; i < parsedData.getGenerators().size(); i++) {
Generator g = parsedData.getGenerators().get(i);
for(long j = 0; j < g.getAmountOfNumbersToGenerate(); j++) {
double random = boxMullerMarsagliaPolarRand(g.getMean(), g.getVariance());
generatedNumbers.add(random);
}
}
return generatedNumbers;
}
/////////////////////////////////////////
// TODO: refactor code into multithreaded
static int[] checkIntervals(ParsedData parsedData, Vector<Double> generatedNumbers) {
int[] numberOfHits = new int[parsedData.getIntervals().size()];
for(int j = 0; j < parsedData.getIntervals().size(); j++) {
Interval interval = parsedData.getIntervals().get(j);
for(int i = 0; i < generatedNumbers.size(); i++) {
if (interval.getStart() < generatedNumbers.get(i) && generatedNumbers.get(i) < interval.getEnd()) {
numberOfHits[j]++;
}
}
}
return numberOfHits;
}
public static void main(String args[]) {
int amountOfThreads = Integer.parseInt(args[0]);
String path = System.getProperty("user.dir") + "/input.dat";
ParsedData parsedData = null;
try {
parsedData = readDataFromFile(path);
} catch (IOException e) {
e.printStackTrace();
}
System.out.println(parsedData.getGenerators().size());
System.out.println(parsedData.getIntervals().size());
Vector<Double> generatedNumbers = generateRandomNumbers(parsedData);
int[] numberOfHits = checkIntervals(parsedData, generatedNumbers);
for (int i = 0; i < numberOfHits.length; i++) {
Interval interval = parsedData.getIntervals().get(i);
System.out.println("" + (i+1) + " " + interval.getStart() + " " + interval.getEnd() + " " + numberOfHits[i]);
}
System.out.println(generatedNumbers.size());
}
}
I don't expect for anyone to write/refactor code for me.
But I don't know how to make this methods multi-threaded:
/////////////////////////////////////////
// TODO: refactor code into multithreaded
static Vector<Double> generateRandomNumbers(ParsedData parsedData) {
Vector<Double> generatedNumbers = new Vector<Double>();
for(int i = 0; i < parsedData.getGenerators().size(); i++) {
Generator g = parsedData.getGenerators().get(i);
for(long j = 0; j < g.getAmountOfNumbersToGenerate(); j++) {
double random = boxMullerMarsagliaPolarRand(g.getMean(), g.getVariance());
generatedNumbers.add(random);
}
}
return generatedNumbers;
}
/////////////////////////////////////////
// TODO: refactor code into multithreaded
static int[] checkIntervals(ParsedData parsedData, Vector<Double> generatedNumbers) {
int[] numberOfHits = new int[parsedData.getIntervals().size()];
for(int j = 0; j < parsedData.getIntervals().size(); j++) {
Interval interval = parsedData.getIntervals().get(j);
for(int i = 0; i < generatedNumbers.size(); i++) {
if (interval.getStart() < generatedNumbers.get(i) && generatedNumbers.get(i) < interval.getEnd()) {
numberOfHits[j]++;
}
}
}
return numberOfHits;
}
The easiest way to make this multithreaded is to use a producer-consumer pattern, with one producer reading the data and sending it to a BlockingQueue, and the consumers reading the data from the BlockingQueue (using take) and processing it using your two static methods. This way you need to do minimal refactoring - the static methods are already re-entrant / thread-safe (assuming that the Vector and ParsedData parameters aren't shared), so they don't need to be modified at all.
I am testing BerkeleyDB Java Edition to understand whether I can use it in my project.
I've created very simple program which works with object of class com.sleepycat.je.Database:
writes N records of 5-15kb each, with keys generated like Integer.toString(random.nextInt());
reads these records fetching them with method Database#get in the same order they were created;
reads the same number of records with method Database#get in random order.
And I now see the strange thing. Execution time for third test grows very non-linearly with increasing of the number of records.
N=80000, write=55sec, sequential fetch=17sec, random fetch=3sec
N=100000, write=60sec, sequential fetch=20sec, random fetch=7sec
N=120000, write=68sec, sequential fetch=27sec, random fetch=11sec
N=140000, write=82sec, sequential fetch=32sec, random fetch=47sec
(I've run tests several times, of course.)
I suppose I am doing something quite wrong. Here is the source for reference (sorry, it is bit long), methods are called in the same order:
private Environment env;
private Database db;
private Random random = new Random();
private List<String> keys = new ArrayList<String>();
private int seed = 113;
public boolean dbOpen() {
EnvironmentConfig ec = new EnvironmentConfig();
DatabaseConfig dc = new DatabaseConfig();
ec.setAllowCreate(true);
dc.setAllowCreate(true);
env = new Environment(new File("mydbenv"), ec);
db = env.openDatabase(null, "moe", dc);
return true;
}
public int storeRecords(int i) {
int j;
long size = 0;
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry val = new DatabaseEntry();
random.setSeed(seed);
for (j = 0; j < i; j++) {
String k = Long.toString(random.nextLong());
byte[] data = new byte[5000 + random.nextInt(10000)];
keys.add(k);
size += data.length;
random.nextBytes(data);
key.setData(k.getBytes());
val.setData(data);
db.put(null, key, val);
}
System.out.println("GENERATED SIZE: " + size);
return j;
}
public int fetchRecords(int i) {
int j, res;
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry val = new DatabaseEntry();
random.setSeed(seed);
res = 0;
for (j = 0; j < i; j++) {
String k = Long.toString(random.nextLong());
byte[] data = new byte[5000 + random.nextInt(10000)];
random.nextBytes(data);
key.setData(k.getBytes());
db.get(null, key, val, null);
if (Arrays.equals(data, val.getData())) {
res++;
} else {
System.err.println("FETCH differs: " + j);
System.err.println(data.length + " " + val.getData().length);
}
}
return res;
}
public int fetchRandom(int i) {
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry val = new DatabaseEntry();
for (int j = 0; j < i; j++) {
String k = keys.get(random.nextInt(keys.size()));
key.setData(k.getBytes());
db.get(null, key, val, null);
}
return i;
}
Performance degradation is non-linear for two reasons:
BDB-JE data structure is a b-tree, which has O(log(n)) performance for retrieving one record. Retrieving all via the get method is O(n*log(n)).
Large data sets don't fit into RAM, and so disk access slows everything down. Random access has very poor cache locality.
Note that you can improve write performance by giving up some durability: ec.setTxnWriteNoSync(true);
You might also want to try Tupl, an open source BerkeleyDB replacement I've been working on. It's still in the alpha stage, but you can find it on SourceForge.
For a fair comparison between BDB-JE and Tupl, I set the cache size to 500M and an explicit checkpoint is performed at the end of the store method.
With BDB-JE:
N=80000, write=11.0sec, fetch=5.3sec
N=100000, write=13.6sec, fetch=7.0sec
N=120000, write=16.4sec, fetch=29.5sec
N=140000, write=18.8sec, fetch=35.9sec
N=160000, write=21.5sec, fetch=41.3sec
N=180000, write=23.9sec, fetch=46.4sec
With Tupl:
N=80000, write=21.7sec, fetch=4.4sec
N=100000, write=27.6sec, fetch=6.3sec
N=120000, write=30.2sec, fetch=8.4sec
N=140000, write=35.4sec, fetch=12.2sec
N=160000, write=39.9sec, fetch=17.4sec
N=180000, write=45.4sec, fetch=22.8sec
BDB-JE is faster at writing entries, because of its log-based format. Tupl is faster at reading, however. Here's the source to the Tupl test:
import java.io.;
import java.util.;
import org.cojen.tupl.*;
public class TuplTest {
public static void main(final String[] args) throws Exception {
final RandTupl rt = new RandTupl();
rt.dbOpen(args[0]);
{
long start = System.currentTimeMillis();
rt.storeRecords(Integer.parseInt(args[1]));
long end = System.currentTimeMillis();
System.out.println("store duration: " + (end - start));
}
{
long start = System.currentTimeMillis();
rt.fetchRecords(Integer.parseInt(args[1]));
long end = System.currentTimeMillis();
System.out.println("fetch duration: " + (end - start));
}
}
private Database db;
private Index ix;
private Random random = new Random();
private List<String> keys = new ArrayList<String>();
private int seed = 113;
public boolean dbOpen(String home) throws Exception {
DatabaseConfig config = new DatabaseConfig();
config.baseFile(new File(home));
config.durabilityMode(DurabilityMode.NO_FLUSH);
config.minCacheSize(500000000);
db = Database.open(config);
ix = db.openIndex("moe");
return true;
}
public int storeRecords(int i) throws Exception {
int j;
long size = 0;
random.setSeed(seed);
for (j = 0; j < i; j++) {
String k = Long.toString(random.nextLong());
byte[] data = new byte[5000 + random.nextInt(10000)];
keys.add(k);
size += data.length;
random.nextBytes(data);
ix.store(null, k.getBytes(), data);
}
System.out.println("GENERATED SIZE: " + size);
db.checkpoint();
return j;
}
public int fetchRecords(int i) throws Exception {
int j, res;
random.setSeed(seed);
res = 0;
for (j = 0; j < i; j++) {
String k = Long.toString(random.nextLong());
byte[] data = new byte[5000 + random.nextInt(10000)];
random.nextBytes(data);
byte[] val = ix.load(null, k.getBytes());
if (Arrays.equals(data, val)) {
res++;
} else {
System.err.println("FETCH differs: " + j);
System.err.println(data.length + " " + val.length);
}
}
return res;
}
public int fetchRandom(int i) throws Exception {
for (int j = 0; j < i; j++) {
String k = keys.get(random.nextInt(keys.size()));
ix.load(null, k.getBytes());
}
return i;
}
}