How can I improve this search algorithms runtime? - java

I'm trying to solve an interview problem I was given a few years ago in preparation for upcoming interviews. The problem is outlined in a pdf here. I wrote a simple solution using DFS that works fine for the example outlined in the document, but I haven't been able to get the program to meet the criteria of
Your code should produce correct answers in under a second for a
10,000 x 10,000 Geo GeoBlock containing 10,000 occupied Geos.
To test this I generated a CSV file with 10000 random entries and when I run the code against it, it averages just over 2 seconds to find the largest geo block in it. I'm not sure what improvements could be made to my approach to cut the runtime by over half, other than running it on a faster laptop. From my investigations it appears the search itself seems to only take about 8ms, so perhaps the way I load the data into memory is the inefficient part?
I'd greatly appreciate an advice on how this could be improved. See code below:
GeoBlockAnalyzer
package analyzer.block.geo.main;
import analyzer.block.geo.model.Geo;
import analyzer.block.geo.result.GeoResult;
import java.awt.*;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.List;
import java.util.*;
public class GeoBlockAnalyzer {
private static final DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd");
private final int width;
private final int height;
private final String csvFilePath;
private GeoResult result = new GeoResult();
// Map of the geo id and respective geo object
private final Map<Integer, Geo> geoMap = new HashMap<>();
// Map of coordinates to each geo in the grid
private final Map<Point, Geo> coordMap = new HashMap<>();
/**
* Constructs a geo grid of the given width and height, populated with the geo data provided in
* the csv file
*
* #param width the width of the grid
* #param height the height of the grid
* #param csvFilePath the csv file containing the geo data
* #throws IOException
*/
public GeoBlockAnalyzer(final int width, final int height, final String csvFilePath)
throws IOException {
if (!Files.exists(Paths.get(csvFilePath)) || Files.isDirectory(Paths.get(csvFilePath))) {
throw new FileNotFoundException(csvFilePath);
}
if (width <= 0 || height <= 0) {
throw new IllegalArgumentException("Input height or width is 0 or smaller");
}
this.width = width;
this.height = height;
this.csvFilePath = csvFilePath;
populateGeoGrid();
populateCoordinatesMap();
calculateGeoNeighbours();
// printNeighbours();
}
/** #return the largest geo block in the input grid */
public GeoResult getLargestGeoBlock() {
for (final Geo geo : this.geoMap.values()) {
final List<Geo> visited = new ArrayList<>();
search(geo, visited);
}
return this.result;
}
/**
* Iterative DFS implementation to find largest geo block.
*
* #param geo the geo to be evaluated
* #param visited list of visited geos
*/
private void search(Geo geo, final List<Geo> visited) {
final Deque<Geo> stack = new LinkedList<>();
stack.push(geo);
while (!stack.isEmpty()) {
geo = stack.pop();
if (visited.contains(geo)) {
continue;
}
visited.add(geo);
final List<Geo> neighbours = geo.getNeighbours();
for (int i = neighbours.size() - 1; i >= 0; i--) {
final Geo g = neighbours.get(i);
if (!visited.contains(g)) {
stack.push(g);
}
}
}
if (this.result.getSize() < visited.size()) {
this.result = new GeoResult(visited);
}
}
/**
* Creates a map of the geo grid from the csv file data
*
* #throws IOException
*/
private void populateGeoGrid() throws IOException {
try (final BufferedReader br = Files.newBufferedReader(Paths.get(this.csvFilePath))) {
int lineNumber = 0;
String line = "";
while ((line = br.readLine()) != null) {
lineNumber++;
final String[] geoData = line.split(",");
LocalDate dateOccupied = null;
// Handle for empty csv cells
for (int i = 0; i < geoData.length; i++) {
// Remove leading and trailing whitespace
geoData[i] = geoData[i].replace(" ", "");
if (geoData[i].isEmpty() || geoData.length > 3) {
throw new IllegalArgumentException(
"There is missing data in the csv file at line: " + lineNumber);
}
}
try {
dateOccupied = LocalDate.parse(geoData[2], formatter);
} catch (final DateTimeParseException e) {
throw new IllegalArgumentException("There input date is invalid on line: " + lineNumber);
}
this.geoMap.put(
Integer.parseInt(geoData[0]),
new Geo(Integer.parseInt(geoData[0]), geoData[1], dateOccupied));
}
}
}
/** Create a map of each coordinate in the grid to its respective geo */
private void populateCoordinatesMap() {
// Using the geo id, calculate its point on the grid
for (int i = this.height - 1; i >= 0; i--) {
int blockId = (i * this.width);
for (int j = 0; j < this.width; j++) {
if (this.geoMap.containsKey(blockId)) {
final Geo geo = this.geoMap.get(blockId);
geo.setCoordinates(i, j);
this.coordMap.put(geo.getCoordinates(), geo);
}
blockId++;
}
}
}
private void calculateGeoNeighbours() {
for (final Geo geo : this.geoMap.values()) {
addNeighboursToGeo(geo);
}
}
private void addNeighboursToGeo(final Geo geo) {
final int x = geo.getCoordinates().x;
final int y = geo.getCoordinates().y;
final Point[] possibleNeighbours = {
new Point(x, y + 1), new Point(x - 1, y), new Point(x + 1, y), new Point(x, y - 1)
};
Geo g;
for (final Point p : possibleNeighbours) {
if (this.coordMap.containsKey(p)) {
g = this.coordMap.get(p);
if (g != null) {
geo.getNeighbours().add(g);
}
}
}
}
private void printNeighbours() {
for (final Geo geo : this.geoMap.values()) {
System.out.println("Geo " + geo.getId() + " has the following neighbours: ");
for (final Geo g : geo.getNeighbours()) {
System.out.println(g.getId());
}
}
}
}
GeoResult
package analyzer.block.geo.result;
import analyzer.block.geo.model.Geo;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
public class GeoResult {
private final List<Geo> geosInBlock = new ArrayList<>();
public GeoResult() {
}
public GeoResult(final List<Geo> geosInBlock) {
this.geosInBlock.addAll(geosInBlock);
}
public List<Geo> getGeosInBlock() {
this.geosInBlock.sort(Comparator.comparingInt(Geo::getId));
return this.geosInBlock;
}
public int getSize() {
return this.geosInBlock.size();
}
#Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append("The geos in the largest cluster of occupied Geos for this GeoBlock are: \n");
for(final Geo geo : this.geosInBlock) {
sb.append(geo.toString()).append("\n");
}
return sb.toString();
}
}
Geo
package analyzer.block.geo.model;
import java.awt.Point;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
public class Geo {
private final int id;
private final String name;
private final LocalDate dateOccupied;
private final Point coordinate;
private final List<Geo> neighbours = new ArrayList<>();
public Geo (final int id, final String name, final LocalDate dateOccupied) {
this.id = id;
this.name = name;
this.dateOccupied = dateOccupied;
this.coordinate = new Point();
}
public int getId() {
return this.id;
}
public String getName() {
return this.name;
}
public LocalDate getDateOccupied() {
return this.dateOccupied;
}
public void setCoordinates(final int x, final int y) {
this.coordinate.setLocation(x, y);
}
public Point getCoordinates() {
return this.coordinate;
}
public String toString() {
return this.id + ", " + this.name + ", " + this.dateOccupied;
}
public List<Geo> getNeighbours() {
return this.neighbours;
}
#Override
public int hashCode() {
return Objects.hash(this.id, this.name, this.dateOccupied);
}
#Override
public boolean equals(final Object obj) {
if(this == obj) {
return true;
}
if(obj == null || this.getClass() != obj.getClass()) {
return false;
}
final Geo geo = (Geo) obj;
return this.id == geo.getId() &&
this.name.equals(geo.getName()) &&
this.dateOccupied == geo.getDateOccupied();
}
}

The major optimization available here is a conceptual one. Unfortunately, this type of optimization is not easy to teach, nor look up in a reference somewhere. The principle being used here is:
It's (almost always) cheaper to use an analytic formula to compute a known result than to (pre)compute it. [1]
It's clear from your code & the definition of your problem that you are not taking advantage of this principle and the problem specification. In particular, one of the key points taken directly from the problem specification is this:
Your code should produce correct answers in under a second for a 10,000 x 10,000 Geo GeoBlock containing 10,000 occupied Geos.
When you read this statement a few things should be going through your mind (when thinking about runtime efficiency):
10,000^2 is a much larger number than 10,000 (exactly 10,000 times larger!) There is a clear efficiency gain if you can maintain an algorithm that is O(n) as opposed to O(n^2) (in the expected case because of the use of hashing.)
touching (i.e. computing any O(1) operation) for the entire grid is going to immediately yield a O(n^2) algorithm; clearly, this is something that must be avoided if possible
from the problem statement, we should never expect O(n^2) geo's that need to be touched. This should be a major hint as to what the person who wrote the problem is looking for. BFS or DFS is an O(N+M) algorithm where N,M are the number of nodes and edges touched. Thus, we should be expecting an O(n) search.
based on the above points, it is clear that the solution being looked for here should be O(10,000) for a problem input with grid size 10,000 x 10,000 and 10,000 geos
The solution you provided is O(n^2) because,
You use visited.contains where visited is a List. This is not showing up in your testing as a problem area because I suspect you are using small geo clusters. Try using a large geo cluster (one with 10,000 geos.) You should see a major slow down as compared to say the largest cluster having 3 geos. The solution here is to use an efficient data structure for visited, some that come to mind are a bit set (unknown to me if Java has any available, but any decent language should) or a hash set (clearly Java has some available.) Because you did not notice this in testing, this suggests to me you are not vetting/testing your code well enough with enough varied examples of the corner cases you expect. This should of come up immediately in any thorough testing/profiling of your code. As per my comment, I would of liked to have seen this type of groundwork/profiling done before the question was posted.
You touch the entire 10,000 x 10,000 grid in the function/member populateCoordinatesMap. This is clearly already O(n^2) where n=10,000. Notice, that the only location where coordMap is used outside of populateCoordinatesMap is in addNeighboursToGeo. This is a major bottleneck, and for no reason, addNeighboursToGeo can be computed in O(1) time without the need for a coordMap. However, we can still use your code as is with a minor modification given below.
I hope it is obvious how to fix (1). To fix (2), replace populateCoordinatesMap
/** Create a map of each coordinate in the grid to its respective geo */
private void populateCoordinatesMap() {
for (Map.Entry<int,Geo> entry : geoMap.entrySet()) {
int key = entry.getKey();
Geo value = entry.getValue();
int x = key % this.width;
int y = key / this.width;
value.setCoordinates(x, y);
this.coordMap.put(geo.getCoordinates(), geo);
}
}
Notice the principle being put to use here. Instead of iterating over the entire grid as you were doing before (O(n^2) immediately), this iterates only over the occupied Geos, and uses the analytic formula for indexing a 2D array (as opposed to doing copious computation to compute the same thing.) Effectively, this change improves populateCoordinatesMap from being O(n^2) to being O(n).
Some general & opinionated comments below:
Overall, I strongly disagree with using an object oriented approach over a procedural one for this problem. I think the OO approach is completely unjustified for how simple this code should be, but I understand that the interviewer wanted to see it.
This is a very simple problem you are trying to solve, and I think the object orientated approach you took here confounds it so much so you could not see the forest for the trees (or perhaps the trees for the forest.) A much simpler approach could of been taken in how this algorithm was implemented, even using an object oriented approach.
It's clear from the points above, you could benefit from knowing the available tools in the language you are working in. By this I mean you should know what containers are readily available and what the trade offs are for using each operation on each container. You should also know at least one decent profiling tool for the language you are working with if you are going to be looking into optimizing code. Given that you failed to post a profiling summary, even after I asked for it, it suggests to me you do not know of such a tool with Java. Learn one.
[1] I provide no reference for this principle because it is a first principle, and can be explained by the fact that running fewer constant time operations is cheaper than running many. The assumption here is that the known analytic form requires less computation. There are occasional exceptions to this rule. But it should be stressed that such exceptions are almost always because of hardware limitations or advantages. For example, when computing the hamming distance it is cheaper to use a precomputed LUT for computing the population count on a hardware architecture without access to SSE registers/operations.

Without testing, it seems to me that the main block here is the literal creation of the map, which could be up to 100,000,000 cells. There would be no need for that if instead we labeled each CSV entry and had a function getNeighbours(id, width, height) that returned the list of possible neighbour IDs (think modular arithmetic). As we iterate over each CSV entry in turn, if (1) neighbour IDs were already seen that all had the same label, we'd label the new ID with that label; if (2) no neighbours were seen, we'd use a new label for the new ID; and if (3) two or more different labels existed between seen neighbour IDs, we'd combine them to one label (say the minimal label), by having a hash that mapped a label to its "final" label. Also store the sum and size for each label. Your current solution is O(n), where n is width x height. The idea here would be O(n), where n is the number of occupied Geos.
Here's something really crude in Python that I wouldn't expect to have all scenarios handled but could hopefully give you an idea (sorry, I don't know Java):
def get_neighbours(id, width, height):
neighbours = []
if id % width != 0:
neighbours.append(id - 1)
if (id + 1) % width != 0:
neighbours.append(id + 1)
if id - width >= 0:
neighbours.append(id - width)
if id + width < width * height:
neighbours.append(id + width)
return neighbours
def f(data, width, height):
ids = {}
labels = {}
current_label = 0
for line in data:
[idx, name, dt] = line.split(",")
idx = int(idx)
this_label = None
neighbours = get_neighbours(idx, width, height)
no_neighbour_was_seen = True
for n in neighbours:
# A neighbour was seen
if n in ids:
no_neighbour_was_seen = False
# We have yet to assign a label to this ID
if not this_label:
this_label = ids[n]["label"]
ids[idx] = {"label": this_label, "data": name + " " + dt}
final_label = labels[this_label]["label"]
labels[final_label]["size"] += 1
labels[final_label]["sum"] += idx
labels[final_label]["IDs"] += [idx]
# This neighbour has yet to be connected
elif ids[n]["label"] != this_label:
old_label = ids[n]["label"]
old_obj = labels[old_label]
final_label = labels[this_label]["label"]
ids[n]["label"] = final_label
labels[final_label]["size"] += old_obj["size"]
labels[final_label]["sum"] += old_obj["sum"]
labels[final_label]["IDs"] += old_obj["IDs"]
del labels[old_label]
if no_neighbour_was_seen:
this_label = current_label
current_label += 1
ids[idx] = {"label": this_label, "data": name + " " + dt}
labels[this_label] = {"label": this_label, "size": 1, "sum": idx, "IDs": [idx]}
for i in ids:
print i, ids[i]["label"], ids[i]["data"]
print ""
for i in labels:
print i
print labels[i]
return labels, ids
data = [
"4, Tom, 2010-10-10",
"5, Katie, 2010-08-24",
"6, Nicole, 2011-01-09",
"11, Mel, 2011-01-01",
"13, Matt, 2010-10-14",
"15, Mel, 2011-01-01",
"17, Patrick, 2011-03-10",
"21, Catherine, 2011-02-25",
"22, Michael, 2011-02-25"
]
f(data, 4, 7)
print ""
f(data, 7, 4)
Output:
"""
4 0 Tom 2010-10-10
5 0 Katie 2010-08-24
6 0 Nicole 2011-01-09
11 1 Mel 2011-01-01
13 2 Matt 2010-10-14
15 1 Mel 2011-01-01
17 2 Patrick 2011-03-10
21 2 Catherine 2011-02-25
22 2 Michael 2011-02-25
0
{'sum': 15, 'size': 3, 'IDs': [4, 5, 6], 'label': 0}
1
{'sum': 26, 'size': 2, 'IDs': [11, 15], 'label': 1}
2
{'sum': 73, 'size': 4, 'IDs': [13, 17, 21, 22], 'label': 2}
---
4 0 Tom 2010-10-10
5 0 Katie 2010-08-24
6 0 Nicole 2011-01-09
11 0 Mel 2011-01-01
13 0 Matt 2010-10-14
15 3 Mel 2011-01-01
17 2 Patrick 2011-03-10
21 3 Catherine 2011-02-25
22 3 Michael 2011-02-25
0
{'sum': 39, 'size': 5, 'IDs': [4, 5, 6, 11, 13], 'label': 0}
2
{'sum': 17, 'size': 1, 'IDs': [17], 'label': 2}
3
{'sum': 58, 'size': 3, 'IDs': [21, 22, 15], 'label': 3}
"""

Related

Optimizing item order in an ArrayList

I have an ArrayList of colors and their frequency of appearance. My program should calculate a reordering of those items that maximizes the minimum distance between two equal bricks.
For example, given input consisting of 4*brick 1 (x), 3*brick 2 (y), and 5*brick 3 (z), one correct output would be: z y x z x z y x z x y.
My code does not produce good solutions. In particular, sometimes there are 2 equal bricks at the end, which is the worst case.
import java.util.ArrayList;
import java.util.Collections;
public class Calc {
// private ArrayList<Wimpel> w = new ArrayList<Brick>();
private String bKette = "";
public String bestOrder(ArrayList<Brick> w) {
while (!w.isEmpty()) {
if (w.get(0).getFrequency() > 0) {
bChain += w.get(0).getColor() + "|";
Brick brick = new Wimpel(w.get(0).getVariant(), w.get(0).getFrequency() - 1);
w.remove(0);
w.add(brick);
// bestOrder(w);
} else {
w.remove(0);
}
bestOrder(w);
}
return bOrder;
}
public int Solutions(ArrayList<Wimpel> w) {
ArrayList<Brick> tmp = new ArrayList<Brick>(w);
int l = 1;
int counter = (int) w.stream().filter(c -> Collections.max(tmp).getFrequency() == c.getFrequency()).count();
l = (int) (fakultaet(counter) * fakultaet((tmp.size() - counter)));
return l;
}
public static long fakultaet(int n) {
return n == 0 ? 1 : n * fakultaet(n - 1);
}
}
How can make my code choose an optimal order?
We will not perform your exercise for you, but we will give you some advice.
Consider your current approach: it operates by filling the result string by cycling through the bricks, choosing one item from each brick in turn as long as any items remain in that brick. But this approach is certain to fail when one brick contains at least two items more than any other, because then only that brick remains at the end, and all its remaining items have to be inserted one after the other.
That is, the problem is not that your code is buggy per se, but rather that your whole strategy is incorrect for the problem. You need something different.
Now, consider the problem itself. Which items will appear at the shortest distance apart in a correct ordering? Those having the highest frequency, of course. And you can compute that minimum distance based on the frequency and total number of items.
Suppose you arrange these most-constrained items first, at the known best distance.
What's left to do at this point? Well, you potentially have some more bricks with lesser frequency, and some more slots in which to accommodate their items. If you ignore the occupied slots altogether, you can treat this as a smaller version of the same problem you had before.

Find Survivor when n people are sitting in a circle

Hi I am across this problem and trying to solve this
Take a second to imagine that you are in a room with 100 chairs arranged in a circle. These chairs are numbered sequentially from One to One Hundred.
At some point in time, the person in chair #1 will be told to leave the room. The person in chair #2 will be skipped, and the person in chair #3 will be told to leave. Next to go is person in chair #6. In other words, 1 person will be skipped initially, and then 2, 3, 4.. and so on. This pattern of skipping will keep going around the circle until there is only one person remaining.. the survivor. Note that the chair is removed when the person leaves the room.Write a program to figure out which chair the survivor is sitting in.
I made good progress but stuck with a issue, after the count reaches 100 and not sure how to iterate from here, can any one help me, this is my code
import java.util.ArrayList;
public class FindSurvivor {
public static void main(String[] args) {
System.out.println(getSurvivorNumber(10));
}
private static int getSurvivorNumber(int numChairs) {
// Handle bad input
if (numChairs < 1) {
return -1;
}
// Populate chair array list
ArrayList<Integer> chairs = new ArrayList<Integer>();
for (int i = 0; i < numChairs; i++) {
chairs.add(i + 1);
}
int chairIndex = 0;
int lr =0;
while (chairs.size() > 1) {
chairs.remove(lr);
chairIndex+=1;
System.out.println(lr+" lr, size "+chairs.size()+" index "+chairIndex);
if(lr==chairs.size()||lr==chairs.size()-1)
lr=0;
lr = lr+chairIndex;
printChair(chairs);
System.out.println();
}
return chairs.get(0);
}
public static void printChair(ArrayList<Integer> chairs){
for(int i : chairs){
System.out.print(i);
}
}
}
The answer is 31. Here are three different implementations
var lastSurvivor = function(skip, count, chairs) {
//base case checks to see if there is a lone survivor
if (chairs.length === 1)
return chairs[0];
//remove chairs when they are left/become dead
chairs.splice(skip, 1);
//increment the skip count so we know which chair
//to leave next.
skip = (skip + 1 + count) % chairs.length;
count++;
//recursive call
return lastSurvivor(skip, count, chairs);
};
/** TESTS *******************************************************************
----------------------------------------------------------------------------*/
var result = lastSurvivor(0, 0, chairs);
console.log('The lone survivor is located in chair #', result);
// The lone survivor is located in chair # 31
/** ALTERNATE IMPLEMENTATIONS ***********************************************
-----------------------------------------------------------------------------
/* Implemenation 2
-----------------*/
var lastSurvivor2 = function(chairs, skip) {
skip++;
if (chairs === 1)
return 1;
else
return ((lastSurvivor2(chairs - 1, skip) + skip - 1) % chairs) + 1;
};
/** Tests 2 *******************************************************************/
var result = lastSurvivor2(100, 0);
console.log('The lone survivor is located in chair #', result);
// The lone survivor is located in chair # 31
/* Implemenation 3
------------------*/
var chairs2 = [];
for (var i = 1; i <= 100; i++)
chairs2.push(i);
var lastSurvivor3 = function(chairs, skip) {
var count = 0;
while (chairs.length > 1) {
chairs.splice(skip, 1);
skip = (skip + 1 + count) % chairs.length;
count++;
}
return chairs[0];
};
/** Tests 3 *******************************************************************/
var result = lastSurvivor3(chairs2, 0);
console.log('The lone survivor is located in chair #', result);
// The lone survivor is located in chair # 31
I'm not sure what your removal pattern is but I'd probably implement this as a circular linked list where the 100th seat holder will connect back to the 1st seat holder. If you use an array, you will have to worry about re-organizing the seats after every removal.
There is elegant analytical solution:
Let's change numbering of people: #2 -> #1, #3 -> #2, ..., #1 -> #100 (in the end we just need to substract 1 to "fix" the result). Now first person remains instead or leaving. Suppose that there is only 64 people in circle. It's easy to see that after first elimination pass 32 people in circle will remain and numbering will start again from #1. So in the end only #1 will remain.
We have 100 people. After 36 people will leave the circle we will end up with 64 people - and we know how to solve this. For each person that leaves the room one person remains, so circle with 64 people will start from 1 + 2*36 = #73 (new #1). Because of changing indexes on first step final answer will be #72.
In general case res = 2*(N - closest_smaller_pow_2) = 2*N - closest_larger_pow_2. The code is trivial:
public static long remaining(long total) {
long pow2 = 1;
while (pow2 < total) {
pow2 *= 2;
}
return 2*total - pow2;
}
Also this algorithm has O(log(N)) complexity instead of O(N), so it's possible to calculate function for huge inputs (it can be easily adapted to use BigInteger instead of long).
First, let's assume the chairs are numbered from 0. We'll switch the numbering back at the end -- but usually things are simpler when items are enumerated from 0 rather than 1.
Now, if you've got n people and you start eliminating at chair x (x is 0 or 1) then in a single pass through you're going to eliminate half the people. Then you've got a problem of roughly half the size (possibly plus one), and if you solve that, you can construct the solution to the original problem by multiplying that sub-result by 2 and maybe adding one.
To code this, it's simply a matter of getting the 4 cases (n odd or even vs x 0 or 1) right. Here's a version that gets the 4 cases right by using bitwise trickery.
public static long j2(long n, long x) {
if (n == 1) return 0;
return j2(n/2 + (n&x), (n&1)^x) + 1-x;
}
A solution with chairs numbered from 1 and without the extra argument can now be written:
public static long remaining(long n) {
return 1 + j2(n, 0);
}
This runs in O(log n) time and uses O(log n) memory.
If your step is incremental you can you use the following code:
int cur = 0;
int step = 1;
while (chairs.size() > 1) {
chairs.remove(cur);
cur += ++step;
cur %= chairs.size();
}
return chairs.get(0);
If your step is fixed to 1 then based on explanation provided by #Jarlax you can solve the problem with one-line of code in O(log n) time:
//for long values
public static long remaining(long numChairs) {
return (numChairs << 1) - (long)Math.pow(2,Long.SIZE - Long.numberOfLeadingZeros(numChairs));
}
//for BigInteger values
public static BigInteger remaining(BigInteger numChairs) {
return numChairs.shiftLeft(1).subtract(new BigInteger("2").pow(numChairs.bitLength()));
}
However, if you stick with ArrayLists no extra variables are required to your code. Always remove the first element and remove-then-add the next at the end of the list. This is however O(n).
while (chairs.size() > 1) {
chairs.remove(0);
chairs.add(chairs.remove(0));
}
return chairs.get(0);

Dijkstra's Dilema: How can I make my Algorithm abstract?

I was doing code forces and wanted to implement Dijkstra's Shortest Path Algorithm for a directed graph using Java with an Adjacency Matrix, but I'm having difficulty making it work for other sizes than the one it is coded to handle.
Here is my working code
int max = Integer.MAX_VALUE;//substitute for infinity
int[][] points={//I used -1 to denote non-adjacency/edges
//0, 1, 2, 3, 4, 5, 6, 7
{-1,20,-1,80,-1,-1,90,-1},//0
{-1,-1,-1,-1,-1,10,-1,-1},//1
{-1,-1,-1,10,-1,50,-1,20},//2
{-1,-1,-1,-1,-1,-1,20,-1},//3
{-1,50,-1,-1,-1,-1,30,-1},//4
{-1,-1,10,40,-1,-1,-1,-1},//5
{-1,-1,-1,-1,-1,-1,-1,-1},//6
{-1,-1,-1,-1,-1,-1,-1,-1} //7
};
int [] record = new int [8];//keeps track of the distance from start to each node
Arrays.fill(record,max);
int sum =0;int q1 = 0;int done =0;
ArrayList<Integer> Q1 = new ArrayList<Integer>();//nodes to transverse
ArrayList<Integer> Q2 = new ArrayList<Integer>();//nodes collected while transversing
Q1.add(0);//starting point
q1= Q1.get(0);
while(done<9) {// <<< My Problem
for(int q2 = 1; q2<8;q2++) {//skips over the first/starting node
if(points[q1][q2]!=-1) {//if node is connected by an edge
if(record[q1] == max)//never visited before
sum=0;
else
sum=record[q1];//starts from where it left off
int total = sum+points[q1][q2];//total distance of route
if(total < record[q2])//connected node distance
record[q2]=total;//if smaller
Q2.add(q2);//colleceted node
}
}
done++;
Q1.remove(0);//removes the first node because it has just been used
if(Q1.size()==0) {//if there are no more nodes to transverse
Q1=Q2;//Pours all the collected connecting nodes to Q1
Q2= new ArrayList<Integer>();
q1=Q1.get(0);
}
else//
q1=Q1.get(0);//sets starting point
}![enter image description here][1]
However, my version of the algorithm only works because I set the while loop to the solved answer. So in other words, it only works for this problem/graph because I solved it by hand first.
How could I make it so it works for all groups of all sizes?
Here is the pictorial representation of the example graph my problem was based on:
I think the main answer you are looking for is that you should let the while-loop run until Q1 is empty. What you're doing is essentially a best-first search. There are more changes required though, since your code is a bit unorthodox.
Commonly, Dijkstra's algorithm is used with a priority queue. Q1 is your "todo list" as I understand from your code. The specification of Dijkstra's says that the vertex that is closest to the starting vertex should be explored next, so rather than an ArrayList, you should use a PriorityQueue for Q1 that sorts vertices according to which is closest to the starting vertex. The most common Java implementation uses the PriorityQueue together with a tuple class: An internal class which stores a reference to a vertex and a "distance" to the starting vertex. The specification for Dijkstra's also specifies that if a new edge is discovered that makes a vertex closer to the start, the DecreaseKey operation should then be used on the entry in the priority queue to make the vertex come up earlier (since it is now closer). However, since PriorityQueue doesn't support that operation, a completely new entry is just added to the queue. If you have a good implementation of a heap that supports this operation (I made one myself, here) then decreaseKey can significantly increase efficiency as you won't need to create those tuples any more either then.
So I hope that is a sufficient answer then: Make a proper 'todo' list instead of Q1, and to make the algorithm generic, let that while-loop run until the todo list is empty.
Edit: I made you an implementation based on your format, that seems to work:
public void run() {
final int[][] points = { //I used -1 to denote non-adjacency/edges
//0, 1, 2, 3, 4, 5, 6, 7
{-1,20,-1,80,-1,-1,90,-1}, //0
{-1,-1,-1,-1,-1,10,-1,-1}, //1
{-1,-1,-1,10,-1,50,-1,20}, //2
{-1,-1,-1,-1,-1,-1,20,-1}, //3
{-1,50,-1,-1,-1,-1,30,-1}, //4
{-1,-1,10,40,-1,-1,-1,-1}, //5
{-1,-1,-1,-1,-1,-1,-1,-1}, //6
{-1,-1,-1,-1,-1,-1,-1,-1} //7
};
final int[] result = dijkstra(points,0);
System.out.print("Result:");
for(final int i : result) {
System.out.print(" " + i);
}
}
public int[] dijkstra(final int[][] points,final int startingPoint) {
final int[] record = new int[points.length]; //Keeps track of the distance from start to each vertex.
final boolean[] explored = new boolean[points.length]; //Keeps track of whether we have completely explored every vertex.
Arrays.fill(record,Integer.MAX_VALUE);
final PriorityQueue<VertexAndDistance> todo = new PriorityQueue<>(points.length); //Vertices left to traverse.
todo.add(new VertexAndDistance(startingPoint,0)); //Starting point (and distance 0).
record[startingPoint] = 0; //We already know that the distance to the starting point is 0.
while(!todo.isEmpty()) { //Continue until we have nothing left to do.
final VertexAndDistance next = todo.poll(); //Take the next closest vertex.
final int q1 = next.vertex;
if(explored[q1]) { //We have already done this one, don't do it again.
continue; //...with the next vertex.
}
for(int q2 = 1;q2 < points.length;q2++) { //Find connected vertices.
if(points[q1][q2] != -1) { //If the vertices are connected by an edge.
final int distance = record[q1] + points[q1][q2];
if(distance < record[q2]) { //And it is closer than we've seen so far.
record[q2] = distance;
todo.add(new VertexAndDistance(q2,distance)); //Explore it later.
}
}
}
explored[q1] = true; //We're done with this vertex now.
}
return record;
}
private class VertexAndDistance implements Comparable<VertexAndDistance> {
private final int distance;
private final int vertex;
private VertexAndDistance(final int vertex,final int distance) {
this.vertex = vertex;
this.distance = distance;
}
/**
* Compares two {#code VertexAndDistance} instances by their distance.
* #param other The instance with which to compare this instance.
* #return A positive integer if this distance is more than the distance
* of the specified object, a negative integer if it is less, or
* {#code 0} if they are equal.
*/
#Override
public int compareTo(final VertexAndDistance other) {
return Integer.compare(distance,other.distance);
}
}
Output: 0 20 40 50 2147483647 30 70 60

Clauset-Newman-Moore community detection implementation

I am trying to implement the above community detection algorithm in Java, and while I have access to C++ code, and the original paper - I can't make it work at all. My major issue is that I don't understand the purpose of the code - i.e. how the algorithm works. In practical terms, my code gets stuck in what seems to be an infinite loop at mergeBestQ, the list heap seems to be getting larger on each iteration (as I would expect from the code), but the value of topQ is always returning the same value.
The graph I am testing this on is quite large (300,000 nodes, 650,000 edges). The original code I am using for my implementation is from the SNAP library (https://github.com/snap-stanford/snap/blob/master/snap-core/cmty.cpp). What would be great is if someone could explain to me the intuition of the algorithm, it seems to be initially setting each node to be in it's own community, then recording the modularity value (whatever that is) of each pair of connected nodes in the graph, then finding the pair of nodes which have the highest modularity and moving them to the same community. In addition, if someone could provide some mid level pseudo code, that would be great. Here is my implementation thus far, I have tried to keep it in one file for the sake of brevity, however CommunityGraph and CommunityNode are elsewhere (should not be required). Graph maintain a list of all nodes and each node maintains a list of its connections to other nodes. When running it never gets past the line while(this.mergeBestQ()){}
UPDATE - found several bugs in my code after a thorough review. The code now completes VERY quickly, but doesnt fully implement the algorithm, for example of the 300,000 nodes in the graph, it states there are approximately 299,000 communities (i.e. roughly 1 node per community). I have listed the updated code below.
/// Clauset-Newman-Moore community detection method.
/// At every step two communities that contribute maximum positive value to global modularity are merged.
/// See: Finding community structure in very large networks, A. Clauset, M.E.J. Newman, C. Moore, 2004
public class CNMMCommunityMetric implements CommunityMetric{
private static class DoubleIntInt implements Comparable{
public double val1;
public int val2;
public int val3;
DoubleIntInt(double val1, int val2, int val3){
this.val1 = val1;
this.val2 = val2;
this.val3 = val3;
}
#Override
public int compareTo(DoubleIntInt o) {
//int this_sum = this.val2 + this.val3;
//int oth_sum = o.val2 + o.val3;
if(this.equals(o)){
return 0;
}
else if(val1 < o.val1 || (val1 == o.val1 && val2 < o.val2) || (val1 == o.val1 && val2 == o.val2 && val3 < o.val3)){
return 1;
}
else{
return -1;
}
//return this.val1 < o.val1 ? 1 : (this.val1 > o.val1 ? -1 : this_sum - oth_sum);
}
#Override
public boolean equals(Object o){
return this.val2 == ((DoubleIntInt)o).val2 && this.val3 == ((DoubleIntInt)o).val3;
}
#Override
public int hashCode() {
int hash = 3;
hash = 79 * hash + this.val2;
hash = 79 * hash + this.val3;
return hash;
}
}
private static class CommunityData {
double DegFrac;
TIntDoubleHashMap nodeToQ = new TIntDoubleHashMap();
int maxQId;
CommunityData(){
maxQId = -1;
}
CommunityData(double nodeDegFrac, int outDeg){
DegFrac = nodeDegFrac;
maxQId = -1;
}
void addQ(int NId, double Q) {
nodeToQ.put(NId, Q);
if (maxQId == -1 || nodeToQ.get(maxQId) < Q) {
maxQId = NId;
}
}
void updateMaxQ() {
maxQId=-1;
int[] nodeIDs = nodeToQ.keys();
double maxQ = nodeToQ.get(maxQId);
for(int i = 0; i < nodeIDs.length; i++){
int id = nodeIDs[i];
if(maxQId == -1 || maxQ < nodeToQ.get(id)){
maxQId = id;
maxQ = nodeToQ.get(maxQId);
}
}
}
void delLink(int K) {
int NId=getMxQNId();
nodeToQ.remove(K);
if (NId == K) {
updateMaxQ();
}
}
int getMxQNId() {
return maxQId;
}
double getMxQ() {
return nodeToQ.get(maxQId);
}
};
private TIntObjectHashMap<CommunityData> communityData = new TIntObjectHashMap<CommunityData>();
private TreeSet<DoubleIntInt> heap = new TreeSet<DoubleIntInt>();
private HashMap<DoubleIntInt,DoubleIntInt> set = new HashMap<DoubleIntInt,DoubleIntInt>();
private double Q = 0.0;
private UnionFind uf = new UnionFind();
#Override
public double getCommunities(CommunityGraph graph) {
init(graph);
//CNMMCommunityMetric metric = new CNMMCommunityMetric();
//metric.getCommunities(graph);
// maximize modularity
while (this.mergeBestQ(graph)) {
}
// reconstruct communities
HashMap<Integer, ArrayList<Integer>> IdCmtyH = new HashMap<Integer, ArrayList<Integer>>();
Iterator<CommunityNode> ns = graph.getNodes();
int community = 0;
TIntIntHashMap communities = new TIntIntHashMap();
while(ns.hasNext()){
CommunityNode n = ns.next();
int r = uf.find(n);
if(!communities.contains(r)){
communities.put(r, community++);
}
n.setCommunity(communities.get(r));
}
System.exit(0);
return this.Q;
}
private void init(Graph graph) {
double M = 0.5/graph.getEdgesList().size();
Iterator<Node> ns = graph.getNodes();
while(ns.hasNext()){
Node n = ns.next();
uf.add(n);
int edges = n.getEdgesList().size();
if(edges == 0){
continue;
}
CommunityData dat = new CommunityData(M * edges, edges);
communityData.put(n.getId(), dat);
Iterator<Edge> es = n.getConnections();
while(es.hasNext()){
Edge e = es.next();
Node dest = e.getStart() == n ? e.getEnd() : e.getStart();
double dstMod = 2 * M * (1.0 - edges * dest.getEdgesList().size() * M);//(1 / (2 * M)) - ((n.getEdgesList().size() * dest.getEdgesList().size()) / ((2 * M) * (2 * M)));// * (1.0 - edges * dest.getEdgesList().size() * M);
dat.addQ(dest.getId(), dstMod);
}
Q += -1.0 * (edges*M) * (edges*M);
if(n.getId() < dat.getMxQNId()){
addToHeap(createEdge(dat.getMxQ(), n.getId(), dat.getMxQNId()));
}
}
}
void addToHeap(DoubleIntInt o){
heap.add(o);
}
DoubleIntInt createEdge(double val1, int val2, int val3){
DoubleIntInt n = new DoubleIntInt(val1, val2, val3);
if(set.containsKey(n)){
DoubleIntInt n1 = set.get(n);
heap.remove(n1);
if(n1.val1 < val1){
n1.val1 = val1;
}
n = n1;
}
else{
set.put(n, n);
}
return n;
}
void removeFromHeap(Collection<DoubleIntInt> col, DoubleIntInt o){
//set.remove(o);
col.remove(o);
}
DoubleIntInt findMxQEdge() {
while (true) {
if (heap.isEmpty()) {
break;
}
DoubleIntInt topQ = heap.first();
removeFromHeap(heap, topQ);
//heap.remove(topQ);
if (!communityData.containsKey(topQ.val2) || ! communityData.containsKey(topQ.val3)) {
continue;
}
if (topQ.val1 != communityData.get(topQ.val2).getMxQ() && topQ.val1 != communityData.get(topQ.val3).getMxQ()) {
continue;
}
return topQ;
}
return new DoubleIntInt(-1.0, -1, -1);
}
boolean mergeBestQ(Graph graph) {
DoubleIntInt topQ = findMxQEdge();
if (topQ.val1 <= 0.0) {
return false;
}
// joint communities
int i = topQ.val3;
int j = topQ.val2;
uf.union(i, j);
Q += topQ.val1;
CommunityData datJ = communityData.get(j);
CommunityData datI = communityData.get(i);
datI.delLink(j);
datJ.delLink(i);
int[] datJData = datJ.nodeToQ.keys();
for(int _k = 0; _k < datJData.length; _k++){
int k = datJData[_k];
CommunityData datK = communityData.get(k);
double newQ = datJ.nodeToQ.get(k);
//if(datJ.nodeToQ.containsKey(i)){
// newQ = datJ.nodeToQ.get(i);
//}
if (datI.nodeToQ.containsKey(k)) {
newQ = newQ + datI.nodeToQ.get(k);
datK.delLink(i);
} // K connected to I and J
else {
newQ = newQ - 2 * datI.DegFrac * datK.DegFrac;
} // K connected to J not I
datJ.addQ(k, newQ);
datK.addQ(j, newQ);
addToHeap(createEdge(newQ, Math.min(j, k), Math.max(j, k)));
}
int[] datIData = datI.nodeToQ.keys();
for(int _k = 0; _k < datIData.length; _k++){
int k = datIData[_k];
if (!datJ.nodeToQ.containsKey(k)) { // K connected to I not J
CommunityData datK = communityData.get(k);
double newQ = datI.nodeToQ.get(k) - 2 * datJ.DegFrac * datK.DegFrac;
datJ.addQ(k, newQ);
datK.delLink(i);
datK.addQ(j, newQ);
addToHeap(createEdge(newQ, Math.min(j, k), Math.max(j, k)));
}
}
datJ.DegFrac += datI.DegFrac;
if (datJ.nodeToQ.isEmpty()) {
communityData.remove(j);
} // isolated community (done)
communityData.remove(i);
return true;
}
}
UPDATE:the currently listed code is fairly quick, and has half the memory usage compared to the "quickest" solution, while only being ~5% slower. the difference is in the use of hashmap + treest vs priority queue, and ensuring only a single object for a given i, j pair exists at any time.
So here's the original paper, a neat lil' six pages, only two of which are about the design & implementation. Here's a cliffnotes:
For a partition of a given graph, the authors define the modularity, Q, of the partition to be the ratio of the number of edges within each community to the number of edges between each community, minus the ratio you'd expect from a completely random partition.
So it's effectively "how much better is this partition at defining communities than a completely random one?"
Given two communities i and j of a partition, they then define deltaQ_ij to be how much the modularity of the partition would change if communities i and j were merged. So if deltaQ_ij > 0, merging i and j will improve the modularity of the partition.
Which leads to a simple greedy algorithm: start with every node in its own community. Calculate deltaQ_ij for every pair of communities. Whichever two communities i, j have the largest deltaQ_ij, merge those two. Repeat.
You'll get maximum modularity when the deltaQ_ij all turn negative, but in the paper the authors let the algorithm run until there's only one community left.
That's pretty much it for understanding the algorithm. The details are in how to compute deltaQ_ij quickly and store the information efficiently.
Edit: Data structure time!
So first off, I think the implementation you're referencing does things a different way to the paper. I'm not quite sure how, because the code is impenetrable, but it seems to use union-find and hashsets in place of the author's binary trees and multiple heaps. Not a clue why they do it a different way. You might want to email the guy who wrote it and ask.
Anyway, the algorithm in the paper needs several things from the format deltaQ is stored in:
First, it needs to be able to recover the largest value in dQ quickly.
Second, it needs to be able to remove all deltaQ_ik and deltaQ_ki for a fixed i quickly.
Third, it needs to be able to update all deltaQ_kj and deltaQ_jk for a fixed j quickly.
The solution the authors come up to for this is as follows:
For each community i, each non-zero deltaQ_ik is stored in a balanced binary tree, indexed by k (so elements can be found easily), and in a heap (so the maximum for that community can be found easily).
The maximum deltaQ_ik from each community i's heap is then stored in another heap, so that the overall maximums can be found easily.
When community i is merged with community j, several things happen to the binary trees:
First, each element from the ith community is added to the jth community's binary tree. If an element with the same index k already exists, you sum the old and new values.
Second, we update all the remaining "old" values in the jth community's binary tree to reflect the fact that the jth community has just increased in size.
And for each other community's binary tree k, we update any deltaQ_kj.
Finally, the tree for community i is thrown away.
And similarly, several things must happen to the heaps:
First, the heap for community i is thrown away.
Then the heap for community j is rebuilt from scratch using the elements from the community's balanced binary tree.
And for each other community k's heap, the position of entry deltaQ_kj is updated.
Finally, the entry for community i in the overall heap is thrown away (causing bubbling) and the entries for community j and each community k connected to i or j are updated.
Strangely, when two communities are merged there's no reference in the paper as to removing deltaQ_ki values from the kth community's heap or tree. I think this might be dealt with by the setting of a_i = 0, but I don't understand the algorithm well enough to be sure.
Edit: Trying to decipher the implementation you linked. Their primary datastructures are
CmtyIdUF, a union-find structure that keeps track of which nodes are in which community (something that's neglected in the paper, but seems necessary unless you want to reconstruct community membership from a trace of the merge or something),
MxQHeap, a heap to keep track of which deltaQ_ij is largest overall. Strangely, when they update the value of a TFltIntIntTr in the heap, they don't ask the heap to re-heapify itself. This is worrying. Does it do it automatically or something?
CmtyQH, a hashmap that maps a community ID i to a structure TCmtyDat which holds what looks heap of the deltaQ_ik for that community. I think. Strangely though, the UpdateMaxQ of the TCmtyDat structure takes linear time, obviating any need for a heap. What's more, the UpdateMaxQ method only appears to be called when an element of the heap is deleted. It should definitely also be getting called when the value of any element in the heap is updated.

Graph colouring algorithm: typical scheduling problem

I'm training code problems like UvA and I have this one in which I have to, given a set of n exams and k students enrolled in the exams, find whether it is possible to schedule all exams in two time slots.
Input
Several test cases. Each one starts with a line containing 1 < n < 200 of different examinations to be scheduled.
The 2nd line has the number of cases k in which there exist at least 1 student enrolled in 2 examinations. Then, k lines will follow, each containing 2 numbers that specify the pair of examinations for each case above.
(An input with n = 0 will means end of the input and is not to be processed).
Output:
You have to decide whether the examination plan is possible or not for 2 time slots.
Example:
Input:
3
3
0 1
1 2
2 0
9
8
0 1
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0
Ouput:
NOT POSSIBLE.
POSSIBLE.
I think the general approach is graph colouring, but I'm really a newb and I may confess that I had some trouble understanding the problem.
Anyway, I'm trying to do it and then submit it.
Could someone please help me doing some code for this problem?
I will have to handle and understand this algo now in order to use it later, over and over.
I prefer C or C++, but if you want, Java is fine to me ;)
Thanks in advance
You are correct that this is a graph coloring problem. Specifically, you need to determine if the graph is 2-colorable. This is trivial: do a DFS on the graph, coloring alternating black and white nodes. If you find a conflict, then the graph is not 2-colorable, and the scheduling is impossible.
possible = true
for all vertex V
color[V] = UNKNOWN
for all vertex V
if color[V] == UNKNOWN
colorify(V, BLACK, WHITE)
procedure colorify(V, C1, C2)
color[V] = C1
for all edge (V, V2)
if color[V2] == C1
possible = false
if color[V2] == UNKNOWN
colorify(V2, C2, C1)
This runs in O(|V| + |E|) with adjacency list.
in practice the question is if you can partition the n examinations into two subsets A and B (two timeslots) such that for every pair in the list of k examination pairs, either a belongs to A and b belongs to B, or a belongs to B and b belongs to A.
You are right that it is a 2-coloring problem; it's a graph with n vertices and there's an undirected arc between vertices a and b iff the pair or appears in the list. Then the question is about the graph's 2-colorability, the two colors denoting the partition to timeslots A and B.
A 2-colorable graph is a "bipartite graph". You can test for bipartiteness easily, see http://en.wikipedia.org/wiki/Bipartite_graph.
I've translated the polygenelubricant's pseudocode to JAVA code, in order to provide a solution for my problem. We have a submission platform (like uva/ACM contests), so I know it passed even in the problem with more and hardest cases.
Here it is:
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.Scanner;
/**
*
* #author newba
*/
public class GraphProblem {
class Edge {
int v1;
int v2;
public Edge(int v1, int v2) {
this.v1 = v1;
this.v2 = v2;
}
}
public GraphProblem () {
Scanner cin = new Scanner(System.in);
while (cin.hasNext()) {
int num_exams = cin.nextInt();
if (num_exams == 0)
break;
int k = cin.nextInt();
Hashtable<Integer,String> exams = new Hashtable<Integer, String>();
ArrayList<Edge> edges = new ArrayList<Edge>();
for (int i = 0; i < k; i++) {
int v1 = cin.nextInt();
int v2 = cin.nextInt();
exams.put(v1,"UNKNOWN");
exams.put(v2,"UNKNOWN");
//add the edge from A->B and B->A
edges.add(new Edge(v1, v2));
edges.add(new Edge(v2, v1));
}
boolean possible = true;
for (Integer key: exams.keySet()){
if (exams.get(key).equals("UNKNOWN")){
if (!colorify(edges, exams,key, "BLACK", "WHITE")){
possible = false;
break;
}
}
}
if (possible)
System.out.println("POSSIBLE.");
else
System.out.println("NOT POSSIBLE.");
}
}
public boolean colorify (ArrayList<Edge> edges,Hashtable<Integer,String> verticesHash,Integer node, String color1, String color2){
verticesHash.put(node,color1);
for (Edge edge : edges){
if (edge.v1 == (int) node) {
if (verticesHash.get(edge.v2).equals(color1)){
return false;
}
if (verticesHash.get(edge.v2).equals("UNKNOWN")){
colorify(edges, verticesHash, edge.v2, color2, color1);
}
}
}
return true;
}
public static void main(String[] args) {
new GraphProblem();
}
}
I didn't optimized yet, I don't have the time right new, but if you want, you/we can discuss it here.
Hope you enjoy it! ;)

Categories