Java BFS Webcrawler produces duplicated website links - java

I am tasked with creating a Java BFS Algorithm without using the built-in LinkedList and Dynamic ArrayList.
I managed to find 2 examples that seem to achieve the result that I am looking for. They can be found below. When I compare my results to the examples I have found my results seem to have duplicated links.
I suspect it has something to do with my contains() method however having tried many different options that I found from Java: Implement String method contains() without built-in method contains() the issue still persist.
Could someone pls help me with this? Thank you so much in advance!!
Examples
https://github.com/theexplorist/WebCrawler
https://www.youtube.com/watch?v=lyVjfz2Tuck&ab_channel=SylvainSaurel (The code in the video can be found below)
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WebCrawler {
public static Queue<String> queue = new LinkedList<>();
public static Set<String> marked = new HashSet<>();
public static String regex = "http[s]*://(\\w+\\.)*(\\w+)";
public static void bfsAlgorithm(String root) throws IOException{
queue.add(root);
BufferedReader br = null;
while (!queue.isEmpty()){
String crawledUrl = queue.poll();
System.out.println("\n=== Site crawled : " + crawledUrl + " ===");
if(marked.size() > 100)
return;
boolean ok = false;
URL url = null;
while(!ok){
try{
url = new URL(crawledUrl);
br = new BufferedReader(new InputStreamReader(url.openStream()));
ok = true;
} catch (MalformedURLException e) {
System.out.println("*** Maformed URL : " + crawledUrl);
crawledUrl = queue.poll();
ok = false;
}
}
StringBuilder sb = new StringBuilder();
String tmp = null;
while((tmp = br.readLine()) != null){
sb.append(tmp);
}
tmp = sb.toString();
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(tmp);
while (matcher.find()){
String w = matcher.group();
if(!marked.contains(w)){
marked.add(w);
System.out.println("Sited added for crawling : " + w);
queue.add(w);
}
}
}
if(br != null){
br.close();
}
}
public static void showResults(){
System.out.println("\n\nResults : ");
System.out.println("Web sites crawled : " + marked.size() + "\n");
for (String s: marked){
System.out.println("* " + s);
}
}
public static void main(String[] args){
try{
bfsAlgorithm("https://www.tesla.com/");
showResults();
} catch (IOException e) {
}
}
}
Below are the results for the root url: https://en.wikipedia.org/
I edited the code to only show the first 20 links.
Result from https://github.com/theexplorist/WebCrawler
Result from the Youtube Video
Result from my code
If you look at my result you will see that there are duplicated links.
Pls find my code below
Main.java
public static void main(String[] args) {
WebCrawler crawler = new WebCrawler();
String rootUrl = "https://en.wikipedia.org/";
crawler.discoverWeb(rootUrl);
}
}
DA.java (Dynamic Array)
class DA{
int size;
int capacity = 10;
Object[] nameofda;
public DA(){
this.nameofda = new Object[capacity];
}
public DA(int capacity){
this.capacity = capacity;
this.nameofda = new Object[capacity];
}
public void add(Object anything){
if(size >= capacity){
grow();
}
nameofda[size] = anything;
size++;
}
public void insert(int index, Object anything){
if(size >= capacity){
grow();
}
for (int i = size; i > index; i--){
nameofda[i] = nameofda[i - 1];
}
nameofda[index] = anything;
size++;
}
public void delete(Object anything){
for(int i = 0; i < size; i++){
if(nameofda[i] == anything){
for(int j = 0; j < (size - i - 1); j++){
nameofda[i + j] = nameofda[i + j + 1];
}
nameofda[size - 1] = null;
size--;
if(size <=(int)(capacity/3)){
shrink();
}
break;
}
}
}
public boolean contains(Object anything){
for(int i = 0; i < size; i++){
if (nameofda[i] == anything){
return true;
}
}
return false;
}
private void grow(){
int newcap = (int)(capacity *2);
Object[] newnameofda = new Object[newcap];
for(int i = 0; i < size; i++){
newnameofda[i] = nameofda[i];
}
capacity = newcap;
nameofda = newnameofda;
}
private void shrink(){
int newcap = (int)(capacity / 2);
Object[] newnameofda = new Object[newcap];
for(int i = 0; i < size; i++){
newnameofda[i] = nameofda[i];
}
capacity = newcap;
nameofda = newnameofda;
}
public boolean isEmpty(){
return size == 0;
}
public String toString(){
String nameofstring = "";
for(int i = 0; i < size; i++){
nameofstring += nameofda[i] + ", ";
}
if(nameofstring != ""){
nameofstring = "[" + nameofstring.substring(0, nameofstring.length() - 2) + "]";
}
else {
nameofstring = "[]";
}
return nameofstring;
}
Queue.java (LinkedList)
public class Queue<T> {
private Node<T> front;
private Node<T> rear;
private int length;
private static class Node<T> {
private final T data;
private Node<T> next;
public Node(T data) {
this.data = data;
}
}
public void enQueue(T item) {
if (front == null) {
rear = new Node<T>(item);
front = rear;
} else {
rear.next = new Node<T>(item);
rear = rear.next;
}
length++;
}
public T deQueue() {
if (front != null) {
T item = front.data;
front = front.next;
length--;
return item;
}
return null;
}
public int size() {
return length;
}
public boolean isEmpty(){
return length == 0;
}
public void displayQueue() {
Node<T> currentNode = front;
while (currentNode != null) {
System.out.print(currentNode.data+" ");
currentNode = currentNode.next;
}
}
}
WebCrawler.java
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WebCrawler {
private Queue<String> queue;
DA discoveredWebsitesList = new DA(5);
public WebCrawler() {
this.queue = new Queue<>();
}
public void discoverWeb(String root) {
this.queue.enQueue(root);
this.discoveredWebsitesList.add(root);
while (!queue.isEmpty()) {
String v = this.queue.deQueue();
StringBuilder rawHtml = readUrl(v);
String regexe = "https://(\\w+\\.)*(\\w+)";
Pattern pattern = Pattern.compile(regexe);
Matcher matcher = pattern.matcher(rawHtml);
while(matcher.find()){
String actualUrl = matcher.group();
if(!this.discoveredWebsitesList.contains(actualUrl)){
this.discoveredWebsitesList.add(actualUrl);
System.out.println("website has been found with URL :" + actualUrl);
this.queue.enQueue(actualUrl);
//System.out.println("Size is: " + queue.size());
if(queue.size() == 20){
System.exit(0);
}
}
}
}
}
public StringBuilder readUrl(String v) {
StringBuilder rawHtml = new StringBuilder() ;
URL ur;
try {
ur = new URL(v);
BufferedReader br = new BufferedReader(new InputStreamReader(ur.openStream()));
String inputLine = "";
while((inputLine = br.readLine()) != null){
rawHtml.append(inputLine);
}br.close();
} catch (Exception e) {
e.printStackTrace();
}
return rawHtml;
}
}

Related

TreeSet wont add edges

i have to code the Dijkstra algorithm. We got a blueprint for this project. Meaning we were told the classes, field variables and methods we have to use.
We have to read the adjacency matrix from a csv file and then use the Dijkstra algorithm.
My problem already begins in in filling the TreeSet edges...
The problem occurs in Graph.class on line 45 when i try to add the Edges.
Example for the csv :
;A;B;C;D;E;F;G;H
A;;1;3;1;;;;
B;1;;;;3;3;;
C;3;;;1;;;1;
D;1;;1;;1;;2;
E;;3;;1;;1;;5
F;;3;;;1;;;1
G;;;1;2;;;;1
H;;;;;5;1;1;
=>
A -> (B,1), (C,3), (D,1)
B -> (A,1), (E,3), (F,3)
C -> (A,3), (D,1), (G,1)
D -> (A,1), (C,1), (E,1), (G,2)
E -> (B,3), (D,1), (F,1), (H,5)
F -> (B,3), (E,1), (H,1)
G -> (C,1), (D,2), (H,1)
H -> (E,5), (F,1), (G,1)
Could somebody look where my problem is ? My indices are correct i checked them with some sout.
Just need help with filling in the TreeSet! I want to try the Dijkstra part myself.
public class Edge implements Comparable<Edge>{
private int distance;
private Node neighbour;
public Edge(int distance, Node neighbour) {
this.distance = distance;
this.neighbour = neighbour;
}
public int getDistance() {
return distance;
}
public void setDistance(int distance) {
this.distance = distance;
}
public Node getNeighbour() {
return neighbour;
}
public void setNeighbour(Node neighbour) {
this.neighbour = neighbour;
}
#Override
public int compareTo(Edge o) {
if (this.neighbour.getId().equals(o.neighbour.getId())){
return 0;
}else{
return -1;
}
}
}
import java.util.TreeSet;
public class Node {
private String id;
private TreeSet<Edge> edges;
private int distance;
private Node previous;
private boolean isVisited;
public Node(String id) {
this.id = id;
this.edges = new TreeSet<>();
}
public Node(String id, int distance){
this.id = id;
this.distance = distance;
}
#Override
public String toString() {
return "Node{" +
"id='" + id + '\'' +
", edges=" + edges +
", distance=" + distance +
", previous=" + previous +
", isVisited=" + isVisited +
'}';
}
public String getPath(){
return null;
}
public void addEdge(Edge e){
edges.add(e);
}
public void init(){
}
public void setStartNode(Node n){
}
public void visit(Node n){
}
public String getId() {
return id;
}
}
import java.io.File;
import java.io.FileNotFoundException;
import java.nio.file.Path;
import java.util.*;
public class Graph {
private PriorityQueue pq;
private ArrayList<Node> nodes;
public Graph(){
this.pq = new PriorityQueue();
this.nodes = new ArrayList();
}
public void readGraphFromAdjacencyMatrixFile(Path file) throws FileNotFoundException {
Scanner sc = new Scanner(new File(String.valueOf(file)));
ArrayList<String> s = new ArrayList<>();
ArrayList<Character> nodesCharacter = new ArrayList<Character>();
while (sc.hasNext()){
s.add(sc.next());
}
sc.close();
for(char ch: s.get(0).toCharArray()){
if (ch != ';' && ch != ',') {
nodes.add(new Node(Character.toString(ch)));
nodesCharacter.add(ch);
}
}
ArrayList<Node> nodes2 = getNodes();
String node = "";
int index = 0;
for (int i = 1; i < s.size(); i++){
int cnt = -2;
char[] chArray = s.get(i).toCharArray();
for (int j = 0; j < chArray.length; j++){
if(j == 0){
node = String.valueOf(chArray[j]);
index = indexOfNode(String.valueOf((chArray[j])));
}
else if (j >= 2){
if (Character.isDigit(chArray[j])){
int neighbourIndex = indexOfNode("" + nodesCharacter.get(cnt));
Edge e = new Edge(Character.getNumericValue(chArray[j]), nodes.get(neighbourIndex));
nodes.get(index).addEdge(e);
cnt--;
}
}
cnt ++;
}
}
}
public String getAllPAths(){
return null;
}
public void calcWithDijkstra(String startNodeId){
}
public ArrayList<Node> getNodes() {
return nodes;
}
public int indexOfNode(String id){
int cnt = 0;
for (int i = 0; i < nodes.size(); i++){
if (id.equals(nodes.get(i).getId())){
return cnt;
}
cnt++;
}
return -1;
}
}

Can't access the object within the GraphNode

I have a graph that contains objects of type GraphNodes. These nodes contain an object City that has properties if It's infected or not. I want to loop through all the nodes and check if a city is infected or not. I have a generic method getInfo which returns an object of type E in my case City. But when i try to chain another method or to get property i can't see them as if they are not available. All the classes in the code are from college so i can't add/remove methods. I've tried with foreach but I still can't get the methods.
Code:
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.LinkedList;
import java.util.Stack;
import java.util.StringTokenizer;
import java.util.LinkedList;
class City {
String osnovna_granka;
boolean zarazen;
City(String osnovna_granka, boolean zarazen) {
this.osnovna_granka = osnovna_granka;
this.zarazen = zarazen;
}
#Override
public String toString() {
if (zarazen == true) {
return osnovna_granka + " zarazen";
} else {
return osnovna_granka + " nezarazen";
}
}
}
class Graph {
int num_nodes;
GraphNode<City> adjList[];
#SuppressWarnings("unchecked")
public Graph(int num_nodes) {
this.num_nodes = num_nodes;
adjList = (GraphNode<City>[]) new GraphNode[num_nodes];
}
int adjacent(int x, int y) {
// proveruva dali ima vrska od jazelot so
// indeks x do jazelot so indeks y
return (adjList[x].containsNeighbor(adjList[y])) ? 1 : 0;
}
void addEdge(int x, int y) {
// dodava vrska od jazelot so indeks x do jazelot so indeks y
if (!adjList[x].containsNeighbor(adjList[y])) {
adjList[x].addNeighbor(adjList[y]);
}
}
void deleteEdge(int x, int y) {
adjList[x].removeNeighbor(adjList[y]);
}
#Override
public String toString() {
String ret = new String();
for (int i = 0; i < this.num_nodes; i++) {
ret += i + ": " + adjList[i] + "\n";
}
return ret;
}
}
class GraphNode<E> {
private int index;//index (reden broj) na temeto vo grafot
private E info;
private LinkedList<GraphNode<E>> neighbors;
public GraphNode(int index, E info) {
this.index = index;
this.info = info;
neighbors = new LinkedList<GraphNode<E>>();
}
boolean containsNeighbor(GraphNode<E> o) {
return neighbors.contains(o);
}
void addNeighbor(GraphNode<E> o) {
neighbors.add(o);
}
void removeNeighbor(GraphNode<E> o) {
if (neighbors.contains(o)) {
neighbors.remove(o);
}
}
#Override
public String toString() {
String ret = "INFO:" + info + " SOSEDI:";
for (int i = 0; i < neighbors.size(); i++) {
ret += neighbors.get(i).info + " ";
}
return ret;
}
#Override
public boolean equals(Object obj) {
#SuppressWarnings("unchecked")
GraphNode<E> pom = (GraphNode<E>) obj;
return (pom.info.equals(this.info));
}
public int getIndex() {
return index;
}
public void setIndex(int index) {
this.index = index;
}
public E getInfo() {
return info;
}
public void setInfo(E info) {
this.info = info;
}
public LinkedList<GraphNode<E>> getNeighbors() {
return neighbors;
}
public void setNeighbors(LinkedList<GraphNode<E>> neighbors) {
this.neighbors = neighbors;
}
}
public class Main {
public static void main(String[] args) throws Exception {
int i, j, k;
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
int N = Integer.parseInt(br.readLine());
Graph g = new Graph(N);
for (i = 0; i < N; i++) {
StringTokenizer st = new StringTokenizer(br.readLine());
st.nextToken();
String osnovna_granka = st.nextToken();
String str_zarazen = st.nextToken();
if (str_zarazen.equals("zarazen")) {
g.adjList[i] = new GraphNode(i, new City(osnovna_granka, true));
} else {
g.adjList[i] = new GraphNode(i, new City(osnovna_granka, false));
}
}
int M = Integer.parseInt(br.readLine());
for (i = 0; i < M; i++) {
StringTokenizer st = new StringTokenizer(br.readLine());
int a = Integer.parseInt(st.nextToken());
int b = Integer.parseInt(st.nextToken());
g.addEdge(a, b);
g.addEdge(b, a);
}
br.close();
Stack<GraphNode> stack = new Stack<>();
int counter = 0;
// vasiot kod ovde;
for(GraphNode gn: g.adjList) {
gn.getInfo().// Here the properties of City should show up
}
}
}
GraphNode is a generic type and you have not specified the type, the IDE cannot infer the type so no methods can be suggested. in the for loop you need to specify the type of the GraphNode.
for(GraphNode<City> gn: g.adjList)

Combinations of a matrix (rows and columns)

Good afternoon, I'm having problems finding all the combinations of a matrix.
I am testing with a 4x4 matrix but I can not get the approximate 2000 possible combinations of its elements, but I only generate 256 combinations. The idea is to take into account the repetitions to reach this number.
The problem is that my algorithm does not create the combinations of the rows. Explained graphically with a 2x2 matrix:
Correct solution
The correct solution would be the image above, but it only generates 4 combinations.
Solution thrown by the algorithm
My code is:
package ia;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
public class Nodo implements Iterator<String> {
Nodo anterior = null;
Nodo raiz = null;
int posicion = 0;
List<String> columnas = new LinkedList<>();
int indicesTotales = 0;
public Nodo(List<String> columnas, Nodo anterior, Nodo raiz) {
this.columnas = columnas;
this.anterior = anterior;
this.raiz = raiz;
this.posicion = 0;
this.indicesTotales = columnas.size() - 1;
}
#Override
public boolean hasNext() {
try {
if (this.raiz != null && this.raiz.posicion <= this.raiz.indicesTotales) {
return true;
} else if (this.posicion > this.indicesTotales) {
return false;
} else {
return false;
}
} catch (Exception exception) {
System.out.println(exception);
}
throw new UnsupportedOperationException("Aún no es compatible.");
}
#Override
public String next() {
if (this.posicion >= this.indicesTotales) {
if (this.anterior != null) {
this.posicion = 0;
this.anterior.next();
} else {
++this.posicion;
}
} else {
++this.posicion;
}
return null;
}
public String actual() {
return columnas.get(this.posicion);
}
#Override
public void remove() {
throw new UnsupportedOperationException("Not supported yet.");
}
}
2.
package ia;
import java.util.Arrays;
import java.util.LinkedList;
public class Combinaciones {
private Nodo iterador = null;
public Combinaciones(String[][] lista) {
Nodo raiz = null;
Nodo aux = null;
for (String[] columnas : lista) {
aux = new Nodo(new LinkedList<>(Arrays.asList(columnas)), aux, raiz);
if (raiz == null) {
raiz = aux;
}
}
iterador = aux;
}
public int generar() {
int contador = 0;
while (iterador.hasNext()) {
Nodo aux = iterador;
System.out.print(aux.actual());
while (aux.anterior != null) {
System.out.print(" " + aux.anterior.actual());
aux = aux.anterior;
}
iterador.next();
System.out.println();
contador++;
}
return contador;
}
}
3.
package ia;
public class Prueba {
public static void main(String[] args) {
Combinaciones lista = new Combinaciones(
new String[][] { { "1", "2" }, { "3", "4" } } );
System.out.println("\n" + lista.generar());
}
}
I would appreciate if someone could help me solve the error.
Tranks.

Why is my thread not ending?

Im pretty new to Java and to thread-programming especially. This code is mostly out of a pretty old book (2001) with samples and examples for a search-engine.
But its just not working
Now i don't know if i am making a mistake or if the author made it or if there are incompatibilities with different versions of java...i really have no clue! The oddest thing about it is that it works 1 out of 100 times ...
After hours of debugging i would appreciate any help!
SearchEngine.java:
import java.util.Vector;
import parsing.SourceElement;
import parsing.WebParserWrapper;
import query.Filter;
public class SearchEngine implements Runnable {
private Vector linkHistory = new Vector();
private int currentLink;
private String beginAt = null;
private SearchHandler searchHandler = null;
private boolean searchInProgress = false;
private boolean stopPending = false;
boolean firstTime = true;
public boolean searchInProgress() {
return searchInProgress;
}
public boolean stopPending() {
return stopPending;
}
#SuppressWarnings("unchecked")
public void followLinks(String url) {
if (stopPending)
return;
try {
boolean drillDown = false;
WebParserWrapper webParser = new WebParserWrapper();
Vector sortedElements = webParser.getElements(url, "", "WITHGET");
Vector contentElements = Filter.getFilteredElements(sortedElements, Filter.CONTENT, "matches", "*");
for (int i = 0; i < contentElements.size(); i++) {
SourceElement thisElement = (SourceElement) contentElements.elementAt(i);
String thisKey = (String) thisElement.getKey();
String thisContent = (String) thisElement.getContent();
boolean goodHit = searchHandler.handleElement(url, thisKey, thisContent);
if (goodHit) {
drillDown = true;
}
}
System.out.println(url + " -- DrillDown " + ((drillDown) ? "positive" : "negative"));
if (drillDown) {
Vector linkElements = Filter.getFilteredElements(sortedElements, Filter.KEY, "matches",
"*a[*].#href[*]");
for (int i = 0; i < linkElements.size(); i++) {
SourceElement thisElement = (SourceElement) linkElements.elementAt(i);
String thisContent = (String) thisElement.getContent();
if (!linkHistory.contains(thisContent)) {
linkHistory.add(thisContent);
System.out.println("Collected: " + thisContent);
}
}
}
}
catch (Exception e) {}
if (currentLink < linkHistory.size()) {
String nextLink = (String) linkHistory.elementAt(currentLink++);
if (nextLink != null) {
followLinks(nextLink);
}
}
}
public boolean startSearch(String url, SearchHandler searchHandler) {
if (searchInProgress)
return false;
beginAt = url;
this.searchHandler = searchHandler;
this.linkHistory = new Vector();
this.currentLink = 0;
Thread searchThread = new Thread(this);
searchThread.start();
return true;
}
public void stopSearch() {
stopPending = true;
}
#Override
public void run() {
searchInProgress = true;
followLinks(beginAt);
searchInProgress = false;
stopPending = false;
}
}
SimpleSearcher.java
import java.util.Enumeration;
import java.util.Hashtable;
public class SimpleSearcher implements SearchHandler {
private SearchEngine searchEngine;
private String keyword;
private String startURL;
private Hashtable hits = new Hashtable();
public boolean handleElement(String url, String key, String content) {
boolean goodHit = false;
int keywordCount = 0;
int pos = -1;
while ((pos = content.toLowerCase().indexOf(keyword, pos + 1)) >= 0){
keywordCount++;
}
if (keywordCount > 0) {
Integer count = (Integer) hits.get(url);
if (count == null){
hits.put(url, new Integer(1));
}
else {
hits.remove(url);
hits.put(url, new Integer(count.intValue() + keywordCount));
}
goodHit = true;
}
if (hits.size() >= 3)
searchEngine.stopSearch();
return goodHit;
}
public Hashtable search(String startURL, String keyword) {
searchEngine = new SearchEngine();
this.startURL = startURL;
this.keyword = keyword;
searchEngine.startSearch(startURL, this);
try {Thread.sleep(1000);}catch (Exception e){e.printStackTrace();}
while (searchEngine.searchInProgress());
return this.hits;
}
public static void main(String[] args) {
SimpleSearcher searcher = new SimpleSearcher();
String url = "http://www.nzz.ch/";
String compareWord = "der";
Hashtable hits = searcher.search(url, compareWord);
System.out.println("URLs=" + hits.size());
for (Enumeration keys = hits.keys(); keys.hasMoreElements();) {
String thisKey = (String) keys.nextElement();
int thisCount = ((Integer) hits.get(thisKey)).intValue();
System.out.println(thisCount + " hits at " + thisKey);
}
}
}
SearchHandler.java
public interface SearchHandler {
public boolean handleElement(String url, String key, String content);
}

Counting distinct words with Threads

The objective is to count distinct words from a file.
UPDATE: Previous Code was successfully finished. Now I have to do the same but using threads (Oh man, I hate them...) and in addition I want to make it with semaphores for better flow.
Code contains some extra stuff left out from previous attempts, I'm trying to figure out what can be used..
I can read one word at a time but mostly I get a "null" in the container. So until I get anything from the container all the time I can't test the Sorter class and so on...
The new addition to the program is WordContainer class to store one word to pass it from reader to sorter:
package main2;
import java.util.ArrayList;
public class WordContainer
{
private ArrayList<String> words;
public synchronized String take()
{
String nextWord = null;
while (words.isEmpty())
{
try
{
wait();
}
catch (InterruptedException e)
{
}
}
nextWord = words.remove(0);
notify();
return nextWord;
}
public synchronized void put(String word)
{
while (words.size() > 999)
{
try
{
wait();
}
catch (InterruptedException e)
{
}
}
words.add(word);
notify();
}
}
DataSet Class combined with Sorter method resulting in Sorter Class:
package main2;
import java.util.concurrent.Semaphore;
public class Sorter extends Thread
{
private WordContainer wordContainer;
private int top;
private String[] elements;
private boolean stopped;
private Semaphore s;
private Semaphore s2;
public Sorter(WordContainer wordContainer, Semaphore s, Semaphore s2)
{
this.wordContainer = wordContainer;
elements = new String[1];
top = 0;
stopped = false;
this.s = s;
this.s2 = s2;
}
public void run()
{
String nextWord = wordContainer.take();
while (nextWord != null)
{
try
{
s.acquire();
}
catch (InterruptedException e)
{
e.printStackTrace();
}
nextWord = wordContainer.take();
s2.release();
add(nextWord);
}
}
public void startSorting()
{
start();
}
public void stopSorting()
{
stopped = true;
}
public boolean member(String target)
{
if (top > 0)
{
return binarySearch(target, 0, top);
}
else
{
return false;
}
}
private boolean binarySearch(String target, int from, int to)
{
if (from == to - 1)
{
return elements[from].equals(target);
}
int middle = (to - from) / 2 + from;
if (elements[from].equals(target))
{
return true;
}
if (elements[middle].compareTo(target) > 0)
{
// search left
return binarySearch(target, from, middle);
}
else
{
// search right
return binarySearch(target, middle, to);
}
}
public void add(String nextElement)
{
if (top < elements.length)
{
elements[top++] = nextElement;
System.out.println("[" + top + "] " + nextElement);
sort();
}
else
{
String[] newArray = new String[elements.length * 2];
for (int i = 0; i < elements.length; i++)
{
newArray[i] = elements[i];
}
elements = newArray;
add(nextElement);
}
}
private void sort()
{
int index = 0;
while (index < top - 1)
{
if (elements[index].compareTo(elements[index + 1]) < 0)
{
index++;
}
else
{
String temp = elements[index];
elements[index] = elements[index + 1];
elements[index + 1] = temp;
if (index > 0)
{
index--;
}
}
}
}
public int size()
{
return top;
}
public String getSortedWords()
{
String w = "";
for (int i = 0; i < elements.length; i++)
{
w += elements[i] + ", ";
}
return w;
}
public int getNumberOfDistinctWords()
{
return top;
}
}
Reader Class now looks like this:
package main2;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.concurrent.Semaphore;
public class Reader extends Thread
{
private static final int whitespace = 45;
private static final int word = 48;
private static final int finished = -1;
private WordContainer wordContainer;
private Semaphore s;
private Semaphore s2;
private String[] wordsR;
private int state;
private BufferedReader reader;
private int nextFreeIndex;
public Reader(File words, WordContainer wordContainer, Semaphore s,
Semaphore s2)
{
state = whitespace;
try
{
reader = new BufferedReader(new FileReader(words));
}
catch (FileNotFoundException e)
{
e.printStackTrace();
}
nextFreeIndex = 0;
wordsR = new String[1];
this.wordContainer = wordContainer;
this.s = s;
this.s2 = s;
}
public void startReading()
{
start();
}
public void run()
{
String nextWord = readNext();
while (nextWord != null)
{
nextWord = readNext();
wordContainer.put(nextWord);
s.release();
try
{
s2.acquire();
}
catch (InterruptedException e)
{
e.printStackTrace();
}
}
}
public String readNext()
{
int next;
StringBuffer nextWord = new StringBuffer();
while (true)
{
try
{
next = reader.read();
}
catch (IOException e)
{
next = -1;
}
char nextChar = (char) next;
switch (state)
{
case whitespace:
if (isWhiteSpace(nextChar))
{
state = whitespace;
}
else if (next == -1)
{
state = finished;
}
else
{
nextWord.append(nextChar);
state = word;
}
break;
case word:
if (isWhiteSpace(nextChar))
{
state = whitespace;
return nextWord.toString();
}
else if (next == -1)
{
state = finished;
return nextWord.toString();
}
else
{
nextWord.append(nextChar);
state = word;
}
break;
case finished:
return null;
}
}
}
private boolean isWhiteSpace(char nextChar)
{
switch (nextChar)
{
case '-':
case '"':
case ':':
case '\'':
case ')':
case '(':
case '!':
case ']':
case '?':
case '.':
case ',':
case ';':
case '[':
case ' ':
case '\t':
case '\n':
case '\r':
return true;
}
return false;
}
public void close()
{
try
{
reader.close();
}
catch (IOException e)
{
}
}
public String getWords()
{
return wordContainer.take();
}
}
Test Class
package test;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.Semaphore;
import main2.Reader;
import main2.Sorter;
import main2.WordContainer;
import junit.framework.Assert;
import junit.framework.TestCase;
public class TestDistinctWordsWithThreads extends TestCase
{
public void test() throws IOException, InterruptedException
{
File words = new File("resources" + File.separator + "AV1611Bible.txt");
if (!words.exists())
{
System.out.println("File [" + words.getAbsolutePath()
+ "] does not exist");
Assert.fail();
}
WordContainer container = new WordContainer();
Semaphore s = new Semaphore(0);
Semaphore s2 = new Semaphore(0);
Reader reader = new Reader(words, container, s, s2);
Sorter sorter = new Sorter(container, s, s2);
reader.startReading();
sorter.startSorting();
reader.join();
sorter.join();
System.out.println(reader.getWords());
Assert.assertTrue(sorter.getNumberOfDistinctWords() == 14720);
/*
* String bible = reader.getWords(); System.out.println(bible); String[]
* bible2 = sorter.getSortedWords(); System.out.println(bible2);
* assertTrue(bible2.length < bible.length());
*/
}
}
Why don't you sinply try something like:
public int countWords(File file) {
Scanner sc = new Scanner(file);
Set<String> allWords = new HashSet<String>();
while(sc.hasNext()) {
allWords.add(sc.next());
}
return allWords.size();
}

Categories