java- removing substring in a list of strings - java

Consider the case of a list of strings
example : list=['apple','bat','cow,'dog','applebat','cowbat','dogbark','help']
The java code must check if any element of string is a subset of another element and if it is then larger string element must be removed.
so in this case strings 'applebat','cowbat','dogbark, are removed.
The approach I have taken was to take two lists and iterate over them in the following way,
ArrayList<String> list1 = new ArrayList<String>(strings);
ArrayList<String> list2 = new ArrayList<String>(strings);
for(int i = 0; i<list1.size();i++)
{
String curr1 = list1.get(i);
for(int j = 0;j<list2.size();j++)
{
String curr2 = list2.get(j);
if(curr2.contains(curr1)&&!curr2.equals(curr1))
{
list2.remove(j);
j--;
}
}
}
IMPORTANT I have lists with the sizes of 200K to 400K elements.I would like to find a way to improve performance. I even tried hashsets but they were not much help.I am facing issues with the time taken by the program.
Can any one suggest any improvements to my code or any other approaches in java to improve performance??

import java.util.ArrayList;
import java.util.*;
// our main class becomes a file but the main method is still found
public class HelloWorld
{
public static void main(String[] args)
{
String[] strings = {"apple","bat","cow","dog","applebat","cowbat","dogbark","help"};
ArrayList<String> list1 = new ArrayList<String>(Arrays.asList(strings));
ArrayList<String> list2 = new ArrayList<String>(Arrays.asList(strings));
ArrayList<String> result = new ArrayList<String>(Arrays.asList(strings));
for(int i = 0; i<8;i++)
{
String curr1 = list1.get(i);
System.out.println(curr1);
int flag = 0;
for(int j = i+1;j<8;j++)
{
String curr2 = list2.get(j);
if((curr2.contains(curr1)&&!curr2.equals(curr1)))
{
result.remove(curr2);
}
}
}
System.out.println(result);
}
}

For full performance boost of huge list of words, I would think a combination of sort and a string searching algorithm, such as the Aho–Corasick algorithm, is what you need, assuming you're willing to implement such complex logic.
First, sort the words by length.
Then build up the Aho–Corasick Dictionary, in word length order. For each word, first check if a substring exists in the dictionary. If it does, skip the word, otherwise add the word to the dictionary.
When done, dump the dictionary, or the parallel-maintained list if dictionary is not easy/possible to dump.

I suppose set will be faster here.
You can easy do that with java8 stream api.
Try that:
private Set<String> delete() {
Set<String> startSet = new HashSet<>(Arrays.asList("a", "b", "c", "d", "ab", "bc", "ce", "fg"));
Set<String> helperSet = new HashSet<>(startSet);
helperSet.forEach(s1 -> helperSet.forEach(s2 -> {
if (s2.contains(s1) && !s1.equals(s2)) {
startSet.remove(s2);
}
}));
return startSet;
}
Do not delete any elements from set you are iterating for or you will have ConcurrentModificationException.

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Random;
public class SubStrRmove {
public static List<String> randomList(int size) {
final String BASE = "abcdefghijklmnopqrstuvwxyz";
Random random = new Random();
List<String> list = new ArrayList<>();
for (int i = 0; i < size; i++) {
int length = random.nextInt(3) + 2;
StringBuffer sb = new StringBuffer();
for (int j = 0; j < length; j++) {
int number = random.nextInt(BASE.length());
sb.append(BASE.charAt(number));
}
list.add(sb.toString());
sb.delete(0, sb.length());
}
return list;
}
public static List<String> removeListSubStr(List<String> args) {
String[] input = args.toArray(new String[args.size()]);
Arrays.parallelSort(input, (s1, s2) -> s1.length() - s2.length());
List<String> result = new ArrayList<>(args.size());
for (int i = 0; i < input.length; i++) {
String temp = input[i];
if (!result.stream().filter(s -> temp.indexOf(s) >= 0).findFirst().isPresent()) {
result.add(input[i]);
}
}
return result;
}
public static List<String> removeListSubStr2(List<String> args) {
String[] input = args.toArray(new String[args.size()]);
Arrays.parallelSort(input, (s1, s2) -> s1.length() - s2.length());
List<String> result = new ArrayList<>(args.size());
for (int i = 0; i < input.length; i++) {
boolean isDiff = true;
for (int j = 0; j < result.size(); j++) {
if (input[i].indexOf(result.get(j)) >= 0) {
isDiff = false;
break;
}
}
if (isDiff) {
result.add(input[i]);
}
}
return result;
}
public static void main(String[] args) throws InterruptedException {
List<String> list = randomList(20000);
Long start1 = new Date().getTime();
List<String> listLambda = removeListSubStr(list);
Long end1 = new Date().getTime();
Long start2 = new Date().getTime();
List<String> listFor = removeListSubStr2(list);
Long end2 = new Date().getTime();
System.out.println("mothod Labbda:" + (end1 - start1) + "ms");
System.out.println("mothod simple:" + (end2 - start2) + "ms");
System.out.println("" + listLambda.size() + listLambda);
System.out.println("" + listFor.size() + listFor);
}
}

I have tested it on small data and hope it helps you to find solution...
import java.util.ArrayList;
import java.util.Arrays;
public class Main {
public static void main(String[] args){
String []list = {"apple","bat","cow","dog","applebat","cowbat","dogbark","help","helpless","cows"};
System.out.println(Arrays.toString(list));
int prelenght = 0;
int prolenght = 0;
long pretime = System.nanoTime();
for(int i=0;i<list.length;i++){
String x = list[i];
prelenght = list[i].length();
for(int j=i+1;j<list.length;j++){
String y = list[j];
if(y.equals(x)){
list[j] = "0";
}else if(y.contains(x)||x.contains(y)){
prolenght = list[j].length();
if(prelenght<prolenght){
list[j] = "0";
}
if(prelenght>prolenght){
list[i] = "0";
break;
}
}
}
}
long protime = System.nanoTime();
long time = (protime - pretime);
System.out.println(time + "ns");
UpdateArray(list);
}
public static void UpdateArray(String[] list){
ArrayList<String> arrayList = new ArrayList<>();
for(int i=0;i<list.length;i++){
if(!list[i].equals("0")){
arrayList.add(list[i]);
}
}
System.out.println(arrayList.toString());
}
}
Output :
[apple, bat, cow, dog, applebat, cowbat, dogbark, help, helpless, cows]
time elapsed : 47393ns
[apple, bat, cow, dog, help]

Related

How can i extract trend words from given dataset (Java)? [duplicate]

How to generate an n-gram of a string like:
String Input="This is my car."
I want to generate n-gram with this input:
Input Ngram size = 3
Output should be:
This
is
my
car
This is
is my
my car
This is my
is my car
Give some idea in Java, how to implement that or if any library is available for it.
I am trying to use this NGramTokenizer but its giving n-gram's of character sequence and I want n-grams of word sequence.
I believe this would do what you want:
import java.util.*;
public class Test {
public static List<String> ngrams(int n, String str) {
List<String> ngrams = new ArrayList<String>();
String[] words = str.split(" ");
for (int i = 0; i < words.length - n + 1; i++)
ngrams.add(concat(words, i, i+n));
return ngrams;
}
public static String concat(String[] words, int start, int end) {
StringBuilder sb = new StringBuilder();
for (int i = start; i < end; i++)
sb.append((i > start ? " " : "") + words[i]);
return sb.toString();
}
public static void main(String[] args) {
for (int n = 1; n <= 3; n++) {
for (String ngram : ngrams(n, "This is my car."))
System.out.println(ngram);
System.out.println();
}
}
}
Output:
This
is
my
car.
This is
is my
my car.
This is my
is my car.
An "on-demand" solution implemented as an Iterator:
class NgramIterator implements Iterator<String> {
String[] words;
int pos = 0, n;
public NgramIterator(int n, String str) {
this.n = n;
words = str.split(" ");
}
public boolean hasNext() {
return pos < words.length - n + 1;
}
public String next() {
StringBuilder sb = new StringBuilder();
for (int i = pos; i < pos + n; i++)
sb.append((i > pos ? " " : "") + words[i]);
pos++;
return sb.toString();
}
public void remove() {
throw new UnsupportedOperationException();
}
}
You are looking for ShingleFilter.
Update: The link points to version 3.0.2. This class may be in different package in newer version of Lucene.
This code returns an array of all Strings of the given length:
public static String[] ngrams(String s, int len) {
String[] parts = s.split(" ");
String[] result = new String[parts.length - len + 1];
for(int i = 0; i < parts.length - len + 1; i++) {
StringBuilder sb = new StringBuilder();
for(int k = 0; k < len; k++) {
if(k > 0) sb.append(' ');
sb.append(parts[i+k]);
}
result[i] = sb.toString();
}
return result;
}
E.g.
System.out.println(Arrays.toString(ngrams("This is my car", 2)));
//--> [This is, is my, my car]
System.out.println(Arrays.toString(ngrams("This is my car", 3)));
//--> [This is my, is my car]
/**
*
* #param sentence should has at least one string
* #param maxGramSize should be 1 at least
* #return set of continuous word n-grams up to maxGramSize from the sentence
*/
public static List<String> generateNgramsUpto(String str, int maxGramSize) {
List<String> sentence = Arrays.asList(str.split("[\\W+]"));
List<String> ngrams = new ArrayList<String>();
int ngramSize = 0;
StringBuilder sb = null;
//sentence becomes ngrams
for (ListIterator<String> it = sentence.listIterator(); it.hasNext();) {
String word = (String) it.next();
//1- add the word itself
sb = new StringBuilder(word);
ngrams.add(word);
ngramSize=1;
it.previous();
//2- insert prevs of the word and add those too
while(it.hasPrevious() && ngramSize<maxGramSize){
sb.insert(0,' ');
sb.insert(0,it.previous());
ngrams.add(sb.toString());
ngramSize++;
}
//go back to initial position
while(ngramSize>0){
ngramSize--;
it.next();
}
}
return ngrams;
}
Call:
long startTime = System.currentTimeMillis();
ngrams = ToolSet.generateNgramsUpto("This is my car.", 3);
long stopTime = System.currentTimeMillis();
System.out.println("My time = "+(stopTime-startTime)+" ms with ngramsize = "+ngrams.size());
System.out.println(ngrams.toString());
Output:
My time = 1 ms with ngramsize = 9 [This, is, This is, my, is my, This
is my, car, my car, is my car]
public static void CreateNgram(ArrayList<String> list, int cutoff) {
try
{
NGramModel ngramModel = new NGramModel();
POSModel model = new POSModelLoader().load(new File("en-pos-maxent.bin"));
PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent");
POSTaggerME tagger = new POSTaggerME(model);
perfMon.start();
for(int i = 0; i<list.size(); i++)
{
String inputString = list.get(i);
ObjectStream<String> lineStream = new PlainTextByLineStream(new StringReader(inputString));
String line;
while ((line = lineStream.read()) != null)
{
String whitespaceTokenizerLine[] = WhitespaceTokenizer.INSTANCE.tokenize(line);
String[] tags = tagger.tag(whitespaceTokenizerLine);
POSSample sample = new POSSample(whitespaceTokenizerLine, tags);
perfMon.incrementCounter();
String words[] = sample.getSentence();
if(words.length > 0)
{
for(int k = 2; k< 4; k++)
{
ngramModel.add(new StringList(words), k, k);
}
}
}
}
ngramModel.cutoff(cutoff, Integer.MAX_VALUE);
Iterator<StringList> it = ngramModel.iterator();
while(it.hasNext())
{
StringList strList = it.next();
System.out.println(strList.toString());
}
perfMon.stopAndPrintFinalResult();
}catch(Exception e)
{
System.out.println(e.toString());
}
}
Here is my codes to create n-gram. In this case, n = 2, 3. n-gram of words sequence which smaller than cutoff value will ignore from result set. Input is list of sentences, then it parse using a tool of OpenNLP
public static void main(String[] args) {
String[] words = "This is my car.".split(" ");
for (int n = 0; n < 3; n++) {
List<String> list = ngrams(n, words);
for (String ngram : list) {
System.out.println(ngram);
}
System.out.println();
}
}
public static List<String> ngrams(int stepSize, String[] words) {
List<String> ngrams = new ArrayList<String>();
for (int i = 0; i < words.length-stepSize; i++) {
String initialWord = "";
int internalCount = i;
int internalStepSize = i + stepSize;
while (internalCount <= internalStepSize
&& internalCount < words.length) {
initialWord = initialWord+" " + words[internalCount];
++internalCount;
}
ngrams.add(initialWord);
}
return ngrams;
}
Check this out:
public static void main(String[] args) {
NGram nGram = new NGram();
String[] tokens = "this is my car".split(" ");
int i = tokens.length;
List<String> ngrams = new ArrayList<>();
while (i >= 1){
ngrams.addAll(nGram.getNGram(tokens, i, new ArrayList<>()));
i--;
}
System.out.println(ngrams);
}
private List<String> getNGram(String[] tokens, int n, List<String> ngrams) {
StringBuilder strbldr = new StringBuilder();
if (tokens.length < n) {
return ngrams;
}else {
for (int i=0; i<n; i++){
strbldr.append(tokens[i]).append(" ");
}
ngrams.add(strbldr.toString().trim());
String[] newTokens = Arrays.copyOfRange(tokens, 1, tokens.length);
return getNGram(newTokens, n, ngrams);
}
}
Simple recursive function, better running time.

Sort ArrayList using predefined list of indices

I am trying to sort an ArrayList using a predefined array of indices.
My current example uses a copy of the original ArrayList for sorting and therefore is not scalable for ArrayLists of larger objects
package sortExample;
import java.awt.List;
import java.util.ArrayList;
import java.util.Arrays;
public class sortExample {
public static void main(String[] args) {
String [] str = new String[] {"a","b","c","d"};
ArrayList<String> arr1 = new ArrayList<String>(Arrays.asList(str));
int [] indices = {3,1,2,0};
ArrayList<String> arr2 = new ArrayList(arr1.size());
for (int i = 0; i < arr1.size(); i++) {
arr2.add("0");
}
int arrIndex = 0;
for (int i : indices){
String st = arr1.get(arrIndex);
arr2.set(i, st);
arrIndex++;
}
System.out.println(arr1.toString());
System.out.println(arr2.toString());
}
}
For reusing same data, please see my solution:
public static void main(String[] args) {
String[] strs = new String[]{"a", "b", "c", "d"};
int[] indices = {3, 1, 2, 0};
String tmp;
for (int i = 0; i < strs.length; i++) {
if (i != indices[i]) {
tmp = strs[i];
strs[i] = strs[indices[i]];
strs[indices[i]] = tmp;
indices[indices[i]] = indices[i];
indices[i] = i;
}
}
for (int i : indices) {
System.out.print(i + " ");
}
System.out.println();
for (String str : strs) {
System.out.print(str + " ");
}
}
Output is:
0 1 2 3
d b c a
Alternate reorder in place based on cycles. Note that indices will be changed to {0,1,2,3}. I don't have Java installed (yet), so I converted working C++ code to what I think is proper Java syntax.
for (int i = 0; i < arr1.size(); i++) {
if(i != indices[i]) {
String st = arr1.get(i);
int t = indices[i];
int k = i;
int j;
while(i != (j = indices[k])){
arr1.set(k, arr1.get(j));
indices[k] = k;
k = j;
}
arr1.set(k, st);
indices[k] = k;
}
}
For this specific case {3,1,2,0}, all this does is swap 0 and 3. The longest cycle occurs when the indices are rotated, such as {3 0 1 2}, in which case st=arr1[0], arr1[0] = arr1[3], arr[3] = arr1[2], arr1[2] = arr1[1], arr1[1] = st.
There is a (a little bit) more simple solution:
int [] indices = {3,1,2,0};
ArrayList<String> arr2 = new ArrayList<String>();
for (int i = 0; i < arr1.size(); i++) {
arr2.add(arr1.get(indices[i]));
}
At the below, just use "indices" for a new array.
public class Sorting {
public static void main(String[] args) {
String [] str = new String[] {"a","b","c","d"};
int [] indices = {3,1,2,0};
String sorted [] = new String [str.length] ;
int i = 0;
for (String string : str) {
sorted[indices[i]] = string;
i++;
}
for (String string : sorted) {
System.out.print(string + " ");
}
}
}
prints: d b c a

Java array I need your thought

Given an array of strings, return another array containing all of its longest strings.
For (String [] x = {"serm", "aa", "sazi", "vcd", "aba","kart"};)
output will be
{"serm", "sazi" , "kart"}.
The following code is wrong, What can I do to fix it.
public class Tester {
public static void main(String[] args) {
Tester all = new Tester();
String [] x = {"serm", "aa", "sazi", "vcd", "aba","kart"};
String [] y = all.allLongestStrings(x);
System.out.println(y);
}
String[] allLongestStrings(String[] input) {
ArrayList<String> answer = new ArrayList<String>(
Arrays.asList(input[0]));
for (int i = 1; i < input.length; i++) {
if (input[i].length() == answer.get(0).length()) {
answer.add(input[i]);
}
if (input[i].length() > answer.get(0).length()) {
answer.add(input[i]);
}
}
return answer.toArray(new String[0]);
}
}
I will give you solution, but as it homework, it will be only sudo code
problem with your solution is, you are not finging longest strings, but strings same size or bigger than size of first element
let helper = []
let maxLength = 0;
for each string in array
if (len(string) >maxLength){
maxLength = len(string);
clear(helper)
}
if (len(string) == maxLength)
helper.add(string)
}
return helper;
You can try below code
private static String[] solution(String[] inputArray) {
int longestStrSize = 0;
List<String> longestStringList = new ArrayList<>();
for (int i = 0; i < inputArray.length; i++) {
if (inputArray[i] != null) {
if (longestStrSize <= inputArray[i].length()) {
longestStrSize = inputArray[i].length();
longestStringList.add(inputArray[i]);
}
}
}
final int i = longestStrSize;
return longestStringList.stream().filter(x -> x.length() >= i).collect(Collectors.toList()).stream()
.toArray(String[]::new);
}

permutations of a string using iteration

I'm trying to find permutation of a given string, but I want to use iteration. The recursive solution I found online and I do understand it, but converting it to an iterative solution is really not working out. Below I have attached my code. I would really appreciate the help:
public static void combString(String s) {
char[] a = new char[s.length()];
//String temp = "";
for(int i = 0; i < s.length(); i++) {
a[i] = s.charAt(i);
}
for(int i = 0; i < s.length(); i++) {
String temp = "" + a[i];
for(int j = 0; j < s.length();j++) {
//int k = j;
if(i != j) {
System.out.println(j);
temp += s.substring(0,j) + s.substring(j+1,s.length());
}
}
System.out.println(temp);
}
}
Following up on my related question comment, here's a Java implementation that does what you want using the Counting QuickPerm Algorithm:
public static void combString(String s) {
// Print initial string, as only the alterations will be printed later
System.out.println(s);
char[] a = s.toCharArray();
int n = a.length;
int[] p = new int[n]; // Weight index control array initially all zeros. Of course, same size of the char array.
int i = 1; //Upper bound index. i.e: if string is "abc" then index i could be at "c"
while (i < n) {
if (p[i] < i) { //if the weight index is bigger or the same it means that we have already switched between these i,j (one iteration before).
int j = ((i % 2) == 0) ? 0 : p[i];//Lower bound index. i.e: if string is "abc" then j index will always be 0.
swap(a, i, j);
// Print current
System.out.println(join(a));
p[i]++; //Adding 1 to the specific weight that relates to the char array.
i = 1; //if i was 2 (for example), after the swap we now need to swap for i=1
}
else {
p[i] = 0;//Weight index will be zero because one iteration before, it was 1 (for example) to indicate that char array a[i] swapped.
i++;//i index will have the option to go forward in the char array for "longer swaps"
}
}
}
private static String join(char[] a) {
StringBuilder builder = new StringBuilder();
builder.append(a);
return builder.toString();
}
private static void swap(char[] a, int i, int j) {
char temp = a[i];
a[i] = a[j];
a[j] = temp;
}
List<String> results = new ArrayList<String>();
String test_str = "abcd";
char[] chars = test_str.toCharArray();
results.add(new String("" + chars[0]));
for(int j=1; j<chars.length; j++) {
char c = chars[j];
int cur_size = results.size();
//create new permutations combing char 'c' with each of the existing permutations
for(int i=cur_size-1; i>=0; i--) {
String str = results.remove(i);
for(int l=0; l<=str.length(); l++) {
results.add(str.substring(0,l) + c + str.substring(l));
}
}
}
System.out.println("Number of Permutations: " + results.size());
System.out.println(results);
Example:
if we have 3 character string e.g. "abc", we can form permuations as below.
1) construct a string with first character e.g. 'a' and store that in results.
char[] chars = test_str.toCharArray();
results.add(new String("" + chars[0]));
2) Now take next character in string (i.e. 'b') and insert that in all possible positions of previously contsructed strings in results. Since we have only one string in results ("a") at this point, doing so gives us 2 new strings 'ba', 'ab'. Insert these newly constructed strings in results and remove "a".
for(int i=cur_size-1; i>=0; i--) {
String str = results.remove(i);
for(int l=0; l<=str.length(); l++) {
results.add(str.substring(0,l) + c + str.substring(l));
}
}
3) Repeat 2) for every character in the given string.
for(int j=1; j<chars.length; j++) {
char c = chars[j];
....
....
}
This gives us "cba", "bca", "bac" from "ba" and "cab", "acb" and "abc" from "ab"
Work queue allows us to create an elegant iterative solution for this problem.
static List<String> permutations(String string) {
List<String> permutations = new LinkedList<>();
Deque<WorkUnit> workQueue = new LinkedList<>();
// We need to permutate the whole string and haven't done anything yet.
workQueue.add(new WorkUnit(string, ""));
while (!workQueue.isEmpty()) { // Do we still have any work?
WorkUnit work = workQueue.poll();
// Permutate each character.
for (int i = 0; i < work.todo.length(); i++) {
String permutation = work.done + work.todo.charAt(i);
// Did we already build a complete permutation?
if (permutation.length() == string.length()) {
permutations.add(permutation);
} else {
// Otherwise what characters are left?
String stillTodo = work.todo.substring(0, i) + work.todo.substring(i + 1);
workQueue.add(new WorkUnit(stillTodo, permutation));
}
}
}
return permutations;
}
A helper class to hold partial results is very simple.
/**
* Immutable unit of work
*/
class WorkUnit {
final String todo;
final String done;
WorkUnit(String todo, String done) {
this.todo = todo;
this.done = done;
}
}
You can test the above piece of code by wrapping them in this class.
import java.util.*;
public class AllPermutations {
public static void main(String... args) {
String str = args[0];
System.out.println(permutations(str));
}
static List<String> permutations(String string) {
...
}
}
class WorkUnit {
...
}
Try it by compiling and running.
$ javac AllPermutations.java; java AllPermutations abcd
The below implementation can also be easily tweaked to return a list of permutations in reverse order by using a LIFO stack of work instead of a FIFO queue.
import java.util.List;
import java.util.Set;
import java.util.ArrayList;
import java.util.HashSet;
public class Anagrams{
public static void main(String[] args)
{
String inpString = "abcd";
Set<String> combs = getAllCombs(inpString);
for(String comb : combs)
{
System.out.println(comb);
}
}
private static Set<String> getAllCombs(String inpString)
{
Set<String> combs = new HashSet<String>();
if( inpString == null | inpString.isEmpty())
return combs;
combs.add(inpString.substring(0,1));
Set<String> tempCombs = new HashSet<String>();
for(char a : inpString.substring(1).toCharArray())
{
tempCombs.clear();
tempCombs.addAll(combs);
combs.clear();
for(String comb : tempCombs)
{
combs.addAll(getCombs(comb,a));
}
}
return combs;
}
private static Set<String> getCombs(String comb, char a) {
Set<String> combs = new HashSet<String>();
for(int i = 0 ; i <= comb.length(); i++)
{
String temp = comb.substring(0, i) + a + comb.substring(i);
combs.add(temp);
//System.out.println(temp);
}
return combs;
}
}
Just posting my approach to the problem:
import java.util.ArrayDeque;
import java.util.Queue;
public class PermutationIterative {
public static void main(String[] args) {
permutationIterative("abcd");
}
private static void permutationIterative(String str) {
Queue<String> currentQueue = null;
int charNumber = 1;
for (char c : str.toCharArray()) {
if (currentQueue == null) {
currentQueue = new ArrayDeque<>(1);
currentQueue.add(String.valueOf(c));
} else {
int currentQueueSize = currentQueue.size();
int numElements = currentQueueSize * charNumber;
Queue<String> nextQueue = new ArrayDeque<>(numElements);
for (int i = 0; i < currentQueueSize; i++) {
String tempString = currentQueue.remove();
for (int j = 0; j < charNumber; j++) {
int n = tempString.length();
nextQueue.add(tempString.substring(0, j) + c + tempString.substring(j, n));
}
}
currentQueue = nextQueue;
}
charNumber++;
}
System.out.println(currentQueue);
}
}
package vishal villa;
import java.util.Scanner;
public class Permutation {
static void result( String st, String ans)
{
if(st.length() == 0)
System.out.println(ans +" ");
for(int i = 0; i<st.length(); i++)
{
char ch = st.charAt(i);
String r = st.substring(0, i) + st.substring(i + 1);
result(r, ans + ch);
}
}
public static void main(String[] args)
{
Scanner Sc = new Scanner(System.in);
System.out.println("enter the string");
String st = Sc.nextLine();
Permutation p = new Permutation();
p.result(st,"" );
}
}
// Java program to print all permutations of a
// given string.
public class Permutation
{
public static void main(String[] args)
{
String str = "ABC";
int n = str.length();
Permutation permutation = new Permutation();
permutation.permute(str, 0, n-1);
}
/**
* permutation function
* #param str string to calculate permutation for
* #param s starting index
* #param e end index
*/
private void permute(String str, int s, int e)
{
if (s == e)
System.out.println(str);
else
{
for (int i = s; i <= s; i++)
{
str = swap(str,l,i);
permute(str, s+1, e);
str = swap(str,l,i);
}
}
}
/**
* Swap Characters at position
* #param a string value
* #param i position 1
* #param j position 2
* #return swapped string
*/
public String swap(String a, int i, int j)
{
char temp;
char[] charArray = a.toCharArray();
temp = charArray[i] ;
charArray[i] = charArray[j];
charArray[j] = temp;
return String.valueOf(charArray);
}
}

N-gram generation from a sentence

How to generate an n-gram of a string like:
String Input="This is my car."
I want to generate n-gram with this input:
Input Ngram size = 3
Output should be:
This
is
my
car
This is
is my
my car
This is my
is my car
Give some idea in Java, how to implement that or if any library is available for it.
I am trying to use this NGramTokenizer but its giving n-gram's of character sequence and I want n-grams of word sequence.
I believe this would do what you want:
import java.util.*;
public class Test {
public static List<String> ngrams(int n, String str) {
List<String> ngrams = new ArrayList<String>();
String[] words = str.split(" ");
for (int i = 0; i < words.length - n + 1; i++)
ngrams.add(concat(words, i, i+n));
return ngrams;
}
public static String concat(String[] words, int start, int end) {
StringBuilder sb = new StringBuilder();
for (int i = start; i < end; i++)
sb.append((i > start ? " " : "") + words[i]);
return sb.toString();
}
public static void main(String[] args) {
for (int n = 1; n <= 3; n++) {
for (String ngram : ngrams(n, "This is my car."))
System.out.println(ngram);
System.out.println();
}
}
}
Output:
This
is
my
car.
This is
is my
my car.
This is my
is my car.
An "on-demand" solution implemented as an Iterator:
class NgramIterator implements Iterator<String> {
String[] words;
int pos = 0, n;
public NgramIterator(int n, String str) {
this.n = n;
words = str.split(" ");
}
public boolean hasNext() {
return pos < words.length - n + 1;
}
public String next() {
StringBuilder sb = new StringBuilder();
for (int i = pos; i < pos + n; i++)
sb.append((i > pos ? " " : "") + words[i]);
pos++;
return sb.toString();
}
public void remove() {
throw new UnsupportedOperationException();
}
}
You are looking for ShingleFilter.
Update: The link points to version 3.0.2. This class may be in different package in newer version of Lucene.
This code returns an array of all Strings of the given length:
public static String[] ngrams(String s, int len) {
String[] parts = s.split(" ");
String[] result = new String[parts.length - len + 1];
for(int i = 0; i < parts.length - len + 1; i++) {
StringBuilder sb = new StringBuilder();
for(int k = 0; k < len; k++) {
if(k > 0) sb.append(' ');
sb.append(parts[i+k]);
}
result[i] = sb.toString();
}
return result;
}
E.g.
System.out.println(Arrays.toString(ngrams("This is my car", 2)));
//--> [This is, is my, my car]
System.out.println(Arrays.toString(ngrams("This is my car", 3)));
//--> [This is my, is my car]
/**
*
* #param sentence should has at least one string
* #param maxGramSize should be 1 at least
* #return set of continuous word n-grams up to maxGramSize from the sentence
*/
public static List<String> generateNgramsUpto(String str, int maxGramSize) {
List<String> sentence = Arrays.asList(str.split("[\\W+]"));
List<String> ngrams = new ArrayList<String>();
int ngramSize = 0;
StringBuilder sb = null;
//sentence becomes ngrams
for (ListIterator<String> it = sentence.listIterator(); it.hasNext();) {
String word = (String) it.next();
//1- add the word itself
sb = new StringBuilder(word);
ngrams.add(word);
ngramSize=1;
it.previous();
//2- insert prevs of the word and add those too
while(it.hasPrevious() && ngramSize<maxGramSize){
sb.insert(0,' ');
sb.insert(0,it.previous());
ngrams.add(sb.toString());
ngramSize++;
}
//go back to initial position
while(ngramSize>0){
ngramSize--;
it.next();
}
}
return ngrams;
}
Call:
long startTime = System.currentTimeMillis();
ngrams = ToolSet.generateNgramsUpto("This is my car.", 3);
long stopTime = System.currentTimeMillis();
System.out.println("My time = "+(stopTime-startTime)+" ms with ngramsize = "+ngrams.size());
System.out.println(ngrams.toString());
Output:
My time = 1 ms with ngramsize = 9 [This, is, This is, my, is my, This
is my, car, my car, is my car]
public static void CreateNgram(ArrayList<String> list, int cutoff) {
try
{
NGramModel ngramModel = new NGramModel();
POSModel model = new POSModelLoader().load(new File("en-pos-maxent.bin"));
PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent");
POSTaggerME tagger = new POSTaggerME(model);
perfMon.start();
for(int i = 0; i<list.size(); i++)
{
String inputString = list.get(i);
ObjectStream<String> lineStream = new PlainTextByLineStream(new StringReader(inputString));
String line;
while ((line = lineStream.read()) != null)
{
String whitespaceTokenizerLine[] = WhitespaceTokenizer.INSTANCE.tokenize(line);
String[] tags = tagger.tag(whitespaceTokenizerLine);
POSSample sample = new POSSample(whitespaceTokenizerLine, tags);
perfMon.incrementCounter();
String words[] = sample.getSentence();
if(words.length > 0)
{
for(int k = 2; k< 4; k++)
{
ngramModel.add(new StringList(words), k, k);
}
}
}
}
ngramModel.cutoff(cutoff, Integer.MAX_VALUE);
Iterator<StringList> it = ngramModel.iterator();
while(it.hasNext())
{
StringList strList = it.next();
System.out.println(strList.toString());
}
perfMon.stopAndPrintFinalResult();
}catch(Exception e)
{
System.out.println(e.toString());
}
}
Here is my codes to create n-gram. In this case, n = 2, 3. n-gram of words sequence which smaller than cutoff value will ignore from result set. Input is list of sentences, then it parse using a tool of OpenNLP
public static void main(String[] args) {
String[] words = "This is my car.".split(" ");
for (int n = 0; n < 3; n++) {
List<String> list = ngrams(n, words);
for (String ngram : list) {
System.out.println(ngram);
}
System.out.println();
}
}
public static List<String> ngrams(int stepSize, String[] words) {
List<String> ngrams = new ArrayList<String>();
for (int i = 0; i < words.length-stepSize; i++) {
String initialWord = "";
int internalCount = i;
int internalStepSize = i + stepSize;
while (internalCount <= internalStepSize
&& internalCount < words.length) {
initialWord = initialWord+" " + words[internalCount];
++internalCount;
}
ngrams.add(initialWord);
}
return ngrams;
}
Check this out:
public static void main(String[] args) {
NGram nGram = new NGram();
String[] tokens = "this is my car".split(" ");
int i = tokens.length;
List<String> ngrams = new ArrayList<>();
while (i >= 1){
ngrams.addAll(nGram.getNGram(tokens, i, new ArrayList<>()));
i--;
}
System.out.println(ngrams);
}
private List<String> getNGram(String[] tokens, int n, List<String> ngrams) {
StringBuilder strbldr = new StringBuilder();
if (tokens.length < n) {
return ngrams;
}else {
for (int i=0; i<n; i++){
strbldr.append(tokens[i]).append(" ");
}
ngrams.add(strbldr.toString().trim());
String[] newTokens = Arrays.copyOfRange(tokens, 1, tokens.length);
return getNGram(newTokens, n, ngrams);
}
}
Simple recursive function, better running time.

Categories