Tokenize devnagari words into letters

Tokenize devnagari words into letters - java

I have something like
a = "बिक्रम मेरो नाम हो"
I want to achieve something like in Java
a[0] = बि
a[1] = क्र
a[3] = म

Java internally stores each character of any language in UTF-16(2 bytes) so you can safely access the characters individually.

Try This:
String a = "बिक्रम मेरो नाम हो";
int strLen = a.length();
char array[] = new char[strLen];
String strArray1[] = new String[strLen];
for (int i=0 ; i< strLen ; i++)
{
array[i] = a.charAt(i);
strArray1[i] = Character.toString(a.charAt(i));
System.out.println ("Index = " + i + "* Char = " +array[i] + "** String =" +strArray1[i] );
}
Output:
Index = 0* Char = ब** String =ब
Index = 1* Char = ि** String =ि
Index = 2* Char = क** String =क
Index = 3* Char = ्** String =्
Index = 4* Char = र** String =र
Index = 5* Char = म** String =म
Index = 6* Char = ** String =
Index = 7* Char = म** String =म
Index = 8* Char = े** String =े
Index = 9* Char = र** String =र
Index = 10* Char = ो** String =ो
Index = 11* Char = ** String =
Index = 12* Char = न** String =न
Index = 13* Char = ा** String =ा
Index = 14* Char = म** String =म
Index = 15* Char = ** String =
Index = 16* Char = ह** String =ह
Index = 17* Char = ो** String =ो
Note:
In order to allow eclipse to allow you to save your java program with foreign characters(Hindi alphabets), do the following:
Go to:
"Windows > Preferences > General > Content Types > Text > {Choose file type}
{Selected file type} > Default encoding > UTF-8" and click Update.

Did you try icu4j?
BreakIterator character instance has a possibility to split String to characters

My code is not at all optimized, sorry about that but it works!
Just change the path of the file in which you are going to enter the devnagri sentence and it should work.
public static void main(String[] args) throws IOException
{
BufferedReader br = new BufferedReader(new FileReader("/home/ubuntu/Documents/trainforjava.txt")); //PLEASE ENTER PATH HERE
String[] devFull = new String[]{
"अ","आ", "इ", "ई", "उ", "ऊ", "ऋ"
, "ऌ" ,"ऍ", "ए", "ऐ", "ऑ", "ओ", "औ",
"क", "ख", "ग", "घ" ,"ङ",
"च" ,"छ" ,"ज"," झ"," ञ",
"ट","ठ", "ड"," ढ"," ण",
"त", "थ", "द", "ध", "न",
"प", "फ", "ब"," भ","म",
"य", "र", "ल", "ळ",
"व", "श" ,"ष","स" ,"ह"
};
String[] uniDev = new String[]
{
"905","906","907","908","909","90a","90b",
"90c","90d","90f","910","911","913","914",
"915","916","917","918","919",
"91a","91b","91c","91d","91e",
"91f","920","921","922","923",
"924","925","926","927","928",
"92a","92b","92c","92d","92e",
"92f","930","932","933",
"935","936","937","938","939"
};
String[] devHalf = new String[]
{
"$़","ऽ","$ा","$ि" ,
"$ी", "$ ु","$ू","$ृ","$ॄ","$ॅ",
"$े","$ै","$ॉ",
"$ो","$ौ"
};
String[] gujHalf = new String[]
{
"$઼","ઽ","$ા","$િ" ,
"$ી","$ુ","$ૂ","$ૃ","$ૄ","$ૅ",
"$ે","$ૈ","$ૉ",
"$ો","$ૌ"
};
try
{
StringBuilder sb = new StringBuilder();
String line = br.readLine();
while( (line = br.readLine() ) != null)
{
line=line.replaceAll(" ", ""); //remove white spaces if any
System.out.println();
//System.out.println(line);
int strLength = line.length();
// String a = "बिक्रम मेरो नाम हो";
int strLen = line.length();
char array[] = new char[strLen];
String strArray1[] = new String[strLen];
int mark[] = new int[strLen+1];
String unis[]=new String[strLen];
int cnt=0;
String newCharD[]=new String [strLen];
String newCharG[]=new String [strLen];
String tempD=null;
String tempG=null;
String arr = null;
String next =null;
String temp=null;
String uniNext=null;
int hold=0;
int j=0;
for (int i=0 ; i< strLen ; i++)
{
j=i+1;
array[i] = line.charAt(i);
strArray1[i] = Character.toString(line.charAt(i));
if(i<(strLen-1))
{
char nbit = line.charAt(j);
next=Character.toString(line.charAt(j));
uniNext=Integer.toHexString(nbit);
//System.out.print("\nUninext:\t"+uniNext);
}
unis[i]=Integer.toHexString(array[i]);
mark[strLen]=1;
if((Arrays.asList(devFull).contains(Character.toString(array[i]))) && (!uniNext.equalsIgnoreCase("94d")) )
{
mark[i]=1;
}
else
{
mark[i]=0;
}
//
//System.out.println();
//System.out.println ("Index = " + i + "* Char = " +array[i] + "** String =" +strArray1[i]+ "Unicode="+unis[i]+"Mark="+mark[i]);
//System.out.print(unis[i].toString());
}
int start=0;
start=0;
for(int l1=0;l1<=strLen;l1++)
{
//start=0;
if(l1==0)
{
temp=Character.toString(array[l1]);
}
else
{
if(mark[l1]==0)
{
temp=temp+Character.toString(array[l1]);
}
else
{
System.out.print(" "+temp);
newCharD[start]=temp;
start++;
temp=null;
if(l1!=strLen)
{
temp=Character.toString(array[l1]);
}
}
}
}
/* for(int s=0;s<start;s++)
{
System.out.print(" "+newCharD[s]);
}*/
for(int s=0;s<start;s++)
{
}
}
}
finally {
br.close();
}
//PrintStream out = new PrintStream(new //FileOutputStream("/home/ubuntu/Documents/trainforjavaoutput.txt"));
//System.setOut(out);
}

Try this for Hindi :-
import java.io.*;
import java.text.BreakIterator;
import java.util.Locale;
public class Test {
public static void main(String[] args) throws IOException
{
String text = "बिक्रम मेरो नाम हो";
Locale hindi = new Locale("hi", "IN");
BreakIterator breaker = BreakIterator.getCharacterInstance(hindi);
breaker.setText(text);
int start = breaker.first();
for (int end = breaker.next();
end != BreakIterator.DONE;
start = end, end = breaker.next()) {
System.out.println(text.substring(start,end));
}
}
}
OUTPUT:-
बि
क्र
म
मे
रो
ना
म
हो
BreakIterator Java Documentation: https://docs.oracle.com/javase/tutorial/i18n/text/about.html

In order to split the string by letters rather than characters, going by dvasanth's suggestion, you can try below:
String x = "बिक्रम मेरो नाम हो";
x=x.replaceAll(" ", ""); // Remove all spaces
int strLength = x.length();
String [] letterArray = new String (strLength /2);
String combined = "";
for (int i=0, j=0; i < strLength ; i=i+2,j++)
{
strArray1[i] = Character.toString(x.charAt(i));
if (i+1 < strLength)
{
strArray1[i+1] = Character.toString(x.charAt(i+1));
combined = strArray1[i]+strArray1[i+1]; // This line provides the letters.
// Assumption is that each letter is 2 unicode characters long.
}
else
{
combined = strArray1[i];
}
letterArray [j] = combined;
System.out.println("Split string by letters is : "+combined);
System.out.println("Split string by letters in array is : "+letterArray [j]);
}
Output is:
Split string by letters is : बि
Split string by letters is : क्
Split string by letters is : रम
Split string by letters is : मे
Split string by letters is : रो
Split string by letters is : ना
Split string by letters is : मह
Split string by letters is : ो
Note:
In order to allow eclipse to allow you to save your java program with foreign characters(Hindi alphabets), do the following:
Go to:
"Windows > Preferences > General > Content Types > Text > {Choose file type}
{Selected file type} > Default encoding > UTF-8" and click Update.

Related

How to remove letter in space for One Time Pad cipher?

I have found this code somewhere on the web for reference yet this bothers me. The result is correct but somehow there is something wrong with the process.
import java.io.*;
import java.util.*;
import java.util.Scanner;
public class onetime{
public static void main(String[] args){
System.out.println("Type Message: " );
Scanner sc = new Scanner(System.in);
String text = sc.nextLine();
String key = RandomAlpha(text.length());
String enc = OTPEncryption(text,key);
System.out.println("Plaintext : " +text);
System.out.println("Key: "+key);
System.out.println("Encrypted : "+enc);
System.out.println("Decrypted : "+OTPDecryption(enc,key));
}
public static String RandomAlpha(int len){
Random r = new Random();
String key = "";
for(int x=0;x<len;x++)
key = key + (char) (r.nextInt(26) + 'A');
return key;
}
public static String OTPEncryption(String text,String key){
String alphaU = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
String alphaL = "abcdefghijklmnopqrstuvwxyz";
int len = text.length();
String sb = "";
for(int x=0;x<len;x++){
char get = text.charAt(x);
char keyget = key.charAt(x);
if(Character.isUpperCase(get)){
int index = alphaU.indexOf(get);
int keydex = alphaU.indexOf(Character.toUpperCase(keyget));
int total = (index + keydex) % 26;
sb = sb+ alphaU.charAt(total);
}
else if(Character.isLowerCase(get)){
int index = alphaL.indexOf(get);
int keydex = alphaU.indexOf(Character.toLowerCase(keyget));
int total = (index + keydex) % 26;
sb = sb+ alphaL.charAt(total);
}
else{
sb = sb + get;
}
}
return sb;
}
public static String OTPDecryption(String text,String key){
String alphaU = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
String alphaL = "abcdefghijklmnopqrstuvwxyz";
int len = text.length();
String sb = "";
for(int x=0;x<len;x++){
char get = text.charAt(x);
char keyget = key.charAt(x);
if(Character.isUpperCase(get)){
int index = alphaU.indexOf(get);
int keydex = alphaU.indexOf(Character.toUpperCase(keyget));
int total = (index - keydex) % 26;
total = (total<0)? total + 26 : total;
sb = sb+ alphaU.charAt(total);
}
else if(Character.isLowerCase(get)){
int index = alphaL.indexOf(get);
int keydex = alphaU.indexOf(Character.toLowerCase(keyget));
int total = (index - keydex) % 26;
total = (total<0)? total + 26 : total;
sb = sb+ alphaL.charAt(total);
}
else{
sb = sb + get;
}
}
return sb;
}
}
Output
Plain Text: help me
Key: QMMNQAB
Encrypted: gdko ld
Decrypt: help me
so as you can here in the output the second 'Q' of the key isn't supposed to be there because it's supposed to be a space. How can I remove it?
Your answer is highly appreciated thank you :)

How to Convert String to Char and Integer in Java ?

I have the following :
String contain 5* ;
char opr ;
int data ;
I would like to store the 5 in the data and * in the opr
any idea how to do it ?
as i know if the string contain only Integer then I will split and the parse it but in this case the String contain int and char
Input :
String input = 5* ;
I want the Output will be like this :
this.opr = *
this.data = 5

A simple way to achieve that (if you don't want to use Regex) is to do something like this:
String temp ="";
// read every char in the input String
for(char c: input.toCharArray()){
// if it's a digit
if(Character.isDigit(c)){
temp +=c; // append it
}
else{ // at the end parse the temp String
data = Integer.parseInt(temp);
opr = c;
break;
}
}
//test
System.out.println("Input: " + input
+ "\t Data: " + data
+ "\t Opr: " + opr);
Test
Input: 5* Data: 5 Opr: *
Input: 123* Data: 123 Opr: *

Under the dubious assumption that the issue is parsing a String in the form of number followed by op, this code will achieve the desired result.
public static void main(String[] args)
{
final String input = "5*";
int data = -1;
String op = "";
Pattern pat = Pattern.compile("([\\d]+)[\\s]*(.*)");
Matcher m = pat.matcher(input);
if (m.matches() && m.groupCount() == 2) {
data = Integer.parseInt(m.group(1));
op = m.group(2).trim();
}
System.out.printf("%2d with op %s%n", data, op);
}
Output:
5 with op *

String input = 5*;
char[] splitted = input.toCharArray();
char opr = splitted[1];
int data = Character.getNumericValue(splitted[0]);

you may also try this with 3 different inputs as below. It will print 0 for incorrect DATA and blank for incorrect OPERAND.
String input = "5*";
//String input = "109/";
//String input = "109b3*";
int data = 0;
char opr = ' ';
int len = input.length();
if ( len > 1 ) {
String data_s = input.substring(0, len - 1);
try {
data = Integer.valueOf(data_s).intValue();
} catch(Exception e) {
}
opr = input.substring(len - 1). charAt(0);
if ( opr != '+' && opr != '-' && opr != '*' && opr != '/' ) {
opr = ' ';
}
}
System.out.println("Input:" + input + " Data:" + data + " Opr:" + opr);

Reverse a string as per length of first word

I am new to Java Strings.
Actually I have the code to reverse words:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
public class Test2 {
public static void main(String[] args) throws IOException
{
System.out.println("enter a sentence");
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
String rev =br.readLine();
String [] bread = rev.split(" ");
for(int z =bread.length-1;z>=0;z--)
{
System.out.println(bread[z]);
}
}
}
For the above code I get:
Input :Bangalore is a city
Output: City is a Bangalore
But I want the output to be like below:
Input: Bangalore is a city
Output:cityaisba ng a lore
Another Example:
Input: Hello Iam New To Java.Java is object Oriented language.
Output: langu age Ori en tedo bjec ti sjava. javaToNe wIamolleH
Please help me out

Here is one way you could do it:
String rev = br.readLine();
String [] bread = rev.split(" ");
int revCounter = 0;
for(int z = bread.length - 1; z >= 0; z--)
{
String word = bread[z];
for(int i = 0; i < word.length(); i++)
{
// If char at current position in 'rev' was a space then
// just print space. Otherwise, print char from current word.
if(rev.charAt(revCounter) == ' ')
{
System.out.print(' ');
i--;
}
else
System.out.print(word.charAt(i));
revCounter++;
}
}

When I run your code I get following result:
city
a
is
Bangalore
So to have it in a single line, why don't you add a space and print a single line?
System.out.println("enter a sentence");
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
String rev = br.readLine();
String[] bread = rev.split(" ");
for (int z = bread.length - 1; z >= 0; z--) {
System.out.print(bread[z] + " ");
}

I didn't check the validity of your code like GHajba did. But if you want spaces to remain on specific places it might be an option to remove all spaces and put them back according to their index in the original String.
Remove all
newBread = newBread.replace(" ", "");
Put them back
StringBuilder str = new StringBuilder(newBread);
for (int index = oldBread.indexOf(" ") ;
index >= 0 ;
index = oldBread.indexOf(" ", index + 1))
{
str.insert(index, ' ');
}
newBread = str.toString();
I came up with this quick and there might be better ways to do this, maybe without StringBuilder, but this might help you until you find a better way.

Try with this (i've used a string as input):
String original = "Bangalore is a city";
System.out.println("Original : "+original);
StringBuilder inverted = new StringBuilder();
StringBuilder output = new StringBuilder();
String temp = "";
String[] split = original.split("\\s+");
for (int i = split.length - 1; i >= 0; i--) {
inverted.append(split[i]);
}
temp = inverted.toString();
for (String string : split) {
int currLenght = string.length();
String substring = temp.substring(0,currLenght);
temp = temp.replaceAll(substring, "");
output.append(substring).append(" ");
}
System.out.println("Converted : "+output.toString());

Append the reversed words without the spaces into a StringBuffer.
StringBuffer b = new StringBuffer();
for (int i = bread.length-1; i >= 0 ; i--) {
b.append(bread[i]);
}
Then insert the spaces of the original String into the StringBuffer.
int spaceIndex, prevIndex = 0;
while ((spaceIndex = rev.indexOf(" ", prevIndex + 1)) != -1) {
b.insert(spaceIndex, ' ');
prevIndex = spaceIndex;
}

Java - Add numbers to matching words

I'm trying to add a count number for matching words, like this:
Match word: "Text"
Input: Text Text Text TextText ExampleText
Output: Text1 Text2 Text3 Text4Text5 ExampleText6
I have tried this:
String text = "Text Text Text TextText ExampleText";
String match = "Text";
int i = 0;
while(text.indexOf(match)!=-1) {
text = text.replaceFirst(match, match + i++);
}
Doesn't work because it would loop forever, the match stays in the string and IndexOf will never stop.
What would you suggest me to do?
Is there a better way doing this?

Here is one with a StringBuilder but no need to split:
public static String replaceWithNumbers( String text, String match ) {
int matchLength = match.length();
StringBuilder sb = new StringBuilder( text );
int index = 0;
int i = 1;
while ( ( index = sb.indexOf( match, index )) != -1 ) {
String iStr = String.valueOf(i++);
sb.insert( index + matchLength, iStr );
// Continue searching from the end of the inserted text
index += matchLength + iStr.length();
}
return sb.toString();
}

first take one stringbuffer i.e. result,Then spilt the source with the match(destination).
It results in an array of blanks and remaining words except "Text".
then check condition for isempty and depending on that replace the array position.
String text = "Text Text Text TextText ExampleText";
String match = "Text";
StringBuffer result = new StringBuffer();
String[] split = text.split(match);
for(int i=0;i<split.length;){
if(split[i].isEmpty())
result.append(match+ ++i);
else
result.append(split[i]+match+ ++i);
}
System.out.println("Result is =>"+result);
O/P
Result is => Text1 Text2 Text3 Text4Text5 ExampleText6

Try this solution is tested
String text = "Text Text Text TextText Example";
String match = "Text";
String lastWord=text.substring(text.length() -match.length());
boolean lastChar=(lastWord.equals(match));
String[] splitter=text.split(match);
StringBuilder sb = new StringBuilder();
for(int i=0;i<splitter.length;i++)
{
if(i!=splitter.length-1)
splitter[i]=splitter[i]+match+Integer.toString(i);
else
splitter[i]=(lastChar)?splitter[i]+match+Integer.toString(i):splitter[i];
sb.append(splitter[i]);
if (i != splitter.length - 1) {
sb.append("");
}
}
String joined = sb.toString();
System.out.print(joined+"\n");

One possible solution could be
String text = "Text Text Text TextText ExampleText";
String match = "Text";
StringBuilder sb = new StringBuilder(text);
int occurence = 1;
int offset = 0;
while ((offset = sb.indexOf(match, offset)) != -1) {
// fixed this after comment from #RealSkeptic
String insertOccurence = Integer.toString(occurence);
sb.insert(offset + match.length(), insertOccurence);
offset += match.length() + insertOccurence.length();
occurence++;
}
System.out.println("result: " + sb.toString());

This will work for you :
public static void main(String[] args) {
String s = "Text Text Text TextText ExampleText";
int count=0;
while(s.contains("Text")){
s=s.replaceFirst("Text", "*"+ ++count); // replace each occurrence of "Text" with some place holder which is not in your main String.
}
s=s.replace("*","Text");
System.out.println(s);
}
O/P:
Text1 Text2 Text3 Text4Text5 ExampleText6

I refactored #DeveloperH 's code to this:
public class Snippet {
public static void main(String[] args) {
String matchWord = "Text";
String input = "Text Text Text TextText ExampleText";
String output = addNumbersToMatchingWords(matchWord, input);
System.out.print(output);
}
private static String addNumbersToMatchingWords(String matchWord, String input) {
String[] inputsParts = input.split(matchWord);
StringBuilder outputBuilder = new StringBuilder();
int i = 0;
for (String inputPart : inputsParts) {
outputBuilder.append(inputPart);
outputBuilder.append(matchWord);
outputBuilder.append(i);
if (i != inputsParts.length - 1)
outputBuilder.append(" ");
i++;
}
return outputBuilder.toString();
}
}

We can solve this by using stringbuilder, it provides simplest construct to insert character in a string. Following is the code
String text = "Text Text Text TextText ExampleText";
String match = "Text";
StringBuilder sb = new StringBuilder(text);
int beginIndex = 0, i =0;
int matchLength = match.length();
while((beginIndex = sb.indexOf(match, beginIndex))!=-1) {
i++;
sb.insert(beginIndex+matchLength, i);
beginIndex++;
}
System.out.println(sb.toString());

Reverse every word in a string and capitalize the start of the word

Sample input:
abc def ghi
Sample output:
Cba Fed Ihg
This is my code:
import java.util.Stack;
public class StringRev {
static String output1 = new String();
static Stack<Character> stack = new Stack<Character>();
public static void ReverseString(String input) {
input = input + " ";
for (int i = 0; i < input.length(); i++) {
boolean cap = true;
if (input.charAt(i) == ' ') {
while (!stack.isEmpty()) {
if (cap) {
char c = stack.pop().charValue();
c = Character.toUpperCase(c);
output1 = output1 + c;
cap = false;
} else
output1 = output1 + stack.pop().charValue();
}
output1 += " ";
} else {
stack.push(input.charAt(i));
}
}
System.out.print(output1);
}
}
Any better solutions?

Make use of
StringBuilder#reverse()
Then without adding any third party libraries,
String originalString = "abc def ghi";
StringBuilder resultBuilder = new StringBuilder();
for (String string : originalString.split(" ")) {
String revStr = new StringBuilder(string).reverse().toString();
revStr = Character.toUpperCase(revStr.charAt(0))
+ revStr.substring(1);
resultBuilder.append(revStr).append(" ");
}
System.out.println(resultBuilder.toString()); //Cba Fed Ihg
Have a Demo

You can use the StringBuffer to reverse() a string.
And then use the WordUtils#capitalize(String) method to make first letter of the string capitalized.
String str = "abc def ghi";
StringBuilder sb = new StringBuilder();
for (String s : str.split(" ")) {
String revStr = new StringBuffer(s).reverse().toString();
sb.append(WordUtils.capitalize(revStr)).append(" ");
}
String strReversed = sb.toString();

public static String reverseString(final String input){
if(null == input || isEmpty(input))
return "";
String result = "";
String[] items = input.split(" ");
for(int i = 0; i < items.length; i++){
result += new StringBuffer(items[i]).reverse().toString();
}
return result.substring(0,1).toupperCase() + result.substring(1);
}

1) Reverse the String with this
StringBuffer a = new StringBuffer("Java");
a.reverse();
2) To make First letter capital use
StringUtils class in apache commons lang package org.apache.commons.lang.StringUtils
It makes first letter capital
capitalise(String);
Hope it helps.

Edited
Reverse the string first and make the first character to uppercase
String string="hello jump";
StringTokenizer str = new StringTokenizer(string," ") ;
String finalString ;
while (str.hasMoreTokens()) {
String input = str.nextToken() ;
String reverse = new StringBuffer(input).reverse().toString();
System.out.println(reverse);
String output = reverse .substring(0, 1).toUpperCase() + reverse .substring(1);
finalString=finalString+" "+output ;
}
System.out.println(finalString);

import java.util.*;
public class CapatiliseAndReverse {
public static void reverseCharacter(String input) {
String result = "";
StringBuilder revString = null;
String split[] = input.split(" ");
for (int i = 0; i < split.length; i++) {
revString = new StringBuilder(split[i]);
revString = revString.reverse();
for (int index = 0; index < revString.length(); index++) {
char c = revString.charAt(index);
if (Character.isLowerCase(revString.charAt(0))) {
revString.setCharAt(0, Character.toUpperCase(c));
}
if (Character.isUpperCase(c)) {
revString.setCharAt(index, Character.toLowerCase(c));
}
}
result = result + " " + revString;
}
System.out.println(result.trim());
}
public static void main(String[] args) {
//System.out.println(reverseCharacter("old is GlOd"));
Scanner sc = new Scanner(System.in);
reverseCharacter(sc.nextLine());
}
}

We Keep Coding

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

Tokenize devnagari words into letters - java

I have something like a = "बिक्रम मेरो नाम हो" I want to achieve something like in Java a[0] = बि a[1] = क्र a[3] = म

Java internally stores each character of any language in UTF-16(2 bytes) so you can safely access the characters individually.

Did you try icu4j? BreakIterator character instance has a possibility to split String to characters

Related

How to remove letter in space for One Time Pad cipher?

How to Convert String to Char and Integer in Java ?

Reverse a string as per length of first word

Java - Add numbers to matching words

Reverse every word in a string and capitalize the start of the word

Categories

Resources