How to properly import the libphonelib to the java project? - java

I'm trying to use the PhoneNumberMatcher from the libphonenumber library. After adding the jar file to my project and setting the BuildPath, I was able to import classes to my project:
import com.google.i18n.phonenumbers.*;
Inside the lib, there is a class named PhoneNumberMatcher.class. I've been trying to reach it but this class name isn't included in suggestions I normally get when I press Ctrl + Space.
If I insist and write the name class, eclipse will underline the name and the message "The type PhoneNumberMatcher is not visible".
Newly I realized that the class has a small blue flag icon is in the project explorer.
It's not the only one that has such a blue flag, then I try the other classes and I realized that all classes with this blue flag isn't accessible. That's why I think these classes probably are private, or for internal use of the lib.
I'm trying to create a tool to extract phone numbers out of a text and I read this lib is exactly for it.
How do I use the PhonNumberMatcher class in my java project, please?

CharSequence text = "Call me at +1 425 882-8080 for details.";
String country = "US";
PhoneNumberUtil util = PhoneNumberUtil.getInstance();
// Find the first phone number match:
PhoneNumberMatch m = util.findNumbers(text, country).iterator().next();
// rawString() contains the phone number as it appears in the text.
"+1 425 882-8080".equals(m.rawString());
// start() and end() define the range of the matched subsequence.
CharSequence subsequence = text.subSequence(m.start(), m.end());
"+1 425 882-8080".contentEquals(subsequence);
// number() returns the the same result as PhoneNumberUtil.parse()
// invoked on rawString().
util.parse(m.rawString(), country).equals(m.number());
https://javadoc.io/doc/com.googlecode.libphonenumber/libphonenumber

Thank you for your answer Chana.
I did actually was able to use the library, but then I realized that the lib was to complicated to use for me, so I did write my own code to extract German phone numbers, IBANs, Postcodes and money amounts and then classify then:
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class NumberExtractorAndClassifier {
public static final String REGEXPhoneNumbers = "(0|0049\\s?|\\+49\\s?|\\(\\+49\\)\\s?|\\(0\\)\\s?){1}([0-9]{2,4})([ \\-\\/]?[0-9]{1,10})+";
public static final String REGEXIbanNumbers = "([A-Z][ -]?){2}([0-9]([ -]?)*){12,30}";
public static final String REGEXAmounts = "(\\d+([.,]?)\\d{0,}([.,]?)\\d*(\\s*)(\\$|€|EUR|USD|Euro|Dollar))|((\\$|€|EUR|USD|Euro|Dollar)(\\s*)\\d+([.,]?\\d{0,}([.,]?)\\d*))";
public static final String REGEXPostCode = "\\b((?:0[1-46-9]\\d{3})|(?:[1-357-9]\\d{4})|(?:[4][0-24-9]\\d{3})|(?:[6][013-9]\\d{3}))\\b";
public static String TextToAnalyze;
public static String CopyOfText = "";
public static int ExaminatedIndex = 0;
public static List<String> ExtractedPhoneNumbers = new ArrayList<String>();
public static List<String> ExtractedIbanNumbers = new ArrayList<String>();
public static List<String> ExtractedAmounts = new ArrayList<String>();
public static List<String> ExtractedPostCodes = new ArrayList<String>();
public static final String EMPTY_STRING = "";
/**
* #brief Constructor: initializes the needed variables and call the different methods in order to find an classify the Numbers
*
* #param Text: is the input text that need to be analyzed
*/
public NumberExtractorAndClassifier(String Text) {
TextToAnalyze = Text; //- This variable is going to have our complete text
CopyOfText = Text; //- This variable is going to have the missing text to analyze
//- We extract the amounts first in order to do not confuse them later with a phone number, IBAN or post-code
ExtractedAmounts = ExtractAmounts();
for (String Amount : ExtractedAmounts)
{
//- We cut them out of the text in order to do not confuse them later with a IBAN, phone number or post-code
String safeToUseInReplaceAllString = Pattern.quote(Amount);
CopyOfText = CopyOfText.replaceAll(safeToUseInReplaceAllString, "");
System.out.println("Found amount -------> " + Amount);
}
//- We extract the IBAN secondly in order to do not confuse them later with a phone number or post-code
ExtractedIbanNumbers = ExtracIbanNumbers();
for (String Iban : ExtractedIbanNumbers)
{
//- We cut them out of the text in order to do not confuse them later with a phone number, or post-code
String safeToUseInReplaceAllString = Pattern.quote(Iban);
CopyOfText = CopyOfText.replaceAll(safeToUseInReplaceAllString, "");
System.out.println("Found IBAN ---------> " + Iban);
}
//- We extract the phone numbers thirdly in order to do not confuse them later with a post-code
ExtractedPhoneNumbers = ExtractPhoneNumbers();
for( String number : ExtractedPhoneNumbers )
{
//- We cut them out of the text in order to do not confuse them later with a post-code
String safeToUseInReplaceAllString = Pattern.quote(number);
CopyOfText = CopyOfText.replaceAll(safeToUseInReplaceAllString, "");
System.out.println("Found number -------> " + number);
}
ExtractedPostCodes = ExtractPostCodes();
for( String PostCode : ExtractedPostCodes)
{
System.out.println("Found post code ----> " + PostCode);
}
}
/**
* #Brief Method extracts phone numbers out of the text with help of REGEXPhoneNumbers
*
* #return List of strings with all the found numbers.
*/
public static List<String> ExtractPhoneNumbers(){
//Initializing our variables
List<String> FoundNumbers = new ArrayList<String>();
boolean LineContainsNumber = true;
Pattern pattern = Pattern.compile(REGEXPhoneNumbers);
Matcher matcher = pattern.matcher(CopyOfText);
while (LineContainsNumber) {
if (matcher.find()) {
String NumberFoundByTheMatcher = matcher.group(0);
FoundNumbers.add(NumberFoundByTheMatcher);
}
else{LineContainsNumber = false;}
}
return FoundNumbers;
}
/**
* #Brief Method extracts IBAN numbers out of the text with help of REGEXIbanNumbers
*
* #return List of strings with all the found IBANS numbers.
*/
public static List<String> ExtracIbanNumbers(){
//Initializing our variables
List<String> FoundIbans = new ArrayList<String>();
boolean LineContainsIban = true;
Pattern pattern = Pattern.compile(REGEXIbanNumbers);
Matcher matcher = pattern.matcher(CopyOfText);
while (LineContainsIban) {
if (matcher.find()) {
String NumberFoundByTheMatcher = matcher.group(0);
FoundIbans.add(NumberFoundByTheMatcher);
}
else{LineContainsIban = false;}
}
return FoundIbans;
}
/**
* #Brief Method extracts Amounts out of the text with help of REGEXAmounts
*
* #return List of strings with all the found amounts.
*/
public static List<String> ExtractAmounts(){
//Initializing our variables
List<String> FoundAmounts = new ArrayList<String>();
boolean LineContainsAmount = true;
Pattern pattern = Pattern.compile(REGEXAmounts);
Matcher matcher = pattern.matcher(CopyOfText);
while (LineContainsAmount) {
if (matcher.find()) {
String NumberFoundByTheMatcher = matcher.group(0);
FoundAmounts.add(NumberFoundByTheMatcher);
}
else{LineContainsAmount = false;}
}
return FoundAmounts;
}
/**
* #Brief Method extracts post codes out of the text with help of REGEXPostCode
*
* #return List of strings with all the found post codes.
*/
public static List<String> ExtractPostCodes(){
List<String> FoundPostCodes = new ArrayList<String>();
boolean LineContainsPostCode = true;
Pattern pattern = Pattern.compile(REGEXPostCode);
Matcher matcher = pattern.matcher(CopyOfText);
while(LineContainsPostCode) {
if(matcher.find()) {
String PostCodeFoundByMatcher = matcher.group(0);
FoundPostCodes.add(PostCodeFoundByMatcher);
}
else {
LineContainsPostCode = false;
}
}
return FoundPostCodes;
}
}

Related

Regex for converting xml tags key to camel case

I am looking for the simplest method in java which takes a XML string, and converts all tags (not their contents) to camel case, such as
<HeaderFirst>
<HeaderTwo>
<ID>id1</ID>
<TimeStamp>2016-11-04T02:46:34Z</TimeStamp>
<DetailedDescription>
<![CDATA[di]]>
</DetailedDescription>
</HeaderTwo>
</HeaderFirst>
will be converted to
<headerFirst>
<headerTwo>
<id>id1</id>
<timeStamp>2016-11-04T02:46:34Z</timeStamp>
<detailedDescription>
<![CDATA[di]]>
</detailedDescription>
</headerTwo>
</headerFirst>
Try something like this:
public void tagToCamelCase(String input){
char[] inputArray = input.toCharArray();
for (int i = 0; i < inputArray.length-2; i++){
if (inputArray[i] == '<'){
if(inputArray[i+1]!= '/')
inputArray[i+1] = Character.toLowerCase(inputArray[i+1]);
else
inputArray[i+2] = Character.toLowerCase(inputArray[i+2]);
}
}
System.out.println(new String(inputArray));
}
Note: the tag ID will be iD and not id. Hope this helps.
Here is a solution that is based on splitting the string on the ">" character and then processing the tokens in three different cases: CDATA, open tag, and close tag
The following code should work (see the program output below). There is, however, a problem with the tag "ID" -- how do we know that its camel case should be "id" not "iD"? This needs a dictionary to capture that knowledge. So the following routine convert() has two modes -- useDictionary being true or false. See if the following solution satisfies your requirement.
To use the "useDictionary" mode you also need to maintain a proper dictionary (the hashmap called "dict" in the program, right now there is only one entry in the dictionary "ID" should be camel-cased to "id"). Note that the dictionary can be ramped up incrementally -- you only need to add the special cases to the dictionary (e.g. the camel-case of "ID" is "id" not "iD")
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CamelCase {
private static Map<String, String> dict = new HashMap<>();
static {
dict.put("ID", "id");
}
public static void main(String[] args) {
String input = "<HeaderFirst> "
+ "\n <HeaderTwo>"
+ "\n <ID>id1</ID>"
+ "\n <TimeStamp>2016-11-04T02:46:34Z</TimeStamp>"
+ "\n <DetailedDescription>"
+ "\n <![CDATA[di]]>"
+ "\n </DetailedDescription>"
+ "\n </HeaderTwo> "
+ "\n</HeaderFirst>";
System.out.println("===== output without using a dictionary =====");
System.out.println(convert(input, false /* useDictionary */));
System.out.println("===== output using a dictionary =====");
System.out.println(convert(input, true /* useDictionary */));
}
private static String convert(String input, boolean useDictionary) {
String splitter = ">";
String[] tokens = input.split(splitter);
StringBuilder sb = new StringBuilder();
Pattern cdataPattern = Pattern.compile("([^<]*)<!\\[CDATA\\[([^\\]]*)\\]\\]");
Pattern oTagPattern = Pattern.compile("([^<]*)<(\\w+)");
Pattern cTagPattern = Pattern.compile("([^<]*)</(\\w+)");
String prefix;
String tag;
String newTag;
for (String token : tokens) {
Matcher cdataMatcher = cdataPattern.matcher(token);
Matcher oTagMatcher = oTagPattern.matcher(token);
Matcher cTagMatcher = cTagPattern.matcher(token);
if (cdataMatcher.find()) { // CDATA - do not change
sb.append(token);
} else if (oTagMatcher.find()) {// open tag - change first char to lower case
prefix = oTagMatcher.group(1);
tag = oTagMatcher.group(2);
newTag = camelCaseOneTag(tag, useDictionary);
sb.append(prefix + "<" + newTag);
} else if (cTagMatcher.find()) {// close tag - change first char to lower case
prefix = cTagMatcher.group(1);
tag = cTagMatcher.group(2);
newTag = camelCaseOneTag(tag, useDictionary);
sb.append(prefix + "<" + newTag);
}
sb.append(splitter);
}
return sb.toString();
}
private static String camelCaseOneTag(String tag, boolean useDictionary) {
String newTag;
if (useDictionary
&& dict.containsKey(tag)) {
newTag = dict.get(tag);
} else {
newTag = tag.substring(0, 1).toLowerCase()
+ tag.substring(1);
}
return newTag;
}
}
The output of this program is this:
===== output without using a dictionary =====
<headerFirst>
<headerTwo>
<iD>id1<iD>
<timeStamp>2016-11-04T02:46:34Z<timeStamp>
<detailedDescription>
<![CDATA[di]]>
<detailedDescription>
<headerTwo>
<headerFirst>
===== output using a dictionary =====
<headerFirst>
<headerTwo>
<id>id1<id>
<timeStamp>2016-11-04T02:46:34Z<timeStamp>
<detailedDescription>
<![CDATA[di]]>
<detailedDescription>
<headerTwo>
<headerFirst>

Matcher can't match

I have the following code. I need to check the text for existing any of the words from some list of banned words. But even if this word exists in the text matcher doesn't see it. here is the code:
final ArrayList<String> regexps = config.getProperty(property);
for (String regexp: regexps){
Pattern pt = Pattern.compile("(" + regexp + ")", Pattern.CASE_INSENSITIVE);
Matcher mt = pt.matcher(plainText);
if (mt.find()){
result = result + "message can't be processed because it doesn't satisfy the rule " + property;
reason = false;
System.out.println("reason" + mt.group() + regexp);
}
}
What is wrong? This code can'f find regexp в[ыy][шs]лит[еe], which is regexp in the plainText = "Вышлите пожалуйста новый счет на оплату на Санг, пока согласовывали, уже
прошли его сроки. Лиценз...". I also tried another variants of the regexp but everything is useless
The trouble is elsewhere.
import java.util.regex.*;
public class HelloWorld {
public static void main(String []args) {
Pattern pt = Pattern.compile("(qwer)");
Matcher mt = pt.matcher("asdf qwer zxcv");
System.out.println(mt.find());
}
}
This prints out true. You may want to use word boundary as delimiter, though:
import java.util.regex.*;
public class HelloWorld {
public static void main(String []args) {
Pattern pt = Pattern.compile("\\bqwer\\b");
Matcher mt = pt.matcher("asdf qwer zxcv");
System.out.println(mt.find());
mt = pt.matcher("asdfqwer zxcv");
System.out.println(mt.find());
}
}
The parenthesis are useless unless you need to capture the keyword in a group. But you already have it to begin with.
Use ArrayList's built in functions indexOf(Object o) and contains(Object o) to check if a String exists anywhere in the Array and where.
e.g.
ArrayList<String> keywords = new ArrayList<String>();
keywords.add("hello");
System.out.println(keywords.contains("hello"));
System.out.println(keywords.indexOf("hello"));
outputs:
true
0
Try this to filter out messages which contain banned words using the following regex which uses OR operator.
private static void findBannedWords() {
final ArrayList<String> keywords = new ArrayList<String>();
keywords.add("f$%k");
keywords.add("s!#t");
keywords.add("a$s");
String input = "what the f$%k";
String bannedRegex = "";
for (String keyword: keywords){
bannedRegex = bannedRegex + ".*" + keyword + ".*" + "|";
}
Pattern pt = Pattern.compile(bannedRegex.substring(0, bannedRegex.length()-1));
Matcher mt = pt.matcher(input);
if (mt.matches()) {
System.out.println("message can't be processed because it doesn't satisfy the rule ");
}
}

regex pattern to match particular uri from list of urls

I have a list of urls (lMapValues ) with wild cards like as mentioned in the code below
I need to match uri against this list to find matching url.
In below code I should get matching url as value of d in the map m.
That means if part of uri is matching in the list of urls, that particular url should be picked.
I tried splitting uri in tokens and then checking each token in list lMapValues .However its not giving me correct result.Below is code for that.
public class Matcher
{
public static void main( String[] args )
{
Map m = new HashMap();
m.put("a","https:/abc/eRControl/*");
m.put("b","https://abc/xyz/*");
m.put("c","https://work/Mypage/*");
m.put("d","https://cr/eRControl/*");
m.put("e","https://custom/MyApp/*");
List lMapValues = new ArrayList(m.values());
List tokens = new ArrayList();
String uri = "cr/eRControl/work/custom.jsp";
StringTokenizer st = new StringTokenizer(uri,"/");
while(st.hasMoreTokens()) {
String token = st.nextToken();
tokens.add(token);
}
for(int i=0;i<lMapValues.size();i++) {
String value = (String)lMapValues.get(i);
String patternString = "\\b(" + StringUtils.join(tokens, "|") + ")\\b";
Pattern pattern = Pattern.compile(patternString);
java.util.regex.Matcher matcher = pattern.matcher(value);
while (matcher.find()) {
System.out.println(matcher.group(1));
System.out.println(value);
}
}
}
}
Please help me with regex pattern to achieve above objective.
Any help will be appreciated.
It's much simpler to check if a string starts with a certain value with String.indexOf().
String[] urls = {
"abc/eRControl",
"abc/xyz",
"work/Mypage",
"cr/eRControl",
"custom/MyApp"
};
String uri = "cr/eRControl/work/custom.jsp";
for (String url : urls) {
if (uri.indexOf(url) == 0) {
System.out.println("Matched: " + url);
}else{
System.out.println("Not matched: " + url);
}
}
Also. There is no need to store the scheme into the map if you are never going to match against it.
if I understand your goal correctly, you might not even need regular expressions here.
Try this...
package test;
import java.util.HashSet;
import java.util.Set;
public class PartialURLMapper {
private static final Set<String> PARTIAL_URLS = new HashSet<String>();
static {
PARTIAL_URLS.add("cr/eRControl");
// TODO add more partial Strings to check against input
}
public static String getPartialStringIfMatching(final String input) {
if (input != null && !input.isEmpty()) {
for (String partial: PARTIAL_URLS) {
// this will be case-sensitive
if (input.contains(partial)) {
return partial;
}
}
}
// no partial match found, we return an empty String
return "";
}
// main method just to add example
public static void main(String[] args) {
System.out.println(PartialURLMapper.getPartialStringIfMatching("cr/eRControl/work/custom.jsp"));
}
}
... it will return:
cr/eRControl
The problem is that i is acting as a key not as an index on
String value = (String)lMapValues.get(i);
you will be better served exchanging the map for a list, and using the for each loop.
List<String> patterns = new ArrayList<String>();
...
for (String pattern : patterns) {
....
}

Java string matching expression String Array

I am using Java, i need your opinion on how to write better code for the following task.
I have following String value
String testStr = "INCLUDES(ABC) EXCLUDES(ABC) EXCLUDES(ABC) INCLUDES(ABC) INCLUDES(ABC)"
I want to manipulate Strings and want to combine all INCLUDES statements into one INCLUDES and the result should be similar to the following:
INCLUDES(ABC,ABC, ABC) EXCLUDES(ABC, ABC)
i would break the initial string into new strings using this class:
http://docs.oracle.com/javase/6/docs/api/java/util/StringTokenizer.html
and put them in an array
start a new string, and then use the tokenizer to break out the part within the parentheses (you can set it to use ( and ) as delimiters) and loop through the array and concatenate them into the new string.
be aware though that any wrongly placed spaces (like INCLUDES( abc )) will mess it up
This seems like a reasonable approach:
Split the testStr using the StringUtils.split method; use " " or null as the token.
Create a Map<String, List<String>>. I will refer to that as theMap
For each string in the returned array perform the following:
Split the string using "()" as the token.
The returned array should have 2 elements. The first element (index 0) is key for theMap and the second element (index 1) is the value to add to the list.
Once you are done with the array returned from splitting testStr, build a new string by using the key value in theMap and appending the elements from the associated list into a string.
Apache StringUtils
I wrote a piece of code for this issue but i don't know if it's good or not
according to your format ,you can split testStr by using " " ,the output will be like this: INCLUDES(ABC)
check if this string contain INCLUDES or EXCLUDES
then split it by using ( )
Like this :
String testStr = "INCLUDES(ABC) EXCLUDES(C) EXCLUDES(ABC) INCLUDES(AC) INCLUDES(AB)";
String s[] = testStr.split(" ");
String INCLUDES = "INCLUDES( ";
String EXCLUDES = "EXCLUDES ( ";
for (int i = 0; i < s.length; i++) {
if (s[i].contains("INCLUDES")) {
INCLUDES += (s[i].substring(s[i].indexOf("(") + 1, s[i].indexOf(")"))) + " ";
}
else if (s[i].contains("EXCLUDES")) {
EXCLUDES += (s[i].substring(s[i].indexOf("(") + 1, s[i].indexOf(")"))) + " ";
}
}
INCLUDES = INCLUDES + ")";
EXCLUDES = EXCLUDES + ")";
System.out.println(INCLUDES);
System.out.println(EXCLUDES);
I have wrote down small utility result as following
if text = "EXCLUDES(ABC) EXCLUDES(ABC) INCLUDES(BMG) INCLUDES(EFG) INCLUDES(IJK)";
output = EXCLUDES(ABC) EXCLUDES(ABC) INCLUDES(BMG & EFG & IJK)
Following is my java codeas following please take a look and if any one can improve it please feel free.
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.sun.xml.internal.ws.util.StringUtils;
/**
* Created by IntelliJ IDEA.
* User: fkhan
* Date: Aug 31, 2012
* Time: 1:36:45 PM
* To change this template use File | Settings | File Templates.
*/
public class TestClass {
public static void main(String args[]) throws Exception {
//String text = "INCLUDES(ABC) EXCLUDES(ABC) EXCLUDES(ABC) INCLUDES(EFG) INCLUDES(IJK)";
String text = "EXCLUDES(ABC) EXCLUDES(ABC) INCLUDES(BMG) INCLUDES(EFG) INCLUDES(IJK)";
List<String> matchedList = findMatchPhrase("INCLUDES", text);
String query = combinePhrase(text, "INCLUDES", matchedList);
System.out.println(query);
}
/**
* This method takes query combine and & multiple phrases
* #param expression
* #param keyword
* #param matchedItemList
* #return
*/
private static String combinePhrase(String expression, String keyword, List<String> matchedItemList) {
//if only one phrase found return value
if(matchedItemList.isEmpty() || matchedItemList.size() ==1){
return expression;
}
//do not remove first match
String matchedItem = null;
for (int index = 1; index < matchedItemList.size(); index++) {
matchedItem = matchedItemList.get(index);
//remove match items from string other then first match
expression = expression.replace(matchedItem, "");
}
StringBuffer textBuffer = new StringBuffer(expression);
//combine other matched strings in first matched item
StringBuffer combineStrBuf = new StringBuffer();
if (matchedItemList.size() > 1) {
for (int index = 1; index < matchedItemList.size(); index++) {
String str = matchedItemList.get(index);
combineStrBuf.append((parseValue(keyword, str)));
combineStrBuf.append(" & ");
}
combineStrBuf.delete(combineStrBuf.lastIndexOf(" & "), combineStrBuf.length());
}
// Inject created phrase into first phrase
//append in existing phrase
return injectInPhrase(keyword, textBuffer, combineStrBuf.toString());
}
/**
*
* #param keyword
* #param textBuffer
* #param injectStr
*/
private static String injectInPhrase(String keyword, StringBuffer textBuffer, String injectStr) {
Matcher matcher = getMatcher(textBuffer.toString());
while (matcher.find()) {
String subStr = matcher.group();
if (subStr.startsWith(keyword)) {
textBuffer.insert(matcher.end()-1, " & ".concat(injectStr));
break;
}
}
return textBuffer.toString();
}
/**
* #param expression
* #param keyword
* #return
*/
private static String parseValue(String keyword, String expression) {
String parsStr = "";
if (expression.indexOf(keyword) > -1) {
parsStr = expression.replace(keyword, "").replace("(", "").replace(")", "");
}
return parsStr;
}
/**
* This method creates matcher object
* and return for further processing
* #param expression
* #return
*/
private static Matcher getMatcher(String expression){
String patternString = "(\\w+)\\((.*?)\\)";
Pattern pattern = Pattern.compile(patternString);
return pattern.matcher(expression);
}
/**
* This method find find matched items by keyword
* and return as list
* #param keyword
* #param expression
* #return
*/
private static List<String> findMatchPhrase(String keyword, String expression) {
List<String> matchList = new ArrayList<String>(3);
keyword = StringUtils.capitalize(keyword);
Matcher matcher = getMatcher(expression);
while (matcher.find()) {
String subStr = matcher.group();
if (subStr.startsWith(keyword)) {
matchList.add(subStr);
}
}
return matchList;
}
}

Filter words from string

I want to filter a string.
Basically when someone types a message, I want certain words to be filtered out, like this:
User types: hey guys lol omg -omg mkdj*Omg*ndid
I want the filter to run and:
Output: hey guys lol - mkdjndid
And I need the filtered words to be loaded from an ArrayList that contains several words to filter out. Now at the moment I am doing if(message.contains(omg)) but that doesn't work if someone types zomg or -omg or similar.
Use replaceAll with a regex built from the bad word:
message = message.replaceAll("(?i)\\b[^\\w -]*" + badWord + "[^\\w -]*\\b", "");
This passes your test case:
public static void main( String[] args ) {
List<String> badWords = Arrays.asList( "omg", "black", "white" );
String message = "hey guys lol omg -omg mkdj*Omg*ndid";
for ( String badWord : badWords ) {
message = message.replaceAll("(?i)\\b[^\\w -]*" + badWord + "[^\\w -]*\\b", "");
}
System.out.println( message );
}
try:
input.replaceAll("(\\*?)[oO][mM][gG](\\*?)", "").split(" ")
Dave gave you the answer already, but I will emphasize the statement here. You will face a problem if you implement your algorithm with a simple for-loop that just replaces the occurrence of the filtered word. As an example, if you filter the word ass in the word 'classic' and replace it with 'butt', the resultant word will be 'clbuttic' which doesn't make any sense. Thus, I would suggest using a word list,like the ones stored in Linux under /usr/share/dict/ directory, to check if the word is valid or it needs filtering.
I don't quite get what you are trying to do.
I ran into this same problem and solved it in the following way:
1) Have a google spreadsheet with all words that I want to filter out
2) Directly download the google spreadsheet into my code with the loadConfigs method (see below)
3) Replace all l33tsp33k characters with their respective alphabet letter
4) Replace all special characters but letters from the sentence
5) Run an algorithm that checks all the possible combinations of words within a string against the list efficiently, note that this part is key - you don't want to loop over your ENTIRE list every time to see if your word is in the list. In my case, I found every combination within the string input and checked it against a hashmap (O(1) runtime). This way the runtime grows relatively to the string input, not the list input.
6) Check if the word is not used in combination with a good word (e.g. bass contains *ss). This is also loaded through the spreadsheet
6) In our case we are also posting the filtered words to Slack, but you can remove that line obviously.
We are using this in our own games and it's working like a charm. Hope you guys enjoy.
https://pimdewitte.me/2016/05/28/filtering-combinations-of-bad-words-out-of-string-inputs/
public static HashMap<String, String[]> words = new HashMap<String, String[]>();
public static void loadConfigs() {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new URL("https://docs.google.com/spreadsheets/d/1hIEi2YG3ydav1E06Bzf2mQbGZ12kh2fe4ISgLg_UBuM/export?format=csv").openConnection().getInputStream()));
String line = "";
int counter = 0;
while((line = reader.readLine()) != null) {
counter++;
String[] content = null;
try {
content = line.split(",");
if(content.length == 0) {
continue;
}
String word = content[0];
String[] ignore_in_combination_with_words = new String[]{};
if(content.length > 1) {
ignore_in_combination_with_words = content[1].split("_");
}
words.put(word.replaceAll(" ", ""), ignore_in_combination_with_words);
} catch(Exception e) {
e.printStackTrace();
}
}
System.out.println("Loaded " + counter + " words to filter out");
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Iterates over a String input and checks whether a cuss word was found in a list, then checks if the word should be ignored (e.g. bass contains the word *ss).
* #param input
* #return
*/
public static ArrayList<String> badWordsFound(String input) {
if(input == null) {
return new ArrayList<>();
}
// remove leetspeak
input = input.replaceAll("1","i");
input = input.replaceAll("!","i");
input = input.replaceAll("3","e");
input = input.replaceAll("4","a");
input = input.replaceAll("#","a");
input = input.replaceAll("5","s");
input = input.replaceAll("7","t");
input = input.replaceAll("0","o");
ArrayList<String> badWords = new ArrayList<>();
input = input.toLowerCase().replaceAll("[^a-zA-Z]", "");
for(int i = 0; i < input.length(); i++) {
for(int fromIOffset = 1; fromIOffset < (input.length()+1 - i); fromIOffset++) {
String wordToCheck = input.substring(i, i + fromIOffset);
if(words.containsKey(wordToCheck)) {
// for example, if you want to say the word bass, that should be possible.
String[] ignoreCheck = words.get(wordToCheck);
boolean ignore = false;
for(int s = 0; s < ignoreCheck.length; s++ ) {
if(input.contains(ignoreCheck[s])) {
ignore = true;
break;
}
}
if(!ignore) {
badWords.add(wordToCheck);
}
}
}
}
for(String s: badWords) {
Server.getSlackManager().queue(s + " qualified as a bad word in a username");
}
return badWords;
}

Categories