Java Spark GroupByFailure - java

I'm attempting to use the Java Spark libraries with a cluster running Spark 2.3.0 over Hadoop 3.1.0 (and using those versions of the Java libraries).
I've run into a problem where I simply cannot use groupByKey, and I am at a loss to explain why. Any attempted usage of groupByKey for any reason in any circumstance is returning a java.lang.IllegalArgumentException.
I've boiled this down to about the simplest test I can think of:
package com.failuretest;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
public class TestReport {
public static void main(String[] args) throws Exception {
SparkConf conf = new SparkConf().setAppName("TestReport").set("spark.executor.memory", "20G");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> test = sc.parallelize(generateTestData());
test.saveAsTextFile("/TEST/testfile1");
test.mapToPair(line -> {
String[] testParts = line.split(" ");
return new Tuple2<String, String>(testParts[0], testParts[1]);
}).groupByKey().saveAsTextFile("/TEST/testfile2");
sc.close();
}
private static List<String> generateTestData() {
List<String> testList = new ArrayList<String>();
int keyCount = 0;
int valCount = 0;
while (valCount++ < 2000000) {
if (valCount % 10 == 0) {
keyCount++;
}
testList.add("Key" + keyCount + " " + "Val" + valCount);
}
return testList;
}
}
I'm just programmatically creating an RDD that produces 10 values per key, then creating my JavaPairRDD with a simple split, then attempting groupByKey.
When it runs, I receive the following stack:
Exception in thread "main" java.lang.IllegalArgumentException
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.xbean.asm5.ClassReader.<init>(Unknown Source)
at org.apache.spark.util.ClosureCleaner$.getClassReader(ClosureCleaner.scala:46)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodInsn$2.apply(ClosureCleaner.scala:449)
at org.apache.spark.util.FieldAccessFinder$$anon$3$$anonfun$visitMethodInsn$2.apply(ClosureCleaner.scala:432)
at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:733)
at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:103)
at scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:103)
at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:230)
at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40)
at scala.collection.mutable.HashMap$$anon$1.foreach(HashMap.scala:103)
at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:732)
at org.apache.spark.util.FieldAccessFinder$$anon$3.visitMethodInsn(ClosureCleaner.scala:432)
at org.apache.xbean.asm5.ClassReader.a(Unknown Source)
at org.apache.xbean.asm5.ClassReader.b(Unknown Source)
at org.apache.xbean.asm5.ClassReader.accept(Unknown Source)
at org.apache.xbean.asm5.ClassReader.accept(Unknown Source)
at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:262)
at org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:261)
at scala.collection.immutable.List.foreach(List.scala:381)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:261)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:159)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2292)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$combineByKeyWithClassTag$1.apply(PairRDDFunctions.scala:88)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$combineByKeyWithClassTag$1.apply(PairRDDFunctions.scala:77)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.PairRDDFunctions.combineByKeyWithClassTag(PairRDDFunctions.scala:77)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$groupByKey$1.apply(PairRDDFunctions.scala:505)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$groupByKey$1.apply(PairRDDFunctions.scala:498)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.PairRDDFunctions.groupByKey(PairRDDFunctions.scala:498)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$groupByKey$3.apply(PairRDDFunctions.scala:641)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$groupByKey$3.apply(PairRDDFunctions.scala:641)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.PairRDDFunctions.groupByKey(PairRDDFunctions.scala:640)
at org.apache.spark.api.java.JavaPairRDD.groupByKey(JavaPairRDD.scala:559)
at com.failuretest.TestReport.main(TestReport.java:22)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:564)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:879)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:197)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:227)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:136)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
It doesn't get any further than the groupByKey (I'm writing a file above with the results, but it really doesn't matter since it never gets there).
I can run it all day long in my local dev instance, but running spark-submit with a jar containing the above fails every time in the cluster.
I'm really not sure where to go from here - what I am trying to do is a bit of a challenge if I cannot group by key.
Am I messing up? Is this a version conflict somewhere?
Dave

I actually figured this out before posting this, but in the interests of helping others...
I discovered that one of my colleagues had decided to have a play around with Java 10 on this particular cluster. Moved it back to Java 8 (sorry - didn't try 9) and the problem went away.
Dave

Related

azure identity java sdk throwing MutableCoercionConfig error

I'm new to the java library, I learned that azure Java SDK can be used in the scala environment, So I have tried using the azure identity library in the data bricks environment
DBR Version 7.3 LTS
scala 2.12
maven coordinate com.azure:azure-identity:1.3.5
pls help me resolve the error,
here is the code :
import com.azure.identity._;
import com.azure.identity.DefaultAzureCredentialBuilder;
val clientID = dbutils.secrets.get(scope="****",key="****");
val ClientSecret = dbutils.secrets.get(scope="****",key="****");
val tenantID = dbutils.secrets.get(scope="****",key="****");
val endpoint = "****" ;
val clientSecretCredential:ClientSecretCredential = new ClientSecretCredentialBuilder().tenantId(tenantID).clientId(clientID).clientSecret(ClientSecret).build();
here is the error i face
java.lang.NoSuchMethodError: com.fasterxml.jackson.dataformat.xml.XmlMapper.coercionConfigDefaults()Lcom/fasterxml/jackson/databind/cfg/MutableCoercionConfig
at com.fasterxml.jackson.dataformat.xml.XmlMapper.<init>(XmlMapper.java:145)
at com.fasterxml.jackson.dataformat.xml.XmlMapper.<init>(XmlMapper.java:127)
at com.fasterxml.jackson.dataformat.xml.XmlMapper.builder(XmlMapper.java:218)
at com.azure.core.util.serializer.JacksonAdapter.<init>(JacksonAdapter.java:137)
at com.azure.core.util.serializer.JacksonAdapter.createDefaultSerializerAdapter(JacksonAdapter.java:189)
at com.azure.identity.implementation.IdentityClient.<clinit>(IdentityClient.java:96)
at com.azure.identity.implementation.IdentityClientBuilder.build(IdentityClientBuilder.java:113)
at com.azure.identity.ClientSecretCredential.<init>(ClientSecretCredential.java:50)
at com.azure.identity.ClientSecretCredentialBuilder.build(ClientSecretCredentialBuilder.java:76)
at linea34e0846b16045d09ff78af8b346393c25.$read$$iw$$iw$$iw$$iw$$iw$$iw.<init>(command-626790621540495:18)
at linea34e0846b16045d09ff78af8b346393c25.$read$$iw$$iw$$iw$$iw$$iw.<init>(command-626790621540495:68)
at linea34e0846b16045d09ff78af8b346393c25.$read$$iw$$iw$$iw$$iw.<init>(command-626790621540495:70)
at linea34e0846b16045d09ff78af8b346393c25.$read$$iw$$iw$$iw.<init>(command-626790621540495:72)
at linea34e0846b16045d09ff78af8b346393c25.$read$$iw$$iw.<init>(command-626790621540495:74)
at linea34e0846b16045d09ff78af8b346393c25.$read$$iw.<init>(command-626790621540495:76)
at linea34e0846b16045d09ff78af8b346393c25.$read.<init>(command-626790621540495:78)
at linea34e0846b16045d09ff78af8b346393c25.$read$.<init>(command-626790621540495:82)
at linea34e0846b16045d09ff78af8b346393c25.$read$.<clinit>(command-626790621540495)
at linea34e0846b16045d09ff78af8b346393c25.$eval$.$print$lzycompute(<notebook>:7)
at linea34e0846b16045d09ff78af8b346393c25.$eval$.$print(<notebook>:6)
at linea34e0846b16045d09ff78af8b346393c25.$eval.$print(<notebook>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at scala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:745)
at scala.tools.nsc.interpreter.IMain$Request.loadAndRun(IMain.scala:1021)
at scala.tools.nsc.interpreter.IMain.$anonfun$interpret$1(IMain.scala:574)
at scala.reflect.internal.util.ScalaClassLoader.asContext(ScalaClassLoader.scala:41)
at scala.reflect.internal.util.ScalaClassLoader.asContext$(ScalaClassLoader.scala:37)
at scala.reflect.internal.util.AbstractFileClassLoader.asContext(AbstractFileClassLoader.scala:41)
at scala.tools.nsc.interpreter.IMain.loadAndRunReq$1(IMain.scala:573)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:600)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:570)
at com.databricks.backend.daemon.driver.DriverILoop.execute(DriverILoop.scala:219)
at com.databricks.backend.daemon.driver.ScalaDriverLocal.$anonfun$repl$1(ScalaDriverLocal.scala:204)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.backend.daemon.driver.DriverLocal$TrapExitInternal$.trapExit(DriverLocal.scala:789)
at com.databricks.backend.daemon.driver.DriverLocal$TrapExit$.apply(DriverLocal.scala:742)
at com.databricks.backend.daemon.driver.ScalaDriverLocal.repl(ScalaDriverLocal.scala:204)
at com.databricks.backend.daemon.driver.DriverLocal.$anonfun$execute$10(DriverLocal.scala:431)
at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:239)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:234)
at com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:231)
at com.databricks.backend.daemon.driver.DriverLocal.withAttributionContext(DriverLocal.scala:48)
at com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:276)
at com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:269)
at com.databricks.backend.daemon.driver.DriverLocal.withAttributionTags(DriverLocal.scala:48)
at com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:408)
at com.databricks.backend.daemon.driver.DriverWrapper.$anonfun$tryExecutingCommand$1(DriverWrapper.scala:653)
at scala.util.Try$.apply(Try.scala:213)
at com.databricks.backend.daemon.driver.DriverWrapper.tryExecutingCommand(DriverWrapper.scala:645)
at com.databricks.backend.daemon.driver.DriverWrapper.getCommandOutputAndError(DriverWrapper.scala:486)
at com.databricks.backend.daemon.driver.DriverWrapper.executeCommand(DriverWrapper.scala:598)
at com.databricks.backend.daemon.driver.DriverWrapper.runInnerLoop(DriverWrapper.scala:391)
at com.databricks.backend.daemon.driver.DriverWrapper.runInner(DriverWrapper.scala:337)
at com.databricks.backend.daemon.driver.DriverWrapper.run(DriverWrapper.scala:219)
at java.lang.Thread.run(Thread.java:748)

How to load HashMap properly from a .yml file?

I am trying to load a HashMap from the config file using the standard Bukkit configuration files API.
HashMap:
public static HashMap<String, String> banned = new HashMap<String, String>();
This is the way I am trying to get the data:
public static boolean isBanned(String uuid) {
if (Dogends.config.getConfigurationSection("Banned").getKeys(true).contains(uuid)) {
return true;
}
return false;
}
If the player is banned then it's ok, but when the player is not banned, then it throws a NullPointerException out.
NullPointerException:
Could not pass event PlayerLoginEvent to Dogends v1.0
org.bukkit.event.EventException
at org.bukkit.plugin.java.JavaPluginLoader$1.execute(JavaPluginLoader.java:302) ~[cb.jar:git-Bukkit-880a532]
at org.bukkit.plugin.RegisteredListener.callEvent(RegisteredListener.java:62) ~[cb.jar:git-Bukkit-880a532]
at org.bukkit.plugin.SimplePluginManager.fireEvent(SimplePluginManager.java:501) [cb.jar:git-Bukkit-880a532]
at org.bukkit.plugin.SimplePluginManager.callEvent(SimplePluginManager.java:486) [cb.jar:git-Bukkit-880a532]
at net.minecraft.server.v1_8_R3.PlayerList.attemptLogin(PlayerList.java:439) [cb.jar:git-Bukkit-880a532]
at net.minecraft.server.v1_8_R3.LoginListener.b(LoginListener.java:89) [cb.jar:git-Bukkit-880a532]
at net.minecraft.server.v1_8_R3.LoginListener.c(LoginListener.java:53) [cb.jar:git-Bukkit-880a532]
at net.minecraft.server.v1_8_R3.NetworkManager.a(NetworkManager.java:222) [cb.jar:git-Bukkit-880a532]
at net.minecraft.server.v1_8_R3.ServerConnection.c(SourceFile:168) [cb.jar:git-Bukkit-880a532]
at net.minecraft.server.v1_8_R3.MinecraftServer.B(MinecraftServer.java:744) [cb.jar:git-Bukkit-880a532]
at net.minecraft.server.v1_8_R3.DedicatedServer.B(DedicatedServer.java:335) [cb.jar:git-Bukkit-880a532]
at net.minecraft.server.v1_8_R3.MinecraftServer.A(MinecraftServer.java:628) [cb.jar:git-Bukkit-880a532]
at net.minecraft.server.v1_8_R3.MinecraftServer.run(MinecraftServer.java:536) [cb.jar:git-Bukkit-880a532]
at java.lang.Thread.run(Unknown Source) [?:1.8.0_91]
Caused by: java.lang.NullPointerException
at me.woulfiee.server.ban.BanCommand.isBanned(BanCommand.java:47) ~[?:?]
at me.woulfiee.server.ban.BanCommand.onPlayerLogin(BanCommand.java:103) ~[?:?]
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[?:1.8.0_91]
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) ~[?:1.8.0_91]
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) ~[?:1.8.0_91]
at java.lang.reflect.Method.invoke(Unknown Source) ~[?:1.8.0_91]
at org.bukkit.plugin.java.JavaPluginLoader$1.execute(JavaPluginLoader.java:300) ~[cb.jar:git-Bukkit-880a532]
... 13 more
config.yml:
Ranks:
Player:
Players: []
Mythic:
Players: []
Doge:
Players: []
Youtuber:
Players: []
Builder:
Players: []
Mod:
Players: []
Admin:
Players: []
Owner:
Players:
- d166739c-32d3-4b37-a1be-883be57d736c
Broadcast:
Interval: 120
Banned:
d166739c-32d3-4b37-a1be-883be57d736c: "CONSOLE \xa7eHELP"
To accomplish what you wish, you should try the following:
Make sure your config is not null/exists
boolean isBanned(String uuid) {
FileConfiguration yourConfig;
//Getting the Banned section
ConfigurationSection banned = yourConfig.getConfigurationSection("Banned");
//All the keys inside the banned configuration section
Set<String> keys = banned.getKeys(false); //We don't want it to be deep
if (keys.contains(uuid))return true; //UUID is on the keys list, so the player is banned
return false; //UUID is not on the keys list, so the player is not banned
}
I don't believe you actually need the hashmap, unless you're using it for something else
getConfigurationSection:
If the ConfigurationSection does not exist but a default value has
been specified, this will return the default value. If the
ConfigurationSection does not exist and no default value was
specified, this will return null.
I'm guessing if there are no users banned, there is no Banned section, so getConfigurationSection returns null, which is why your getKeys() call throws a NPE.
So you should first check if the configuration section exists, and only then try to use it.

JClouds-Chef BootstrapConfig Builder MissingMethodException

Please note: although this question involve the JClouds-Chef library and Groovy here, I think this is a Java API question at heart.
On JClouds-Chef 1.7.3 here:
List<String> runlist = new RunListBuilder().addRole("typicalapp").build();
ArrayList<String> runList2 = new ArrayList<String>();
for(String item : runlist) {
runList2.add(item);
}
System.out.println("runList2 is of type: " + runList2.getClass().getName());
BootstrapConfig bootstrapConfig = BootstrapConfig.builder().runlist(runList2).build();
Produces the following output/exception:
runList2 is of type: java.util.ArrayList
Exception in thread "main" groovy.lang.MissingMethodException: No signature of method: org.jclouds.chef.domain.BootstrapConfig$Builder.runlist() is applicable for argument types: (java.util.ArrayList) values: [[role[typicalapp]]]
Possible solutions: runList(java.lang.Iterable), build(), split(groovy.lang.Closure)
at org.codehaus.groovy.runtime.ScriptBytecodeAdapter.unwrap(ScriptBytecodeAdapter.java:55)
at org.codehaus.groovy.runtime.callsite.PojoMetaClassSite.call(PojoMetaClassSite.java:46)
at org.codehaus.groovy.runtime.callsite.CallSiteArray.defaultCall(CallSiteArray.java:45)
at org.codehaus.groovy.runtime.callsite.AbstractCallSite.call(AbstractCallSite.java:108)
at org.codehaus.groovy.runtime.callsite.AbstractCallSite.call(AbstractCallSite.java:116)
at net.myuser.chef.test.ChefPlugin.provision(ChefPlugin.groovy:71)
at net.myuser.chef.test.ChefPlugin$provision.call(Unknown Source)
at org.codehaus.groovy.runtime.callsite.CallSiteArray.defaultCall(CallSiteArray.java:45)
at org.codehaus.groovy.runtime.callsite.AbstractCallSite.call(AbstractCallSite.java:108)
at org.codehaus.groovy.runtime.callsite.AbstractCallSite.call(AbstractCallSite.java:112)
at net.myuser.chef.test.ChefPlugin.main(ChefPlugin.groovy:27)
I'm pretty sure the code for this version of BootstrapConfig#Builder is here. As far as I can tell, ArrayList extends Iterable, so I can't see what's going on here.
You are using runlist instead of runList

how to set Hadoop DistributedCache?

when I run the hadoop code to add the third jar,just like the following code:
public static void addTmpJar(String jarPath, JobConf conf) throws IOException {
System.setProperty("path.separator", ":");
FileSystem fs = FileSystem.getLocal(conf);
String newJarPath = new Path(jarPath).makeQualified(fs).toString();
String tmpjars = conf.get("tmpjars");
if (tmpjars == null || tmpjars.length() == 0) {
conf.set("tmpjars", newJarPath);
} else {
conf.set("tmpjars", tmpjars + "," + newJarPath);
}
}
I get the following exception:
Error initializing attempt_201405281453_0053_m_000002_0:
org.apache.hadoop.util.DiskChecker$DiskErrorException: Could not find any valid local directory for taskTracker/hadoop/distcache/-7315515059647727905_-860888033_1107570546/nn.hadoop.dev/tmp/hadoop-hadoop/mapred/staging/hadoop/.staging/job_201405281453_0053/libjars/mahout-core-0.8-job.jar
at org.apache.hadoop.fs.LocalDirAllocator$AllocatorPerContext.getLocalPathForWrite(LocalDirAllocator.java:381)
at org.apache.hadoop.fs.LocalDirAllocator.getLocalPathForWrite(LocalDirAllocator.java:146)
at org.apache.hadoop.filecache.TrackerDistributedCacheManager.getLocalCache(TrackerDistributedCacheManager.java:173)
at org.apache.hadoop.filecache.TaskDistributedCacheManager.setupCache(TaskDistributedCacheManager.java:187)
at org.apache.hadoop.mapred.TaskTracker$4.run(TaskTracker.java:1320)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190)
at org.apache.hadoop.mapred.TaskTracker.initializeJob(TaskTracker.java:1311)
at org.apache.hadoop.mapred.TaskTracker.localizeJob(TaskTracker.java:1226)
at org.apache.hadoop.mapred.TaskTracker$5.run(TaskTracker.java:2603)
at java.lang.Thread.run(Thread.java:744)
any one who can tell how to solve this problem,thanks!
From the commandline you can add a jar to the distributedcache using -libjars, the only prerequisite is that your MR program implements Tool which uses GenericOptionsParser, the latter takes care of adding the jar to the cache.
This page explains the above in more detail

Java Stanford NLP: ArrayIndexOutOfBounds after loading second lexicon

I am using the Stanford Natural Language processing toolkit. I've been trying to find spelling errors with Lexicon's isKnown method, but it produces quite a few false positives. So I thought I'd load a second lexicon, and check that too. However, that causes a problem.
private static LexicalizedParser lp = new LexicalizedParser(Constants.stdLexFile);
private static LexicalizedParser wsjLexParse = new LexicalizedParser(Constants.wsjLexFile);
static {
lp.setOptionFlags(Constants.lexOptionFlags);
wsjLexParse.setOptionFlags(Constants.lexOptionFlags);
}
public ParseTree(String input) throws IllegalArgumentException, IllegalAccessException, InvocationTargetException {
initialInput = input;
DocumentPreprocessor process = new DocumentPreprocessor();
sentences = process.getSentencesFromText(new StringReader(input));
for (List<? extends HasWord> sent : sentences) {
if(lp.parse(sent)) { // line 65
forest.add(lp.getBestParse()); //non determinism?
}
}
partsOfSpeech = pos();
runAnalysis();
}
The following fail trace is produced:
java.lang.ArrayIndexOutOfBoundsException: 45547
at edu.stanford.nlp.parser.lexparser.BaseLexicon.initRulesWithWord(BaseLexicon.java:300)
at edu.stanford.nlp.parser.lexparser.BaseLexicon.isKnown(BaseLexicon.java:160)
at edu.stanford.nlp.parser.lexparser.BaseLexicon.ruleIteratorByWord(BaseLexicon.java:212)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.initializeChart(ExhaustivePCFGParser.java:1299)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.parse(ExhaustivePCFGParser.java:388)
at edu.stanford.nlp.parser.lexparser.LexicalizedParser.parse(LexicalizedParser.java:234)
at nth.compling.ParseTree.<init>(ParseTree.java:65)
at nth.compling.ParseTreeTest.constructor(ParseTreeTest.java:33)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at org.junit.internal.runners.BeforeAndAfterRunner.invokeMethod(BeforeAndAfterRunner.java:74)
at org.junit.internal.runners.BeforeAndAfterRunner.runBefores(BeforeAndAfterRunner.java:50)
at org.junit.internal.runners.BeforeAndAfterRunner.runProtected(BeforeAndAfterRunner.java:33)
at org.junit.internal.runners.TestClassRunner.run(TestClassRunner.java:52)
at org.eclipse.jdt.internal.junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:45)
at org.eclipse.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:460)
at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:673)
at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:386)
at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:196)
If I comment out this line: (and other references to wsjLexParse)
private static LexicalizedParser wsjLexParse = new LexicalizedParser(Constants.wsjLexFile);
then everything works fine. What am I doing wrong here?
Looks like a bug in the Stanford library. You should report it to them.
Does the second lexicon work when you load only it (and not the other one)?
Does the same error occur when you load the two lexica in different order?

Categories