I'm trying to implement a NLP-Pipeline using Spark/Scala.
Right now I face the difficulty of subtracting one collection (implemented as Dataframes) from another -
both collections items have IDs, but the number of arguments associated with that ID differ in both collections.
Example:
Entry in Collection A: "_id" -> "someUniqueID", "attribute1" -> "someValue"
Entry in Collection B: "_id" -> "someUniqueID", "attribute1" -> "someValue", "attribute2" ->"someValue"
I tried to accomplish this by using:
collection_A.join(collection_B, Seq("_id"), jointype="left_semi")
After doing so, I cannot use methods like
.show()
.count()
but
.printSchema()
works and the resulting structure is the desired on.
Calling either of the methods mentioned about will result in the errorlog listed below:
Exception in thread "main" java.lang.AbstractMethodError
at scala.collection.TraversableLike$class.filter(TraversableLike.scala:270)
at org.apache.spark.sql.catalyst.expressions.ExpressionSet.filter(ExpressionSet.scala:55)
at org.apache.spark.sql.catalyst.plans.logical.QueryPlanConstraints$class.constraints(QueryPlanConstraints.scala:36)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.constraints$lzycompute(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.constraints(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.optimizer.InferFiltersFromConstraints$.org$apache$spark$sql$catalyst$optimizer$InferFiltersFromConstraints$$getAllConstraints(Optimizer.scala:805)
at org.apache.spark.sql.catalyst.optimizer.InferFiltersFromConstraints$$anonfun$inferFilters$1.applyOrElse(Optimizer.scala:780)
at org.apache.spark.sql.catalyst.optimizer.InferFiltersFromConstraints$$anonfun$inferFilters$1.applyOrElse(Optimizer.scala:765)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:258)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:258)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:257)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDown(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$class.transformDown(AnalysisHelper.scala:149)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDown(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDown(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:263)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:263)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:328)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:186)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:326)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:263)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDown(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$class.transformDown(AnalysisHelper.scala:149)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDown(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDown(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:263)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:263)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:328)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:186)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:326)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:263)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDown(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$class.transformDown(AnalysisHelper.scala:149)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDown(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDown(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:263)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:263)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:328)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:186)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:326)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:263)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDown(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$class.transformDown(AnalysisHelper.scala:149)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDown(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDown(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:247)
at org.apache.spark.sql.catalyst.optimizer.InferFiltersFromConstraints$.inferFilters(Optimizer.scala:765)
at org.apache.spark.sql.catalyst.optimizer.InferFiltersFromConstraints$.apply(Optimizer.scala:759)
at org.apache.spark.sql.catalyst.optimizer.InferFiltersFromConstraints$.apply(Optimizer.scala:754)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:87)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:84)
at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:35)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:84)
at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:76)
at scala.collection.immutable.List.foreach(List.scala:381)
at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:76)
at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:67)
at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:67)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:73)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:69)
at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:78)
at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:78)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3365)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
at org.apache.spark.sql.Dataset.show(Dataset.scala:751)
at org.apache.spark.sql.Dataset.show(Dataset.scala:710)
at org.apache.spark.sql.Dataset.show(Dataset.scala:719)
at App$.main(App.scala:49)
at App.main(App.scala)
I really appreciate any help/hint regarding this error.
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…