Synpase-Py4JJavaError: An error occurred while calling None.com.amazon.deequ.analyzers.Size
Spark Notebook used below code
%%pyspark
from pyspark.sql import SparkSession, Row
import pydeequ
spark = (SparkSession
.builder
.config("spark.jars.packages", pydeequ.deequ_maven_coord)
.config("spark.jars.excludes", pydeequ.f2j_maven_coord)
.getOrCreate())
df = spark.sparkContext.parallelize([
Row(a="foo", b=1, c=5),
Row(a="bar", b=2, c=6),
Row(a="baz", b=3, c=None)]).toDF()
%%pyspark
from pydeequ.analyzers import *
analysisResult = AnalysisRunner(spark) \
.onData(df) \
.addAnalyzer(Size()) \
.addAnalyzer(Completeness("b")) \
.run()
analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.show()
Py4JJavaError: An error occurred while calling None.com.amazon.deequ.analyzers.Size.
: java.lang.NoSuchMethodError: scala.Product.$init$(Lscala/Product;)V
at com.amazon.deequ.analyzers.Size.<init>(Size.scala:37)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Traceback (most recent call last):
File "/home/trusted-service-user/cluster-env/env/lib/python3.6/site-packages/pydeequ/analyzers.py", line 122, in addAnalyzer
_analyzer_jvm = analyzer._analyzer_jvm
File "/home/trusted-service-user/cluster-env/env/lib/python3.6/site-packages/pydeequ/analyzers.py", line 743, in _analyzer_jvm
self._jvm.scala.Option.apply(self.where)
File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1525, in call
answer, self._gateway_client, None, self._fqn)
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 69, in deco
return f(*a, **kw)
File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling None.com.amazon.deequ.analyzers.Size.
: java.lang.NoSuchMethodError: scala.Product.$init$(Lscala/Product;)V
at com.amazon.deequ.analyzers.Size.<init>(Size.scala:37)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Got Below Error
ModuleNotFoundError: No module named 'pydeequ'
Traceback (most recent call last):
deequ-1.2.2-spark-2.4.jar
We loaded the Jar file in Workspace packages
We tried to monitor the apache spark applications and Livy job was successfully
Jar download Location:
https://repo1.maven.org/maven2/com/amazon/deequ/deequ/
https://pypi.org/project/pydeequ/#files
Does not work with scala too
%%spark
val df = spark.read.format("csv").option("header","true").load("abfss://xxx@Piepel .dfs.core.windows.net/xxx/xxx.csv")
%%spark
import spark.implicits._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.concat
import com.amazon.deequ.{VerificationSuite, VerificationResult}
import com.amazon.deequ.VerificationResult.checkResultsAsDataFrame
import com.amazon.deequ.checks.{Check, CheckLevel, CheckStatus}
import com.amazon.deequ.suggestions.{ConstraintSuggestionRunner, Rules}
import com.amazon.deequ.analyzers._
import com.amazon.deequ.analyzers.runners.AnalysisRunner
import com.amazon.deequ.analyzers.runners.AnalyzerContext.successMetricsAsDataFrame
import com.amazon.deequ.analyzers.{Analysis, ApproxCountDistinct, Completeness, Compliance, Distinctness, InMemoryStateProvider, Size}
val analysisResult: AnalyzerContext = { AnalysisRunner
// data to run the analysis on
.onData(df)
// define analyzers that compute metrics
.addAnalyzer(Size())
.addAnalyzer(Completeness("MaterialNumber"))
//.addAnalyzer(ApproxCountDistinct("review_id"))
//.addAnalyzer(Mean("star_rating"))
//.addAnalyzer(Compliance("top star_rating", "star_rating >= 4.0"))
//.addAnalyzer(Correlation("total_votes", "star_rating"))
//.addAnalyzer(Correlation("total_votes", "helpful_votes"))
// compute metrics
.run()
}
// retrieve successfully computed metrics as a Spark data frame
val metrics = successMetricsAsDataFrame(spark, analysisResult)
Error: java.lang.NoSuchMethodError: scala.Product.$init$(Lscala/Product;)V
at com.amazon.deequ.analyzers.Size.<init>(Size.scala:37)
Regards,
navin