Synpase-Py4JJavaError: An error occurred while calling None.com.amazon.deequ.analyzers.Size

Dondapati, Navin 281 Reputation points
2021-05-24T19:48:08.963+00:00

Spark Notebook used below code

%%pyspark
from pyspark.sql import SparkSession, Row
import pydeequ
spark = (SparkSession
.builder
.config("spark.jars.packages", pydeequ.deequ_maven_coord)
.config("spark.jars.excludes", pydeequ.f2j_maven_coord)
.getOrCreate())
df = spark.sparkContext.parallelize([
Row(a="foo", b=1, c=5),
Row(a="bar", b=2, c=6),
Row(a="baz", b=3, c=None)]).toDF()

%%pyspark
from pydeequ.analyzers import *
analysisResult = AnalysisRunner(spark) \
.onData(df) \
.addAnalyzer(Size()) \
.addAnalyzer(Completeness("b")) \
.run()
analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.show()

Py4JJavaError: An error occurred while calling None.com.amazon.deequ.analyzers.Size.
: java.lang.NoSuchMethodError: scala.Product.$init$(Lscala/Product;)V
at com.amazon.deequ.analyzers.Size.<init>(Size.scala:37)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)

Traceback (most recent call last):

File "/home/trusted-service-user/cluster-env/env/lib/python3.6/site-packages/pydeequ/analyzers.py", line 122, in addAnalyzer
_analyzer_jvm = analyzer._analyzer_jvm

File "/home/trusted-service-user/cluster-env/env/lib/python3.6/site-packages/pydeequ/analyzers.py", line 743, in _analyzer_jvm
self._jvm.scala.Option.apply(self.where)

File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1525, in call
answer, self._gateway_client, None, self._fqn)

File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 69, in deco
return f(*a, **kw)

File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)

py4j.protocol.Py4JJavaError: An error occurred while calling None.com.amazon.deequ.analyzers.Size.
: java.lang.NoSuchMethodError: scala.Product.$init$(Lscala/Product;)V
at com.amazon.deequ.analyzers.Size.<init>(Size.scala:37)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)

Got Below Error

ModuleNotFoundError: No module named 'pydeequ'
Traceback (most recent call last):

deequ-1.2.2-spark-2.4.jar

We loaded the Jar file in Workspace packages

We tried to monitor the apache spark applications and Livy job was successfully

Jar download Location:
https://repo1.maven.org/maven2/com/amazon/deequ/deequ/

https://pypi.org/project/pydeequ/#files

Does not work with scala too
%%spark
val df = spark.read.format("csv").option("header","true").load("abfss://xxx@Piepel .dfs.core.windows.net/xxx/xxx.csv")

%%spark
import spark.implicits._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.concat
import com.amazon.deequ.{VerificationSuite, VerificationResult}
import com.amazon.deequ.VerificationResult.checkResultsAsDataFrame
import com.amazon.deequ.checks.{Check, CheckLevel, CheckStatus}
import com.amazon.deequ.suggestions.{ConstraintSuggestionRunner, Rules}
import com.amazon.deequ.analyzers._
import com.amazon.deequ.analyzers.runners.AnalysisRunner
import com.amazon.deequ.analyzers.runners.AnalyzerContext.successMetricsAsDataFrame
import com.amazon.deequ.analyzers.{Analysis, ApproxCountDistinct, Completeness, Compliance, Distinctness, InMemoryStateProvider, Size}

val analysisResult: AnalyzerContext = { AnalysisRunner
// data to run the analysis on
.onData(df)
// define analyzers that compute metrics
.addAnalyzer(Size())
.addAnalyzer(Completeness("MaterialNumber"))
//.addAnalyzer(ApproxCountDistinct("review_id"))
//.addAnalyzer(Mean("star_rating"))
//.addAnalyzer(Compliance("top star_rating", "star_rating >= 4.0"))
//.addAnalyzer(Correlation("total_votes", "star_rating"))
//.addAnalyzer(Correlation("total_votes", "helpful_votes"))
// compute metrics
.run()
}

// retrieve successfully computed metrics as a Spark data frame
val metrics = successMetricsAsDataFrame(spark, analysisResult)

Error: java.lang.NoSuchMethodError: scala.Product.$init$(Lscala/Product;)V
at com.amazon.deequ.analyzers.Size.<init>(Size.scala:37)

Regards,
navin

Azure Synapse Analytics
Azure Synapse Analytics
An Azure analytics service that brings together data integration, enterprise data warehousing, and big data analytics. Previously known as Azure SQL Data Warehouse.
4,696 questions
{count} votes