Can not read data from cosmos db analytical store via managed Virtual Network in synapse workspace.
i created synapse workspace with managed Virtual Network .Then add Managed private endpoints to cosmos db both analytical store and sql store . Approved the Managed private. Created the synapse link to cosmos db via managed Vnet runtime.Every thing is ok until i ran read data to DataFrame in SparkPools.
source:
df = spark.read\
.format("cosmos.olap")\
.option("spark.synapse.linkedService", "CosmosDb_p")\
.option("spark.cosmos.container", "xxxxx_cost")\
.load()
display(df.limit(10))
get the error below:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
/tmp/ipykernel_6648/509115221.py in <module>
2 # To select a preferred list of regions in a multi-region Cosmos DB account, add .option("spark.cosmos.preferredRegions", "<Region1>,<Region2>")
3
----> 4 df = spark.read\
5 .format("cosmos.olap")\
6 .option("spark.synapse.linkedService", "CosmosDb_p")\
/opt/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py in load(self, path, format, schema, **options)
162 return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))
163 else:
--> 164 return self._df(self._jreader.load())
165
166 def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
~/cluster-env/env/lib/python3.8/site-packages/py4j/java_gateway.py in call(self, *args)
1319
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1323
/opt/spark/python/lib/pyspark.zip/pyspark/sql/utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
~/cluster-env/env/lib/python3.8/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTERtype
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o717.load.
: org.apache.hadoop.fs.azure.AzureException: com.microsoft.azure.storage.StorageException:
at org.apache.hadoop.fs.azure.AzureNativeFileSystemStore.retrieveMetadata(AzureNativeFileSystemStore.java:2223)
at org.apache.hadoop.fs.azure.NativeAzureFileSystem.listStatus(NativeAzureFileSystem.java:2793)
at com.microsoft.azure.cosmos.analytics.spark.connector.store.alos.ALoSFileManager.listAllRootMetadataPath(ALoSFileManager.scala:268)
at com.microsoft.azure.cosmos.analytics.spark.connector.store.alos.ALoSFileManager.getRootMetadataPathList(ALoSFileManager.scala:289)
at com.microsoft.azure.cosmos.analytics.spark.connector.store.alos.ALoSSnapshot.createFileSegmentsMetadata(ALoSSnapshot.scala:158)
at com.microsoft.azure.cosmos.analytics.spark.connector.store.alos.ALoSSnapshot.<init>(ALoSSnapshot.scala:33)
at com.microsoft.azure.cosmos.analytics.spark.connector.store.alos.ALoSLogicalTable.startTransaction(ALoSLogicalTable.scala:43)
at com.microsoft.azure.cosmos.analytics.spark.connector.CosmosLogicalTable.startTransaction(CosmosLogicalTable.scala:54)
at com.microsoft.azure.cosmos.analytics.spark.connector.datasource.CosmosOLAPSourceBase.createAndInitializeCosmosLogicalTable(CosmosOLAPSourceBase.scala:67)
at com.microsoft.azure.cosmos.analytics.spark.connector.datasource.CosmosOLAPSourceBase.createAndInitializeCosmosLogicalTable$(CosmosOLAPSourceBase.scala:48)
at com.microsoft.azure.cosmos.analytics.spark.connector.datasource.CosmosOLAPSource.createAndInitializeCosmosLogicalTable(CosmosOLAPSource.scala:18)
at com.microsoft.azure.cosmos.analytics.spark.connector.datasource.CosmosOLAPSource.inferSchema(CosmosOLAPSource.scala:30)
at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.getTableFromProvider(DataSourceV2Utils.scala:81)
at org.apache.spark.sql.DataFrameReader.$anonfun$load$1(DataFrameReader.scala:241)
at scala.Option.map(Option.scala:230)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:218)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:176)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: com.microsoft.azure.storage.StorageException:
at com.microsoft.azure.storage.StorageException.translateException(StorageException.java:87)
at com.microsoft.azure.storage.core.ExecutionEngine.executeWithRetry(ExecutionEngine.java:209)
at com.microsoft.azure.storage.blob.CloudBlob.exists(CloudBlob.java:1994)
at com.microsoft.azure.storage.blob.CloudBlob.exists(CloudBlob.java:1981)
at org.apache.hadoop.fs.azure.StorageInterfaceImpl$CloudBlobWrapperImpl.exists(StorageInterfaceImpl.java:333)
at org.apache.hadoop.fs.azure.AzureNativeFileSystemStore.retrieveMetadata(AzureNativeFileSystemStore.java:2158)
... 27 more
Caused by: java.net.UnknownHostException: 1307346684204174987.z29.blob.storage.azure.net
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:184)
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
at java.net.Socket.connect(Socket.java:607)
at sun.security.ssl.SSLSocketImpl.connect(SSLSocketImpl.java:288)
at sun.net.NetworkClient.doConnect(NetworkClient.java:175)
at sun.net.www.http.HttpClient.openServer(HttpClient.java:463)
at sun.net.www.http.HttpClient.openServer(HttpClient.java:558)
at sun.net.www.protocol.https.HttpsClient.<init>(HttpsClient.java:264)
at sun.net.www.protocol.https.HttpsClient.New(HttpsClient.java:367)
at sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.getNewHttpClient(AbstractDelegateHttpsURLConnection.java:203)
at sun.net.www.protocol.http.HttpURLConnection.plainConnect0(HttpURLConnection.java:1162)
at sun.net.www.protocol.http.HttpURLConnection.plainConnect(HttpURLConnection.java:1056)
at sun.net.www.protocol.https.AbstractDelegateHttpsURLConnection.connect(AbstractDelegateHttpsURLConnection.java:189)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1567)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1495)
at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:480)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.getResponseCode(HttpsURLConnectionImpl.java:352)
at com.microsoft.azure.storage.core.ExecutionEngine.executeWithRetry(ExecutionEngine.java:115)
... 31 more
---------------------------------------------------------------------------
i tried increase the memory to large. switched spark version from 3.2 to 2.4. but nothing changed.
but i found it is ok when i use "spark.read.format("cosmos.oltp")" which means read from the sql store.
Is there a bug in managed Virtual Network or might I be doing wrong?
Thanks!