Hello,
I am trying to read a parquet files from the storage accounts. I am using parquet.net library for reading the parquet files. My goal is to reading all the parquet files in the storage account and check which columns has null values.
I tried using azure databricks pyspark, however since some of the column names has special characters its not working. I tried pandas in azure databricks, its taking long time for processing. Hence i tried using azure functions with c# . However i am getting error since each parquet file has different order of columns.
Could someone help me in what other options i have or how can i fix this
string connectionString = "<<storage account connection string>>";
log.LogInformation($"C# Timer trigger function executed at: {DateTime.Now}");
BlobServiceClient blobServiceClient = new BlobServiceClient(connectionString);
string containerName = "containername";
BlobContainerClient containerClient = blobServiceClient.GetBlobContainerClient(containerName);
//Getting the sample structure of the blob to create the final table
string sampleBlobName = "Blobname.parquet";
BlobClient sampleblobClient = containerClient.GetBlobClient(sampleBlobName);
var sampleStream = sampleblobClient.OpenRead();
var sampleReader = new ParquetReader(sampleStream);
Table sampleTable = sampleReader.ReadAsTable();
DataField[] datafields = sampleTable.Schema.GetDataFields();
Table finalTable = new Table(datafields);
//looping through the container for getting the blobs and read the file and add the rows to the final table
await foreach (BlobItem blobItem in containerClient.GetBlobsAsync())
{
if (blobItem.Name.Contains("2021072")) //for testing
{
log.LogInformation("\t" + blobItem.Name);
BlobClient blobClient = containerClient.GetBlobClient(blobItem.Name);
var stream = blobClient.OpenRead();
var reader = new ParquetReader(stream);
Table table = reader.ReadAsTable();
foreach (Row row in table)
{
finalTable.Add(row,);
}
log.LogInformation(finalTable.Count.ToString());
}
}