Nested Zip file stored in blob storage unzip is not working in azure databaricks

manish verma 441 Reputation points
2024-09-01T15:52:41.77+00:00

Hi All,

try to unzip nested zip file in azure databricks, nested zip file stored in blob storage. below is the code i am using. it will not give any error message but in destination folder i didn't see any file

import zipfile
from pyspark.dbutils import DBUtils

def unzip_nested_zipfile(file_path, extract_path):
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
        for inner_file in zip_ref.namelist():
            if inner_file.endswith('.zip'):
                inner_zip_path = f"{extract_path}/{inner_file}"
                unzip_nested_zipfile(inner_zip_path, extract_path)

source_folder = '/mnt/input/zip'
destination_folder = '/mnt/output/unzip'

for file_info in dbutils.fs.ls(source_folder):
    file_path = file_info.path
    if file_path.endswith('.zip'):
        try:
            unzip_nested_zipfile(file_path.replace("dbfs:", "/dbfs"), destination_folder.replace("dbfs:", "/dbfs"))
            print("Unzipping completed successfully!")
        except FileNotFoundError:
            print(f"File not found: {file_path}")
Azure Databricks
Azure Databricks
An Apache Spark-based analytics platform optimized for Azure.
2,151 questions
{count} votes

1 answer

Sort by: Most helpful
  1. Amira Bedhiafi 22,616 Reputation points
    2024-09-02T15:09:22.4666667+00:00

    The zipfile module might not work directly with the DBFS path, so you should first copy the file to a local path in Databricks, unzip it, and then handle nested zips.

    After unzipping, check if the extracted files contain any zip files, and if so, unzip those as well.

    
    import zipfile
    
    import os
    
    from pyspark.dbutils import DBUtils
    
    def unzip_nested_zipfile(file_path, extract_path):
    
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
    
            zip_ref.extractall(extract_path)
    
            for inner_file in zip_ref.namelist():
    
                if inner_file.endswith('.zip'):
    
                    inner_zip_path = os.path.join(extract_path, inner_file)
    
                    unzip_nested_zipfile(inner_zip_path, extract_path)
    
                    os.remove(inner_zip_path)  # Optional: Remove the inner zip after extraction
    
    source_folder = '/mnt/input/zip'
    
    destination_folder = '/mnt/output/unzip'
    
    local_tmp_folder = '/tmp/zip_tmp/'
    
    dbutils.fs.mkdirs(destination_folder)  # Ensure the destination folder exists
    
    os.makedirs(local_tmp_folder, exist_ok=True)  # Create a local temp directory
    
    for file_info in dbutils.fs.ls(source_folder):
    
        file_path = file_info.path
    
        if file_path.endswith('.zip'):
    
            local_file_path = os.path.join(local_tmp_folder, os.path.basename(file_path))
    
            dbutils.fs.cp(file_path, f"file:{local_file_path}")
    
            
    
            try:
    
                unzip_nested_zipfile(local_file_path, local_tmp_folder)
    
                
    
                # Move unzipped contents to destination in DBFS
    
                for root, _, files in os.walk(local_tmp_folder):
    
                    for file_name in files:
    
                        local_unzipped_file = os.path.join(root, file_name)
    
                        dest_file_path = local_unzipped_file.replace(local_tmp_folder, destination_folder)
    
                        dbutils.fs.cp(f"file:{local_unzipped_file}", dest_file_path)
    
                
    
                print(f"Unzipping of {file_path} completed successfully!")
    
            except FileNotFoundError:
    
                print(f"File not found: {file_path}")
    
            finally:
    
                # Clean up local files
    
                os.remove(local_file_path)
    
                for root, _, files in os.walk(local_tmp_folder):
    
                    for file_name in files:
    
                        os.remove(os.path.join(root, file_name))
    

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.