The zipfile
module might not work directly with the DBFS path, so you should first copy the file to a local path in Databricks, unzip it, and then handle nested zips.
After unzipping, check if the extracted files contain any zip files, and if so, unzip those as well.
import zipfile
import os
from pyspark.dbutils import DBUtils
def unzip_nested_zipfile(file_path, extract_path):
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.extractall(extract_path)
for inner_file in zip_ref.namelist():
if inner_file.endswith('.zip'):
inner_zip_path = os.path.join(extract_path, inner_file)
unzip_nested_zipfile(inner_zip_path, extract_path)
os.remove(inner_zip_path) # Optional: Remove the inner zip after extraction
source_folder = '/mnt/input/zip'
destination_folder = '/mnt/output/unzip'
local_tmp_folder = '/tmp/zip_tmp/'
dbutils.fs.mkdirs(destination_folder) # Ensure the destination folder exists
os.makedirs(local_tmp_folder, exist_ok=True) # Create a local temp directory
for file_info in dbutils.fs.ls(source_folder):
file_path = file_info.path
if file_path.endswith('.zip'):
local_file_path = os.path.join(local_tmp_folder, os.path.basename(file_path))
dbutils.fs.cp(file_path, f"file:{local_file_path}")
try:
unzip_nested_zipfile(local_file_path, local_tmp_folder)
# Move unzipped contents to destination in DBFS
for root, _, files in os.walk(local_tmp_folder):
for file_name in files:
local_unzipped_file = os.path.join(root, file_name)
dest_file_path = local_unzipped_file.replace(local_tmp_folder, destination_folder)
dbutils.fs.cp(f"file:{local_unzipped_file}", dest_file_path)
print(f"Unzipping of {file_path} completed successfully!")
except FileNotFoundError:
print(f"File not found: {file_path}")
finally:
# Clean up local files
os.remove(local_file_path)
for root, _, files in os.walk(local_tmp_folder):
for file_name in files:
os.remove(os.path.join(root, file_name))