Excel files saved on Sharepoint corrupted with embed_image function of Xlsxwriter (Microsoft Graph and Python)
Hello,
I have the following section of my Python script that works perfectly fine locally but as soon as I work on remote files on a SharePoint site, the Excel file gets corrupted after running the script (see attached image). All other formatting tasks using xlsxwriter work perfectly fine on remote files on SH.
I have included below the section of the script that causes the issue. I am not sure if this could come from the BytesIO used for the file and then the images, or the way the file is uploaded to SH in the script? I tried resetting the BytesIO and/or closing it in the loop and it does not work then (i.e. no embedded images) but the file is no longer corrupted .
Do you have any idea what could be causing this issue?
Thanks in advance for your help with this.
# Function to upload files to SharePoint
def upload_file_to_sharepoint(access_token, site_id, library_id, folder_path, file_name, file_data):
formatted_path = f"/{folder_path}/{file_name}".replace(" ", "%20")
endpoint = f'https://graph.microsoft.com/v1.0/sites/{site_id}/drives/{library_id}/root:{formatted_path}:/content'
headers = {'Authorization': 'Bearer ' + access_token, 'Content-Type': 'application/octet-stream'}
response = requests.put(endpoint, headers=headers, data=file_data)
if response.status_code in [200, 201]:
print("File uploaded successfully.")
else:
print(f"Failed to upload file: {response.status_code}, {response.text}")
# Create a new Excel file with xlsxwriter
output_stream = BytesIO()
workbook = xlsxwriter.Workbook(output_stream, {'in_memory': True})
worksheet = workbook.add_worksheet()
# Write data to Excel, consider URL formatting and image embedding
for row_num, row in enumerate(filtered_df.itertuples(index=False), start=1):
# Set row height to 300 if the row has any data
if any(pd.notna(value) for value in row):
worksheet.set_row(row_num, 300)
for col_num, value in enumerate(row):
column_name = filtered_df.columns[col_num]
# Determine the format based on whether the column is a designated URL column
format_to_use = url_format if column_name in url_columns and pd.notna(value) else cell_format
# Handle specific behavior for image URLs in 'Post Overview'
if column_name == 'Post Overview' and pd.notna(value):
imgUrl = getattr(row, 'imgUrl', None)
try:
image_data = requests.get(imgUrl).content
image_stream = BytesIO(image_data)
#image_stream.seek(0) # Reset stream position. Test failed
worksheet.embed_image(row_num, col_num, 'image.png', {'image_data': image_stream})
#image_stream.close() # Test failed
except Exception as e:
print(f"Failed to load image from {imgUrl}: {e}")
worksheet.write(row_num, col_num, 'Failed to load image', format_to_use)
else:
worksheet.write(row_num, col_num, value if pd.notna(value) else '', format_to_use)
workbook.close()
output_stream.seek(0)
processed_file_name = 'Processed_' + datetime.now().strftime('%Y%m%d%H%M%S') + '.xlsx'
upload_file_to_sharepoint(access_token, config['site_id'], config['library_id'], config['folder_path'], processed_file_name, output_stream.read())
print("Excel file with images saved successfully.")