InvalidDocumentAccessLevel Cannot access source document location with the current permissions.
Hi,
I am getting this error for the following code I have. Can my sourceuri be my local computer directory as mentioned or is supposed to be blob with uri?
This is my code:
import os
import fitz # PyMuPDF for PDF handling
from docx import Document
from langdetect import detect
from azure.core.credentials import AzureKeyCredential
from azure.ai.translation.document import DocumentTranslationClient
import hashlib
from glob import glob
import pytesseract
from PIL import Image
import logging
Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
Initialize Azure Document Translation client
endpoint = "https://xxxx.cognitiveservices.azure.com/"
credential = AzureKeyCredential("xxxx")
client = DocumentTranslationClient(endpoint, credential)
output_dir = "/xxx/xxx/xxx/xxx/xxx/"
os.makedirs(output_dir, exist_ok=True)
def detect_language(text):
try:
return detect(text)
except:
return "error"
def ocr_pdf_page(page):
pix = page.get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
text = pytesseract.image_to_string(img)
return text
def process_pdf(file_path):
doc = fitz.open(file_path)
text = ''
for page in doc:
page_text = page.get_text().strip()
if not page_text: # If no text is extracted, use OCR
page_text = ocr_pdf_page(page)
text += page_text + "\n"
return text
def process_docx(file_path):
doc = Document(file_path)
text = ''
for para in doc.paragraphs:
text += para.text + '\n'
return text
def translate_and_save_file(file_path, target_language="en"):
try:
base_name = os.path.basename(file_path)
name_without_ext, _ = os.path.splitext(base_name)
translated_name = name_without_ext[:200]
name_hash = hashlib.md5(translated_name.encode()).hexdigest()[:8]
new_file_name = f"{translated_name}_{name_hash}_translated.txt"
new_file_path = os.path.join(output_dir, new_file_name)
# Pass 'source_url', 'target_url', and 'target_language' for a single input
poller = client.begin_translation(
source_url=file_path,
target_url=new_file_path,
target_language=target_language
)
result = poller.result()
logging.info(f"Translated and saved: {new_file_path}")
except Exception as e:
logging.error(f"Failed to translate and save {file_path}: {e}")
Iterate over PDFs and DOCXs
for file_path in glob(os.path.join(output_dir, '.pdf')) + glob(os.path.join(output_dir, '.docx')):
logging.info(f"Processing file: {file_path}")
lang = detect_language(process_pdf(file_path) if file_path.endswith('.pdf') else process_docx(file_path))
if lang.startswith('zh'):
translate_and_save_file(file_path)
this is the error:
2024-03-17 00:51:32,444 - ERROR - Failed to translate and save /xxx/xxx/xxx/xxx/xxx/xxxxxx.pdf: (InvalidDocumentAccessLevel): Cannot access source document location with the current permissions.