python script

Shambhu Rai 1,406 Reputation points
2022-05-12T18:28:23.927+00:00

Hi Expert,

I am using attached pdf and able to export data in with some errors in output csv files

Here is my code and sample file .. could you please help with the code which export correct data in csv format( sample pdf is attached)

py201211-sample1-17-end.pdf

existing code

from django.shortcuts import render  
 import os  
 from django.http import HttpResponse  
 import csv  
 import re  
 from azure.core.credentials import AzureKeyCredential  
 from azure.ai.formrecognizer import FormRecognizerClient  
 from azure.storage.blob import BlobClient  
      
      
 # Create your views here.  
      
 def download_blob(blob_name, output_path):  
     """  
     Download  
     :param blob_name:  
     :param output_path:  
     :return:  
     """  
     _, filename = os.path.split(blob_name)  
     destination_file = os.path.join(output_path, filename)  
      
     blob_client = BlobClient.from_connection_string(  
         conn_str='DefaultEndpointsProtocol=https;AccountName=demoretail;AccountKey=jSZtsbMoGpmViFuWtTXDwEJEktIs24oUAIPSz9tSiZ25zCPe0mFRWC6V0gvlZCcGU0HcxCTdV1GsAl5vMwnanA==;EndpointSuffix=core.windows.net',  
         container_name='demo',  
         blob_name=blob_name  
     )  
     with open(destination_file, "wb") as my_blob:  
         blob_data = blob_client.download_blob()  
         blob_data.readinto(my_blob)  
      
     return destination_file  
      
      
 def recognize_form_tables(form_path):  
     endpoint = https://Test1.cognitiveservices.azure.com/  
     credential = AzureKeyCredential("<key>")  
     form_recognizer_client = FormRecognizerClient(endpoint, credential)  
      
     with open(form_path, "rb") as fd:  
         form = fd.read()  
      
     os.remove(form_path)  
      
     response = form_recognizer_client.begin_recognize_content(form)  
     form_pages = response.result()  
      
     tables = []  
     table_label_data = []  
     port_regex = '^col1:(.*)'  
     header_regex = '.*col1:(.*)Area Name:(.*)Month Reporting:\s*([A-Za-z]{3}-[0-9]{2}).*'  
      
     table_index = -1  
     for content in form_pages:  
         for table in content.tables:  
             tables.append(table)  
      
         table_header = ''  
         i = 0  
         flag = False  
         for line_idx, line in enumerate(content.lines):  
             port_line = re.findall(port_regex, line.text)  
             if port_line:  
                 table_index += 1  
                 i = 0  
                 flag = True  
      
             if flag and i < 10 :  
                 table_header += line.text + ' '  
      
             if i == 10:  
                 header_match = re.match(header_regex, table_header)  
                 if header_match:  
                     gr = header_match.groups()  
                     table_label_data.append([gr[0], gr[1], gr[2]])  
                 table_header = ''  
                 flag = False  
      
             i += 1  
      
     return tables, table_label_data  
      
      
 def create_csv(table, path):  
     with open(path, 'a') as f:  
         writer = csv.writer(f)  
         for row in table:  
             if len(row) < 10 or not row[3]:  
                 continue  
             writer.writerow(row)  
      
      
 def create_csv_data(tables, table_label_data):  
     count = 0  
     for t in tables:  
         count += 1  
         table_data = []  
         row_index = -1  
         for cell in t.cells:  
             cell = cell.to_dict()  
      
             if count > 1 and 'is_header' in cell and cell['is_header']:  
                 continue  
             elif cell['row_index'] == row_index or (count > 1 and cell['row_index'] == row_index + 1):  
                 table_data[row_index].append(cell['text'])  
             else:  
                 row_index += 1  
                 if 'is_header' in cell and cell['is_header']:  
                     table_data.append(['Port', 'Area Name', 'Month Reporting'])  
                 else:  
                     table_data.append([])  
                     if len(table_label_data) > count:  
                         table_data[row_index] = table_label_data[count - 1] + table_data[row_index]  
                 table_data[row_index].append(cell['text'])  
      
         create_csv(table_data, f'table.csv')  
     print('Created or updated table.csv file.')  
      
      
 def index(request):  
     form_path = download_blob('Test.pdf', '')  
     tables, table_label_data = recognize_form_tables(form_path)  
     print('form recognize success')  
     create_csv_data(tables, table_label_data)  
     with open('table.csv', newline='') as in_file:  
         with open('Test.csv', 'w', newline='') as out_file:  
             writer = csv.writer(out_file)  
             for row in csv.reader(in_file):  
                 if row:  
                     writer.writerow(row)  
     return HttpResponse("Load Succeeded")  

expected format:
port,grade,reporting date must be column like other data in table

201125-image.png

please note in the third table Grade Label is missing only value is mentioned

Azure AI Document Intelligence
Azure AI Document Intelligence
An Azure service that turns documents into usable data. Previously known as Azure Form Recognizer.
1,405 questions
{count} votes