python script
Shambhu Rai
1,406
Reputation points
Hi Expert,
I am using attached pdf and able to export data in with some errors in output csv files
Here is my code and sample file .. could you please help with the code which export correct data in csv format( sample pdf is attached)
existing code
from django.shortcuts import render
import os
from django.http import HttpResponse
import csv
import re
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import FormRecognizerClient
from azure.storage.blob import BlobClient
# Create your views here.
def download_blob(blob_name, output_path):
"""
Download
:param blob_name:
:param output_path:
:return:
"""
_, filename = os.path.split(blob_name)
destination_file = os.path.join(output_path, filename)
blob_client = BlobClient.from_connection_string(
conn_str='DefaultEndpointsProtocol=https;AccountName=demoretail;AccountKey=jSZtsbMoGpmViFuWtTXDwEJEktIs24oUAIPSz9tSiZ25zCPe0mFRWC6V0gvlZCcGU0HcxCTdV1GsAl5vMwnanA==;EndpointSuffix=core.windows.net',
container_name='demo',
blob_name=blob_name
)
with open(destination_file, "wb") as my_blob:
blob_data = blob_client.download_blob()
blob_data.readinto(my_blob)
return destination_file
def recognize_form_tables(form_path):
endpoint = https://Test1.cognitiveservices.azure.com/
credential = AzureKeyCredential("<key>")
form_recognizer_client = FormRecognizerClient(endpoint, credential)
with open(form_path, "rb") as fd:
form = fd.read()
os.remove(form_path)
response = form_recognizer_client.begin_recognize_content(form)
form_pages = response.result()
tables = []
table_label_data = []
port_regex = '^col1:(.*)'
header_regex = '.*col1:(.*)Area Name:(.*)Month Reporting:\s*([A-Za-z]{3}-[0-9]{2}).*'
table_index = -1
for content in form_pages:
for table in content.tables:
tables.append(table)
table_header = ''
i = 0
flag = False
for line_idx, line in enumerate(content.lines):
port_line = re.findall(port_regex, line.text)
if port_line:
table_index += 1
i = 0
flag = True
if flag and i < 10 :
table_header += line.text + ' '
if i == 10:
header_match = re.match(header_regex, table_header)
if header_match:
gr = header_match.groups()
table_label_data.append([gr[0], gr[1], gr[2]])
table_header = ''
flag = False
i += 1
return tables, table_label_data
def create_csv(table, path):
with open(path, 'a') as f:
writer = csv.writer(f)
for row in table:
if len(row) < 10 or not row[3]:
continue
writer.writerow(row)
def create_csv_data(tables, table_label_data):
count = 0
for t in tables:
count += 1
table_data = []
row_index = -1
for cell in t.cells:
cell = cell.to_dict()
if count > 1 and 'is_header' in cell and cell['is_header']:
continue
elif cell['row_index'] == row_index or (count > 1 and cell['row_index'] == row_index + 1):
table_data[row_index].append(cell['text'])
else:
row_index += 1
if 'is_header' in cell and cell['is_header']:
table_data.append(['Port', 'Area Name', 'Month Reporting'])
else:
table_data.append([])
if len(table_label_data) > count:
table_data[row_index] = table_label_data[count - 1] + table_data[row_index]
table_data[row_index].append(cell['text'])
create_csv(table_data, f'table.csv')
print('Created or updated table.csv file.')
def index(request):
form_path = download_blob('Test.pdf', '')
tables, table_label_data = recognize_form_tables(form_path)
print('form recognize success')
create_csv_data(tables, table_label_data)
with open('table.csv', newline='') as in_file:
with open('Test.csv', 'w', newline='') as out_file:
writer = csv.writer(out_file)
for row in csv.reader(in_file):
if row:
writer.writerow(row)
return HttpResponse("Load Succeeded")
expected format:
port,grade,reporting date must be column like other data in table
please note in the third table Grade Label is missing only value is mentioned