question

ShambhuRai-4099 avatar image
0 Votes"
ShambhuRai-4099 asked romungi-MSFT commented

python script

Hi Expert,

I am using attached pdf and able to export data in with some errors in output csv files

Here is my code and sample file .. could you please help with the code which export correct data in csv format( sample pdf is attached)

py201211-sample1-17-end.pdf

existing code

 from django.shortcuts import render
  import os
  from django.http import HttpResponse
  import csv
  import re
  from azure.core.credentials import AzureKeyCredential
  from azure.ai.formrecognizer import FormRecognizerClient
  from azure.storage.blob import BlobClient
        
        
  # Create your views here.
        
  def download_blob(blob_name, output_path):
      """
      Download
      :param blob_name:
      :param output_path:
      :return:
      """
      _, filename = os.path.split(blob_name)
      destination_file = os.path.join(output_path, filename)
        
      blob_client = BlobClient.from_connection_string(
          conn_str='DefaultEndpointsProtocol=https;AccountName=demoretail;AccountKey=jSZtsbMoGpmViFuWtTXDwEJEktIs24oUAIPSz9tSiZ25zCPe0mFRWC6V0gvlZCcGU0HcxCTdV1GsAl5vMwnanA==;EndpointSuffix=core.windows.net',
          container_name='demo',
          blob_name=blob_name
      )
      with open(destination_file, "wb") as my_blob:
          blob_data = blob_client.download_blob()
          blob_data.readinto(my_blob)
        
      return destination_file
        
        
  def recognize_form_tables(form_path):
      endpoint = https://Test1.cognitiveservices.azure.com/
      credential = AzureKeyCredential("<key>")
      form_recognizer_client = FormRecognizerClient(endpoint, credential)
        
      with open(form_path, "rb") as fd:
          form = fd.read()
        
      os.remove(form_path)
        
      response = form_recognizer_client.begin_recognize_content(form)
      form_pages = response.result()
        
      tables = []
      table_label_data = []
      port_regex = '^col1:(.*)'
      header_regex = '.*col1:(.*)Area Name:(.*)Month Reporting:\s*([A-Za-z]{3}-[0-9]{2}).*'
        
      table_index = -1
      for content in form_pages:
          for table in content.tables:
              tables.append(table)
        
          table_header = ''
          i = 0
          flag = False
          for line_idx, line in enumerate(content.lines):
              port_line = re.findall(port_regex, line.text)
              if port_line:
                  table_index += 1
                  i = 0
                  flag = True
        
              if flag and i < 10 :
                  table_header += line.text + ' '
        
              if i == 10:
                  header_match = re.match(header_regex, table_header)
                  if header_match:
                      gr = header_match.groups()
                      table_label_data.append([gr[0], gr[1], gr[2]])
                  table_header = ''
                  flag = False
        
              i += 1
        
      return tables, table_label_data
        
        
  def create_csv(table, path):
      with open(path, 'a') as f:
          writer = csv.writer(f)
          for row in table:
              if len(row) < 10 or not row[3]:
                  continue
              writer.writerow(row)
        
        
  def create_csv_data(tables, table_label_data):
      count = 0
      for t in tables:
          count += 1
          table_data = []
          row_index = -1
          for cell in t.cells:
              cell = cell.to_dict()
        
              if count > 1 and 'is_header' in cell and cell['is_header']:
                  continue
              elif cell['row_index'] == row_index or (count > 1 and cell['row_index'] == row_index + 1):
                  table_data[row_index].append(cell['text'])
              else:
                  row_index += 1
                  if 'is_header' in cell and cell['is_header']:
                      table_data.append(['Port', 'Area Name', 'Month Reporting'])
                  else:
                      table_data.append([])
                      if len(table_label_data) > count:
                          table_data[row_index] = table_label_data[count - 1] + table_data[row_index]
                  table_data[row_index].append(cell['text'])
        
          create_csv(table_data, f'table.csv')
      print('Created or updated table.csv file.')
        
        
  def index(request):
      form_path = download_blob('Test.pdf', '')
      tables, table_label_data = recognize_form_tables(form_path)
      print('form recognize success')
      create_csv_data(tables, table_label_data)
      with open('table.csv', newline='') as in_file:
          with open('Test.csv', 'w', newline='') as out_file:
              writer = csv.writer(out_file)
              for row in csv.reader(in_file):
                  if row:
                      writer.writerow(row)
      return HttpResponse("Load Succeeded")


expected format:
port,grade,reporting date must be column like other data in table

201125-image.png



please note in the third table Grade Label is missing only value is mentioned

azure-form-recognizer
· 5
5 |1600 characters needed characters left characters exceeded

Up to 10 attachments (including images) can be used with a maximum of 3.0 MiB each and 30.0 MiB total.

Suggestion please

0 Votes 0 ·

Does the call to the client return the complete text of your document? I tried locally and it seems to return only the first two headers.

 [FormPage(page_number=1, text_angle=0.0, width=11.6944, height=8.2639, unit=inch, tables=[FormTable(page_number=1, cells=[FormTableCell(text=BL Date, row_index=0, column_index=0, row_span=1, column_span=1, bounding_box=[Point(x=0.1623, y=2.6349), Point(x=1.0755, y=2.6349), Point(x=1.0638, y=2.7678), Point(x=0.174, y=2.7678)], confidence=1.0, is_header=True, is_footer=False, page_number=1, field_elements=[FormWord(text=BL, bounding_box=[Point(x=0.475, y=2.6749), Point(x=0.5635, y=2.6749), Point(x=0.5635, y=2.7362), Point(x=0.475, y=2.7362)], confidence=1.0, page_number=1, kind=word), FormWord(text=Date, bounding_box=[Point(x=0.5933, y=2.6752), Point(x=0.7749, y=2.6752), Point(x=0.7749, y=2.7372), Point(x=0.5933, y=2.7372)], confidence=1.0, page_number=1, kind=word)]), FormTableCell(text=Vessel, row_index=0, column_index=1, row_span=1, column_span=1, bounding_box=[Point(x=1.0755, y=2.6349), Point(x=2.6561, y=2.646), Point(x=2.6561, y=2.7678), Point(x=1.0638, y=2.7678)], confidence=1.0, is_header=True, is_footer=]

Are you seeing similar behavior when you print the response?

 print(form_pages)


0 Votes 0 ·

yes its same

0 Votes 0 ·

Basically, the table details are not completely extracted with this client call. If you try to use the analyze API all the details are extracted and are available in JSON format.

 https://<your_resource>.cognitiveservices.azure.com/formrecognizer/v2.1/layout/analyze

Following the scripts available on this page as reference, this gives you the operation id of the result that can give you the JSON.

 endpoint = r"https://<your_resource>.cognitiveservices.azure.com/"
 apim_key = "<your_key>"
 post_url = endpoint + "/formrecognizer/v2.1/layout/analyze"
 source = r"sample1-17-end.pdf"
 headers = {
 # Request headers
     'Content-Type': 'application/octet-stream',
     'Ocp-Apim-Subscription-Key': apim_key,
 }
 params = {
     "includeTextDetails": True,
     "locale": "en-US"
 }
 with open(source, "rb") as f:
     data_bytes = f.read()
 try:
     resp = post(url = post_url, data = data_bytes, headers = headers, params = params)
     if resp.status_code != 202:
         print("POST analyze failed:\n%s" % resp.text)
         quit()
     print("POST analyze succeeded:\n%s" % resp.headers)
     get_url = resp.headers["operation-location"]
 except Exception as e:
     print("POST analyze failed:\n%s" % str(e))
     quit()



0 Votes 0 ·
Show more comments

0 Answers