ShambhuRai-4099 avatar image
0 Votes"
ShambhuRai-4099 asked romungi-MSFT commented

python script

Hi Expert,

I am using attached pdf and able to export data in with some errors in output csv files

Here is my code and sample file .. could you please help with the code which export correct data in csv format( sample pdf is attached)


existing code

 from django.shortcuts import render
  import os
  from django.http import HttpResponse
  import csv
  import re
  from azure.core.credentials import AzureKeyCredential
  from import FormRecognizerClient
  from import BlobClient
  # Create your views here.
  def download_blob(blob_name, output_path):
      :param blob_name:
      :param output_path:
      _, filename = os.path.split(blob_name)
      destination_file = os.path.join(output_path, filename)
      blob_client = BlobClient.from_connection_string(
      with open(destination_file, "wb") as my_blob:
          blob_data = blob_client.download_blob()
      return destination_file
  def recognize_form_tables(form_path):
      endpoint =
      credential = AzureKeyCredential("<key>")
      form_recognizer_client = FormRecognizerClient(endpoint, credential)
      with open(form_path, "rb") as fd:
          form =
      response = form_recognizer_client.begin_recognize_content(form)
      form_pages = response.result()
      tables = []
      table_label_data = []
      port_regex = '^col1:(.*)'
      header_regex = '.*col1:(.*)Area Name:(.*)Month Reporting:\s*([A-Za-z]{3}-[0-9]{2}).*'
      table_index = -1
      for content in form_pages:
          for table in content.tables:
          table_header = ''
          i = 0
          flag = False
          for line_idx, line in enumerate(content.lines):
              port_line = re.findall(port_regex, line.text)
              if port_line:
                  table_index += 1
                  i = 0
                  flag = True
              if flag and i < 10 :
                  table_header += line.text + ' '
              if i == 10:
                  header_match = re.match(header_regex, table_header)
                  if header_match:
                      gr = header_match.groups()
                      table_label_data.append([gr[0], gr[1], gr[2]])
                  table_header = ''
                  flag = False
              i += 1
      return tables, table_label_data
  def create_csv(table, path):
      with open(path, 'a') as f:
          writer = csv.writer(f)
          for row in table:
              if len(row) < 10 or not row[3]:
  def create_csv_data(tables, table_label_data):
      count = 0
      for t in tables:
          count += 1
          table_data = []
          row_index = -1
          for cell in t.cells:
              cell = cell.to_dict()
              if count > 1 and 'is_header' in cell and cell['is_header']:
              elif cell['row_index'] == row_index or (count > 1 and cell['row_index'] == row_index + 1):
                  row_index += 1
                  if 'is_header' in cell and cell['is_header']:
                      table_data.append(['Port', 'Area Name', 'Month Reporting'])
                      if len(table_label_data) > count:
                          table_data[row_index] = table_label_data[count - 1] + table_data[row_index]
          create_csv(table_data, f'table.csv')
      print('Created or updated table.csv file.')
  def index(request):
      form_path = download_blob('Test.pdf', '')
      tables, table_label_data = recognize_form_tables(form_path)
      print('form recognize success')
      create_csv_data(tables, table_label_data)
      with open('table.csv', newline='') as in_file:
          with open('Test.csv', 'w', newline='') as out_file:
              writer = csv.writer(out_file)
              for row in csv.reader(in_file):
                  if row:
      return HttpResponse("Load Succeeded")

expected format:
port,grade,reporting date must be column like other data in table


please note in the third table Grade Label is missing only value is mentioned

· 5
5 |1600 characters needed characters left characters exceeded

Up to 10 attachments (including images) can be used with a maximum of 3.0 MiB each and 30.0 MiB total.

Suggestion please

0 Votes 0 ·

Does the call to the client return the complete text of your document? I tried locally and it seems to return only the first two headers.

 [FormPage(page_number=1, text_angle=0.0, width=11.6944, height=8.2639, unit=inch, tables=[FormTable(page_number=1, cells=[FormTableCell(text=BL Date, row_index=0, column_index=0, row_span=1, column_span=1, bounding_box=[Point(x=0.1623, y=2.6349), Point(x=1.0755, y=2.6349), Point(x=1.0638, y=2.7678), Point(x=0.174, y=2.7678)], confidence=1.0, is_header=True, is_footer=False, page_number=1, field_elements=[FormWord(text=BL, bounding_box=[Point(x=0.475, y=2.6749), Point(x=0.5635, y=2.6749), Point(x=0.5635, y=2.7362), Point(x=0.475, y=2.7362)], confidence=1.0, page_number=1, kind=word), FormWord(text=Date, bounding_box=[Point(x=0.5933, y=2.6752), Point(x=0.7749, y=2.6752), Point(x=0.7749, y=2.7372), Point(x=0.5933, y=2.7372)], confidence=1.0, page_number=1, kind=word)]), FormTableCell(text=Vessel, row_index=0, column_index=1, row_span=1, column_span=1, bounding_box=[Point(x=1.0755, y=2.6349), Point(x=2.6561, y=2.646), Point(x=2.6561, y=2.7678), Point(x=1.0638, y=2.7678)], confidence=1.0, is_header=True, is_footer=]

Are you seeing similar behavior when you print the response?


0 Votes 0 ·

yes its same

0 Votes 0 ·

Basically, the table details are not completely extracted with this client call. If you try to use the analyze API all the details are extracted and are available in JSON format.


Following the scripts available on this page as reference, this gives you the operation id of the result that can give you the JSON.

 endpoint = r"https://<your_resource>"
 apim_key = "<your_key>"
 post_url = endpoint + "/formrecognizer/v2.1/layout/analyze"
 source = r"sample1-17-end.pdf"
 headers = {
 # Request headers
     'Content-Type': 'application/octet-stream',
     'Ocp-Apim-Subscription-Key': apim_key,
 params = {
     "includeTextDetails": True,
     "locale": "en-US"
 with open(source, "rb") as f:
     data_bytes =
     resp = post(url = post_url, data = data_bytes, headers = headers, params = params)
     if resp.status_code != 202:
         print("POST analyze failed:\n%s" % resp.text)
     print("POST analyze succeeded:\n%s" % resp.headers)
     get_url = resp.headers["operation-location"]
 except Exception as e:
     print("POST analyze failed:\n%s" % str(e))

0 Votes 0 ·
Show more comments

0 Answers