An other pythonic option π¨βπ»π
With Pandas, PyMssql (ODBC) & DuckDB π¦
import yaml
from time import time
from sqlalchemy import create_engine
import pandas as pd
import duckdb
config = yaml.safe_load(open("conf.yml"))
db = config['db']
engine = create_engine(f"mssql+pymssql://{db['user']}:{db['password']}@{db['host']}:{db['port']}/{db['dbname']}")
con = engine.connect().execution_options(stream_results=True)
def get_tables_in_schema(schema):
""" fetch table list in given schema of our mssql db """
sql = f"""select tab.name as [table]
from sys.tables tab
where schema_name(tab.schema_id)='{schema}'"""
df = pd.read_sql(sql, engine)
return df['table'].values.tolist()
def export_table_to_parquet(schema, table):
""" export data from given table to parquet """
time_step = time()
print("Let's export", table)
sql = f"SELECT * FROM {schema}.{table}"
lines = 0
for i, df in enumerate(pd.read_sql(sql, con, chunksize=1000000)):
# by chunk of 1M rows if needed
t_step = time()
file_name = table + ('' if i==0 else f'_{i}m')
duckdb.sql(f"copy df to 'output/{file_name}.parquet' (format parquet)")
lines += df.shape[0]
print(' ', file_name, df.shape[0], f'lines ({round(time() - t_step, 2)}s)')
print(" ", lines, f"lines exported {'' if i==0 else f' in {i} files'} ({round(time() - time_step, 2)}s)")
schema = 'myschema'
for table in get_tables_in_schema(schema): # pour exporter TOUTES les tables
export_table_to_parquet(schema, table)
# export_table_to_parquet(schema, 'stations')
# export_table_to_parquet(schema, 'rentals')