Access data stored in storage account (Data Lake Gen2) with Jupyter Notebook
Access data stored in storage account (Data Lake Gen2) with Jupyter Notebook
BlobServiceClient example
In case of missing dependencies, install:
pip install azure-storage-blob
from azure.storage.blob import BlobServiceClient
account_name = '{your_storage_account_name}'
account_key = '{your_key}'
blob_service_client = BlobServiceClient(account_url="https://{}.blob.core.windows.net".format(account_name), credential=account_key)
containers = blob_service_client.list_containers()
for container in containers:
print(container.name)
The above will print out all the containers in the storag account.
To get all files with folder filter:
container_client = blob_service_client.get_container_client("{your_container_name}")
folder_name = "{your_file_prefix_filter}"
blobs = container_client.list_blobs(name_starts_with=folder_name)
for blob in blobs:
print(blob.name)
DataLakeServiceClient example
Utility functions to access the Data Lake Gen2
from azure.identity import ClientSecretCredential
from azure.storage.filedatalake import DataLakeServiceClient
def initialize_storage_account_ad(storage_account_name: str, client_id: str, client_secret: str, tenant_id: str):
try:
global datalake_service_client
credential = ClientSecretCredential(tenant_id, client_id, client_secret)
datalake_service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
"https", storage_account_name), credential=credential)
except Exception as e:
print(e)
def initialize_storage_account(storage_account_name: str, storage_account_key: str):
try:
global datalake_service_client
datalake_service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
"https", storage_account_name), credential=storage_account_key)
except Exception as e:
print(e)
def list_directory_contents(file_system_name: str, directory_path: str) -> list|None:
try:
file_system_client = datalake_service_client.get_file_system_client(file_system=file_system_name)
paths = file_system_client.get_paths(path=directory_path)
return list(paths)
except Exception as e:
print(e)
def download_file_from_directory(file_system_name: str, directory_path: str, file_path: str) -> str:
try:
file_system_client = datalake_service_client.get_file_system_client(file_system=file_system_name)
directory_client = file_system_client.get_directory_client(directory_path)
file_client = directory_client.get_file_client(file_path)
download = file_client.download_file()
file_content = download.readall().decode("utf-8")
return file_content
except Exception as e:
print(e)
Implementation to download all files within Data Lake Gen2 directory and read them into Pandas Data Frame
import pandas as pd
import json
# Create a DataLakeServiceClient object
account_name = '{Account_Name}'
account_key = '{Account_Key}'
initialize_storage_account(account_name, account_key)
# List file systems in the account
file_systems = datalake_service_client.list_file_systems()
for file_system in file_systems:
print(file_system.name)
gateway = "{Container_Name}"
directory = "{Directory_Path}"
files = list_directory_contents(gateway, directory)
# List to store each json-file's data
data = []
for filePath in files:
file_content = download_file_from_directory(gateway, directory, filePath)
# Check if the file is a JSON file
if filePath.name.endswith(".json"):
# Load the contents of the file into a dictionary
file_data = json.loads(file_content)
# Add json file content into array
data.append(file_data)
# Create a DataFrame from the data array
df: pd.DataFrame = pd.DataFrame(data)
df.head()