Databricks Service Account Integrations

Problem

You are building interactive or rendered content in Python or R which needs to access a private Databricks resource using a Service Principal. The content must authenticate to Databricks using the Service Principal’s credentials.

Solution

The examples below illustrates how to use the OAuth integrations feature of Posit Connect to delegate authorization to the data provider. These examples use the Service Principal’s OAuth access token to call the Databricks API, allowing content to access private Databricks resources.

When publishing the content to Connect make sure the following environment variables are set for the deployed content:

  • DATABRICKS_HOST
  • DATABRICKS_PATH

Make sure to set the following additional environment variables for local development:

  • CLIENT_ID
  • CLIENT_SECRET
requirements.txt
databricks-sql-connector==4.0.0
databricks-sdk==0.45.0
posit-sdk==0.9.0
from posit.connect.external.databricks import (
    databricks_config,
    sql_credentials,
    ConnectStrategy,
)
from databricks import sql
from databricks.sdk.core import ApiClient #, azure_cli
from databricks.sdk.service.iam import CurrentUserAPI

import os
import pandas as pd

DATABRICKS_HOST = os.getenv('DATABRICKS_HOST')
DATABRICKS_HOST_URL = f"https://{DATABRICKS_HOST}"
SQL_HTTP_PATH = os.getenv("DATABRICKS_PATH")

# databricks_config() will try to authenticate using a different method depending
# on which environment is detected at runtime.
# Content running on Posit Connect will use the provided posit_connect_strategy.
# Content running on Posit Workbench will use the provided posit_workbench_strategy.
# If neither Workbench nor Connect is detected then the posit_default_strategy is used as a fallback.
# The posit_default_strategy uses the same method of detecting credentials as the databricks-sdk.
# If your local environment is already configured via environment variables or a .databrickscfg then
# you likely do not need to override this value.
# For more information, see the examples from the posit-sdk
# https://github.com/posit-dev/posit-sdk-py/blob/v0.9.0/src/posit/connect/external/databricks.py#L248
cfg = databricks_config(
    host=DATABRICKS_HOST_URL,
    # posit_default_strategy=azure_cli,
    posit_connect_strategy=ConnectStrategy(),
)

databricks_user_info = CurrentUserAPI(ApiClient(cfg)).me()
print(f"Hello, {databricks_user_info.display_name}!")

query = "SELECT * FROM samples.nyctaxi.trips LIMIT 10;"
with sql.connect(
  server_hostname=DATABRICKS_HOST,
  http_path=SQL_HTTP_PATH,
  # https://github.com/databricks/databricks-sql-python/issues/148#issuecomment-2271561365
  credentials_provider=sql_credentials(cfg),
) as connection:
  with connection.cursor() as cursor:
    cursor.execute(query)
    rows = cursor.fetchall()
    print(pd.DataFrame([row.asDict() for row in rows]))

Make sure to set the following additional environment variables for local development:

  • DATABRICKS_TOKEN
library(connectapi)
library(dbplyr)
library(dplyr)
library(httr2)
library(odbc)

# obtain a databricks OAuth token.
# 1. if running on Posit Connect, exchange Connect's content-session-token 
#    for the Service Principal's Databricks OAuth access token
# 2. if running locally, read the Databricks token from the 
#    DATABRICKS_TOKEN environment var
databricks_token <- function() {
  host <- Sys.getenv("DATABRICKS_HOST")
  token <- NULL

  # if running on Connect
  if (Sys.getenv("POSIT_PRODUCT") == "CONNECT") {
    # init Posit Connect API client
    client <- connect()
    # exchange content session token for Service Account OAuth access token
    # Connect makes the content session token available in the 
    # environment variable `CONNECT_CONTENT_SESSION_TOKEN`.
    credentials <- get_oauth_content_credentials(client)
    token <- credentials$access_token
  } else {
    token <- Sys.getenv("DATABRICKS_TOKEN")
  }
  token
}

# call the databricks "current user" API to get
# information about the authenticated user.
databricks_user <- function() {
  host <- Sys.getenv("DATABRICKS_HOST")
  token <- databricks_token()

  url <- file.path(
    paste0("https://", host),
    "api/2.0/preview/scim/v2/Me",
    fsep = "/"
  )
  resp <- httr2::request(url) |>
    httr2::req_auth_bearer_token(token) |>
    httr2::req_retry(max_tries = 2) |>
    httr2::req_perform()

  resp_body <- httr2::resp_body_json(resp)
  resp_body
}

# execute a query against a data source in Databricks
databricks_data_query <- function() {
  con <- dbConnect(
    odbc::databricks(),
    httpPath = Sys.getenv("DATABRICKS_PATH"),
    authMech = 11,
    auth_flow = 0,
    auth_accesstoken = databricks_token()
  )

  tbl(
    con,
    in_catalog("samples", "nyctaxi", "trips")
  ) |>
  head(10) |>
  collect()
}

databricks_user()$displayName
databricks_data_query()