Databricks Service Account Integrations

Problem

You are building interactive or rendered content in Python or R which needs to access a private Databricks resource using a Service Principal. The content must authenticate to Databricks using the Service Principal’s credentials.

Solution

The examples below illustrates how to use the OAuth integrations feature of Posit Connect to delegate authorization to the data provider. These examples use the Service Principal’s OAuth access token to call the Databricks API, allowing content to access private Databricks resources.

When publishing the content to Connect make sure the following environment variables are set for the deployed content:

  • DATABRICKS_HOST
  • DATABRICKS_PATH

Make sure to set the following additional environment variables for local development:

  • CLIENT_ID
  • CLIENT_SECRET
from posit.connect.external.databricks import (
  PositContentCredentialsStrategy,
  PositLocalContentCredentialsStrategy,
)
from databricks import sql
from databricks.sdk.core import ApiClient, Config
from databricks.sdk.service.iam import CurrentUserAPI

import os
import pandas as pd

DATABRICKS_HOST = os.getenv('DATABRICKS_HOST')
DATABRICKS_HOST_URL = f"https://{DATABRICKS_HOST}"
TOKEN_ENDPOINT_URL = f"https://{DATABRICKS_HOST}/oidc/v1/token"

# Rather than relying on the Databricks CLI as a local strategy, we use
# PositLocalContentCredentialsStrategy as a drop-in replacement.
# Can be replaced with the Databricks CLI implementation when
# https://github.com/databricks/cli/issues/1939 is resolved.
local_strategy = PositLocalContentCredentialsStrategy(
  token_endpoint_url=TOKEN_ENDPOINT_URL,
  client_id=os.getenv('CLIENT_ID'),
  client_secret=os.getenv('CLIENT_SECRET'),
)
posit_strategy = PositContentCredentialsStrategy(local_strategy=local_strategy)
cfg = Config(host=DATABRICKS_HOST_URL, credentials_strategy=posit_strategy)

databricks_user_info = CurrentUserAPI(ApiClient(cfg)).me()
print(f"Hello, {databricks_user_info.display_name}!")
    
query = "SELECT * FROM samples.nyctaxi.trips LIMIT 10;"
with sql.connect(
  server_hostname=DATABRICKS_HOST,
  http_path=os.getenv('DATABRICKS_PATH'),
  credentials_provider=posit_strategy.sql_credentials_provider(cfg),
) as connection:
  with connection.cursor() as cursor:
    cursor.execute(query)
    rows = cursor.fetchall()
    print(pd.DataFrame([row.asDict() for row in rows]))

Make sure to set the following additional environment variables for local development:

  • DATABRICKS_TOKEN
library(connectapi)
library(dbplyr)
library(dplyr)
library(httr2)
library(odbc)

# obtain a databricks OAuth token.
# 1. if running on Posit Connect, exchange Connect's content-session-token 
#    for the Service Principal's Databricks OAuth access token
# 2. if running locally, read the Databricks token from the 
#    DATABRICKS_TOKEN environment var
databricks_token <- function() {
  host <- Sys.getenv("DATABRICKS_HOST")
  token <- NULL

  # if running on Connect
  if (Sys.getenv("RSTUDIO_PRODUCT") == "CONNECT") {
    # init Posit Connect API client
    client <- connect()
    # exchange content session token for Service Account OAuth access token
    # Connect makes the content session token available in the 
    # environment variable `CONNECT_CONTENT_SESSION_TOKEN`.
    credentials <- get_oauth_content_credentials(client)
    token <- credentials$access_token
  } else {
    token <- Sys.getenv("DATABRICKS_TOKEN")
  }
  token
}

# call the databricks "current user" API to get
# information about the authenticated user.
databricks_user <- function() {
  host <- Sys.getenv("DATABRICKS_HOST")
  token <- databricks_token()

  url <- file.path(
    paste0("https://", host),
    "api/2.0/preview/scim/v2/Me",
    fsep = "/"
  )
  resp <- httr2::request(url) |>
    httr2::req_auth_bearer_token(token) |>
    httr2::req_retry(max_tries = 2) |>
    httr2::req_perform()

  resp_body <- httr2::resp_body_json(resp)
  resp_body
}

# execute a query against a data source in Databricks
databricks_data_query <- function() {
  con <- dbConnect(
    odbc::databricks(),
    httpPath = Sys.getenv("DATABRICKS_PATH"),
    authMech = 11,
    auth_flow = 0,
    auth_accesstoken = databricks_token()
  )

  tbl(
    con,
    in_catalog("samples", "nyctaxi", "trips")
  ) |>
  head(10) |>
  collect()
}

databricks_user()$displayName
databricks_data_query()