Databricks Service Account Integrations
Problem
You are building interactive or rendered content in Python or R which needs to access a private Databricks resource using a Service Principal. The content must authenticate to Databricks using the Service Principal’s credentials.
Solution
The examples below illustrates how to use the OAuth integrations feature of Posit Connect to delegate authorization to the data provider. These examples use the Service Principal’s OAuth access token to call the Databricks API, allowing content to access private Databricks resources.
When publishing the content to Connect make sure the following environment variables are set for the deployed content:
DATABRICKS_HOST
DATABRICKS_PATH
Make sure to set the following additional environment variables for local development:
CLIENT_ID
CLIENT_SECRET
requirements.txt
-sql-connector==4.0.0
databricks-sdk==0.45.0
databricks-sdk==0.9.0 posit
from posit.connect.external.databricks import (
databricks_config,
sql_credentials,
ConnectStrategy,
)from databricks import sql
from databricks.sdk.core import ApiClient #, azure_cli
from databricks.sdk.service.iam import CurrentUserAPI
import os
import pandas as pd
= os.getenv('DATABRICKS_HOST')
DATABRICKS_HOST = f"https://{DATABRICKS_HOST}"
DATABRICKS_HOST_URL = os.getenv("DATABRICKS_PATH")
SQL_HTTP_PATH
# databricks_config() will try to authenticate using a different method depending
# on which environment is detected at runtime.
# Content running on Posit Connect will use the provided posit_connect_strategy.
# Content running on Posit Workbench will use the provided posit_workbench_strategy.
# If neither Workbench nor Connect is detected then the posit_default_strategy is used as a fallback.
# The posit_default_strategy uses the same method of detecting credentials as the databricks-sdk.
# If your local environment is already configured via environment variables or a .databrickscfg then
# you likely do not need to override this value.
# For more information, see the examples from the posit-sdk
# https://github.com/posit-dev/posit-sdk-py/blob/v0.9.0/src/posit/connect/external/databricks.py#L248
= databricks_config(
cfg =DATABRICKS_HOST_URL,
host# posit_default_strategy=azure_cli,
=ConnectStrategy(),
posit_connect_strategy
)
= CurrentUserAPI(ApiClient(cfg)).me()
databricks_user_info print(f"Hello, {databricks_user_info.display_name}!")
= "SELECT * FROM samples.nyctaxi.trips LIMIT 10;"
query with sql.connect(
=DATABRICKS_HOST,
server_hostname=SQL_HTTP_PATH,
http_path# https://github.com/databricks/databricks-sql-python/issues/148#issuecomment-2271561365
=sql_credentials(cfg),
credentials_provideras connection:
) with connection.cursor() as cursor:
cursor.execute(query)= cursor.fetchall()
rows print(pd.DataFrame([row.asDict() for row in rows]))
Make sure to set the following additional environment variables for local development:
DATABRICKS_TOKEN
library(connectapi)
library(dbplyr)
library(dplyr)
library(httr2)
library(odbc)
# obtain a databricks OAuth token.
# 1. if running on Posit Connect, exchange Connect's content-session-token
# for the Service Principal's Databricks OAuth access token
# 2. if running locally, read the Databricks token from the
# DATABRICKS_TOKEN environment var
<- function() {
databricks_token <- Sys.getenv("DATABRICKS_HOST")
host <- NULL
token
# if running on Connect
if (Sys.getenv("POSIT_PRODUCT") == "CONNECT") {
# init Posit Connect API client
<- connect()
client # exchange content session token for Service Account OAuth access token
# Connect makes the content session token available in the
# environment variable `CONNECT_CONTENT_SESSION_TOKEN`.
<- get_oauth_content_credentials(client)
credentials <- credentials$access_token
token else {
} <- Sys.getenv("DATABRICKS_TOKEN")
token
}
token
}
# call the databricks "current user" API to get
# information about the authenticated user.
<- function() {
databricks_user <- Sys.getenv("DATABRICKS_HOST")
host <- databricks_token()
token
<- file.path(
url paste0("https://", host),
"api/2.0/preview/scim/v2/Me",
fsep = "/"
)<- httr2::request(url) |>
resp ::req_auth_bearer_token(token) |>
httr2::req_retry(max_tries = 2) |>
httr2::req_perform()
httr2
<- httr2::resp_body_json(resp)
resp_body
resp_body
}
# execute a query against a data source in Databricks
<- function() {
databricks_data_query <- dbConnect(
con ::databricks(),
odbchttpPath = Sys.getenv("DATABRICKS_PATH"),
authMech = 11,
auth_flow = 0,
auth_accesstoken = databricks_token()
)
tbl(
con,in_catalog("samples", "nyctaxi", "trips")
|>
) head(10) |>
collect()
}
databricks_user()$displayName
databricks_data_query()