Analyzing Most Active Content, Users, and Publishers
Problem
You want to identify the most used content deployed on the Connect server, the users with the highest viewing history, and the publishers with the most content usage during the selected time period.
You might want to identify the content and users that would be most impacted by changes to the server or content.
Solution
Analyze usage data from the Connect server. Here, we present four approaches, each of which builds a data frame for the metric over a specified time period
- Historical usage of all content on the server
- Most popular content, by number of hits
- Top n content users based on content loads
- Top n publishers based on loads of their content
We also provide an example Quarto dashboard with visualizations of these approaches.
This recipe requires the following inputs:
- The desired timeframe for the report, defined by:
- the
as_of_date
- the number of
days_back
from theas_of_date
to include in the report
- The number of top N items and users include in the report, defined by
top_n
.
Note that if your Connect version is prior to 2023.10, content usage for Plumber, Voila, Streamlit, Dash, and Bokeh were over-counted in instrumentation metrics. Metrics collected after upgrading to version 2023.10 or greater are accurate.
Getting the most used content
The code sample below yields the top n content items, based on usage over a selected time period.
from posit import connect
from datetime import date, timedelta
import polars as pl
import numpy as np
#### User-defined inputs ####
# 1. Specify timeframe for the report. Default is 90 days prior to today
= date.today()
as_of_date = 90
days_back
# 2. Specify the number of content items and users to report as the Top N pieces of content, users, and publishers. Default is to show the `top_n=20`.
= 20
top_n ###########################
= as_of_date - timedelta(days=days_back)
report_from
= connect.Client()
client
# Retrieve content details and select relevant columns
= client.content.find()
df_content = (pl.DataFrame(df_content, infer_schema_length=None)
content_info # unnest struct column "owner" to get owner username
"owner").map_elements(lambda x: x["username"], return_dtype=pl.String).alias("owner_username"))
.with_columns(pl.col(# select relevant columns
"guid", "name", "title", "owner_username"])
.select([
)
# Get usage data
= client.metrics.usage.find(
df =report_from.strftime("%Y-%m-%dT%H:%M:%SZ"),
start=as_of_date.strftime("%Y-%m-%dT%H:%M:%SZ"))
end
= (pl.DataFrame(df, infer_schema_length=None)
usage "started").str.to_datetime("%Y-%m-%dT%H:%M:%SZ"))
.with_columns(pl.col("started").cast(pl.Date).alias("day"))
.with_columns(pl.col("content_guid", "user_guid", "day"])
.select([# filter out any content items that may have been deleted
filter(pl.col("content_guid").is_in(content_info["guid"]))
.
)
# Identify the top N most used content
= (
top_n_content "content_guid")
usage.group_by(len().alias("tot_usage"))
.agg(pl.="content_guid", right_on="guid", how="left")
.join(content_info, left_on"name", "title", "tot_usage","owner_username"])
.select(["tot_usage", descending=True)
.sort(# if title is null then replace with name column
"title"), pl.col("name")).alias("title"))
.with_columns(pl.coalesce(pl.col(
.head(top_n) )
An example bar graph visualizing this data is included in the Quarto dashboard code at the end of this recipe.
library(connectapi)
library(dplyr)
library(lubridate)
#### User-defined inputs ####
# 1. Specify timeframe for the report. Default is 90 days prior to today
<- today()
as_of_date <- 90
days_back
# 2. Specify the number of content items to report as the Top N pieces of content. Default is to show the `top_n <- 20` pieces of content.
<- 20
top_n ############################
<- as_of_date - ddays(days_back)
report_from
<- connect()
client
# Retrieve content details and select relevant columns
<- connectapi::get_content(client) |>
content_info hoist(owner, owner_username = "username") |>
select(guid, name, title, owner_username, dashboard_url)
# Query for both shiny and non-shiny usage data
<- get_usage_shiny(client,
usage_shiny from = report_from,
to = as_of_date,
limit = Inf
)
<- get_usage_static(client,
usage_non_shiny from = report_from,
to = as_of_date,
limit = Inf
|>
) rename(started = time) |>
mutate(ended = started)
<- bind_rows(usage_shiny, usage_non_shiny) |>
usage mutate(day = as.Date(started)) |>
select(content_guid, user_guid, day) |>
# filter out any content items that may have been deleted
filter(content_guid %in% content_info$guid)
<- usage |>
top_n_content group_by(content_guid) |>
summarise(tot_usage = n()) |>
arrange(desc(tot_usage)) |>
head(top_n) |>
left_join(content_info, by = c(content_guid = "guid")) |>
# if title is NA then substitute with name
mutate(title = coalesce(title, name))
An example bar graph visualizing this data is included in the Quarto dashboard code at the end of this recipe.
Getting the most active content viewers
The code sample below yields the top n content viewers, based on usage over a selected time period. It builds from the previous code block.
# Get user details to cross reference `user_guid` with usernames
= pl.DataFrame(client.users.find())
all_users
# identify top N content users and identify their most-frequently used content item
# make a table of a user's usage of each content item
= (
user_content_hits "user_guid", "content_guid"])
usage.group_by([len().alias("content_hits"))
.agg(pl.# for each user_guid return the content_guid for the most content_hits
"user_guid")
.group_by(lambda group: group.sort("content_hits", descending=True))
.map_groups("user_guid")
.group_by(1)
.head(# if user_guid is null then replace with "anonymous"
"user_guid"), pl.lit("anonymous")).alias("user_guid"))
.with_columns(pl.coalesce(pl.col(
)
# make a table of a user's total usage of all content
= (
user_tot_usage "user_guid")
usage.group_by(len().alias("users_tot_usage"))
.agg(pl.# if user_guid is null then replace with "anonymous"
"user_guid"), pl.lit("anonymous")).alias("user_guid"))
.with_columns(pl.coalesce(pl.col(
)
= (
top_n_users
user_content_hits="user_guid", right_on="user_guid", how="left")
.join(user_tot_usage, left_on# what percent of a user's total usage is their most frequently accessed content item
100*pl.col("content_hits") / pl.col("users_tot_usage")).alias("top_item_percent_of_usage"))
.with_columns(("users_tot_usage", descending=True)
.sort(
.head(top_n)="user_guid", right_on="user_guid", how="left")
.join(all_users, left_on# if username is null then replace with "anonymous"
"username"), pl.lit("anonymous")).alias("username"))
.with_columns(pl.coalesce(pl.col(="content_guid", right_on="guid", how="left")
.join(content_info, left_on# if title is null then replace with name column
"title"), pl.col("name")).alias("title"))
.with_columns(pl.coalesce(pl.col("username", "title", "top_item_percent_of_usage", "users_tot_usage"])
.select(["title": "users_most_frequent_content"})
.rename({ )
An example bar graph visualizing this data is included in the Quarto dashboard code at the end of this recipe.
# Get user details to cross reference `user_guid` with usernames
<- get_users(client, limit = Inf)
all_users
# identify top N content users and identify their most-frequently used content item
<- usage |>
top_n_users select(-day) |>
group_by(user_guid, content_guid) |>
# count a user's usage of a content item
mutate(content_hits = n()) |>
ungroup() |>
group_by(user_guid) |>
# count a user's total usage of all content
mutate(users_tot_usage = n()) |>
arrange(desc(content_hits)) |>
# slice to save only each user's most frequently accessed content item
slice(1) |>
ungroup() |>
# what fraction of a user's total usage is their most frequently accessed content item
mutate(top_item_percent_of_usage = content_hits / users_tot_usage) |>
# select only the top N users
arrange(desc(users_tot_usage)) |>
head(top_n) |>
left_join(all_users, by = "user_guid") |>
mutate(username = coalesce(username, "anonymous")) |>
left_join(content_info, by = c(content_guid = "guid")) |>
# if title is NA then substitute with name
mutate(title = coalesce(title, name)) |>
rename(users_most_frequent_content = title) |>
select(username, users_most_frequent_content, users_tot_usage, top_item_percent_of_usage)
An example bar graph visualizing this data is included in the Quarto dashboard code at the end of this recipe.
Getting the publishers with the most content hits
The code sample below yields the top n publishers, based on usage of their content over the selected time period. This builds on the previous code blocks.
# table of most frequent content_hits with content info
= (
content_hits "content_guid")
usage.group_by(len().alias("content_hits"))
.agg(pl.="content_guid", right_on="guid", how="left")
.join(content_info, left_on"content_guid","name", "title", "content_hits","owner_username"])
.select(["owner_username")
.group_by(lambda group: group.sort("content_hits", descending=True))
.map_groups("owner_username")
.group_by(1)
.head("content_hits", descending=True)
.sort(
)
# table of how many of an owner's content items were accessed in the period
= (
owners_distinct_content "owner_username")
content_hits.group_by(# count number of distinct content_guids
"content_guid").alias("num_owners_items_accessed"))
.agg(pl.n_unique(
)
# table of total content_hits for each owner
= (
owner_total_hits ="content_guid", right_on="guid", how="left")
usage.join(content_info, left_on"owner_username")
.group_by(len().alias("owners_tot_usage"))
.agg(pl.
)
# build table of top N publishers
= (
top_n_publishers
content_hits="owner_username", right_on="owner_username", how="left")
.join(owners_distinct_content, left_on="owner_username", right_on="owner_username", how="left")
.join(owner_total_hits, left_on"owners_tot_usage", descending=True, nulls_last=True)
.sort(
.head(top_n)# if title is NA then substitute with name
"title"), pl.col("name")).alias("title"))
.with_columns(pl.coalesce(pl.col("title": "owners_most_frequent_content"})
.rename({"content_hits": "owners_most_frequent_content_hits"})
.rename({# remove name column
"name","content_guid"])
.drop([ )
An example bar graph visualizing this data is included in the Quarto dashboard code at the end of this recipe.
<- usage |>
top_n_publishers left_join(content_info, by = c(content_guid = "guid")) |>
select(content_guid, name, title, owner_username) |>
group_by(content_guid) |>
mutate(content_hits = n()) |>
ungroup() |>
group_by(owner_username) |>
# number of owner's distinct content items accessed in the period
mutate(num_owners_items_accessed = n_distinct(content_guid)) |>
mutate(owners_tot_usage = n()) |>
arrange(desc(content_hits)) |>
slice(1) |>
arrange(desc(owners_tot_usage)) |>
head(top_n) |>
# if title is NA then substitute with name
mutate(title = coalesce(title, name)) |>
rename(owners_most_frequent_content = title, owners_most_frequent_content_hits = content_hits) |>
select(-content_guid, -name)
An example bar graph visualizing this data is included in the Quarto dashboard code at the end of this recipe.
Deploying a sample Quarto dashboard with these metrics
The Quarto document below builds visualizations around the data frames retrieved above. To use, copy the source code into a new .qmd
file. This can be deployed in your own environment, or customized to your needs.
# requirements.txt
==2.2.2
pandas==5.22.0
plotly==0.20.23
polars-sdk==0.2.0
posit==16.0.0 pyarrow
---
title: Most Popular Content on Connect
format:
dashboard:
theme: default
orientation: rows
---
```{python}
#| label: setup
from posit import connect
from datetime import date, timedelta
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
```
```{python}
#| label: user-defined inputs
# 1. Specify timeframe for the report. Default is 90 days prior to today
as_of_date = date.today()
days_back = 90
report_from = as_of_date - timedelta(days=days_back)
# 2. Specify the number of content items to report as the Top N pieces of content. Default is to show the `top_n=20` pieces of content.
top_n = 20
# Note: The CONNECT_SERVER and CONNECT_API_KEY environment variables are required for this report to pull usage data. Use environment variables for added security; do not hard-code your API key or other secrets.
```
```{python}
#| label: Establish connection to Connect server
# Note `connect.Client()` automatically detects the environment variables CONNECT_SERVER and CONNECT_API_KEY. They do not need to be specified as arguments to this function, however, if these environment variables are not set, the code will fail
client = connect.Client()
```
```{python}
# | label: Get content info and usage
### Retrieve content details and select relevant columns
df_content = client.content.find()
content_info = (pl.DataFrame(df_content, infer_schema_length=None)
# unnest struct column "owner" to get owner username
.with_columns(pl.col("owner").map_elements(lambda x: x["username"], return_dtype=pl.String).alias("owner_username"))
# select relevant columns
.select(["guid", "name", "title", "owner_username"])
)
### Get usage data
df = client.metrics.usage.find(
start=report_from.strftime("%Y-%m-%dT%H:%M:%SZ"),
end=as_of_date.strftime("%Y-%m-%dT%H:%M:%SZ"))
usage = (pl.DataFrame(df, infer_schema_length=None)
.with_columns(pl.col("started").str.to_datetime("%Y-%m-%dT%H:%M:%SZ"))
.with_columns(pl.col("started").cast(pl.Date).alias("day"))
.select(["content_guid", "user_guid", "day"])
# filter out any content items that may have been deleted
.filter(pl.col("content_guid").is_in(content_info["guid"]))
)
### Get all user details to cross reference user_guids with usernames
all_users = (pl.DataFrame(client.users.find())
.select(["guid", "username"])
#rename guid to user_guid
.rename({"guid": "user_guid"}))
```
## Row
Content usage on Connect for the period of **`{python} report_from.strftime("%Y-%m-%d")`** to **`{python} as_of_date.strftime("%Y-%m-%d")`**
## Row
```{python}
# plot total content usage over time
usage_plot_data = (usage.group_by("day")
.agg(pl.len().alias("usage"))
.sort("day")
.select("day", "usage")
)
# plot usage_plot_data using plotly
fig = px.line(usage_plot_data.to_pandas(), x="day", y="usage", title=f"Total usage for the {days_back} day period", template="plotly_white")
fig.update_traces(line_color='#AE929F', line_width=4)
```
## Row
```{python}
# bar graph of top N content items
# identify top N content
top_n_content = (
usage.group_by("content_guid")
.agg(pl.len().alias("tot_usage"))
.join(content_info, left_on="content_guid", right_on="guid", how="left")
.select(["name", "title", "tot_usage","owner_username"])
.sort("tot_usage", descending=True)
# if title is null then replace with name column
.with_columns(pl.coalesce(pl.col("title"), pl.col("name")).alias("title"))
.head(top_n)
)
# make a bar plot of top_n_content and have the tooltip show the owner_username and tot_usage
fig = (
px.bar(top_n_content.to_pandas(), x="tot_usage", y="title",
orientation='h', title=f"Top {top_n} Content Items by Usage", template="plotly_white",
hover_data=[top_n_content["tot_usage"], top_n_content["owner_username"]])
.update_layout(yaxis=dict(autorange="reversed"),
title=f"Top {top_n} Content Items")
.update_traces(marker_color='#16565A',
hovertemplate="total usage: %{x}<br>owner username: %{customdata[1]}")
.update_yaxes(title_text="")
.update_xaxes(title_text="content hits")
.show()
)
```
```{python}
# bar graph of top N content users
# identify top N content users and identify their most-frequently used content item
# make a table of a user's usage of each content item
user_content_hits = (
usage.group_by(["user_guid", "content_guid"])
.agg(pl.len().alias("content_hits"))
# for each user_guid return the content_guid for the most content_hits
.group_by("user_guid")
.map_groups(lambda group: group.sort("content_hits", descending=True))
.group_by("user_guid")
.head(1)
# if user_guid is null then replace with "anonymous"
.with_columns(pl.coalesce(pl.col("user_guid"), pl.lit("anonymous")).alias("user_guid"))
)
# make a table of a user's total usage of all content
user_tot_usage = (
usage.group_by("user_guid")
.agg(pl.len().alias("users_tot_usage"))
# if user_guid is null then replace with "anonymous"
.with_columns(pl.coalesce(pl.col("user_guid"), pl.lit("anonymous")).alias("user_guid"))
)
top_n_users = (
user_content_hits
.join(user_tot_usage, left_on="user_guid", right_on="user_guid", how="left")
# what percent of a user's total usage is their most frequently accessed content item
.with_columns((100*pl.col("content_hits") / pl.col("users_tot_usage")).alias("top_item_percent_of_usage"))
.sort("users_tot_usage", descending=True)
.head(top_n)
.join(all_users, left_on="user_guid", right_on="user_guid", how="left")
# if username is null then replace with "anonymous"
.with_columns(pl.coalesce(pl.col("username"), pl.lit("anonymous")).alias("username"))
.join(content_info, left_on="content_guid", right_on="guid", how="left")
# if title is null then replace with name column
.with_columns(pl.coalesce(pl.col("title"), pl.col("name")).alias("title"))
.select(["username", "title", "top_item_percent_of_usage", "users_tot_usage"])
.rename({"title": "users_most_frequent_content"})
)
# make interactive plot
fig = (
px.bar(top_n_users.to_pandas(), x="users_tot_usage", y="username",
orientation="h", template="plotly_white",
hover_data=[top_n_users["users_tot_usage"], top_n_users["users_most_frequent_content"], top_n_users["top_item_percent_of_usage"]])
.update_layout(yaxis=dict(autorange="reversed"),
title=f"Top {top_n} users")
.update_traces(marker_color='#16565A',
hovertemplate="total usage: %{x}<br>most frequently accessed content: %{customdata[1]}<br>% of all content hits spent on \'%{customdata[1]}\': %{customdata[2]:.0f}%")
.update_yaxes(title_text="")
.update_xaxes(title_text="content hits")
.show()
)
```
```{python}
# bar graph of top N publishers
# table of most frequent content_hits with content info
content_hits = (
usage.group_by("content_guid")
.agg(pl.len().alias("content_hits"))
.join(content_info, left_on="content_guid", right_on="guid", how="left")
.select(["content_guid","name", "title", "content_hits","owner_username"])
.group_by("owner_username")
.map_groups(lambda group: group.sort("content_hits", descending=True))
.group_by("owner_username")
.head(1)
.sort("content_hits", descending=True)
)
# table of how many of an owner's content items were accessed in the period
owners_distinct_content = (
content_hits.group_by("owner_username")
# count number of distinct content_guids
.agg(pl.n_unique("content_guid").alias("num_owners_items_accessed"))
)
# table of total content_hits for each owner
owner_total_hits = (
usage.join(content_info, left_on="content_guid", right_on="guid", how="left")
.group_by("owner_username")
.agg(pl.len().alias("owners_tot_usage"))
)
# build table of top N publishers
top_n_publishers = (
content_hits
.join(owners_distinct_content, left_on="owner_username", right_on="owner_username", how="left")
.join(owner_total_hits, left_on="owner_username", right_on="owner_username", how="left")
.sort("owners_tot_usage", descending=True, nulls_last=True)
.head(top_n)
# if title is NA then substitute with name
.with_columns(pl.coalesce(pl.col("title"), pl.col("name")).alias("title"))
.rename({"title": "owners_most_frequent_content"})
.rename({"content_hits": "owners_most_frequent_content_hits"})
# remove name column
.drop(["name","content_guid"])
)
# make interactive plot
fig = (
px.bar(top_n_publishers.to_pandas(), x="owners_tot_usage", y="owner_username",
orientation="h", title=f"Top {top_n} publishers", template="plotly_white",
hover_data=[top_n_publishers["owners_tot_usage"],
top_n_publishers["num_owners_items_accessed"],
top_n_publishers["owners_most_frequent_content"],
top_n_publishers["owners_most_frequent_content_hits"]])
.update_layout(yaxis=dict(autorange="reversed"))
.update_traces(marker_color='#16565A',
hovertemplate="all content hits: %{x}<br>number of distinct content items accessed in period: %{customdata[1]}<br>most frequently accessed content: %{customdata[2]}<br> hits for \'%{customdata[2]}\': %{customdata[3]}")
.update_yaxes(title_text="")
.update_xaxes(title_text="content hits")
.show()
)
```
---
title: Most Popular Content on Connect
format:
dashboard:
theme: default
orientation: rows
---
```{r}
#| label: setup
#| include: false
library(connectapi)
library(tidyverse)
library(gt)
library(plotly)
```
```{r}
#| label: user-defined inputs
#| include: false
### User-defined inputs
# 1. Specify timeframe for the report. Default is 90 days prior to today
as_of_date <- today()
days_back <- 90
report_from <- as_of_date - ddays(days_back)
# 2. Specify the number of content items to report as the Top N pieces of content. Default is to show the `top_n <- 20` pieces of content.
top_n <- 20
# Note: The CONNECT_SERVER and CONNECT_API_KEY environment variables are required for this report to pull usage data. Use environment variables for added security; do not hard-code your API key or other secrets.
```
```{r}
#| label: Establish connection to Connect server
#| include: false
#|
# Note `connectapi::connect` automatically detects the environment variables CONNECT_SERVER and CONNECT_API_KEY. They do not need to be specified as arguments to this function, however, if these environment variables are not set, the code will fail
client <- connect()
```
```{r}
#| label: Get content info and usage
#| include: false
### Retrieve content details and select relevant columns
content_info <- connectapi::get_content(client) |>
hoist(owner, owner_username = "username") |>
select(guid, name, title, owner_username, dashboard_url)
### Query for both shiny and non-shiny usage data
usage_shiny <- get_usage_shiny(client,
from = report_from,
to = as_of_date,
limit = Inf
)
usage_non_shiny <- get_usage_static(client,
from = report_from,
to = as_of_date,
limit = Inf
) |>
rename(started = time) |>
mutate(ended = started)
usage <- bind_rows(usage_shiny, usage_non_shiny) |>
mutate(day = as.Date(started)) |>
select(content_guid, user_guid, day) |>
# filter out any content items that may have been deleted
filter(content_guid %in% content_info$guid)
### Get all user details to cross reference user_guids with usernames
all_users <- connectapi::get_users(client, limit = Inf) |>
select(guid, username, email) |> rename(user_guid = guid)
```
## Row
Content usage on Connect for the period of **`{r} report_from`** to **`{r} as_of_date`**
## Row
```{r}
#| label: plot total content usage over time
usage_plot_data <- usage |>
group_by(day) |>
summarise(usage = n()) |>
arrange(day)
# generate plot
gg <- ggplot(usage_plot_data, aes(day, usage)) +
geom_line(color = "#AE929F", linewidth = 1) +
theme_minimal() +
labs(
y = NULL,
x = NULL,
title = glue::glue("Total usage for the {days_back} day period"),
)
# make interactive with plotly
ggplotly(gg) |>
config(displayModeBar = FALSE)
```
## Row
```{r}
#| label: bar graph of top N content items
top_n_content <- usage |>
group_by(content_guid) |>
summarise(tot_usage = n()) |>
arrange(desc(tot_usage)) |>
head(top_n) |>
left_join(content_info, by = c(content_guid = "guid")) |>
# if title is NA then substitute with name
mutate(title = coalesce(title, name))
# generate plot
gg <- ggplot(top_n_content, aes(x = reorder(title, tot_usage), y = tot_usage,
text=paste("owner username:",owner_username))) +
geom_col(fill = "#16565A") +
coord_flip() +
theme_minimal() +
labs(
x = NULL,
y = "content hits",
title = glue::glue("Top {top_n} content items)")
)
# make interactive with plotly and include the owner_username and tot_usage in the tooltip
ggplotly(gg, tooltip = c("y", "text")) |>
config(displayModeBar = FALSE)
```
```{r}
#| label: bar graph of top N content users
# identify top N content users and identify their most-frequently used content item
top_n_users <- usage |>
select(-day) |>
group_by(user_guid, content_guid) |>
# count a user's usage of a content item
mutate(content_hits = n()) |>
ungroup() |>
group_by(user_guid) |>
# count a user's total usage of all content
mutate(users_tot_usage = n()) |>
arrange(desc(content_hits)) |>
# slice to save only each user's most frequently accessed content item
slice(1) |>
ungroup() |>
# what fraction of a user's total usage is their most frequently accessed content item
mutate(top_item_percent_of_usage = content_hits / users_tot_usage) |>
# select only the top N users
arrange(desc(users_tot_usage)) |>
head(top_n) |>
left_join(all_users, by = "user_guid") |>
mutate(username = coalesce(username, "anonymous")) |>
left_join(content_info, by = c(content_guid = "guid")) |>
# if title is NA then substitute with name
mutate(title = coalesce(title, name)) |>
rename(users_most_frequent_content = title) |>
select(username, users_most_frequent_content, users_tot_usage, top_item_percent_of_usage)
# generate plot
gg <- ggplot(top_n_users, aes(
x = reorder(username, users_tot_usage), y = users_tot_usage,
# text=paste("most used content:",users_most_frequent_content)
text = glue::glue("total usage: {users_tot_usage} \nmost frequently accessed content: {users_most_frequent_content} \n% of all content hits spent on \'{users_most_frequent_content}\': {scales::percent(top_item_percent_of_usage,1)}")
)) +
geom_col(fill = "#16565A") +
coord_flip() +
theme_minimal() +
labs(
x = NULL,
y = "content hits",
title = glue::glue("Top {top_n} content users")
)
# make interactive with plotly and include the owner_username and tot_usage in the tooltip
ggplotly(gg, tooltip = c("text")) |>
config(displayModeBar = FALSE)
```
```{r}
#| label: bar graph of top N publishers
top_n_publishers <- usage |>
left_join(content_info, by = c(content_guid = "guid")) |>
select(content_guid, name, title, owner_username) |>
group_by(content_guid) |>
mutate(content_hits = n()) |>
ungroup() |>
group_by(owner_username) |>
# number of owner's distinct content items accessed in the period
mutate(num_owners_items_accessed = n_distinct(content_guid)) |>
mutate(owners_tot_usage = n()) |>
arrange(desc(content_hits)) |>
slice(1) |>
arrange(desc(owners_tot_usage)) |>
head(top_n) |>
# if title is NA then substitute with name
mutate(title = coalesce(title, name)) |>
rename(owners_most_frequent_content = title, owners_most_frequent_content_hits = content_hits) |>
select(-content_guid, -name)
# generate plot
gg <- ggplot(top_n_publishers, aes(
x = reorder(owner_username, owners_tot_usage), y = owners_tot_usage,
text = glue::glue("all content hits: {owners_tot_usage} \nnumber of distinct content items accessed in period: {num_owners_items_accessed} \nmost frequently accessed content: {owners_most_frequent_content} \nhits for \'{owners_most_frequent_content}\': {owners_most_frequent_content_hits}")
)) +
geom_col(fill = "#16565A") +
coord_flip() +
theme_minimal() +
labs(
x = NULL,
y = "content hits",
title = glue::glue("Top {top_n} publishers")
)
# make interactive with plotly and include the owner_username and tot_usage in the tooltip
ggplotly(gg, tooltip = c("text")) |>
config(displayModeBar = FALSE)
```