Analyzing Most Active Content, Users, and Publishers

Problem

You want to identify the most used content deployed on the Connect server, the users with the highest viewing history, and the publishers with the most content usage during the selected time period.

You might want to identify the content and users that would be most impacted by changes to the server or content.

Solution

Analyze usage data from the Connect server. Here, we present four approaches, each of which builds a data frame for the metric over a specified time period

  • Historical usage of all content on the server
  • Most popular content, by number of hits
  • Top n content users based on content loads
  • Top n publishers based on loads of their content

We also provide an example Quarto dashboard with visualizations of these approaches.

This recipe requires the following inputs:

  1. The desired timeframe for the report, defined by:
  • the as_of_date
  • the number of days_back from the as_of_date to include in the report
  1. The number of top N items and users include in the report, defined by top_n.
Note

Note that if your Connect version is prior to 2023.10, content usage for Plumber, Voila, Streamlit, Dash, and Bokeh were over-counted in instrumentation metrics. Metrics collected after upgrading to version 2023.10 or greater are accurate.

Getting the most used content

The code sample below yields the top n content items, based on usage over a selected time period.

from posit import connect
from datetime import date, timedelta
import polars as pl
import numpy as np

#### User-defined inputs ####
# 1. Specify timeframe for the report. Default is 90 days prior to today
as_of_date = date.today()
days_back = 90

# 2. Specify the number of content items and users to report as the Top N pieces of content, users, and publishers. Default is to show the `top_n=20`.
top_n = 20
###########################

report_from = as_of_date - timedelta(days=days_back)

client = connect.Client()

# Retrieve content details and select relevant columns
df_content = client.content.find()
content_info = (pl.DataFrame(df_content, infer_schema_length=None)
                # unnest struct column "owner" to get owner username
                .with_columns(pl.col("owner").map_elements(lambda x: x["username"], return_dtype=pl.String).alias("owner_username"))
                # select relevant columns
                .select(["guid", "name", "title", "owner_username"])
)

# Get usage data
df = client.metrics.usage.find(
    start=report_from.strftime("%Y-%m-%dT%H:%M:%SZ"),
    end=as_of_date.strftime("%Y-%m-%dT%H:%M:%SZ"))

usage = (pl.DataFrame(df, infer_schema_length=None)
        .with_columns(pl.col("started").str.to_datetime("%Y-%m-%dT%H:%M:%SZ"))
        .with_columns(pl.col("started").cast(pl.Date).alias("day"))
        .select(["content_guid", "user_guid", "day"])
        # filter out any content items that may have been deleted
        .filter(pl.col("content_guid").is_in(content_info["guid"]))
)


# Identify the top N most used content
top_n_content = (
  usage.group_by("content_guid")
  .agg(pl.len().alias("tot_usage"))
  .join(content_info, left_on="content_guid", right_on="guid", how="left")
  .select(["name", "title", "tot_usage","owner_username"])
  .sort("tot_usage", descending=True)
  # if title is null then replace with name column
  .with_columns(pl.coalesce(pl.col("title"), pl.col("name")).alias("title"))
  .head(top_n)
)

An example bar graph visualizing this data is included in the Quarto dashboard code at the end of this recipe.

library(connectapi)
library(dplyr)
library(lubridate)

#### User-defined inputs ####
# 1. Specify timeframe for the report. Default is 90 days prior to today
as_of_date <- today()
days_back <- 90

# 2. Specify the number of content items to report as the Top N pieces of content. Default is to show the `top_n <- 20` pieces of content.
top_n <- 20
############################

report_from <- as_of_date - ddays(days_back)

client <- connect()

# Retrieve content details and select relevant columns
content_info <- connectapi::get_content(client) |>
  hoist(owner, owner_username = "username") |>
  select(guid, name, title, owner_username, dashboard_url)


# Query for both shiny and non-shiny usage data
usage_shiny <- get_usage_shiny(client,
    from = report_from,
    to = as_of_date,
    limit = Inf
  )

usage_non_shiny <- get_usage_static(client,
    from = report_from,
    to = as_of_date,
    limit = Inf
  ) |>
    rename(started = time) |>
    mutate(ended = started)


usage <- bind_rows(usage_shiny, usage_non_shiny) |>
  mutate(day = as.Date(started)) |>
  select(content_guid, user_guid, day) |>
  # filter out any content items that may have been deleted
  filter(content_guid %in% content_info$guid)

top_n_content <- usage |>
  group_by(content_guid) |>
  summarise(tot_usage = n()) |>
  arrange(desc(tot_usage)) |>
  head(top_n) |>
  left_join(content_info, by = c(content_guid = "guid")) |>
  # if title is NA then substitute with name
  mutate(title = coalesce(title, name))

An example bar graph visualizing this data is included in the Quarto dashboard code at the end of this recipe.

Getting the most active content viewers

The code sample below yields the top n content viewers, based on usage over a selected time period. It builds from the previous code block.

# Get user details to cross reference `user_guid` with usernames
all_users = pl.DataFrame(client.users.find())

# identify top N content users and identify their most-frequently used content item

# make a table of a user's usage of each content item
user_content_hits = (
  usage.group_by(["user_guid", "content_guid"])
  .agg(pl.len().alias("content_hits"))
  # for each user_guid return the content_guid for the most content_hits
  .group_by("user_guid")
  .map_groups(lambda group: group.sort("content_hits", descending=True))
  .group_by("user_guid")
  .head(1)
  # if user_guid is null then replace with "anonymous"
  .with_columns(pl.coalesce(pl.col("user_guid"), pl.lit("anonymous")).alias("user_guid"))
)

# make a table of a user's total usage of all content
user_tot_usage = (
  usage.group_by("user_guid")
  .agg(pl.len().alias("users_tot_usage"))
  # if user_guid is null then replace with "anonymous"
  .with_columns(pl.coalesce(pl.col("user_guid"), pl.lit("anonymous")).alias("user_guid"))
)

top_n_users = (
  user_content_hits
  .join(user_tot_usage, left_on="user_guid", right_on="user_guid", how="left")
  # what percent of a user's total usage is their most frequently accessed content item
  .with_columns((100*pl.col("content_hits") / pl.col("users_tot_usage")).alias("top_item_percent_of_usage"))
  .sort("users_tot_usage", descending=True)
  .head(top_n)
  .join(all_users, left_on="user_guid", right_on="user_guid", how="left")
  # if username is null then replace with "anonymous"
  .with_columns(pl.coalesce(pl.col("username"), pl.lit("anonymous")).alias("username"))
  .join(content_info, left_on="content_guid", right_on="guid", how="left")
  # if title is null then replace with name column
  .with_columns(pl.coalesce(pl.col("title"), pl.col("name")).alias("title"))
  .select(["username", "title", "top_item_percent_of_usage", "users_tot_usage"])
  .rename({"title": "users_most_frequent_content"})
)

An example bar graph visualizing this data is included in the Quarto dashboard code at the end of this recipe.

# Get user details to cross reference `user_guid` with usernames
all_users <- get_users(client, limit = Inf)

# identify top N content users and identify their most-frequently used content item
top_n_users <- usage |>
  select(-day) |>
  group_by(user_guid, content_guid) |>
  # count a user's usage of a content item
  mutate(content_hits = n()) |>
  ungroup() |>
  group_by(user_guid) |>
  # count a user's total usage of all content
  mutate(users_tot_usage = n()) |>
  arrange(desc(content_hits)) |>
  # slice to save only each user's most frequently accessed content item
  slice(1) |>
  ungroup() |>
  # what fraction of a user's total usage is their most frequently accessed content item
  mutate(top_item_percent_of_usage = content_hits / users_tot_usage) |>
  # select only the top N users
  arrange(desc(users_tot_usage)) |>
  head(top_n) |>
  left_join(all_users, by = "user_guid") |>
  mutate(username = coalesce(username, "anonymous")) |>
  left_join(content_info, by = c(content_guid = "guid")) |>
  # if title is NA then substitute with name
  mutate(title = coalesce(title, name)) |>
  rename(users_most_frequent_content = title) |>
  select(username, users_most_frequent_content, users_tot_usage, top_item_percent_of_usage)

An example bar graph visualizing this data is included in the Quarto dashboard code at the end of this recipe.

Getting the publishers with the most content hits

The code sample below yields the top n publishers, based on usage of their content over the selected time period. This builds on the previous code blocks.

# table of most frequent content_hits with content info
content_hits = (
  usage.group_by("content_guid")
  .agg(pl.len().alias("content_hits"))
  .join(content_info, left_on="content_guid", right_on="guid", how="left")
  .select(["content_guid","name", "title", "content_hits","owner_username"])
  .group_by("owner_username")
  .map_groups(lambda group: group.sort("content_hits", descending=True))
  .group_by("owner_username")
  .head(1)
  .sort("content_hits", descending=True)
 )

# table of how many of an owner's content items were accessed in the period
owners_distinct_content = (
  content_hits.group_by("owner_username")
  # count number of distinct content_guids
  .agg(pl.n_unique("content_guid").alias("num_owners_items_accessed"))
 )

# table of total content_hits for each owner
owner_total_hits = (
  usage.join(content_info, left_on="content_guid", right_on="guid", how="left")
  .group_by("owner_username")
  .agg(pl.len().alias("owners_tot_usage"))
 )

# build table of top N publishers
top_n_publishers = (
  content_hits
  .join(owners_distinct_content, left_on="owner_username", right_on="owner_username", how="left")
  .join(owner_total_hits, left_on="owner_username", right_on="owner_username", how="left")
  .sort("owners_tot_usage", descending=True, nulls_last=True)
  .head(top_n)
  # if title is NA then substitute with name
  .with_columns(pl.coalesce(pl.col("title"), pl.col("name")).alias("title"))
  .rename({"title": "owners_most_frequent_content"})
  .rename({"content_hits": "owners_most_frequent_content_hits"})
  # remove name column
  .drop(["name","content_guid"])
)

An example bar graph visualizing this data is included in the Quarto dashboard code at the end of this recipe.

top_n_publishers <- usage |>
    left_join(content_info, by = c(content_guid = "guid")) |>
    select(content_guid, name, title, owner_username) |>
    group_by(content_guid) |>
    mutate(content_hits = n()) |>
    ungroup() |>
    group_by(owner_username) |>
    # number of owner's distinct content items accessed in the period
    mutate(num_owners_items_accessed = n_distinct(content_guid)) |>
    mutate(owners_tot_usage = n()) |>
    arrange(desc(content_hits)) |>
    slice(1) |>
    arrange(desc(owners_tot_usage)) |>
    head(top_n) |>
    # if title is NA then substitute with name
    mutate(title = coalesce(title, name)) |>
    rename(owners_most_frequent_content = title, owners_most_frequent_content_hits = content_hits) |>
    select(-content_guid, -name)

An example bar graph visualizing this data is included in the Quarto dashboard code at the end of this recipe.

Deploying a sample Quarto dashboard with these metrics

The Quarto document below builds visualizations around the data frames retrieved above. To use, copy the source code into a new .qmd file. This can be deployed in your own environment, or customized to your needs.

# requirements.txt
pandas==2.2.2
plotly==5.22.0
polars==0.20.23
posit-sdk==0.2.0
pyarrow==16.0.0
---
title: Most Popular Content on Connect
format:
  dashboard:
     theme: default
     orientation: rows
---

```{python}
#| label: setup
from posit import connect
from datetime import date, timedelta
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
```


```{python}
#| label: user-defined inputs

# 1. Specify timeframe for the report. Default is 90 days prior to today
as_of_date = date.today()
days_back = 90
report_from = as_of_date - timedelta(days=days_back)


# 2. Specify the number of content items to report as the Top N pieces of content. Default is to show the `top_n=20` pieces of content.
top_n = 20

# Note: The CONNECT_SERVER and CONNECT_API_KEY environment variables are required for this report to pull usage data. Use environment variables for added security; do not hard-code your API key or other secrets.
```


```{python}
#| label: Establish connection to Connect server

# Note `connect.Client()` automatically detects the environment variables CONNECT_SERVER and CONNECT_API_KEY. They do not need to be specified as arguments to this function, however, if these environment variables are not set, the code will fail
client = connect.Client()
```

```{python}
# | label: Get content info and usage

### Retrieve content details and select relevant columns
df_content = client.content.find()
content_info = (pl.DataFrame(df_content, infer_schema_length=None)
                # unnest struct column "owner" to get owner username
                .with_columns(pl.col("owner").map_elements(lambda x: x["username"], return_dtype=pl.String).alias("owner_username"))
                # select relevant columns
                .select(["guid", "name", "title", "owner_username"])
)

### Get usage data
df = client.metrics.usage.find(
    start=report_from.strftime("%Y-%m-%dT%H:%M:%SZ"),
    end=as_of_date.strftime("%Y-%m-%dT%H:%M:%SZ"))

usage = (pl.DataFrame(df, infer_schema_length=None)
        .with_columns(pl.col("started").str.to_datetime("%Y-%m-%dT%H:%M:%SZ"))
        .with_columns(pl.col("started").cast(pl.Date).alias("day"))
        .select(["content_guid", "user_guid", "day"])
        # filter out any content items that may have been deleted
        .filter(pl.col("content_guid").is_in(content_info["guid"]))
)



### Get all user details to cross reference user_guids with usernames
all_users = (pl.DataFrame(client.users.find())
             .select(["guid", "username"])
             #rename guid to user_guid
              .rename({"guid": "user_guid"}))
```

## Row

Content usage on Connect for the period of **`{python} report_from.strftime("%Y-%m-%d")`** to **`{python} as_of_date.strftime("%Y-%m-%d")`**


## Row


```{python}
# plot total content usage over time

usage_plot_data = (usage.group_by("day")
  .agg(pl.len().alias("usage"))
  .sort("day")
  .select("day", "usage")
)
# plot usage_plot_data using plotly
fig = px.line(usage_plot_data.to_pandas(), x="day", y="usage", title=f"Total usage for the {days_back} day period", template="plotly_white")
fig.update_traces(line_color='#AE929F', line_width=4)
```


## Row

```{python}
# bar graph of top N content items

# identify top N content
top_n_content = (
  usage.group_by("content_guid")
  .agg(pl.len().alias("tot_usage"))
  .join(content_info, left_on="content_guid", right_on="guid", how="left")
  .select(["name", "title", "tot_usage","owner_username"])
  .sort("tot_usage", descending=True)
  # if title is null then replace with name column
  .with_columns(pl.coalesce(pl.col("title"), pl.col("name")).alias("title"))
  .head(top_n)
)

# make a bar plot of top_n_content and have the tooltip show the owner_username and tot_usage
fig = (
  px.bar(top_n_content.to_pandas(), x="tot_usage", y="title",
              orientation='h', title=f"Top {top_n} Content Items by Usage", template="plotly_white",
              hover_data=[top_n_content["tot_usage"], top_n_content["owner_username"]])
        .update_layout(yaxis=dict(autorange="reversed"),
                  title=f"Top {top_n} Content Items")
        .update_traces(marker_color='#16565A',
                  hovertemplate="total usage: %{x}<br>owner username: %{customdata[1]}")
        .update_yaxes(title_text="")
        .update_xaxes(title_text="content hits")
        .show()
        )
```


```{python}
# bar graph of top N content users

# identify top N content users and identify their most-frequently used content item

# make a table of a user's usage of each content item
user_content_hits = (
  usage.group_by(["user_guid", "content_guid"])
  .agg(pl.len().alias("content_hits"))
  # for each user_guid return the content_guid for the most content_hits
  .group_by("user_guid")
  .map_groups(lambda group: group.sort("content_hits", descending=True))
  .group_by("user_guid")
  .head(1)
  # if user_guid is null then replace with "anonymous"
  .with_columns(pl.coalesce(pl.col("user_guid"), pl.lit("anonymous")).alias("user_guid"))
)

# make a table of a user's total usage of all content
user_tot_usage = (
  usage.group_by("user_guid")
  .agg(pl.len().alias("users_tot_usage"))
  # if user_guid is null then replace with "anonymous"
  .with_columns(pl.coalesce(pl.col("user_guid"), pl.lit("anonymous")).alias("user_guid"))
)


top_n_users = (
  user_content_hits
  .join(user_tot_usage, left_on="user_guid", right_on="user_guid", how="left")
  # what percent of a user's total usage is their most frequently accessed content item
  .with_columns((100*pl.col("content_hits") / pl.col("users_tot_usage")).alias("top_item_percent_of_usage"))
  .sort("users_tot_usage", descending=True)
  .head(top_n)
  .join(all_users, left_on="user_guid", right_on="user_guid", how="left")
  # if username is null then replace with "anonymous"
  .with_columns(pl.coalesce(pl.col("username"), pl.lit("anonymous")).alias("username"))
  .join(content_info, left_on="content_guid", right_on="guid", how="left")
  # if title is null then replace with name column
  .with_columns(pl.coalesce(pl.col("title"), pl.col("name")).alias("title"))
  .select(["username", "title", "top_item_percent_of_usage", "users_tot_usage"])
  .rename({"title": "users_most_frequent_content"})
)

# make interactive plot
fig = (
  px.bar(top_n_users.to_pandas(), x="users_tot_usage", y="username",
              orientation="h", template="plotly_white",
              hover_data=[top_n_users["users_tot_usage"], top_n_users["users_most_frequent_content"], top_n_users["top_item_percent_of_usage"]])
    .update_layout(yaxis=dict(autorange="reversed"),
                  title=f"Top {top_n} users")
    .update_traces(marker_color='#16565A',
                  hovertemplate="total usage: %{x}<br>most frequently accessed content: %{customdata[1]}<br>% of all content hits spent on \'%{customdata[1]}\': %{customdata[2]:.0f}%")
    .update_yaxes(title_text="")
    .update_xaxes(title_text="content hits")
    .show()
)
```


```{python}
# bar graph of top N publishers

# table of most frequent content_hits with content info
content_hits = (
  usage.group_by("content_guid")
  .agg(pl.len().alias("content_hits"))
  .join(content_info, left_on="content_guid", right_on="guid", how="left")
  .select(["content_guid","name", "title", "content_hits","owner_username"])
  .group_by("owner_username")
  .map_groups(lambda group: group.sort("content_hits", descending=True))
  .group_by("owner_username")
  .head(1)
  .sort("content_hits", descending=True)
 )

# table of how many of an owner's content items were accessed in the period
owners_distinct_content = (
  content_hits.group_by("owner_username")
  # count number of distinct content_guids
  .agg(pl.n_unique("content_guid").alias("num_owners_items_accessed"))
 )

# table of total content_hits for each owner
owner_total_hits = (
  usage.join(content_info, left_on="content_guid", right_on="guid", how="left")
  .group_by("owner_username")
  .agg(pl.len().alias("owners_tot_usage"))
 )

# build table of top N publishers
top_n_publishers = (
  content_hits
  .join(owners_distinct_content, left_on="owner_username", right_on="owner_username", how="left")
  .join(owner_total_hits, left_on="owner_username", right_on="owner_username", how="left")
  .sort("owners_tot_usage", descending=True, nulls_last=True)
  .head(top_n)
  # if title is NA then substitute with name
  .with_columns(pl.coalesce(pl.col("title"), pl.col("name")).alias("title"))
  .rename({"title": "owners_most_frequent_content"})
  .rename({"content_hits": "owners_most_frequent_content_hits"})
  # remove name column
  .drop(["name","content_guid"])
)

# make interactive plot
fig = (
  px.bar(top_n_publishers.to_pandas(), x="owners_tot_usage", y="owner_username",
              orientation="h", title=f"Top {top_n} publishers", template="plotly_white",
              hover_data=[top_n_publishers["owners_tot_usage"],
              top_n_publishers["num_owners_items_accessed"],
              top_n_publishers["owners_most_frequent_content"],
              top_n_publishers["owners_most_frequent_content_hits"]])
      .update_layout(yaxis=dict(autorange="reversed"))
      .update_traces(marker_color='#16565A',
                  hovertemplate="all content hits: %{x}<br>number of distinct content items accessed in period: %{customdata[1]}<br>most frequently accessed content: %{customdata[2]}<br> hits for \'%{customdata[2]}\': %{customdata[3]}")
      .update_yaxes(title_text="")
      .update_xaxes(title_text="content hits")
      .show()
)
```
---
title: Most Popular Content on Connect
format:
  dashboard:
     theme: default
     orientation: rows
---

```{r}
#| label: setup
#| include: false

library(connectapi)
library(tidyverse)
library(gt)
library(plotly)
```


```{r}
#| label: user-defined inputs
#| include: false

### User-defined inputs
# 1. Specify timeframe for the report. Default is 90 days prior to today
as_of_date <- today()
days_back <- 90
report_from <- as_of_date - ddays(days_back)

# 2. Specify the number of content items to report as the Top N pieces of content. Default is to show the `top_n <- 20` pieces of content.
top_n <- 20


# Note: The CONNECT_SERVER and CONNECT_API_KEY environment variables are required for this report to pull usage data. Use environment variables for added security; do not hard-code your API key or other secrets.
```


```{r}
#| label: Establish connection to Connect server
#| include: false
#| 
# Note `connectapi::connect` automatically detects the environment variables CONNECT_SERVER and CONNECT_API_KEY. They do not need to be specified as arguments to this function, however, if these environment variables are not set, the code will fail
client <- connect()
```

```{r}
#| label: Get content info and usage
#| include: false

### Retrieve content details and select relevant columns
content_info <- connectapi::get_content(client) |>
  hoist(owner, owner_username = "username") |>
  select(guid, name, title, owner_username, dashboard_url)

### Query for both shiny and non-shiny usage data

usage_shiny <- get_usage_shiny(client,
    from = report_from,
    to = as_of_date,
    limit = Inf
  )

usage_non_shiny <- get_usage_static(client,
    from = report_from,
    to = as_of_date,
    limit = Inf
  ) |>
    rename(started = time) |>
    mutate(ended = started)


usage <- bind_rows(usage_shiny, usage_non_shiny) |>
  mutate(day = as.Date(started)) |>
  select(content_guid, user_guid, day) |>
  # filter out any content items that may have been deleted
  filter(content_guid %in% content_info$guid)

### Get all user details to cross reference user_guids with usernames
all_users <- connectapi::get_users(client, limit = Inf) |>
  select(guid, username, email) |> rename(user_guid = guid)
```

## Row

Content usage on Connect for the period of **`{r} report_from`** to **`{r} as_of_date`**


## Row

```{r}
#| label: plot total content usage over time

usage_plot_data <- usage |>
    group_by(day) |>
    summarise(usage = n()) |>
    arrange(day)

# generate plot
gg <- ggplot(usage_plot_data, aes(day, usage)) +
     geom_line(color = "#AE929F", linewidth = 1) +
     theme_minimal() +
     labs(
       y = NULL,
       x = NULL,
       title = glue::glue("Total usage for the {days_back} day period"),
     )

# make interactive with plotly
ggplotly(gg) |>
      config(displayModeBar = FALSE)
```

## Row

```{r}
#| label: bar graph of top N content items

top_n_content <- usage |>
  group_by(content_guid) |>
  summarise(tot_usage = n()) |>
  arrange(desc(tot_usage)) |>
  head(top_n) |>
  left_join(content_info, by = c(content_guid = "guid")) |>
  # if title is NA then substitute with name
  mutate(title = coalesce(title, name))

# generate plot
gg <- ggplot(top_n_content, aes(x = reorder(title, tot_usage), y = tot_usage,
text=paste("owner username:",owner_username))) +
  geom_col(fill = "#16565A") +
  coord_flip() +
  theme_minimal() +
  labs(
    x = NULL,
    y = "content hits",
    title = glue::glue("Top {top_n} content items)")
  )

# make interactive with plotly and include the owner_username and tot_usage in the tooltip
ggplotly(gg, tooltip = c("y", "text")) |>
      config(displayModeBar = FALSE)
```

```{r}
#| label: bar graph of top N content users

# identify top N content users and identify their most-frequently used content item
top_n_users <- usage |>
  select(-day) |>
  group_by(user_guid, content_guid) |>
  # count a user's usage of a content item
  mutate(content_hits = n()) |>
  ungroup() |>
  group_by(user_guid) |>
  # count a user's total usage of all content
  mutate(users_tot_usage = n()) |>
  arrange(desc(content_hits)) |>
  # slice to save only each user's most frequently accessed content item
  slice(1) |>
  ungroup() |>
  # what fraction of a user's total usage is their most frequently accessed content item
  mutate(top_item_percent_of_usage = content_hits / users_tot_usage) |>
  # select only the top N users
  arrange(desc(users_tot_usage)) |>
  head(top_n) |>
  left_join(all_users, by = "user_guid") |>
  mutate(username = coalesce(username, "anonymous")) |>
  left_join(content_info, by = c(content_guid = "guid")) |>
  # if title is NA then substitute with name
  mutate(title = coalesce(title, name)) |>
  rename(users_most_frequent_content = title) |>
  select(username, users_most_frequent_content, users_tot_usage, top_item_percent_of_usage)


# generate plot
gg <- ggplot(top_n_users, aes(
  x = reorder(username, users_tot_usage), y = users_tot_usage,
  # text=paste("most used content:",users_most_frequent_content)
  text = glue::glue("total usage: {users_tot_usage} \nmost frequently accessed content: {users_most_frequent_content} \n% of all content hits spent on \'{users_most_frequent_content}\': {scales::percent(top_item_percent_of_usage,1)}")
)) +
  geom_col(fill = "#16565A") +
  coord_flip() +
  theme_minimal() +
  labs(
    x = NULL,
    y = "content hits",
    title = glue::glue("Top {top_n} content users")
  )


# make interactive with plotly and include the owner_username and tot_usage in the tooltip
ggplotly(gg, tooltip = c("text")) |>
  config(displayModeBar = FALSE)
```


```{r}
#| label: bar graph of top N publishers

top_n_publishers <- usage |>
    left_join(content_info, by = c(content_guid = "guid")) |>
    select(content_guid, name, title, owner_username) |>
    group_by(content_guid) |>
    mutate(content_hits = n()) |>
    ungroup() |>
    group_by(owner_username) |>
    # number of owner's distinct content items accessed in the period
    mutate(num_owners_items_accessed = n_distinct(content_guid)) |>
    mutate(owners_tot_usage = n()) |>
    arrange(desc(content_hits)) |>
    slice(1) |>
    arrange(desc(owners_tot_usage)) |>
    head(top_n) |>
    # if title is NA then substitute with name
    mutate(title = coalesce(title, name)) |>
    rename(owners_most_frequent_content = title, owners_most_frequent_content_hits = content_hits) |>
    select(-content_guid, -name)


# generate plot
gg <- ggplot(top_n_publishers, aes(
  x = reorder(owner_username, owners_tot_usage), y = owners_tot_usage,
  text = glue::glue("all content hits: {owners_tot_usage} \nnumber of distinct content items accessed in period: {num_owners_items_accessed} \nmost frequently accessed content: {owners_most_frequent_content} \nhits for \'{owners_most_frequent_content}\': {owners_most_frequent_content_hits}")
)) +
  geom_col(fill = "#16565A") +
  coord_flip() +
  theme_minimal() +
  labs(
    x = NULL,
    y = "content hits",
    title = glue::glue("Top {top_n} publishers")
  )


# make interactive with plotly and include the owner_username and tot_usage in the tooltip
ggplotly(gg, tooltip = c("text")) |>
  config(displayModeBar = FALSE)
```