In biomedical research, it's not uncommon to do an experiment where one varies a bunch of conditions (e.g., genotypes of mice or cell lines, treatment with various drugs), and measures a single major output variable for each experiment (e.g. blood sugar level, expression of a reporter gene).
A common way one plots these is as a series of bar or boxplots (or violin plots, or whatever), arranged horizontally, with a grid underneath depicting the variables that were changed in each condition.
It's easy to make a graph like this in Excel, setting the left-to-right order however you want, but then of course you have to mark it up in Illustrator or the like. It would be much nicer to be able to do this in R, especially for exploratory analysis, where one could hopefully have the markup done automatically. But I can't seem to find a good way to do this.
I've pasted code below, for an example of the kind of data I'm working with and how it is anized, and how I was able to kludge up a horizontal barplot, with labels, by manually specifying an ordering variable that accounts for the absence or presence of two treatment conditions.
I also included a genotype variable, with separates the data into two groups—the only way to get this to plot horizontally was to use facet_wrap
. Below, on the left, is the output I could get from native ggplot, and on the right is an Illustrator-edited version that indicates how I would like the figure laid out. Is there a package that can make a graph like this, or a workaround that might be used to put such a graph together "by hand" in R?
(I'm pretty comfortable with ggplot coding.)
library(tidyverse)
test <- structure(list(replicate = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
2, 3, 3, 3, 3, 3, 3, 3, 3),
genotype = c("A", "A", "A", "A",
"B", "B", "B", "B", "A", "A", "A", "A", "B", "B", "B", "B", "A",
"A", "A", "A"),
treat1 = c(FALSE, FALSE, TRUE, TRUE, FALSE, FALSE,
TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE,
FALSE, FALSE, TRUE, TRUE),
treat2 = c(FALSE, TRUE, FALSE, TRUE,
FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE,
FALSE, TRUE, FALSE, TRUE, FALSE, TRUE),
output_norm = c(1, 5,
1.75, 4.74, 1, 13.61, 0.7, 7.96, 1, 3, 1.67, 2.51, 1, 6.44, 0.93,
10.92, 1, 3.63, 2.24, 6.59)),
row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"))
# Create ordered factor variable accounting for treat1 vs treat2, for x-axis
condition <- tibble(
treat1=c(F, T, F, T),
treat2=c(F, F, T, T),
treatment=c('untreated', 'treat1', 'treat2', 'treat1 + treat2')) %>%
mutate(treatment=ordered(treatment, levels=treatment)) %>% print()
# Add treatment variable to test data
test <- left_join(test, condition, by=c('treat1', 'treat2')) %>% print()
# Bar graph with individual data points
ggplot(test, aes(x=treatment, y=output_norm)) +
geom_bar(stat='summary', fun='mean', position='dodge') +
geom_jitter(width=0.1, height=0) +
facet_wrap(~genotype) +
labs(title='output by genotype', x='Treatment', y='output') +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
UPDATE: The answer below by stefan worked well, and motivated me to create a generalized function to use legendry
to create plots like this. The only ugly part was that I have to manually build the guides
as a string, to assemble the stack of one guide_axis_base
call per variable, and then use eval(parse(text=))
to execute it within ggplot
.
grid_plot <- function(df, output, vars,
fun='mean',
logical.convert=c('', '+'),
width=0.1,
theme='bw') {
# function to plot df as summarized barplot + jittered individual points, with
# grid of variables underneath
# df = dataframe to plot
# output = name of output variable in df
# vars = ordered vector of variable names for anizing and summarizing data
# fun = function to summarize data (default = 'mean')
# logical.convert = vector of strings for replacing FALSE/TRUE (default = c('', '+'))
# width = width of jitter plot (default = 0.1)
# theme = theme for plot (default = 'bw')
# return = ggplot object
require(legendry)
# remove any columns other than output and vars
df <- select(df, all_of(c(vars, output)))
# convert logical variables to strings
if(!is.null(logical.convert)) {
df <- mutate(df, across(where(is.logical), function(x) {
ifelse(x, logical.convert[2], logical.convert[1])
}))
}
# summarize df using vars and fun
df_summ <- group_by(df, across(all_of(vars))) %>%
summarise(output_summ = do.call(fun, list(.data[[output]])), .groups="drop")
# arrange df_summ by vars, add x.axis variable based on sort order
df_summ <- arrange(df_summ, pick(all_of(vars))) %>%
mutate(x.axis = 1:nrow(df_summ))
# merge x.axis variable into original df
df <- left_join(df, select(df_summ, -output_summ), by=vars) %>%
arrange(pick(all_of(vars)))
# create key_labels dataframe
key_labels <- select(df_summ, -c(output_summ, x.axis))
# create text for calling stack creation function
# (there must be a smarter way to do this but I haven't found it yet)
st_call <- sapply(seq_along(vars), function(i) {
paste0(
'guide_axis_base(key = key_manual(aesthetic = 1:', nrow(key_labels), ',\n',
'label=c("', str_flatten(key_labels[[vars[i]]], '", "'), '")),\n',
'theme=theme(',
ifelse(i>1, 'axis.ticks.x=element_blank(),\n', ''),
'axis.text.x = element_text(vjust = 0)))'
) %>% return()
})
st_call <- str_flatten(st_call, collapse = ',\n')
gu <- paste0('guides(\n x=compose_stack(\n',
st_call,
',\n',
'side.titles = c("',
str_flatten(vars, collapse='", "'),
'"),\n',
'theme = theme(legendry.axis.subtitle = element_text(vjust = 0))))')
# uncomment below to see guides text
# tidy_source(text = gu, args.newline=T)
# create ggplot object
pl <- ggplot(df_summ, aes(x = x.axis, y = output_summ)) +
# bar plot of summarized values
geom_bar(stat = "summary", fun = "mean", position = "dodge") +
# jitter plot of individual values
geom_jitter(
data=df, aes(x = x.axis, y = .data[[output]]),
width = width, height = 0) +
xlim(0.5, nrow(key_labels) + 0.5) +
labs(title = NULL, x = NULL, y = output) +
eval(parse(text=paste0('theme_', theme, '()'))) +
# Add margin to make room for side titles
theme(plot.margin = margin(5.5, 5.5, 5.5, 22)) +
eval(parse(text=gu))
return(pl)
}
Example output:
grid_plot(test, 'output_norm', c('genotype', 'treat2', 'treat1'), theme='gray')
In biomedical research, it's not uncommon to do an experiment where one varies a bunch of conditions (e.g., genotypes of mice or cell lines, treatment with various drugs), and measures a single major output variable for each experiment (e.g. blood sugar level, expression of a reporter gene).
A common way one plots these is as a series of bar or boxplots (or violin plots, or whatever), arranged horizontally, with a grid underneath depicting the variables that were changed in each condition.
It's easy to make a graph like this in Excel, setting the left-to-right order however you want, but then of course you have to mark it up in Illustrator or the like. It would be much nicer to be able to do this in R, especially for exploratory analysis, where one could hopefully have the markup done automatically. But I can't seem to find a good way to do this.
I've pasted code below, for an example of the kind of data I'm working with and how it is anized, and how I was able to kludge up a horizontal barplot, with labels, by manually specifying an ordering variable that accounts for the absence or presence of two treatment conditions.
I also included a genotype variable, with separates the data into two groups—the only way to get this to plot horizontally was to use facet_wrap
. Below, on the left, is the output I could get from native ggplot, and on the right is an Illustrator-edited version that indicates how I would like the figure laid out. Is there a package that can make a graph like this, or a workaround that might be used to put such a graph together "by hand" in R?
(I'm pretty comfortable with ggplot coding.)
library(tidyverse)
test <- structure(list(replicate = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
2, 3, 3, 3, 3, 3, 3, 3, 3),
genotype = c("A", "A", "A", "A",
"B", "B", "B", "B", "A", "A", "A", "A", "B", "B", "B", "B", "A",
"A", "A", "A"),
treat1 = c(FALSE, FALSE, TRUE, TRUE, FALSE, FALSE,
TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE,
FALSE, FALSE, TRUE, TRUE),
treat2 = c(FALSE, TRUE, FALSE, TRUE,
FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE,
FALSE, TRUE, FALSE, TRUE, FALSE, TRUE),
output_norm = c(1, 5,
1.75, 4.74, 1, 13.61, 0.7, 7.96, 1, 3, 1.67, 2.51, 1, 6.44, 0.93,
10.92, 1, 3.63, 2.24, 6.59)),
row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"))
# Create ordered factor variable accounting for treat1 vs treat2, for x-axis
condition <- tibble(
treat1=c(F, T, F, T),
treat2=c(F, F, T, T),
treatment=c('untreated', 'treat1', 'treat2', 'treat1 + treat2')) %>%
mutate(treatment=ordered(treatment, levels=treatment)) %>% print()
# Add treatment variable to test data
test <- left_join(test, condition, by=c('treat1', 'treat2')) %>% print()
# Bar graph with individual data points
ggplot(test, aes(x=treatment, y=output_norm)) +
geom_bar(stat='summary', fun='mean', position='dodge') +
geom_jitter(width=0.1, height=0) +
facet_wrap(~genotype) +
labs(title='output by genotype', x='Treatment', y='output') +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
UPDATE: The answer below by stefan worked well, and motivated me to create a generalized function to use legendry
to create plots like this. The only ugly part was that I have to manually build the guides
as a string, to assemble the stack of one guide_axis_base
call per variable, and then use eval(parse(text=))
to execute it within ggplot
.
grid_plot <- function(df, output, vars,
fun='mean',
logical.convert=c('', '+'),
width=0.1,
theme='bw') {
# function to plot df as summarized barplot + jittered individual points, with
# grid of variables underneath
# df = dataframe to plot
# output = name of output variable in df
# vars = ordered vector of variable names for anizing and summarizing data
# fun = function to summarize data (default = 'mean')
# logical.convert = vector of strings for replacing FALSE/TRUE (default = c('', '+'))
# width = width of jitter plot (default = 0.1)
# theme = theme for plot (default = 'bw')
# return = ggplot object
require(legendry)
# remove any columns other than output and vars
df <- select(df, all_of(c(vars, output)))
# convert logical variables to strings
if(!is.null(logical.convert)) {
df <- mutate(df, across(where(is.logical), function(x) {
ifelse(x, logical.convert[2], logical.convert[1])
}))
}
# summarize df using vars and fun
df_summ <- group_by(df, across(all_of(vars))) %>%
summarise(output_summ = do.call(fun, list(.data[[output]])), .groups="drop")
# arrange df_summ by vars, add x.axis variable based on sort order
df_summ <- arrange(df_summ, pick(all_of(vars))) %>%
mutate(x.axis = 1:nrow(df_summ))
# merge x.axis variable into original df
df <- left_join(df, select(df_summ, -output_summ), by=vars) %>%
arrange(pick(all_of(vars)))
# create key_labels dataframe
key_labels <- select(df_summ, -c(output_summ, x.axis))
# create text for calling stack creation function
# (there must be a smarter way to do this but I haven't found it yet)
st_call <- sapply(seq_along(vars), function(i) {
paste0(
'guide_axis_base(key = key_manual(aesthetic = 1:', nrow(key_labels), ',\n',
'label=c("', str_flatten(key_labels[[vars[i]]], '", "'), '")),\n',
'theme=theme(',
ifelse(i>1, 'axis.ticks.x=element_blank(),\n', ''),
'axis.text.x = element_text(vjust = 0)))'
) %>% return()
})
st_call <- str_flatten(st_call, collapse = ',\n')
gu <- paste0('guides(\n x=compose_stack(\n',
st_call,
',\n',
'side.titles = c("',
str_flatten(vars, collapse='", "'),
'"),\n',
'theme = theme(legendry.axis.subtitle = element_text(vjust = 0))))')
# uncomment below to see guides text
# tidy_source(text = gu, args.newline=T)
# create ggplot object
pl <- ggplot(df_summ, aes(x = x.axis, y = output_summ)) +
# bar plot of summarized values
geom_bar(stat = "summary", fun = "mean", position = "dodge") +
# jitter plot of individual values
geom_jitter(
data=df, aes(x = x.axis, y = .data[[output]]),
width = width, height = 0) +
xlim(0.5, nrow(key_labels) + 0.5) +
labs(title = NULL, x = NULL, y = output) +
eval(parse(text=paste0('theme_', theme, '()'))) +
# Add margin to make room for side titles
theme(plot.margin = margin(5.5, 5.5, 5.5, 22)) +
eval(parse(text=gu))
return(pl)
}
Example output:
grid_plot(test, 'output_norm', c('genotype', 'treat2', 'treat1'), theme='gray')
Share
Improve this question
edited Mar 8 at 17:07
C. Murtaugh
asked Mar 4 at 14:56
C. MurtaughC. Murtaugh
6755 silver badges15 bronze badges
1
- This looks kind of like an upset plot — I wonder if UpSetR would be helpful. – zephryl Commented Mar 4 at 15:26
5 Answers
Reset to default 6A more recent option would be to use the legendry
package which via e.g. compose_stack
allows to stack multiple axes and allows for side titles, too. However, it requires some effort to set up the single guide_axis_base
objects.
library(tidyverse)
library(legendry)
test <- test |>
mutate(
treat1 = if_else(treat1, "+", " "),
treat2 = if_else(treat2, "+", " "),
x = interaction(treat1, treat2, genotype)
)
key_labels <- distinct(
test,
x, genotype, treat1, treat2
) |>
mutate(x = as.numeric(x))
gab <- function(which, ticks = TRUE) {
guide_axis_base(
key = key_manual(
key_labels$x, key_labels[[which]]
),
theme = theme(
axis.ticks.x = if (!ticks) element_blank(),
axis.text.x = element_text(vjust = 0)
)
)
}
ggplot(test, aes(x = x, y = output_norm)) +
geom_bar(stat = "summary", fun = "mean", position = "dodge") +
geom_jitter(width = 0.1, height = 0) +
labs(title = "output by genotype", x = NULL, y = "output") +
theme_bw() +
guides(
x = compose_stack(
gab("genotype"),
gab("treat1", ticks = FALSE),
gab("treat2", ticks = FALSE),
side.titles = c("genotype", "treat1", "treat2"),
theme = theme(
legendry.axis.subtitle = element_text(
vjust = 0
)
)
)
) +
# Add margin to make room for side titles
theme(plot.margin = margin(5.5, 5.5, 5.5, 22))
test %>%
mutate(ord = as.numeric(as.factor(genotype)) * 10 + treat1 + 2 * treat2) %>%
arrange(ord) %>%
mutate(ord = as.factor(ord),
lbl = paste(genotype,
ifelse(treat1, "+", ""),
ifelse(treat2, "+", ""), sep = "\n")) -> test2
ggplot(test2, aes(x=ord, y=output_norm)) +
geom_bar(stat='summary', fun='mean', position='dodge') +
geom_jitter(width=0.1, height=0) +
labs(title='output by genotype', x='Treatment', y='output') +
theme_bw() +
scale_x_discrete(labels = unique(test2$lbl))
This matches your desired output quite closely. Unfortunately, the plot.tag.position
values, which position the genotype/treat1/treat2 label, are relative to the plotting area, so the correct values to align it with the axis labels will have to be adjusted based on the dimensions of your output.
test = test |>
arrange(genotype, treat2, treat1) |>
mutate(
x_axis = fct_inorder(
paste(
genotype,
ifelse(treat1, "+", ""),
ifelse(treat2, "+", ""),
sep = "\n"
)
)
)
# bar graph with individual data points
ggplot(test, aes(x = x_axis, y=output_norm)) +
geom_bar(stat='summary', fun='mean', position='dodge') +
geom_jitter(width=0.1, height=0) +
labs(
title = 'output by genotype',
x = NULL,
y = 'output',
tag = paste("genotype", "treat1", "treat2", sep = "\n")
) +
theme_bw() +
theme(
plot.margin = margin(l = 20, 5.5, 5.5, 5.5),
plot.tag.position = c(0.06, .075), ## adjust these for a given output size
plot.tag = element_text(
size = rel(.8), ## this matches `theme_bw()$axis.text
hjust = 1, ## right justify
vjust = 1, ## probably doesn't matter, but matches $axis.text.x
lineheight = 1 ## this seems to be correct
)
)
I am surprised we need lineheight = 1
to get the line spacing correct. As far as I can see, the theme_bw()$text
sets lineheight = 0.9
, which I would think is inherited by both axis.text.x
and plot.tag
, but if I don't set it to 1
for plot.tag
then the spacing doesn't line up. The line height for the axis text must be set somewhere else I'm not seeing.
Using the ggh4x
-package on your sample data gets pretty close with minimal code.
library(ggh4x)
ggplot(test, aes(x = interaction(treat1, treat2, genotype), y = output_norm)) +
geom_bar(stat = "summary", fun = "mean", position = "dodge") +
scale_x_discrete(NULL, guide = "axis_nested")
sample data
test <- structure(list(replicate = c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
2, 3, 3, 3, 3, 3, 3, 3, 3),
genotype = c("A", "A", "A", "A",
"B", "B", "B", "B", "A", "A", "A", "A", "B", "B", "B", "B", "A",
"A", "A", "A"),
treat1 = c(FALSE, FALSE, TRUE, TRUE, FALSE, FALSE,
TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE,
FALSE, FALSE, TRUE, TRUE),
treat2 = c(FALSE, TRUE, FALSE, TRUE,
FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE,
FALSE, TRUE, FALSE, TRUE, FALSE, TRUE),
output_norm = c(1, 5,
1.75, 4.74, 1, 13.61, 0.7, 7.96, 1, 3, 1.67, 2.51, 1, 6.44, 0.93,
10.92, 1, 3.63, 2.24, 6.59)),
row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"))
A base R approach would be to make a few tweaks in the data and then plotting the text in the margins:
# Data tweaks
test$x_plot <- paste0(test$genotype, test$treatment) # create unique pairings
tabtext <- unique(test[c("x_plot", "treat1", "treat2")]) # for table
tabtext[2:3] <- t(apply(tabtext[2:3], 1, ifelse, "+", "")) # for table
mean_bar <- tapply(test$output_norm, test$x_plot, mean) # for mean bars
mean_bar <- mean_bar[c(4, 1, 3, 2, 8, 5, 7, 6)] # order however you'd like
# Plot
bb <- barplot(mean_bar, ylim = c(0, max(test$output_norm)+1),
names.arg = c(rep("A", 4), rep("B", 4)))
mtext(side = 1, tabtext$treat1, at = bb, padj = 4)
mtext(side = 1, tabtext$treat2, at = bb, padj = 6)
mtext(side = 1, c("Genotype", "Treat 1", "Treat 2"), at = 0, padj = c(2, 4, 6))
# jitter points
lapply(seq_along(mean_bar), \(x){
points(x = jitter(rep(bb[x], sum(test$x_plot %in% names(mean_bar[x])))),
y = test$output_norm[test$x_plot %in% names(mean_bar[x])],
pch = 21, bg = "maroon")
})
发布者:admin,转转请注明出处:http://www.yc00.com/questions/1745037110a4607588.html
评论列表(0条)