I have a data.frame containing different combinations of a group and count values where both of them exist. I need to plot or create a matrix similar to correlation matrix. I have come up with a simple example here
dat <- data.frame(source = c('A','A','A','B','B','C'),
target = c('B','C','D','C','D','D'),
count = c(4,5,6,3,3,5))
> dat
source target count
1 A B 4
2 A C 5
3 A D 6
4 B C 3
5 B D 3
6 C D 5
How do I get a matrix like this?? and plot this matrix
I have a data.frame containing different combinations of a group and count values where both of them exist. I need to plot or create a matrix similar to correlation matrix. I have come up with a simple example here
dat <- data.frame(source = c('A','A','A','B','B','C'),
target = c('B','C','D','C','D','D'),
count = c(4,5,6,3,3,5))
> dat
source target count
1 A B 4
2 A C 5
3 A D 6
4 B C 3
5 B D 3
6 C D 5
How do I get a matrix like this?? and plot this matrix
Share edited Mar 4 at 12:23 user20650 26k5 gold badges59 silver badges94 bronze badges asked Mar 4 at 11:43 KoundyKoundy 5,5753 gold badges27 silver badges39 bronze badges 4 |8 Answers
Reset to default 4This uses dplyr, tapply and pheatmap:
library(dplyr)
by <- c("source", "target")
m <- dat %>%
mutate(across(any_of(by), \(x) factor(x, sort(unique(unlist(.[by])))))) %>%
{ tapply(.$count, .[by], c, default = 0) } %>%
{ . + t(.) + diag(ncol(.)) }
m
giving
target
source A B C D
A 1 4 5 6
B 4 1 3 3
C 5 3 1 5
D 6 3 5 1
library(pheatmap)
pheatmap(m, display_numbers = TRUE, cluster_rows = FALSE, cluster_cols = FALSE)
(continued after image)
We can also try the Bioconductor ComplexHeatmap package.
library(circlize)
library(ComplexHeatmap)
library(RColorBrewer)
col_fun <- colorRamp2(c(0, 3, 6), brewer.pal(n = 3, name = "RdYlBu"))
Heatmap(m, name = "m", col = col_fun,
cluster_rows = FALSE,
cluster_columns = FALSE,
column_names_rot = 0,
row_names_gp = gpar(fontsize = 20),
column_names_gp = gpar(fontsize = 20),
cell_fun = function(j, i, x, y, width, height, fill) {
grid.text(sprintf("%.0f", m[i, j]), x, y, gp = gpar(fontsize = 30))})
(continued after image)
or without color
Heatmap(m, name = "m",
rect_gp = gpar(type = "none"),
cluster_rows = FALSE,
cluster_columns = FALSE,
column_names_rot = 0,
row_names_gp = gpar(fontsize = 20),
column_names_gp = gpar(fontsize = 20),
show_heatmap_legend = FALSE,
cell_fun = function(j, i, x, y, width, height, fill) {
grid.rect(x = x, y = y, width = width, height = height, gp = gpar(fill = NA))
grid.text(sprintf("%.0f", m[i, j]), x, y, gp = gpar(fontsize = 30))})
(continued after image)
Another possibility is a balloonplot using the gplots package.
library(gplots)
balloonplot(as.table(t(m)), show.margins = FALSE, cum.margins = FALSE, main = "m")
(continued after image)
The ggpubr package also has a balloon plot function:
library(ggpubr)
ggballoonplot(m, show.label = TRUE, rotate.x.text = 0)
You can try the following options
- Option 1:
xtabs
with base R
> `diag<-`(xtabs(count ~ ., rbind(dat, setNames(dat, names(dat)[c(2, 1, 3)]))), 1)
target
source A B C D
A 1 4 5 6
B 4 1 3 3
C 5 3 1 5
D 6 3 5 1
- Option 2:
as_adjacency_matrix
withigraph
package
library(igraph)
dat %>%
graph_from_data_frame(directed = FALSE) %>%
as_adjacency_matrix(attr = "count", sparse = FALSE) %>%
`diag<-`(1)
which gives
A B C D
A 1 4 5 6
B 4 1 3 3
C 5 3 1 5
D 6 3 5 1
"Extend" your data to include (1) the inverse of every combination and (2) identities, then plot using ggplot2::geom_tile()
and geom_text()
:
library(ggplot2)
library(forcats)
vals <- union(dat$source, dat$target)
dat_ext <- data.frame(
source = c(dat$source, dat$target, vals),
target = c(dat$target, dat$source, vals),
count = c(dat$count, dat$count, rep(1, length(vals)))
)
ggplot(dat_ext, aes(source, fct_rev(target))) +
geom_tile(color = "black") +
geom_text(aes(label = count)) +
scale_x_discrete(position = "top") +
theme_void() +
theme(axis.text = element_text(face = "bold"))
dat <- data.frame(
source = c('A', 'A', 'A', 'B', 'B', 'C'),
target = c('B', 'C', 'D', 'C', 'D', 'D'),
count = c(4, 5, 6, 3, 3, 5)
)
names <- unique(c(dat$source, dat$target))
dat$source <- factor(dat$source, levels = names)
dat$target <- factor(dat$target, levels = names)
mat <- xtabs(count ~ source + target, data = dat)
mat[lower.tri(mat)] <- t(mat)[lower.tri(mat)]
mat
target
source A B C D
A 0 4 5 6
B 4 0 3 3
C 5 3 0 5
D 6 3 5 0
We can create an edge matrix and use it for indexing.
> edges <- lapply(dat[1:2], factor, levels=(lv <- unique(unlist(dat[1:2])))) |>
+ sapply(as.integer)
> res <- diag(1, max(edges)) |> `dimnames<-`(list(lv, lv))
> for (i in list(1:2, 2:1)) res[edges[, i]] <- dat$count
> res
A B C D
A 1 4 5 6
B 4 1 3 3
C 5 3 1 5
D 6 3 5 1
I guess this is similar to what igraph is doing under the hood.
To get back to the original data frame we can use which(..., arr.ind=TRUE)
> idx <- which(upper.tri(res), arr.ind=TRUE)
> data.frame(apply(idx, 2, \(i) rownames(res)[i]), res[idx]) |>
+ sort_by(~list(row, col))
row col res.idx.
1 A B 4
2 A C 5
4 A D 6
3 B C 3
5 B D 3
6 C D 5
Doing {igraph}
:
# X =
dat |>
igraph::graph_from_data_frame(directed = FALSE) |>
igraph::as_adjacency_matrix(type = 'both', sparse = FALSE, attr = 'count') |>
`diag<-`(1L)
A B C D
A 1 4 5 6
B 4 1 3 3
C 5 3 1 5
D 6 3 5 1
Edit. Base R colour heatmap from X
with text labels.
local({
rn = unique(rownames(X)); cn = unique(colnames(X))
nr = length(rn); nc = length(cn)
s = seq(nr)
M = t(X[rev(s), ])
image(x = s, y = s, z = M, xlab = '', ylab = '', axes = FALSE,
col = hcl.colors(1e3, 'Reds', rev = TRUE, alpha = .8))
with(expand.grid(x=s, y=s), text(x, y, labels = M, col = 'black'))
axis(2, at = s, labels = rev(rn), las = 2, col = NA) # left
axis(3, at = s, labels = cn, col = NA) # top
})
Primitive base-R Solution:
# Original data
dat <- data.frame(
source = c('A', 'A', 'A', 'B', 'B', 'C'),
target = c('B', 'C', 'D', 'C', 'D', 'D'),
count = c(4, 5, 6, 3, 3, 5)
)
# Get unique character list
nodes <- unique(unlist(dat[1:2]))
# Create an empty matrix
mat <- matrix(nrow = length(nodes), ncol = length(nodes))
# Fill the diagonal with 1
diag(mat) <- 1
# Index mapping
for (k in seq_len(nrow(dat))) {
i <- which(nodes == dat[[1]][k]) # Find row index
j <- which(nodes == dat[[2]][k]) # Find column index
mat[i, j] <- dat$count[k] # Assign value
}
# Fill lower triangle
mat[lower.tri(mat)] <- t(mat)[lower.tri(mat)]
mat
[,1] [,2] [,3] [,4]
[1,] 1 4 5 6
[2,] 4 1 3 3
[3,] 5 3 1 5
[4,] 6 3 5 1
in Base R:
l <- factor(unlist(dat[1:2]))
`diag<-`(as.matrix(structure(dat$count, Size = nlevels(l),
Labels = levels(l), class = "dist")), 1)
A B C D
A 1 4 5 6
B 4 1 3 3
C 5 3 1 5
D 6 3 5 1
发布者:admin,转转请注明出处:http://www.yc00.com/questions/1745045917a4608095.html
dat
they show doesn't match the code used to generate it, hence the confusion – user20650 Commented Mar 4 at 12:21