Title: | Biclustering with Missing Data |
---|---|
Description: | Biclustering is a statistical learning technique that simultaneously partitions and clusters rows and columns of a data matrix. Since the solution space of biclustering is in infeasible to completely search with current computational mechanisms, this package uses a greedy heuristic. The algorithm featured in this package is, to the best our knowledge, the first biclustering algorithm to work on data with missing values. Li, J., Reisner, J., Pham, H., Olafsson, S., and Vardeman, S. (2020) Biclustering with Missing Data. Information Sciences, 510, 304–316. |
Authors: | John Reisner [cre, aut, cph], Hieu Pham [ctb, cph], Jing Li [ctb, cph] |
Maintainer: | John Reisner <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.2.3 |
Built: | 2024-10-27 03:34:23 UTC |
Source: | https://github.com/jreisner/biclustermd |
The main function is biclustermd()
. Results can be plotted with autoplot()
and as.Biclust()
converts results to Biclust objects.
biclustermd
object to a Biclust
objectConvert a biclustermd
object to a Biclust
object
as.Biclust(object)
as.Biclust(object)
object |
The |
Returns an object of class Biclust
.
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) bc as.Biclust(bc) # biclust::drawHeatmap won't work since it doesn't exclude NAs ## Not run: biclust::drawHeatmap(synthetic, as.Biclust(bc), 6) # bicluster 6 is in the top right-hand corner here: autoplot(bc) # compare with bicust::drawHeatmap2: biclust::drawHeatmap2(synthetic, as.Biclust(bc), 6) # bicluster 3 is in the bottom right-hand corner here: autoplot(bc) # compare with bicust::drawHeatmap2: biclust::drawHeatmap2(synthetic, as.Biclust(bc), 3)
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) bc as.Biclust(bc) # biclust::drawHeatmap won't work since it doesn't exclude NAs ## Not run: biclust::drawHeatmap(synthetic, as.Biclust(bc), 6) # bicluster 6 is in the top right-hand corner here: autoplot(bc) # compare with bicust::drawHeatmap2: biclust::drawHeatmap2(synthetic, as.Biclust(bc), 6) # bicluster 3 is in the bottom right-hand corner here: autoplot(bc) # compare with bicust::drawHeatmap2: biclust::drawHeatmap2(synthetic, as.Biclust(bc), 3)
Make a heatmap of sparse biclustering results
## S3 method for class 'biclustermd' autoplot( object, axis.text = NULL, reorder = FALSE, transform_colors = FALSE, c = 1/6, cell_alpha = 1/5, col_clusts = NULL, row_clusts = NULL, ... )
## S3 method for class 'biclustermd' autoplot( object, axis.text = NULL, reorder = FALSE, transform_colors = FALSE, c = 1/6, cell_alpha = 1/5, col_clusts = NULL, row_clusts = NULL, ... )
object |
An object of class "biclustermd". |
axis.text |
A character vector specifying for which axes text should be
drawn. Can be any of |
reorder |
A logical. If |
transform_colors |
If equals |
c |
Value to scale the data by before running it through a standard normal CDF. Default is 1/6. |
cell_alpha |
A scalar defining the transparency of shading over a cell and by default this equals 1/5. The color corresponds to the cell mean. |
col_clusts |
A vector of column cluster indices to display. If |
row_clusts |
A vector of row cluster indices to display. If |
... |
Arguments to be passed to |
An object of class ggplot.
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) bc autoplot(bc) autoplot(bc, axis.text = c('x', 'row')) + ggplot2::scale_fill_distiller(palette = "Spectral", na.value = "white") # Complete shading autoplot(bc, axis.text = c('col', 'row'), cell_alpha = 1) # Transformed values and no shading autoplot(bc, transform_colors = TRUE, c = 1/20, cell_alpha = 0) # Focus on row cluster 1 and column cluster 2 autoplot(bc, col_clusts = 2, row_clusts = 1)
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) bc autoplot(bc) autoplot(bc, axis.text = c('x', 'row')) + ggplot2::scale_fill_distiller(palette = "Spectral", na.value = "white") # Complete shading autoplot(bc, axis.text = c('col', 'row'), cell_alpha = 1) # Transformed values and no shading autoplot(bc, transform_colors = TRUE, c = 1/20, cell_alpha = 0) # Focus on row cluster 1 and column cluster 2 autoplot(bc, col_clusts = 2, row_clusts = 1)
Creates a ggplot of the three similarity measures used in biclustermd::bicluster()
for both row and column dimensions.
## S3 method for class 'biclustermd_sim' autoplot(object, similarity = NULL, facet = TRUE, ncol = NULL, ...)
## S3 method for class 'biclustermd_sim' autoplot(object, similarity = NULL, facet = TRUE, ncol = NULL, ...)
object |
Object of class "biclustermd_sim" |
similarity |
A character vector indicating which similarity measure to plot.
Can be any of |
facet |
If |
ncol |
If faceting, the number of columns to arrange the plots in. |
... |
Arguments to pass to |
A ggplot object.
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) bc autoplot(bc$Similarities, ncol = 1)
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) bc autoplot(bc$Similarities, ncol = 1)
Creates a ggplot of the decrease in SSE recorded in biclustermd::bicluster()
.
## S3 method for class 'biclustermd_sse' autoplot(object, ...)
## S3 method for class 'biclustermd_sse' autoplot(object, ...)
object |
Object of class "biclustermd_sse" with columns "Iteration" and "SSE" |
... |
Arguments to pass to |
A ggplot object.
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) bc autoplot(bc$SSE)
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) bc autoplot(bc$SSE)
Bicluster data with non-random missing values
biclustermd( data, row_clusters = floor(sqrt(nrow(data))), col_clusters = floor(sqrt(ncol(data))), miss_val = mean(data, na.rm = TRUE), miss_val_sd = 1, similarity = "Rand", row_min_num = floor(nrow(data)/row_clusters), col_min_num = floor(ncol(data)/col_clusters), row_num_to_move = 1, col_num_to_move = 1, row_shuffles = 1, col_shuffles = 1, max.iter = 100, verbose = FALSE )
biclustermd( data, row_clusters = floor(sqrt(nrow(data))), col_clusters = floor(sqrt(ncol(data))), miss_val = mean(data, na.rm = TRUE), miss_val_sd = 1, similarity = "Rand", row_min_num = floor(nrow(data)/row_clusters), col_min_num = floor(ncol(data)/col_clusters), row_num_to_move = 1, col_num_to_move = 1, row_shuffles = 1, col_shuffles = 1, max.iter = 100, verbose = FALSE )
data |
Dataset to bicluster. Must to be a data matrix with only numbers and missing values in the data set. It should have row names and column names. |
row_clusters |
The number of clusters to partition the rows into. The
default is |
col_clusters |
The number of clusters to partition the columns into. The
default is |
miss_val |
Value or function to put in empty cells of the prototype matrix.
If a value, a random normal variable with sd = |
miss_val_sd |
Standard deviation of the normal distribution |
similarity |
The metric used to compare two successive clusterings. Can be "Rand" (default), "HA" for the Hubert and Arabie adjusted Rand index or "Jaccard". See RRand for details. |
row_min_num |
Minimum row prototype size in order to be eligible to be
chosen when filling an empty row prototype. Default is |
col_min_num |
Minimum column prototype size in order to be eligible to be
chosen when filling an empty row prototype. Default is |
row_num_to_move |
Number of rows to remove from the sampled prototype to put in the empty row prototype. Default is 1. |
col_num_to_move |
Number of columns to remove from the sampled prototype to put in the empty column prototype. Default is 1. |
row_shuffles |
Number of times to shuffle rows in each iteration. Default is 1. |
col_shuffles |
Number of times to shuffle columns in each iteration. Default is 1. |
max.iter |
Maximum number of iterations to let the algorithm run for. |
verbose |
Logical. If TRUE, will report progress. |
A list of class biclustermd
:
params |
a list of all arguments passed to the function, including defaults. |
data |
the inputted two way table of data. |
P0 |
the initial column partition matrix. |
Q0 |
the initial row partition matrix. |
InitialSSE |
the SSE of the original partitioning. |
P |
the final column partition matrix. |
Q |
the final row partition matrix. |
SSE |
a matrix of class biclustermd_sse detailing the SSE recorded at the end of each iteration. |
Similarities |
a data frame of class biclustermd_sim detailing the
value of row and column similarity measures recorded at the end of each
iteration. Contains information for all three similarity measures.
This carries an attribute |
iteration |
the number of iterations the algorithm ran for, whether |
A |
the final prototype matrix which gives the average of each bicluster. |
Li, J., Reisner, J., Pham, H., Olafsson, S., and Vardeman, S. (2020) Biclustering with Missing Data. Information Sciences, 510, 304–316.
rep_biclustermd
, tune_biclustermd
data("synthetic") # default parameters bc <- biclustermd(synthetic) bc autoplot(bc) # providing the true number of row and column clusters bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2) bc autoplot(bc) # an example with the nycflights13::flights dataset library(nycflights13) data("flights") library(dplyr) flights_bcd <- flights %>% select(month, dest, arr_delay) flights_bcd <- flights_bcd %>% group_by(month, dest) %>% summarise(mean_arr_delay = mean(arr_delay, na.rm = TRUE)) %>% spread(dest, mean_arr_delay) %>% as.data.frame() rownames(flights_bcd) <- flights_bcd$month flights_bcd <- as.matrix(flights_bcd[, -1]) flights_bc <- biclustermd(data = flights_bcd, col_clusters = 6, row_clusters = 4, row_min_num = 3, col_min_num = 5, max.iter = 20, verbose = TRUE) flights_bc
data("synthetic") # default parameters bc <- biclustermd(synthetic) bc autoplot(bc) # providing the true number of row and column clusters bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2) bc autoplot(bc) # an example with the nycflights13::flights dataset library(nycflights13) data("flights") library(dplyr) flights_bcd <- flights %>% select(month, dest, arr_delay) flights_bcd <- flights_bcd %>% group_by(month, dest) %>% summarise(mean_arr_delay = mean(arr_delay, na.rm = TRUE)) %>% spread(dest, mean_arr_delay) %>% as.data.frame() rownames(flights_bcd) <- flights_bcd$month flights_bcd <- as.matrix(flights_bcd[, -1]) flights_bc <- biclustermd(data = flights_bcd, col_clusters = 6, row_clusters = 4, row_min_num = 3, col_min_num = 5, max.iter = 20, verbose = TRUE) flights_bc
Make a binary vector with all values equal to zero except for one
binary_vector_gen(n, i)
binary_vector_gen(n, i)
n |
Desired vector length. |
i |
Index whose value is one. |
A vector
Make a heat map of bicluster cell sizes.
cell_heatmap(x, ...)
cell_heatmap(x, ...)
x |
An object of class |
... |
Arguments to pass to |
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) cell_heatmap(bc) cell_heatmap(bc) + ggplot2::scale_fill_viridis_c()
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) cell_heatmap(bc) cell_heatmap(bc) + ggplot2::scale_fill_viridis_c()
Make a data frame containing the MSE for each bicluster cell
cell_mse(x)
cell_mse(x)
x |
An object of class |
A data frame giving the row cluster, column cluster, the number of data points in each row and column cluster, the number of data points missing in the cell, and the cell MSE.
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) cell_mse(bc)
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) cell_mse(bc)
Calculate the sum cluster SSE in each iteration
cluster_iteration_sum_sse(data, P, Q)
cluster_iteration_sum_sse(data, P, Q)
data |
The data being biclustered. Must to be a data matrix with only numbers and missing values in the data set. It should have row names and column names. |
P |
Matrix for column prototypes. |
Q |
Matrix for row prototypes. |
The SSE for the parameters specified.
Get column names in each column cluster
col_cluster_names(x, data)
col_cluster_names(x, data)
x |
Biclustering object to extract column cluster designation from |
data |
Data that contains the column names |
A data frame with two columns: cluster
corresponds to the column
cluster and name
gives the column names in each cluster.
data("synthetic") rownames(synthetic) <- letters[1:nrow(synthetic)] colnames(synthetic) <- letters[1:ncol(synthetic)] bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) bc
data("synthetic") rownames(synthetic) <- letters[1:nrow(synthetic)] colnames(synthetic) <- letters[1:ncol(synthetic)] bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) bc
A generic to gather column names
col.names(x)
col.names(x)
x |
an object to retrieve column names from |
Get data matrix column names and their corresponding column cluster membership
## S3 method for class 'biclustermd' col.names(x)
## S3 method for class 'biclustermd' col.names(x)
x |
and object of class |
a data frame with column names of the shuffled matrix and corresponding column cluster names.
data("synthetic") # default parameters bc <- biclustermd(synthetic) bc col.names(bc) # this is a simplified version of the output for gather(bc): library(dplyr) gather(bc) %>% distinct(col_cluster, col_name)
data("synthetic") # default parameters bc <- biclustermd(synthetic) bc col.names(bc) # this is a simplified version of the output for gather(bc): library(dplyr) gather(bc) %>% distinct(col_cluster, col_name)
Compare two biclusterings or a pair of partition matrices
compare_biclusters(bc1, bc2)
compare_biclusters(bc1, bc2)
bc1 |
the first biclustering or partition matrix. Must be either of class
|
bc2 |
the second biclustering or partition matrix. Must be either of class
|
If comparing a pair of biclusterings, a list containing the column similarity indices and the row similarity indices, in that order. If a pair of matrices, a vector of similarity indices.
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2) bc2 <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2) # compare the two biclusterings compare_biclusters(bc, bc2) # determine the similarity between initial and final row clusterings compare_biclusters(bc$Q0, bc$Q)
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2) bc2 <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2) # compare the two biclusterings compare_biclusters(bc, bc2) # determine the similarity between initial and final row clusterings compare_biclusters(bc$Q0, bc$Q)
Randomly select a column prototype to fill an empty column prototype with
fill_empties_P(data, obj, col_min_num = 10, col_num_to_move = 5)
fill_empties_P(data, obj, col_min_num = 10, col_num_to_move = 5)
data |
The data being biclustered. Must to be a data matrix with only numbers and missing values in the data set. It should have row names and column names. |
obj |
A matrix for column clusters, typically named P. |
col_min_num |
Minimum column prototype size in order to be eligible to be chosen when filling an empty column prototype. Default is 10. |
col_num_to_move |
Number of columns to remove from the sampled prototype to put in the empty column prototype. Default is 5. |
A matrix for column clusters, i.e., a P matrix.
Randomly select a row prototype to fill an empty row prototype with
fill_empties_Q(data, obj, row_min_num = 10, row_num_to_move = 5)
fill_empties_Q(data, obj, row_min_num = 10, row_num_to_move = 5)
data |
The data being biclustered. Must to be a data matrix with only numbers and missing values in the data set. It should have row names and column names. |
obj |
A matrix for row clusters, typically named Q |
row_min_num |
Minimum row prototype size in order to be eligible to be chosen when filling an empty row prototype. Default is 10. |
row_num_to_move |
Number of rows to remove from the sampled prototype to put in the empty row prototype. Default is 5. |
A matrix for row clusters, i.e., a Q matrix.
Formats a partition matrix so that subsets in a partition will be ordered by the value of the smallest in each subset
format_partition(P1)
format_partition(P1)
P1 |
A partition matrix. |
A formatted partition matrix.
Gather a biclustermd object
## S3 method for class 'biclustermd' gather( data, key = NULL, value = NULL, ..., na.rm = FALSE, convert = FALSE, factor_key = FALSE )
## S3 method for class 'biclustermd' gather( data, key = NULL, value = NULL, ..., na.rm = FALSE, convert = FALSE, factor_key = FALSE )
data |
a |
key |
unused; included for consistency with |
value |
unused; included for consistency with |
... |
unused; included for consistency with |
na.rm |
unused; included for consistency with |
convert |
unused; included for consistency with |
factor_key |
unused; included for consistency with |
A data frame containing the row names and column names of both the two-way table of data biclustered and the cell-average matrix.
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) gather(bc) # bicluster 6 is in the top right-hand corner here: autoplot(bc) # bicluster 3 is in the bottom right-hand corner here: autoplot(bc)
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) gather(bc) # bicluster 6 is in the top right-hand corner here: autoplot(bc) # bicluster 3 is in the bottom right-hand corner here: autoplot(bc)
Compute the Jaccard similarity coefficient for two clusterings
jaccard_similarity(clus1, clus2)
jaccard_similarity(clus1, clus2)
clus1 |
vector giving the first set of clusters |
clus2 |
vector giving the second set of clusters |
a numeric
Milligan, G.W. and Cooper, M. C. (1986) A study of the comparability of external criteria for hierarchical cluster analysis. Multivariate Behavioral Research, 21, 441-458.
Make a heatmap of cell MSEs
mse_heatmap(x, ...)
mse_heatmap(x, ...)
x |
An object of class |
... |
Arguments to pass to |
A ggplot object.
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) mse_heatmap(bc) mse_heatmap(bc) + ggplot2::scale_fill_viridis_c()
data("synthetic") bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) mse_heatmap(bc) mse_heatmap(bc) + ggplot2::scale_fill_viridis_c()
For each row in a partition matrix, this function gets the column index for which the row is equal to one. That is, for row i, this function returns the index of the row entry that is equal to one.
part_matrix_to_vector(P0)
part_matrix_to_vector(P0)
P0 |
A partition matrix |
An integer vector
This function is used to randomly generate a partition matrix and assign rows or columns to prototypes. Must be the case that N > K.
partition_gen(N, K)
partition_gen(N, K)
N |
Number of objects/rows in a partition matrix |
K |
Desired number of partitions |
A partition matrix.
Create a partition matrix with a partition vector p
partition_gen_by_p(N, K, p)
partition_gen_by_p(N, K, p)
N |
Rows in a partition matrix |
K |
Number of prototypes to create |
p |
Integer vector containing the cluster each row in a partition matrix is to be assigned to. |
A partition matrix.
Find the index of the first nonzero value in a vector
position_finder(vec)
position_finder(vec)
vec |
A binary vector |
Position of the first nonzero value in a vector.
Print an object of class biclustermd
## S3 method for class 'biclustermd' print(x, ...)
## S3 method for class 'biclustermd' print(x, ...)
x |
a |
... |
arguments passed to or from other methods |
Reorder a bicluster object for making a heat map
reorder_biclust(x)
reorder_biclust(x)
x |
A bicluster object. |
A list containing the two partition matrices used by gg_bicluster.
Repeat a biclustering to achieve a minimum SSE solution
rep_biclustermd( data, nrep = 10, parallel = FALSE, ncores = 2, col_clusters = floor(sqrt(ncol(data))), row_clusters = floor(sqrt(nrow(data))), miss_val = mean(data, na.rm = TRUE), miss_val_sd = 1, similarity = "Rand", row_min_num = 5, col_min_num = 5, row_num_to_move = 1, col_num_to_move = 1, row_shuffles = 1, col_shuffles = 1, max.iter = 100 )
rep_biclustermd( data, nrep = 10, parallel = FALSE, ncores = 2, col_clusters = floor(sqrt(ncol(data))), row_clusters = floor(sqrt(nrow(data))), miss_val = mean(data, na.rm = TRUE), miss_val_sd = 1, similarity = "Rand", row_min_num = 5, col_min_num = 5, row_num_to_move = 1, col_num_to_move = 1, row_shuffles = 1, col_shuffles = 1, max.iter = 100 )
data |
Dataset to bicluster. Must to be a data matrix with only numbers and missing values in the data set. It should have row names and column names. |
nrep |
The number of times to repeat the biclustering. Default 10. |
parallel |
Logical indicating if the user would like to utilize the
|
ncores |
The number of cores to use if parallel computing. Default 2. |
col_clusters |
The number of clusters to partition the columns into. |
row_clusters |
The number of clusters to partition the rows into. |
miss_val |
Value or function to put in empty cells of the prototype matrix.
If a value, a random normal variable with sd = |
miss_val_sd |
Standard deviation of the normal distribution |
similarity |
The metric used to compare two successive clusterings. Can be "Rand" (default), "HA" for the Hubert and Arabie adjusted Rand index or "Jaccard". See RRand and for details. |
row_min_num |
Minimum row prototype size in order to be eligible to be chosen when filling an empty row prototype. Default is 5. |
col_min_num |
Minimum column prototype size in order to be eligible to be chosen when filling an empty row prototype. Default is 5. |
row_num_to_move |
Number of rows to remove from the sampled prototype to put in the empty row prototype. Default is 1. |
col_num_to_move |
Number of columns to remove from the sampled prototype to put in the empty column prototype. Default is 1. |
row_shuffles |
Number of times to shuffle rows in each iteration. Default is 1. |
col_shuffles |
Number of times to shuffle columns in each iteration. Default is 1. |
max.iter |
Maximum number of iterations to let the algorithm run for. |
A list of the minimum SSE biclustering, a vector containing the final SSE of each repeat, and the time it took the function to run.
Li, J., Reisner, J., Pham, H., Olafsson, S., and Vardeman, S. (2019) Biclustering for Missing Data. Information Sciences, Submitted
data("synthetic") # 20 repeats without parallelization repeat_bc <- rep_biclustermd(synthetic, nrep = 20, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) repeat_bc autoplot(repeat_bc$best_bc) plot(repeat_bc$rep_sse, type = 'b', pch = 20) repeat_bc$runtime # 20 repeats with parallelization over 2 cores repeat_bc <- rep_biclustermd(synthetic, nrep = 20, parallel = TRUE, ncores = 2, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) repeat_bc$runtime
data("synthetic") # 20 repeats without parallelization repeat_bc <- rep_biclustermd(synthetic, nrep = 20, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) repeat_bc autoplot(repeat_bc$best_bc) plot(repeat_bc$rep_sse, type = 'b', pch = 20) repeat_bc$runtime # 20 repeats with parallelization over 2 cores repeat_bc <- rep_biclustermd(synthetic, nrep = 20, parallel = TRUE, ncores = 2, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) repeat_bc$runtime
Make a heatmap of sparse biclustering results
results_heatmap( x, reorder = FALSE, transform_colors = FALSE, c = 1/6, cell_alpha = 1/5, col_clusts = NULL, row_clusts = NULL, ... )
results_heatmap( x, reorder = FALSE, transform_colors = FALSE, c = 1/6, cell_alpha = 1/5, col_clusts = NULL, row_clusts = NULL, ... )
x |
A |
reorder |
A logical. If TRUE, heatmap will be sorted according to the cell-average matrix, |
transform_colors |
If equals |
c |
Value to scale the data by before running it through a standard normal CDF. Default is 1/6. |
cell_alpha |
A scalar defining the transparency of shading over a cell and by default this equals 1/5. The color corresponds to the cell mean. |
col_clusts |
A vector of column cluster indices to display. If NULL (default), all are displayed. |
row_clusts |
A vector of row cluster indices to display. If NULL (default), all are displayed. |
... |
Arguments to be passed to |
An object of class ggplot.
Get row names in each row cluster
row_cluster_names(x, data)
row_cluster_names(x, data)
x |
Biclustering object to extract row cluster designation from |
data |
Data that contains the row names |
A data frame with two columns: cluster
corresponds to the row
cluster and name
gives the row names in each cluster.
data("synthetic") rownames(synthetic) <- letters[1:nrow(synthetic)] colnames(synthetic) <- letters[1:ncol(synthetic)] bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) bc
data("synthetic") rownames(synthetic) <- letters[1:nrow(synthetic)] colnames(synthetic) <- letters[1:ncol(synthetic)] bc <- biclustermd(synthetic, col_clusters = 3, row_clusters = 2, miss_val = mean(synthetic, na.rm = TRUE), miss_val_sd = sd(synthetic, na.rm = TRUE), col_min_num = 2, row_min_num = 2, col_num_to_move = 1, row_num_to_move = 1, max.iter = 10) bc
Get data matrix row names and their corresponding row cluster membership
## S3 method for class 'biclustermd' row.names(x)
## S3 method for class 'biclustermd' row.names(x)
x |
and object of class |
a data frame with row names of the shuffled matrix and corresponding row cluster names.
data("synthetic") # default parameters bc <- biclustermd(synthetic) bc row.names(bc) # this is a simplified version of the output for gather(bc): library(dplyr) gather(bc) %>% distinct(row_cluster, row_name)
data("synthetic") # default parameters bc <- biclustermd(synthetic) bc row.names(bc) # this is a simplified version of the output for gather(bc): library(dplyr) gather(bc) %>% distinct(row_cluster, row_name)
This dataset stems from the R journal article introducing biclustermd
to R users. It describes the data attributes and run time for varying data
sizes and structures.
runtimes
runtimes
An object of class data.frame
with 2400 rows and 13 columns.
A data frame of 2400 rows and 13 variables (defined range, inclusive):
Unique identifier of a combination of parameters.
Number of rows in the data matrix. (50, 1500)
Number of columns in the data matrix. (50, 1500)
Product of the dimensions of the data. (2500, 2250000)
Number of clusters to partition the rows into. (4, 300)
Number of clusters to partition the columns into. (4, 300)
Average row cluster size. rows / row_clusts
Average column cluster size. cols / col_clusts
Percent of data values which are missing.
CPU time used executing instructions to calls (from ?proc.time
.
CPU time used executing calls (from ?proc.time
.
Amount of time in seconds it took the algorithm to converge.
Number of iterations to convergence.
This simple dataset allows users to use data that are easy to understand while learning
biclustermd
. This is a matrix with 6 rows and 12 columns. 50% of values are missing.
synthetic
synthetic
An object of class matrix
with 6 rows and 12 columns.
Bicluster data over a grid of tuning parameters
tune_biclustermd( data, nrep = 10, parallel = FALSE, ncores = 2, tune_grid = NULL )
tune_biclustermd( data, nrep = 10, parallel = FALSE, ncores = 2, tune_grid = NULL )
data |
Dataset to bicluster. Must to be a data matrix with only numbers and missing values in the data set. It should have row names and column names. |
nrep |
The number of times to repeat the biclustering for each set of parameters. Default 10. |
parallel |
Logical indicating if the user would like to utilize the
|
ncores |
The number of cores to use if parallel computing. Default 2. |
tune_grid |
A data frame of parameters to tune over. The column names of
this must match the arguments passed to |
A list of:
best_combn |
The best combination of parameters, |
best_bc |
The minimum SSE biclustering using the parameters in
|
grid |
|
runtime |
CPU runtime & elapsed time. |
Li, J., Reisner, J., Pham, H., Olafsson, S., and Vardeman, S. (2019) Biclustering for Missing Data. Information Sciences, Submitted
library(dplyr) library(ggplot2) data("synthetic") tg <- expand.grid( miss_val = fivenum(synthetic), similarity = c("Rand", "HA", "Jaccard"), col_min_num = 2, row_min_num = 2, col_clusters = 3:5, row_clusters = 2 ) tg # in parallel: two cores: tbc <- tune_biclustermd(synthetic, nrep = 2, parallel = TRUE, ncores = 2, tune_grid = tg) tbc tbc$grid %>% group_by(miss_val, col_clusters) %>% summarise(avg_sd = mean(sd_sse)) %>% ggplot(aes(miss_val, avg_sd, color = col_clusters, group = col_clusters)) + geom_line() + geom_point() tbc <- tune_biclustermd(synthetic, nrep = 2, tune_grid = tg) tbc boxplot(tbc$grid$mean_sse ~ tbc$grid$similarity) boxplot(tbc$grid$sd_sse ~ tbc$grid$similarity) # nycflights13::flights dataset library(nycflights13) data("flights") library(dplyr) flights_bcd <- flights %>% select(month, dest, arr_delay) flights_bcd <- flights_bcd %>% group_by(month, dest) %>% summarise(mean_arr_delay = mean(arr_delay, na.rm = TRUE)) %>% spread(dest, mean_arr_delay) %>% as.data.frame() # months as rows rownames(flights_bcd) <- flights_bcd$month flights_bcd <- as.matrix(flights_bcd[, -1]) flights_grid <- expand.grid( row_clusters = 4, col_clusters = c(6, 9, 12), miss_val = fivenum(flights_bcd), similarity = c("Rand", "Jaccard") ) # RUN TIME: approximately 40 seconds across two cores. flights_tune <- tune_biclustermd( flights_bcd, nrep = 10, parallel = TRUE, ncores = 2, tune_grid = flights_grid ) flights_tune
library(dplyr) library(ggplot2) data("synthetic") tg <- expand.grid( miss_val = fivenum(synthetic), similarity = c("Rand", "HA", "Jaccard"), col_min_num = 2, row_min_num = 2, col_clusters = 3:5, row_clusters = 2 ) tg # in parallel: two cores: tbc <- tune_biclustermd(synthetic, nrep = 2, parallel = TRUE, ncores = 2, tune_grid = tg) tbc tbc$grid %>% group_by(miss_val, col_clusters) %>% summarise(avg_sd = mean(sd_sse)) %>% ggplot(aes(miss_val, avg_sd, color = col_clusters, group = col_clusters)) + geom_line() + geom_point() tbc <- tune_biclustermd(synthetic, nrep = 2, tune_grid = tg) tbc boxplot(tbc$grid$mean_sse ~ tbc$grid$similarity) boxplot(tbc$grid$sd_sse ~ tbc$grid$similarity) # nycflights13::flights dataset library(nycflights13) data("flights") library(dplyr) flights_bcd <- flights %>% select(month, dest, arr_delay) flights_bcd <- flights_bcd %>% group_by(month, dest) %>% summarise(mean_arr_delay = mean(arr_delay, na.rm = TRUE)) %>% spread(dest, mean_arr_delay) %>% as.data.frame() # months as rows rownames(flights_bcd) <- flights_bcd$month flights_bcd <- as.matrix(flights_bcd[, -1]) flights_grid <- expand.grid( row_clusters = 4, col_clusters = c(6, 9, 12), miss_val = fivenum(flights_bcd), similarity = c("Rand", "Jaccard") ) # RUN TIME: approximately 40 seconds across two cores. flights_tune <- tune_biclustermd( flights_bcd, nrep = 10, parallel = TRUE, ncores = 2, tune_grid = flights_grid ) flights_tune