Benchmarking of sci-rocket

Author

J. van Riet

Introduction

This workflow will visualize the benchmarking of two sci-RNA-seqv3 data-set consisting of a large cohort of Four Core Genotypes (FCG) mice (FCG; 11.3 billion mate-pairs) and a smaller Danio Rerio cohort (490 million mate-pairs) in which additional nuclear oligo hashing barcodes were added.

Show code
library(dplyr)
library(patchwork)
source('misc_functions.R')

# Parallel options.
future::plan(strategy = future::multisession(workers = 10))

# Set seed.
base::set.seed(708813)

# Location of benchmarking logs.
files_benchmark <- list.files('~/DKFZ/odomLab/manuscript_scirocket/benchmarks/', full.names = T)

Overview of cohorts

Show code
cohorts <- readr::read_tsv('../config/testing_samplesheet.tsv', show_col_types = FALSE)

cohorts %>% 
    dplyr::group_by(experiment_name) %>% 
    dplyr::summarise(n = dplyr::n_distinct(sample_name), .groups = 'keep') %>% 
    dplyr::ungroup() %>% 
    dplyr::select(`Cohort` = experiment_name, `Total samples` = n) %>% 
    knitr::kable() %>% 
    kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)
Cohort Total samples
sx42b 48
test_hashing 1

Overview of cohorts

Import of benchmarking logs

The runtime, IO and memory usage of experiments are logged using the Snakemake benchmarking suite. We now import the benchmarking logs of the two cohorts.

Show code
data_benchmark <- dplyr::bind_rows(future.apply::future_lapply(files_benchmark, function(x){
    data <- readr::read_tsv(x, show_col_types = FALSE) %>%
        dplyr::mutate(
            step = gsub('_test_.*', '', basename(x)),
            step = gsub('_zebra.*|_mouse.*', '', step),
            step = gsub('_sx42b.*', '', step),
            experiment = dplyr::if_else(grepl('sx42b|mouse', x), 'FCG', 'Zebrafish (Hashing)')
        )
    return(data)
}))

# Calc. mean + SE
data_benchmark <- data_benchmark %>%
    dplyr::group_by(step, experiment) %>%
    dplyr::summarise(
        mean_m = mean(s / 60),
        sd_m = sd(s / 60),
        mean_io_in = mean(io_in / 1024),
        sd_io_in = sd(io_in / 1024),
        mean_io_out = mean(io_out / 1024),
        sd_io_out = sd(io_out / 1024),
        mean_max_rss = mean(max_rss / 1024),
        sd_max_rss = sd(max_rss / 1024),
        mean_mean_load = mean(mean_load / 100),
        sd_mean_load = sd(mean_load / 100), .groups = 'keep'
    ) %>% 
    dplyr::mutate(
        step = factor(step, levels = c('bcl2fastq', 'split_R1', 'split_R2', 'demultiplex_fastq_split', 'gather_demultiplexed_sequencing', 'gather_demultiplexed_samples', 'trim_fastp', 'generate_index_STAR', 'starSolo_align', 'sambamba_index', 'sci_dash')),
        step = dplyr::recode_factor(
            step,
            bcl2fastq = 'Converting BCL (**bcl2fastq**)',
            split_R1 = "Splitting R1 into chunks",
            split_R2 = "Splitting R2 into chunks",
            demultiplex_fastq_split = "Barcode demultiplexing (on chunks)",
            gather_demultiplexed_sequencing = "Merging experiment-based files",
            gather_demultiplexed_samples = "Merging sample-based files",
            trim_fastp = "Trimming (**fastp**)",
            generate_index_STAR = "Generating alignment index (**STAR**)",
            starSolo_align = "Alignment and UMI counting (**STARSolo**)",
            sambamba_index = "Generating BAM indexes (**sambamba**)",
            sci_dash = "Generating interactive dashboard"
        )
    ) %>% 
    dplyr::ungroup()
Show code
generate_benchmarking_plot(data_benchmark %>% dplyr::filter(experiment == 'FCG'))
Figure 1: Benchmarking of the FCG cohort
Show code
generate_benchmarking_plot(data_benchmark %>% dplyr::filter(experiment != 'FCG'), ylimits_runtime = c(0, 45), nudge_runtime = 2.5, nudge_io = 2.5, ylimits_maxio_read = c(0,100), ylimits_maxio_write = c(0, 100))
Figure 2: Benchmarking of the Zebrafish cohort

Determine speed of demultiplexing

Using a single split chunk, we can determine the speed of de-multiplexing by checking the de-multiplexing time per 1M reads.

Show code
x <- readr::read_tsv('~/DKFZ/odomLab/manuscript_scirocket/logs/step2_demultiplexing_reads/demultiplex_fastq_split_sx42b_1-of-25.log', col_names = 'line', show_col_types = FALSE) %>%
    dplyr::filter(grepl("INFO: Done:", line)) %>%
    dplyr::mutate(
        n_reads = as.integer(gsub(' read-pairs.*', '', gsub('.*INFO: Done: ', '', line))),
        time = lubridate::as_datetime(gsub(' -.*', '', line))
    )

x$time <- x$time - min(x$time)

ggplot2::ggplot(x, ggplot2::aes(x = n_reads, y = time)) +
    ggplot2::geom_point(size = 1, shape = 21) +
    ggplot2::scale_x_continuous(labels = scales::unit_format(suffix = ' million', scale = 0.000001), expand = c(0,0.01)) +
    ggplot2::scale_y_continuous(limits = c(0, 12000), expand = c(0,0.01)) +
    ggplot2::labs(x = 'No. read-pairs', y = 'Time (in seconds)') +
    ggpubr::stat_cor(label.y = 11000, p.digits = 0, method = 'pearson', alternative = 'two.sided') +
    theme_job
Figure 3: De-multiplexing speed per 1M reads.

Write suppl. table

Show code
data_xl <- list('Demultiplexing' = cohorts, 'Benchmarking' = data_benchmark)
openxlsx::write.xlsx(data_xl, '~/DKFZ/odomLab/manuscript_scirocket/tables/SupplTable1.xlsx', asTable = TRUE)

Session information

Show code
sessioninfo::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.3.2 (2023-10-31)
 os       macOS Sonoma 14.3.1
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       Europe/Berlin
 date     2024-03-11
 pandoc   3.1.12.2 @ /opt/homebrew/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package      * version date (UTC) lib source
 abind          1.4-5   2016-07-21 [1] CRAN (R 4.3.0)
 backports      1.4.1   2021-12-13 [1] CRAN (R 4.3.0)
 bit            4.0.5   2022-11-15 [1] CRAN (R 4.3.0)
 bit64          4.0.5   2020-08-30 [1] CRAN (R 4.3.0)
 broom          1.0.5   2023-06-09 [1] CRAN (R 4.3.0)
 car            3.1-2   2023-03-30 [1] CRAN (R 4.3.0)
 carData        3.0-5   2022-01-06 [1] CRAN (R 4.3.0)
 cli            3.6.2   2023-12-11 [1] CRAN (R 4.3.1)
 codetools      0.2-19  2023-02-01 [1] CRAN (R 4.3.2)
 colorspace     2.1-0   2023-01-23 [1] CRAN (R 4.3.0)
 commonmark     1.9.1   2024-01-30 [1] CRAN (R 4.3.1)
 crayon         1.5.2   2022-09-29 [1] CRAN (R 4.3.0)
 digest         0.6.34  2024-01-11 [1] CRAN (R 4.3.1)
 dplyr        * 1.1.4   2023-11-17 [1] CRAN (R 4.3.1)
 evaluate       0.23    2023-11-01 [1] CRAN (R 4.3.1)
 fansi          1.0.6   2023-12-08 [1] CRAN (R 4.3.1)
 farver         2.1.1   2022-07-06 [1] CRAN (R 4.3.0)
 fastmap        1.1.1   2023-02-24 [1] CRAN (R 4.3.0)
 future         1.33.1  2023-12-22 [1] CRAN (R 4.3.1)
 future.apply   1.11.1  2023-12-21 [1] CRAN (R 4.3.1)
 generics       0.1.3   2022-07-05 [1] CRAN (R 4.3.0)
 ggplot2        3.5.0   2024-02-23 [1] CRAN (R 4.3.1)
 ggpubr         0.6.0   2023-02-10 [1] CRAN (R 4.3.0)
 ggsignif       0.6.4   2022-10-13 [1] CRAN (R 4.3.0)
 ggtext         0.1.2   2022-09-16 [1] CRAN (R 4.3.0)
 globals        0.16.2  2022-11-21 [1] CRAN (R 4.3.0)
 glue           1.7.0   2024-01-09 [1] CRAN (R 4.3.1)
 gridtext       0.1.5   2022-09-16 [1] CRAN (R 4.3.0)
 gtable         0.3.4   2023-08-21 [1] CRAN (R 4.3.0)
 highr          0.10    2022-12-22 [1] CRAN (R 4.3.0)
 hms            1.1.3   2023-03-21 [1] CRAN (R 4.3.0)
 htmltools      0.5.7   2023-11-03 [1] CRAN (R 4.3.1)
 htmlwidgets    1.6.4   2023-12-06 [1] CRAN (R 4.3.1)
 hues           0.2.0   2019-12-01 [1] CRAN (R 4.3.0)
 jsonlite       1.8.8   2023-12-04 [1] CRAN (R 4.3.1)
 kableExtra     1.4.0   2024-01-24 [1] CRAN (R 4.3.1)
 knitr          1.45    2023-10-30 [1] CRAN (R 4.3.1)
 labeling       0.4.3   2023-08-29 [1] CRAN (R 4.3.0)
 lifecycle      1.0.4   2023-11-07 [1] CRAN (R 4.3.1)
 listenv        0.9.1   2024-01-29 [1] CRAN (R 4.3.1)
 lubridate      1.9.3   2023-09-27 [1] CRAN (R 4.3.1)
 magrittr       2.0.3   2022-03-30 [1] CRAN (R 4.3.0)
 markdown       1.12    2023-12-06 [1] CRAN (R 4.3.1)
 munsell        0.5.0   2018-06-12 [1] CRAN (R 4.3.0)
 openxlsx       4.2.5.2 2023-02-06 [1] CRAN (R 4.3.0)
 parallelly     1.37.1  2024-02-29 [1] CRAN (R 4.3.1)
 patchwork    * 1.2.0   2024-01-08 [1] CRAN (R 4.3.1)
 pillar         1.9.0   2023-03-22 [1] CRAN (R 4.3.0)
 pkgconfig      2.0.3   2019-09-22 [1] CRAN (R 4.3.0)
 purrr          1.0.2   2023-08-10 [1] CRAN (R 4.3.0)
 R6             2.5.1   2021-08-19 [1] CRAN (R 4.3.0)
 Rcpp           1.0.12  2024-01-09 [1] CRAN (R 4.3.1)
 readr          2.1.5   2024-01-10 [1] CRAN (R 4.3.1)
 rlang          1.1.3   2024-01-10 [1] CRAN (R 4.3.1)
 rmarkdown      2.26    2024-03-05 [1] CRAN (R 4.3.1)
 rstatix        0.7.2   2023-02-01 [1] CRAN (R 4.3.0)
 rstudioapi     0.15.0  2023-07-07 [1] CRAN (R 4.3.0)
 scales         1.3.0   2023-11-28 [1] CRAN (R 4.3.1)
 sessioninfo    1.2.2   2021-12-06 [1] CRAN (R 4.3.0)
 stringi        1.8.3   2023-12-11 [1] CRAN (R 4.3.1)
 stringr        1.5.1   2023-11-14 [1] CRAN (R 4.3.1)
 svglite        2.1.3   2023-12-08 [1] CRAN (R 4.3.1)
 systemfonts    1.0.5   2023-10-09 [1] CRAN (R 4.3.1)
 tibble         3.2.1   2023-03-20 [1] CRAN (R 4.3.0)
 tidyr          1.3.1   2024-01-24 [1] CRAN (R 4.3.1)
 tidyselect     1.2.0   2022-10-10 [1] CRAN (R 4.3.0)
 timechange     0.3.0   2024-01-18 [1] CRAN (R 4.3.1)
 tzdb           0.4.0   2023-05-12 [1] CRAN (R 4.3.0)
 utf8           1.2.4   2023-10-22 [1] CRAN (R 4.3.1)
 vctrs          0.6.5   2023-12-01 [1] CRAN (R 4.3.1)
 viridisLite    0.4.2   2023-05-02 [1] CRAN (R 4.3.0)
 vroom          1.6.5   2023-12-05 [1] CRAN (R 4.3.1)
 withr          3.0.0   2024-01-16 [1] CRAN (R 4.3.1)
 xfun           0.42    2024-02-08 [1] CRAN (R 4.3.1)
 xml2           1.3.6   2023-12-04 [1] CRAN (R 4.3.1)
 yaml           2.3.8   2023-12-11 [1] CRAN (R 4.3.1)
 zip            2.3.1   2024-01-27 [1] CRAN (R 4.3.1)

 [1] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library

──────────────────────────────────────────────────────────────────────────────