Benchmarking of sci-rocket

Author

J. van Riet

Introduction

This workflow will visualize the benchmarking of two sci-RNA-seqv3 data-set consisting of a large cohort of Four Core Genotypes (FCG) mice (FCG; 11.3 billion mate-pairs) and a smaller Danio Rerio cohort (490 million mate-pairs) in which additional nuclear oligo hashing barcodes were added.

Show code

library(dplyr)
library(patchwork)
source('misc_functions.R')

# Parallel options.
future::plan(strategy = future::multisession(workers = 10))

# Set seed.
base::set.seed(708813)

# Location of benchmarking logs.
files_benchmark <- list.files('~/DKFZ/odomLab/manuscript_scirocket/benchmarks/', full.names = T)

Overview of cohorts

Show code

cohorts <- readr::read_tsv('../config/testing_samplesheet.tsv', show_col_types = FALSE)

cohorts %>% 
    dplyr::group_by(experiment_name) %>% 
    dplyr::summarise(n = dplyr::n_distinct(sample_name), .groups = 'keep') %>% 
    dplyr::ungroup() %>% 
    dplyr::select(`Cohort` = experiment_name, `Total samples` = n) %>% 
    knitr::kable() %>% 
    kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)

Cohort	Total samples
sx42b	48
test_hashing	1

Overview of cohorts

Import of benchmarking logs

The runtime, IO and memory usage of experiments are logged using the Snakemake benchmarking suite. We now import the benchmarking logs of the two cohorts.

Show code

data_benchmark <- dplyr::bind_rows(future.apply::future_lapply(files_benchmark, function(x){
    data <- readr::read_tsv(x, show_col_types = FALSE) %>%
        dplyr::mutate(
            step = gsub('_test_.*', '', basename(x)),
            step = gsub('_zebra.*|_mouse.*', '', step),
            step = gsub('_sx42b.*', '', step),
            experiment = dplyr::if_else(grepl('sx42b|mouse', x), 'FCG', 'Zebrafish (Hashing)')
        )
    return(data)
}))

# Calc. mean + SE
data_benchmark <- data_benchmark %>%
    dplyr::group_by(step, experiment) %>%
    dplyr::summarise(
        mean_m = mean(s / 60),
        sd_m = sd(s / 60),
        mean_io_in = mean(io_in / 1024),
        sd_io_in = sd(io_in / 1024),
        mean_io_out = mean(io_out / 1024),
        sd_io_out = sd(io_out / 1024),
        mean_max_rss = mean(max_rss / 1024),
        sd_max_rss = sd(max_rss / 1024),
        mean_mean_load = mean(mean_load / 100),
        sd_mean_load = sd(mean_load / 100), .groups = 'keep'
    ) %>% 
    dplyr::mutate(
        step = factor(step, levels = c('bcl2fastq', 'split_R1', 'split_R2', 'demultiplex_fastq_split', 'gather_demultiplexed_sequencing', 'gather_demultiplexed_samples', 'trim_fastp', 'generate_index_STAR', 'starSolo_align', 'sambamba_index', 'sci_dash')),
        step = dplyr::recode_factor(
            step,
            bcl2fastq = 'Converting BCL (**bcl2fastq**)',
            split_R1 = "Splitting R1 into chunks",
            split_R2 = "Splitting R2 into chunks",
            demultiplex_fastq_split = "Barcode demultiplexing (on chunks)",
            gather_demultiplexed_sequencing = "Merging experiment-based files",
            gather_demultiplexed_samples = "Merging sample-based files",
            trim_fastp = "Trimming (**fastp**)",
            generate_index_STAR = "Generating alignment index (**STAR**)",
            starSolo_align = "Alignment and UMI counting (**STARSolo**)",
            sambamba_index = "Generating BAM indexes (**sambamba**)",
            sci_dash = "Generating interactive dashboard"
        )
    ) %>% 
    dplyr::ungroup()

Show code

generate_benchmarking_plot(data_benchmark %>% dplyr::filter(experiment == 'FCG'))

Figure 1: Benchmarking of the FCG cohort

Show code

generate_benchmarking_plot(data_benchmark %>% dplyr::filter(experiment != 'FCG'), ylimits_runtime = c(0, 45), nudge_runtime = 2.5, nudge_io = 2.5, ylimits_maxio_read = c(0,100), ylimits_maxio_write = c(0, 100))

Figure 2: Benchmarking of the Zebrafish cohort

Determine speed of demultiplexing

Using a single split chunk, we can determine the speed of de-multiplexing by checking the de-multiplexing time per 1M reads.

Show code

x <- readr::read_tsv('~/DKFZ/odomLab/manuscript_scirocket/logs/step2_demultiplexing_reads/demultiplex_fastq_split_sx42b_1-of-25.log', col_names = 'line', show_col_types = FALSE) %>%
    dplyr::filter(grepl("INFO: Done:", line)) %>%
    dplyr::mutate(
        n_reads = as.integer(gsub(' read-pairs.*', '', gsub('.*INFO: Done: ', '', line))),
        time = lubridate::as_datetime(gsub(' -.*', '', line))
    )

x$time <- x$time - min(x$time)

ggplot2::ggplot(x, ggplot2::aes(x = n_reads, y = time)) +
    ggplot2::geom_point(size = 1, shape = 21) +
    ggplot2::scale_x_continuous(labels = scales::unit_format(suffix = ' million', scale = 0.000001), expand = c(0,0.01)) +
    ggplot2::scale_y_continuous(limits = c(0, 12000), expand = c(0,0.01)) +
    ggplot2::labs(x = 'No. read-pairs', y = 'Time (in seconds)') +
    ggpubr::stat_cor(label.y = 11000, p.digits = 0, method = 'pearson', alternative = 'two.sided') +
    theme_job

Figure 3: De-multiplexing speed per 1M reads.

Write suppl. table

Show code

data_xl <- list('Demultiplexing' = cohorts, 'Benchmarking' = data_benchmark)
openxlsx::write.xlsx(data_xl, '~/DKFZ/odomLab/manuscript_scirocket/tables/SupplTable1.xlsx', asTable = TRUE)

Session information

Show code

sessioninfo::session_info()

─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.3.2 (2023-10-31)
 os       macOS Sonoma 14.3.1
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       Europe/Berlin
 date     2024-03-11
 pandoc   3.1.12.2 @ /opt/homebrew/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package      * version date (UTC) lib source
 abind          1.4-5   2016-07-21 [1] CRAN (R 4.3.0)
 backports      1.4.1   2021-12-13 [1] CRAN (R 4.3.0)
 bit            4.0.5   2022-11-15 [1] CRAN (R 4.3.0)
 bit64          4.0.5   2020-08-30 [1] CRAN (R 4.3.0)
 broom          1.0.5   2023-06-09 [1] CRAN (R 4.3.0)
 car            3.1-2   2023-03-30 [1] CRAN (R 4.3.0)
 carData        3.0-5   2022-01-06 [1] CRAN (R 4.3.0)
 cli            3.6.2   2023-12-11 [1] CRAN (R 4.3.1)
 codetools      0.2-19  2023-02-01 [1] CRAN (R 4.3.2)
 colorspace     2.1-0   2023-01-23 [1] CRAN (R 4.3.0)
 commonmark     1.9.1   2024-01-30 [1] CRAN (R 4.3.1)
 crayon         1.5.2   2022-09-29 [1] CRAN (R 4.3.0)
 digest         0.6.34  2024-01-11 [1] CRAN (R 4.3.1)
 dplyr        * 1.1.4   2023-11-17 [1] CRAN (R 4.3.1)
 evaluate       0.23    2023-11-01 [1] CRAN (R 4.3.1)
 fansi          1.0.6   2023-12-08 [1] CRAN (R 4.3.1)
 farver         2.1.1   2022-07-06 [1] CRAN (R 4.3.0)
 fastmap        1.1.1   2023-02-24 [1] CRAN (R 4.3.0)
 future         1.33.1  2023-12-22 [1] CRAN (R 4.3.1)
 future.apply   1.11.1  2023-12-21 [1] CRAN (R 4.3.1)
 generics       0.1.3   2022-07-05 [1] CRAN (R 4.3.0)
 ggplot2        3.5.0   2024-02-23 [1] CRAN (R 4.3.1)
 ggpubr         0.6.0   2023-02-10 [1] CRAN (R 4.3.0)
 ggsignif       0.6.4   2022-10-13 [1] CRAN (R 4.3.0)
 ggtext         0.1.2   2022-09-16 [1] CRAN (R 4.3.0)
 globals        0.16.2  2022-11-21 [1] CRAN (R 4.3.0)
 glue           1.7.0   2024-01-09 [1] CRAN (R 4.3.1)
 gridtext       0.1.5   2022-09-16 [1] CRAN (R 4.3.0)
 gtable         0.3.4   2023-08-21 [1] CRAN (R 4.3.0)
 highr          0.10    2022-12-22 [1] CRAN (R 4.3.0)
 hms            1.1.3   2023-03-21 [1] CRAN (R 4.3.0)
 htmltools      0.5.7   2023-11-03 [1] CRAN (R 4.3.1)
 htmlwidgets    1.6.4   2023-12-06 [1] CRAN (R 4.3.1)
 hues           0.2.0   2019-12-01 [1] CRAN (R 4.3.0)
 jsonlite       1.8.8   2023-12-04 [1] CRAN (R 4.3.1)
 kableExtra     1.4.0   2024-01-24 [1] CRAN (R 4.3.1)
 knitr          1.45    2023-10-30 [1] CRAN (R 4.3.1)
 labeling       0.4.3   2023-08-29 [1] CRAN (R 4.3.0)
 lifecycle      1.0.4   2023-11-07 [1] CRAN (R 4.3.1)
 listenv        0.9.1   2024-01-29 [1] CRAN (R 4.3.1)
 lubridate      1.9.3   2023-09-27 [1] CRAN (R 4.3.1)
 magrittr       2.0.3   2022-03-30 [1] CRAN (R 4.3.0)
 markdown       1.12    2023-12-06 [1] CRAN (R 4.3.1)
 munsell        0.5.0   2018-06-12 [1] CRAN (R 4.3.0)
 openxlsx       4.2.5.2 2023-02-06 [1] CRAN (R 4.3.0)
 parallelly     1.37.1  2024-02-29 [1] CRAN (R 4.3.1)
 patchwork    * 1.2.0   2024-01-08 [1] CRAN (R 4.3.1)
 pillar         1.9.0   2023-03-22 [1] CRAN (R 4.3.0)
 pkgconfig      2.0.3   2019-09-22 [1] CRAN (R 4.3.0)
 purrr          1.0.2   2023-08-10 [1] CRAN (R 4.3.0)
 R6             2.5.1   2021-08-19 [1] CRAN (R 4.3.0)
 Rcpp           1.0.12  2024-01-09 [1] CRAN (R 4.3.1)
 readr          2.1.5   2024-01-10 [1] CRAN (R 4.3.1)
 rlang          1.1.3   2024-01-10 [1] CRAN (R 4.3.1)
 rmarkdown      2.26    2024-03-05 [1] CRAN (R 4.3.1)
 rstatix        0.7.2   2023-02-01 [1] CRAN (R 4.3.0)
 rstudioapi     0.15.0  2023-07-07 [1] CRAN (R 4.3.0)
 scales         1.3.0   2023-11-28 [1] CRAN (R 4.3.1)
 sessioninfo    1.2.2   2021-12-06 [1] CRAN (R 4.3.0)
 stringi        1.8.3   2023-12-11 [1] CRAN (R 4.3.1)
 stringr        1.5.1   2023-11-14 [1] CRAN (R 4.3.1)
 svglite        2.1.3   2023-12-08 [1] CRAN (R 4.3.1)
 systemfonts    1.0.5   2023-10-09 [1] CRAN (R 4.3.1)
 tibble         3.2.1   2023-03-20 [1] CRAN (R 4.3.0)
 tidyr          1.3.1   2024-01-24 [1] CRAN (R 4.3.1)
 tidyselect     1.2.0   2022-10-10 [1] CRAN (R 4.3.0)
 timechange     0.3.0   2024-01-18 [1] CRAN (R 4.3.1)
 tzdb           0.4.0   2023-05-12 [1] CRAN (R 4.3.0)
 utf8           1.2.4   2023-10-22 [1] CRAN (R 4.3.1)
 vctrs          0.6.5   2023-12-01 [1] CRAN (R 4.3.1)
 viridisLite    0.4.2   2023-05-02 [1] CRAN (R 4.3.0)
 vroom          1.6.5   2023-12-05 [1] CRAN (R 4.3.1)
 withr          3.0.0   2024-01-16 [1] CRAN (R 4.3.1)
 xfun           0.42    2024-02-08 [1] CRAN (R 4.3.1)
 xml2           1.3.6   2023-12-04 [1] CRAN (R 4.3.1)
 yaml           2.3.8   2023-12-11 [1] CRAN (R 4.3.1)
 zip            2.3.1   2024-01-27 [1] CRAN (R 4.3.1)

 [1] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library

──────────────────────────────────────────────────────────────────────────────