Skip to content

Conversation

@ben-schwen
Copy link
Member

@ben-schwen ben-schwen commented Dec 16, 2025

Not directly demanded but improving benchmarks

image image
Details
library(atime)
library(data.table)

pkg.path = '.'
limit = 5

# taken from .ci/atime/tests.R
pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) {
  pkg_find_replace = function(glob, FIND, REPLACE) {
    atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE)
  }
  Package_regex = gsub(".", "_?", old.Package, fixed = TRUE)
  Package_ = gsub(".", "_", old.Package, fixed = TRUE)
  new.Package_ = paste0(Package_, "_", sha)
  pkg_find_replace(
    "DESCRIPTION",
    paste0("Package:\\s+", old.Package),
    paste("Package:", new.Package))
  pkg_find_replace(
    file.path("src", "Makevars.*in"),
    Package_regex,
    new.Package_)
  pkg_find_replace(
    file.path("R", "onLoad.R"),
    Package_regex,
    new.Package_)
  pkg_find_replace(
    file.path("R", "onLoad.R"),
    sprintf('packageVersion\\("%s"\\)', old.Package),
    sprintf('packageVersion\\("%s"\\)', new.Package))
  pkg_find_replace(
    file.path("src", "init.c"),
    paste0("R_init_", Package_regex),
    paste0("R_init_", gsub("[.]", "_", new.Package_)))
  pkg_find_replace(
    "NAMESPACE",
    sprintf('useDynLib\\("?%s"?', Package_regex),
    paste0('useDynLib(', new.Package_))
}

versions = c(
  'before' = 'b0c4ac3b',
  'after' = 'df05b9d0'
)

set.seed(123)

# ===== Benchmark 1: Small groups (10 elements each) =====
N = as.integer(10^seq(2, 6, 0.25))
grpsize_small = 10

gmedian_work1 = lapply(setNames(nm = N), function(n) {
  data.table(
    grp = rep(1:n, each = grpsize_small),
    x = rnorm(n * grpsize_small)
  )
})

gmedian_bench1 = atime_versions(
  pkg.path, N,
  setup = {
    dt = gmedian_work1[[as.character(N)]]
  },
  expr = data.table:::`[.data.table`(dt, , .(med = median(x)), by = grp),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench1)

# ===== Benchmark 2: Medium groups (100 elements each) =====
N = as.integer(10^seq(2, 5, 0.25))
grpsize_medium = 100

gmedian_work2 = lapply(setNames(nm = N), function(n) {
  data.table(
    grp = rep(1:n, each = grpsize_medium),
    x = rnorm(n * grpsize_medium)
  )
})

gmedian_bench2 = atime_versions(
  pkg.path, N,
  setup = {
    dt = gmedian_work2[[as.character(N)]]
  },
  expr = data.table:::`[.data.table`(dt, , .(med = median(x)), by = grp),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench2)

# ===== Benchmark 3: Large groups (1000 elements each) =====
N = as.integer(10^seq(2, 4.5, 0.25))
grpsize_large = 1000

gmedian_work3 = lapply(setNames(nm = N), function(n) {
  data.table(
    grp = rep(1:n, each = grpsize_large),
    x = rnorm(n * grpsize_large)
  )
})

gmedian_bench3 = atime_versions(
  pkg.path, N,
  setup = {
    dt = gmedian_work3[[as.character(N)]]
  },
  expr = data.table:::`[.data.table`(dt, , .(med = median(x)), by = grp),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench3)

# ===== Benchmark 4: Very large groups (10000 elements each) =====
N = as.integer(10^seq(1, 3.5, 0.25))
grpsize_vlarge = 10000

gmedian_work4 = lapply(setNames(nm = N), function(n) {
  data.table(
    grp = rep(1:n, each = grpsize_vlarge),
    x = rnorm(n * grpsize_vlarge)
  )
})

gmedian_bench4 = atime_versions(
  pkg.path, N,
  setup = {
    dt = gmedian_work4[[as.character(N)]]
  },
  expr = data.table:::`[.data.table`(dt, , .(med = median(x)), by = grp),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench4)

# ===== Benchmark 5: Integer data =====
N = as.integer(10^seq(2, 4.5, 0.25))
grpsize_int = 1000

gmedian_work5 = lapply(setNames(nm = N), function(n) {
  data.table(
    grp = rep(1:n, each = grpsize_int),
    x = sample(1L:1000L, n * grpsize_int, replace = TRUE)
  )
})

gmedian_bench5 = atime_versions(
  pkg.path, N,
  setup = {
    dt = gmedian_work5[[as.character(N)]]
  },
  expr = data.table:::`[.data.table`(dt, , .(med = median(x)), by = grp),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench5)

# ===== Benchmark 6: With NA values =====
N = as.integer(10^seq(2, 4.5, 0.25))
grpsize_na = 1000

gmedian_work6 = lapply(setNames(nm = N), function(n) {
  dt = data.table(
    grp = rep(1:n, each = grpsize_na),
    x = rnorm(n * grpsize_na)
  )
  # Set 10% of values to NA
  dt[sample(.N, as.integer(.N * 0.1)), x := NA_real_]
  dt
})

gmedian_bench6 = atime_versions(
  pkg.path, N,
  setup = {
    dt = gmedian_work6[[as.character(N)]]
  },
  expr = data.table:::`[.data.table`(dt, , .(med = median(x, na.rm = TRUE)), by = grp),
  seconds.limit = limit, verbose = TRUE, sha.vec = versions,
  pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench6)

# ===== SAVE BENCHMARK RESULTS =====
results_dir = "benchmark_results"
dir.create(results_dir, showWarnings = FALSE)

save(
  gmedian_bench1, gmedian_bench2, gmedian_bench3,
  gmedian_bench4, gmedian_bench5, gmedian_bench6,
  file = file.path(results_dir, "benchmarks_gmedian.rda")
)

library(ggplot2)
library(patchwork)

load(file.path(results_dir, "benchmarks_gmedian.rda"))

g = function(p, title) p + ggtitle(title)

p = list(
  g(plot(gmedian_bench1), "Small groups (10 elements)"),
  g(plot(gmedian_bench2), "Medium groups (100 elements)"),
  g(plot(gmedian_bench3), "Large groups (1000 elements)"),
  g(plot(gmedian_bench4), "Very large groups (10000 elements)"),
  g(plot(gmedian_bench5), "Integer data (1000 elements)"),
  g(plot(gmedian_bench6), "With 10% NA values (1000 elements)")
)

pdf("benchmark_gmedian_results.pdf", width = 12, height = 9)
for (i in seq(1, length(p), by = 4)) {
  pg = wrap_plots(p[i:min(i+3, length(p))], ncol = 2, nrow = 2, guides = "collect")
  print(pg)
}
dev.off()

@ben-schwen ben-schwen changed the title add floyd-rivest and parallelization to gmedian add floyd-rivest selection and parallelization to gmedian Dec 16, 2025
@codecov
Copy link

codecov bot commented Dec 16, 2025

Codecov Report

❌ Patch coverage is 62.50000% with 3 lines in your changes missing coverage. Please review.
✅ Project coverage is 98.98%. Comparing base (0216983) to head (19f0b8d).
⚠️ Report is 1 commits behind head on master.

Files with missing lines Patch % Lines
src/quickselect.c 40.00% 3 Missing ⚠️
Additional details and impacted files
@@            Coverage Diff             @@
##           master    #7481      +/-   ##
==========================================
- Coverage   99.00%   98.98%   -0.02%     
==========================================
  Files          87       87              
  Lines       16893    16879      -14     
==========================================
- Hits        16725    16708      -17     
- Misses        168      171       +3     

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@github-actions
Copy link

github-actions bot commented Dec 16, 2025

  • HEAD=gmedian stopped early for DT[by,verbose=TRUE] improved in #6296
    Comparison Plot

Generated via commit 19f0b8d

Download link for the artifact containing the test results: ↓ atime-results.zip

Task Duration
R setup and installing dependencies 5 minutes and 23 seconds
Installing different package versions 11 minutes and 13 seconds
Running and plotting the test cases 4 minutes and 4 seconds

else subd[j-nacount] = xd[k];
#pragma omp parallel num_threads(getDTthreads(ngrp, true))
{
double *thread_subd = malloc(maxgrpn * sizeof(double));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it make sense to allocate outside of parallel and use R alloc, so the memory usage can be tracked by apps that track memory usage using R API.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, will do.

if (nacount && !narm) {
ansd[i] = NA_REAL;
} else {
// Use Floyd-Rivest for groups larger than 100, quickselect for smaller
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it make sense to put 100 as a parameter, so we can explore impact of this code path branches by options

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could do. Like all of these parameters, it should be tuned.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants