From 59fb014173b9234efcf729835d14d20b0babcdef Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Mon, 19 Jan 2026 07:16:35 -0800
Subject: [PATCH 1/3] Migrate third party tests from unittest to pytest

---
 .../cupy/binary_tests/test_elementwise.py     |   4 +-
 .../cupy/binary_tests/test_packing.py         |   4 +-
 .../cupy/core_tests/test_array_function.py    |   4 +-
 .../third_party/cupy/core_tests/test_core.py  |   9 +-
 .../cupy/core_tests/test_internal.py          | 179 +++++++-----------
 5 files changed, 83 insertions(+), 117 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/binary_tests/test_elementwise.py b/dpnp/tests/third_party/cupy/binary_tests/test_elementwise.py
index 431f1f27d8c7..95ec1d4374c8 100644
--- a/dpnp/tests/third_party/cupy/binary_tests/test_elementwise.py
+++ b/dpnp/tests/third_party/cupy/binary_tests/test_elementwise.py
@@ -1,11 +1,9 @@
 from __future__ import annotations
 
-import unittest
-
 from dpnp.tests.third_party.cupy import testing
 
 
-class TestElementwise(unittest.TestCase):
+class TestElementwise:
 
     @testing.for_int_dtypes()
     @testing.numpy_cupy_array_equal()
diff --git a/dpnp/tests/third_party/cupy/binary_tests/test_packing.py b/dpnp/tests/third_party/cupy/binary_tests/test_packing.py
index a72a8a558b08..518e74d98868 100644
--- a/dpnp/tests/third_party/cupy/binary_tests/test_packing.py
+++ b/dpnp/tests/third_party/cupy/binary_tests/test_packing.py
@@ -1,4 +1,4 @@
-import unittest
+from __future__ import annotations
 
 import numpy
 import pytest
@@ -11,7 +11,7 @@
 )
 
 
-class TestPacking(unittest.TestCase):
+class TestPacking:
 
     @testing.for_int_dtypes()
     @testing.numpy_cupy_array_equal()
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_array_function.py b/dpnp/tests/third_party/cupy/core_tests/test_array_function.py
index 7878a5a0aaff..f0c45900bcda 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_array_function.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_array_function.py
@@ -1,4 +1,4 @@
-import unittest
+from __future__ import annotations
 
 import numpy
 import pytest
@@ -11,7 +11,7 @@
 )
 
 
-class TestArrayFunction(unittest.TestCase):
+class TestArrayFunction:
 
     @testing.with_requires("numpy>=1.17.0")
     def test_array_function(self):
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_core.py b/dpnp/tests/third_party/cupy/core_tests/test_core.py
index c959b4f24954..d9f8196db562 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_core.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_core.py
@@ -1,20 +1,23 @@
 from __future__ import annotations
 
 import sys
-import unittest
 
 import numpy
 import pytest
 
 import dpnp as cupy
+
+# from cupy._core import core
 from dpnp.tests.third_party.cupy import testing
 from dpnp.tests.third_party.cupy.testing._protocol_helpers import (
     DummyObjectWithCudaArrayInterface,
     DummyObjectWithCuPyGetNDArray,
 )
 
+# from cupy_tests.core_tests import test_raw
+
 
-class TestSize(unittest.TestCase):
+class TestSize:
 
     # def tearDown(self):
     #     # Free huge memory for slow test
@@ -58,7 +61,7 @@ def test_size_huge(self, xp):
 
 
 @pytest.mark.skip("no cupy._core submodule")
-class TestOrder(unittest.TestCase):
+class TestOrder:
 
     @testing.for_orders(_orders.keys())
     def test_ndarray(self, order):
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_internal.py b/dpnp/tests/third_party/cupy/core_tests/test_internal.py
index 205661e80d75..2f4239a2e220 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_internal.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_internal.py
@@ -1,18 +1,16 @@
-import math
-import unittest
+from __future__ import annotations
 
 import numpy
 import pytest
 
 # from cupy._core import internal
-from dpnp.tests.third_party.cupy import testing
 
 pytest.skip(
     "CuPy internal functions are not supported", allow_module_level=True
 )
 
 
-class TestProd(unittest.TestCase):
+class TestProd:
 
     def test_empty(self):
         assert internal.prod([]) == 1
@@ -24,7 +22,7 @@ def test_two(self):
         assert internal.prod([2, 3]) == 6
 
 
-class TestProdSequence(unittest.TestCase):
+class TestProdSequence:
 
     def test_empty(self):
         assert internal.prod_sequence(()) == 1
@@ -74,7 +72,7 @@ def test_float(self):
         assert internal.get_size(1.0) == (1.0,)
 
 
-class TestVectorEqual(unittest.TestCase):
+class TestVectorEqual:
 
     def test_empty(self):
         assert internal.vector_equal([], []) is True
@@ -89,7 +87,7 @@ def test_different_size(self):
         assert internal.vector_equal([1, 2, 3], [1, 2]) is False
 
 
-class TestGetCContiguity(unittest.TestCase):
+class TestGetCContiguity:
 
     def test_zero_in_shape(self):
         assert internal.get_c_contiguity((1, 0, 1), (1, 1, 1), 3)
@@ -122,134 +120,101 @@ def test_no_contiguous3(self):
         assert not internal.get_c_contiguity((3, 1, 3), (6, 6, 4), 2)
 
 
-class TestInferUnknownDimension(unittest.TestCase):
+class TestInferUnknownDimension:
 
     def test_known_all(self):
         assert internal.infer_unknown_dimension((1, 2, 3), 6) == [1, 2, 3]
 
     def test_multiple_unknown(self):
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
             internal.infer_unknown_dimension((-1, 1, -1), 10)
 
     def test_infer(self):
         assert internal.infer_unknown_dimension((-1, 2, 3), 12) == [2, 2, 3]
 
 
-@testing.parameterize(
-    {"slice": (2, 8, 1), "expect": (2, 8, 1)},
-    {"slice": (2, None, 1), "expect": (2, 10, 1)},
-    {"slice": (2, 1, 1), "expect": (2, 2, 1)},
-    {"slice": (2, -1, 1), "expect": (2, 9, 1)},
-    {"slice": (None, 8, 1), "expect": (0, 8, 1)},
-    {"slice": (-3, 8, 1), "expect": (7, 8, 1)},
-    {"slice": (11, 8, 1), "expect": (10, 10, 1)},
-    {"slice": (11, 11, 1), "expect": (10, 10, 1)},
-    {"slice": (-11, 8, 1), "expect": (0, 8, 1)},
-    {"slice": (-11, -11, 1), "expect": (0, 0, 1)},
-    {"slice": (8, 2, -1), "expect": (8, 2, -1)},
-    {"slice": (8, None, -1), "expect": (8, -1, -1)},
-    {"slice": (8, 9, -1), "expect": (8, 8, -1)},
-    {"slice": (8, -3, -1), "expect": (8, 7, -1)},
-    {"slice": (None, 8, -1), "expect": (9, 8, -1)},
-    {"slice": (-3, 6, -1), "expect": (7, 6, -1)},
-    {"slice": (10, 10, -1), "expect": (9, 9, -1)},
-    {"slice": (10, 8, -1), "expect": (9, 8, -1)},
-    {"slice": (9, 10, -1), "expect": (9, 9, -1)},
-    {"slice": (9, 9, -1), "expect": (9, 9, -1)},
-    {"slice": (9, 8, -1), "expect": (9, 8, -1)},
-    {"slice": (8, 8, -1), "expect": (8, 8, -1)},
-    {"slice": (-9, -8, -1), "expect": (1, 1, -1)},
-    {"slice": (-9, -9, -1), "expect": (1, 1, -1)},
-    {"slice": (-9, -10, -1), "expect": (1, 0, -1)},
-    {"slice": (-9, -11, -1), "expect": (1, -1, -1)},
-    {"slice": (-9, -12, -1), "expect": (1, -1, -1)},
-    {"slice": (-10, -9, -1), "expect": (0, 0, -1)},
-    {"slice": (-10, -10, -1), "expect": (0, 0, -1)},
-    {"slice": (-10, -11, -1), "expect": (0, -1, -1)},
-    {"slice": (-10, -12, -1), "expect": (0, -1, -1)},
-    {"slice": (-11, 8, -1), "expect": (-1, -1, -1)},
-    {"slice": (-11, -9, -1), "expect": (-1, -1, -1)},
-    {"slice": (-11, -10, -1), "expect": (-1, -1, -1)},
-    {"slice": (-11, -11, -1), "expect": (-1, -1, -1)},
-    {"slice": (-11, -12, -1), "expect": (-1, -1, -1)},
+@pytest.mark.parametrize(
+    ("slice_", "expect"),
+    [
+        ((2, 8, 1), (2, 8, 1)),
+        ((2, None, 1), (2, 10, 1)),
+        ((2, 1, 1), (2, 2, 1)),
+        ((2, -1, 1), (2, 9, 1)),
+        ((None, 8, 1), (0, 8, 1)),
+        ((-3, 8, 1), (7, 8, 1)),
+        ((11, 8, 1), (10, 10, 1)),
+        ((11, 11, 1), (10, 10, 1)),
+        ((-11, 8, 1), (0, 8, 1)),
+        ((-11, -11, 1), (0, 0, 1)),
+        ((8, 2, -1), (8, 2, -1)),
+        ((8, None, -1), (8, -1, -1)),
+        ((8, 9, -1), (8, 8, -1)),
+        ((8, -3, -1), (8, 7, -1)),
+        ((None, 8, -1), (9, 8, -1)),
+        ((-3, 6, -1), (7, 6, -1)),
+        ((10, 10, -1), (9, 9, -1)),
+        ((10, 8, -1), (9, 8, -1)),
+        ((9, 10, -1), (9, 9, -1)),
+        ((9, 9, -1), (9, 9, -1)),
+        ((9, 8, -1), (9, 8, -1)),
+        ((8, 8, -1), (8, 8, -1)),
+        ((-9, -8, -1), (1, 1, -1)),
+        ((-9, -9, -1), (1, 1, -1)),
+        ((-9, -10, -1), (1, 0, -1)),
+        ((-9, -11, -1), (1, -1, -1)),
+        ((-9, -12, -1), (1, -1, -1)),
+        ((-10, -9, -1), (0, 0, -1)),
+        ((-10, -10, -1), (0, 0, -1)),
+        ((-10, -11, -1), (0, -1, -1)),
+        ((-10, -12, -1), (0, -1, -1)),
+        ((-11, 8, -1), (-1, -1, -1)),
+        ((-11, -9, -1), (-1, -1, -1)),
+        ((-11, -10, -1), (-1, -1, -1)),
+        ((-11, -11, -1), (-1, -1, -1)),
+        ((-11, -12, -1), (-1, -1, -1)),
+    ],
 )
-class TestCompleteSlice(unittest.TestCase):
+def test_complete_slice(slice_, expect):
+    assert internal.complete_slice(slice(*slice_), 10) == slice(*expect)
 
-    def test_complete_slice(self):
-        assert internal.complete_slice(slice(*self.slice), 10) == slice(
-            *self.expect
-        )
 
-
-class TestCompleteSliceError(unittest.TestCase):
+class TestCompleteSliceError:
 
     def test_invalid_step_value(self):
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
             internal.complete_slice(slice(1, 1, 0), 1)
 
     def test_invalid_step_type(self):
-        with self.assertRaises(TypeError):
+        with pytest.raises(TypeError):
             internal.complete_slice(slice(1, 1, (1, 2)), 1)
 
     def test_invalid_start_type(self):
-        with self.assertRaises(TypeError):
+        with pytest.raises(TypeError):
             internal.complete_slice(slice((1, 2), 1, 1), 1)
-        with self.assertRaises(TypeError):
+        with pytest.raises(TypeError):
             internal.complete_slice(slice((1, 2), 1, -1), 1)
 
     def test_invalid_stop_type(self):
-        with self.assertRaises(TypeError):
+        with pytest.raises(TypeError):
             internal.complete_slice(slice((1, 2), 1, 1), 1)
-        with self.assertRaises(TypeError):
+        with pytest.raises(TypeError):
             internal.complete_slice(slice((1, 2), 1, -1), 1)
 
 
-@testing.parameterize(
-    {"x": 0, "expect": 0},
-    {"x": 1, "expect": 1},
-    {"x": 2, "expect": 2},
-    {"x": 3, "expect": 4},
-    {"x": 2**10, "expect": 2**10},
-    {"x": 2**10 - 1, "expect": 2**10},
-    {"x": 2**10 + 1, "expect": 2**11},
-    {"x": 2**40, "expect": 2**40},
-    {"x": 2**40 - 1, "expect": 2**40},
-    {"x": 2**40 + 1, "expect": 2**41},
+@pytest.mark.parametrize(
+    ("x", "expect"),
+    [
+        (0, 0),
+        (1, 1),
+        (2, 2),
+        (3, 4),
+        (2**10, 2**10),
+        (2**10 - 1, 2**10),
+        (2**10 + 1, 2**11),
+        (2**40, 2**40),
+        (2**40 - 1, 2**40),
+        (2**40 + 1, 2**41),
+    ],
 )
-class TestClp2(unittest.TestCase):
-
-    def test_clp2(self):
-        assert internal.clp2(self.x) == self.expect
-
-
-@testing.parameterize(
-    *testing.product(
-        {
-            "value": [
-                0.0,
-                1.0,
-                -1.0,
-                0.25,
-                -0.25,
-                11.0,
-                -11.0,
-                2**-15,
-                -(2**-15),  # Denormalized Number
-                float("inf"),
-                float("-inf"),
-            ],
-        }
-    )
-)
-class TestConvertFloat16(unittest.TestCase):
-
-    def test_conversion(self):
-        half = internal.to_float16(self.value)
-        assert internal.from_float16(half) == self.value
-
-
-class TestConvertFloat16Nan(unittest.TestCase):
-
-    def test_conversion(self):
-        half = internal.to_float16(float("nan"))
-        assert math.isnan(internal.from_float16(half))
+def test_clp2(x, expect):
+    assert internal.clp2(x) == expect

From 7ee68fda2e1ccd2063d53fec62aeafab8ac2df8a Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Mon, 19 Jan 2026 07:27:59 -0800
Subject: [PATCH 2/3] Update FFT tests due to changes in FFT config to make it
 more thread-safe

---
 .../third_party/cupy/fft_tests/test_cache.py  |  74 +-
 .../cupy/fft_tests/test_callback.py           | 965 +++++++++++++++---
 .../third_party/cupy/fft_tests/test_fft.py    | 143 ++-
 3 files changed, 940 insertions(+), 242 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/fft_tests/test_cache.py b/dpnp/tests/third_party/cupy/fft_tests/test_cache.py
index e32f8e8305a0..29e0a7724edf 100644
--- a/dpnp/tests/third_party/cupy/fft_tests/test_cache.py
+++ b/dpnp/tests/third_party/cupy/fft_tests/test_cache.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import contextlib
 import io
 import queue
@@ -14,7 +16,7 @@
 # from cupy.cuda import runtime
 # from cupy.fft import config
 
-# from .test_fft import (multi_gpu_config, _skip_multi_gpu_bug)
+# from .test_fft import multi_gpu_config
 
 pytest.skip("FFT cache functions are not supported", allow_module_level=True)
 
@@ -30,26 +32,29 @@ def intercept_stdout(func):
 
 
 class TestPlanCache(unittest.TestCase):
-    def setUp(self):
-        self.caches = []
-        self.old_sizes = []
+    @contextlib.contextmanager
+    @staticmethod
+    def prepare_and_restore_caches():
+        old_sizes = []
         for i in range(n_devices):
             with device.Device(i):
                 cache = config.get_plan_cache()
-                self.old_sizes.append(cache.get_size())
+                old_sizes.append(cache.get_size())
                 cache.clear()
                 cache.set_memsize(-1)
                 cache.set_size(2)
-            self.caches.append(cache)
 
-    def tearDown(self):
-        for i in range(n_devices):
-            with device.Device(i):
-                cache = config.get_plan_cache()
-                cache.clear()
-                cache.set_size(self.old_sizes[i])
-                cache.set_memsize(-1)
+        try:
+            yield
+        finally:
+            for i in range(n_devices):
+                with device.Device(i):
+                    cache = config.get_plan_cache()
+                    cache.clear()
+                    cache.set_size(old_sizes[i])
+                    cache.set_memsize(-1)
 
+    @prepare_and_restore_caches()
     def test_LRU_cache1(self):
         # test if insertion and clean-up works
         cache = config.get_plan_cache()
@@ -62,6 +67,7 @@ def test_LRU_cache1(self):
         cache.clear()
         assert cache.get_curr_size() == 0 <= cache.get_size()
 
+    @prepare_and_restore_caches()
     def test_LRU_cache2(self):
         # test if plan is reused
         cache = config.get_plan_cache()
@@ -83,6 +89,7 @@ def test_LRU_cache2(self):
         # we should get the same plan
         assert plan0 is plan1
 
+    @prepare_and_restore_caches()
     def test_LRU_cache3(self):
         # test if cache size is limited
         cache = config.get_plan_cache()
@@ -108,6 +115,7 @@ def test_LRU_cache3(self):
         for _, node in cache:
             assert plan is not node.plan
 
+    @prepare_and_restore_caches()
     def test_LRU_cache4(self):
         # test if fetching the plan will reorder it to the top
         cache = config.get_plan_cache()
@@ -149,6 +157,8 @@ def test_LRU_cache4(self):
             cache[next(iterator)[0]]
 
     @testing.multi_gpu(2)
+    @prepare_and_restore_caches()
+    @pytest.mark.thread_unsafe(reason="intercepts stdout")
     def test_LRU_cache5(self):
         # test if the LRU cache is thread-local
 
@@ -210,10 +220,13 @@ def thread_init_caches(gpus, queue):
         assert stdout.count("uninitialized") == n_devices - 2
 
     @testing.multi_gpu(2)
-    def test_LRU_cache6(self):
+    @prepare_and_restore_caches()
+    def test_LRU_cache6(self, gpus=None):
         # test if each device has a separate cache
-        cache0 = self.caches[0]
-        cache1 = self.caches[1]
+        with device.Device(0):
+            cache0 = config.get_plan_cache()
+        with device.Device(1):
+            cache1 = config.get_plan_cache()
 
         # ensure a fresh state
         assert cache0.get_curr_size() == 0 <= cache0.get_size()
@@ -247,10 +260,13 @@ def test_LRU_cache6(self):
     @pytest.mark.skipif(
         runtime.is_hip, reason="hipFFT doesn't support multi-GPU"
     )
-    def test_LRU_cache7(self):
+    @prepare_and_restore_caches()
+    def test_LRU_cache7(self, gpus=None):
         # test accessing a multi-GPU plan
-        cache0 = self.caches[0]
-        cache1 = self.caches[1]
+        with device.Device(0):
+            cache0 = config.get_plan_cache()
+        with device.Device(1):
+            cache1 = config.get_plan_cache()
 
         # ensure a fresh state
         assert cache0.get_curr_size() == 0 <= cache0.get_size()
@@ -319,6 +335,7 @@ def test_LRU_cache7(self):
         assert cache0.get_curr_size() == 1 <= cache0.get_size()
         assert cache1.get_curr_size() == 2 <= cache1.get_size()
 
+    @prepare_and_restore_caches()
     def test_LRU_cache8(self):
         # test if Plan1d and PlanNd can coexist in the same cache
         cache = config.get_plan_cache()
@@ -340,6 +357,7 @@ def test_LRU_cache8(self):
         assert isinstance(next(iterator)[1].plan, cufft.PlanNd)
         assert isinstance(next(iterator)[1].plan, cufft.Plan1d)
 
+    @prepare_and_restore_caches()
     def test_LRU_cache9(self):
         # test if memsizes in the cache adds up
         cache = config.get_plan_cache()
@@ -358,6 +376,8 @@ def test_LRU_cache9(self):
 
         assert memsize == cache.get_curr_memsize()
 
+    @prepare_and_restore_caches()
+    @pytest.mark.thread_unsafe(reason="intercepts stdout")
     def test_LRU_cache10(self):
         # test if deletion works and if show_info() is consistent with data
         cache = config.get_plan_cache()
@@ -406,11 +426,13 @@ def test_LRU_cache10(self):
     @pytest.mark.skipif(
         runtime.is_hip, reason="hipFFT doesn't support multi-GPU"
     )
+    @prepare_and_restore_caches()
     def test_LRU_cache11(self):
         # test if collectively deleting a multi-GPU plan works
-        _skip_multi_gpu_bug((128,), self.gpus)
-        cache0 = self.caches[0]
-        cache1 = self.caches[1]
+        with device.Device(0):
+            cache0 = config.get_plan_cache()
+        with device.Device(1):
+            cache1 = config.get_plan_cache()
 
         # ensure a fresh state
         assert cache0.get_curr_size() == 0 <= cache0.get_size()
@@ -441,11 +463,14 @@ def test_LRU_cache11(self):
     @pytest.mark.skipif(
         runtime.is_hip, reason="hipFFT doesn't support multi-GPU"
     )
+    @prepare_and_restore_caches()
     def test_LRU_cache12(self):
         # test if an error is raise when one of the caches is unable
         # to fit it a multi-GPU plan
-        cache0 = self.caches[0]
-        cache1 = self.caches[1]
+        with device.Device(0):
+            cache0 = config.get_plan_cache()
+        with device.Device(1):
+            cache1 = config.get_plan_cache()
 
         # ensure a fresh state
         assert cache0.get_curr_size() == 0 <= cache0.get_size()
@@ -467,6 +492,7 @@ def test_LRU_cache12(self):
         runtime.runtimeGetVersion() >= 11080,
         "CUDA 11.8 has different plan size",
     )
+    @prepare_and_restore_caches()
     def test_LRU_cache13(self):
         # test if plan insertion respect the memory size limit
         cache = config.get_plan_cache()
diff --git a/dpnp/tests/third_party/cupy/fft_tests/test_callback.py b/dpnp/tests/third_party/cupy/fft_tests/test_callback.py
index 12bfd4aa73df..e2682d0aaa6e 100644
--- a/dpnp/tests/third_party/cupy/fft_tests/test_callback.py
+++ b/dpnp/tests/third_party/cupy/fft_tests/test_callback.py
@@ -1,18 +1,62 @@
+from __future__ import annotations
+
 import contextlib
+import os
 import string
 import sys
 import tempfile
 from unittest import mock
 
+try:
+    import Cython
+except ImportError:
+    Cython = None
+else:
+    if Cython.__version__ < "0.29.0":
+        Cython = None
 import numpy as np
 import pytest
 
 import dpnp as cupy
 from dpnp.tests.third_party.cupy import testing
 
+# from cupy.cuda import cufft
+# from cupy.cuda.device import get_compute_capability
+
 pytest.skip("FFT callbacks are not supported", allow_module_level=True)
 
 
+def cuda_version():
+    return cupy.cuda.runtime.runtimeGetVersion()
+
+
+cb_ver_for_test = ("legacy", "jit")
+
+
+def check_should_skip_legacy_test():
+    if not sys.platform.startswith("linux"):
+        pytest.skip("legacy callbacks are only supported on Linux")
+    if Cython is None:
+        pytest.skip("no working Cython")
+    if "LD_PRELOAD" in os.environ:
+        pytest.skip(
+            "legacy callback does not work if libcufft.so " "is preloaded"
+        )
+    if cufft.getVersion() >= 12000 and get_compute_capability() == "75":
+        pytest.skip(
+            "cuFFT legacy callbacks in CUDA 13.0+ do not support " "cc 7.5"
+        )
+    if cufft.getVersion() == 11303 and get_compute_capability() == "120":
+        pytest.skip(
+            "cuFFT legacy callbacks in CUDA 12.8.0 do not support " "cc 12.0"
+        )
+
+
+def check_should_skip_jit_test():
+    if cufft.getVersion() < 11303:
+        pytest.skip("JIT callbacks require cuFFT from CUDA 12.8+")
+
+
 @contextlib.contextmanager
 def use_temporary_cache_dir():
     target = "cupy.fft._callback.get_cache_dir"
@@ -21,45 +65,50 @@ def use_temporary_cache_dir():
             yield path
 
 
+suppress_legacy_warning = pytest.mark.filterwarnings(
+    "ignore:.*legacy callback.*:DeprecationWarning"
+)
+
+
 _load_callback = r"""
-__device__ ${data_type} CB_ConvertInput(
-    void* dataIn, size_t offset, void* callerInfo, void* sharedPtr)
+__device__ ${data_type} ${cb_name}(
+    void* dataIn, ${offset_type} offset, void* callerInfo, void* sharedPtr)
 {
     ${data_type} x = ((${data_type}*)dataIn)[offset];
     ${element} *= 2.5;
     return x;
 }
 
-__device__ ${load_type} d_loadCallbackPtr = CB_ConvertInput;
+__device__ ${load_type} d_loadCallbackPtr = ${cb_name};
 """
 
 _load_callback_with_aux = r"""
-__device__ ${data_type} CB_ConvertInput(
-    void* dataIn, size_t offset, void* callerInfo, void* sharedPtr)
+__device__ ${data_type} ${cb_name}(
+    void* dataIn, ${offset_type} offset, void* callerInfo, void* sharedPtr)
 {
     ${data_type} x = ((${data_type}*)dataIn)[offset];
     ${element} *= *((${aux_type}*)callerInfo);
     return x;
 }
 
-__device__ ${load_type} d_loadCallbackPtr = CB_ConvertInput;
+__device__ ${load_type} d_loadCallbackPtr = ${cb_name};
 """
 
 _load_callback_with_aux2 = r"""
-__device__ ${data_type} CB_ConvertInput(
-    void* dataIn, size_t offset, void* callerInfo, void* sharedPtr)
+__device__ ${data_type} ${cb_name}(
+    void* dataIn, ${offset_type} offset, void* callerInfo, void* sharedPtr)
 {
     ${data_type} x = ((${data_type}*)dataIn)[offset];
     ${element} *= ((${aux_type}*)callerInfo)[offset];
     return x;
 }
 
-__device__ ${load_type} d_loadCallbackPtr = CB_ConvertInput;
+__device__ ${load_type} d_loadCallbackPtr = ${cb_name};
 """
 
 _store_callback = r"""
-__device__ void CB_ConvertOutput(
-    void *dataOut, size_t offset, ${data_type} element,
+__device__ void ${cb_name}(
+    void *dataOut, ${offset_type} offset, ${data_type} element,
     void *callerInfo, void *sharedPointer)
 {
     ${data_type} x = element;
@@ -67,12 +116,12 @@ def use_temporary_cache_dir():
     ((${data_type}*)dataOut)[offset] = x;
 }
 
-__device__ ${store_type} d_storeCallbackPtr = CB_ConvertOutput;
+__device__ ${store_type} d_storeCallbackPtr = ${cb_name};
 """
 
 _store_callback_with_aux = r"""
-__device__ void CB_ConvertOutput(
-    void *dataOut, size_t offset, ${data_type} element,
+__device__ void ${cb_name}(
+    void *dataOut, ${offset_type} offset, ${data_type} element,
     void *callerInfo, void *sharedPointer)
 {
     ${data_type} x = element;
@@ -80,26 +129,209 @@ def use_temporary_cache_dir():
     ((${data_type}*)dataOut)[offset] = x;
 }
 
-__device__ ${store_type} d_storeCallbackPtr = CB_ConvertOutput;
+__device__ ${store_type} d_storeCallbackPtr = ${cb_name};
 """
 
 
-def _set_load_cb(code, element, data_type, callback_type, aux_type=None):
-    return string.Template(code).substitute(
+def _set_load_cb(
+    code,
+    element,
+    data_type,
+    callback_type,
+    callback_name,
+    aux_type=None,
+    cb_ver="",
+):
+    if cb_ver == "jit":
+        callback_type = callback_type.replace(
+            "cufftCallback", "cufftJITCallback"
+        )
+    callback = string.Template(code).substitute(
         data_type=data_type,
         aux_type=aux_type,
         load_type=callback_type,
+        cb_name=callback_name,
         element=element,
+        offset_type=("size_t" if cb_ver == "legacy" else "unsigned long long"),
     )
-
-
-def _set_store_cb(code, element, data_type, callback_type, aux_type=None):
-    return string.Template(code).substitute(
+    if cb_ver == "jit":
+        callback = "#include <cufftXt.h>\n\n" + callback
+    return callback
+
+
+def _set_store_cb(
+    code,
+    element,
+    data_type,
+    callback_type,
+    callback_name,
+    aux_type=None,
+    cb_ver="",
+):
+    if cb_ver == "jit":
+        callback_type = callback_type.replace(
+            "cufftCallback", "cufftJITCallback"
+        )
+    callback = string.Template(code).substitute(
         data_type=data_type,
         aux_type=aux_type,
         store_type=callback_type,
+        cb_name=callback_name,
         element=element,
+        offset_type=("size_t" if cb_ver == "legacy" else "unsigned long long"),
     )
+    if cb_ver == "jit":
+        callback = "#include <cufftXt.h>\n\n" + callback
+    return callback
+
+
+# Note: this class is place here instead of at the end of this file, because
+# pytest does not reset warnings internally, and other tests would suppress
+# the warnings such that at the end we have no warnings to capture, but we want
+# to ensure warnings are raised.
+@pytest.mark.skipif(
+    cupy.cuda.runtime.is_hip, reason="hipFFT does not support callbacks"
+)
+class TestInputValidationWith1dCallbacks:
+
+    shape = (10,)
+    norm = "ortho"
+    dtype = np.complex64
+
+    @classmethod
+    def setup_class(cls):
+        # All tests in this class use a temporary cache dir (also if threaded)
+        with use_temporary_cache_dir():
+            yield
+
+    def test_fft_load_legacy(self):
+        check_should_skip_legacy_test()
+
+        fft = cupy.fft.fft
+        code = _load_callback
+        types = (
+            "x.x",
+            "cufftComplex",
+            "cufftCallbackLoadC",
+            "cufftJITCallbackLoadComplex",
+        )
+        cb_load = _set_load_cb(code, *types, cb_ver="legacy")
+
+        a = testing.shaped_random(self.shape, cupy, self.dtype)
+        with pytest.deprecated_call(
+            match="legacy callback is considered deprecated"
+        ):
+            with cupy.fft.config.set_cufft_callbacks(
+                cb_load=cb_load, cb_ver="legacy"
+            ):
+                fft(a, norm=self.norm)
+
+    def test_fft_load_jit_no_name(self):
+        check_should_skip_jit_test()
+
+        fft = cupy.fft.fft
+        code = _load_callback
+        types = (
+            "x.x",
+            "cufftComplex",
+            "cufftCallbackLoadC",
+            "cufftJITCallbackLoadComplex",
+        )
+        cb_load = _set_load_cb(code, *types, cb_ver="jit")
+
+        a = testing.shaped_random(self.shape, cupy, self.dtype)
+        # We omit passing cb_load_name. The test infra setup would check
+        # if we can infer it correctly.
+        with cupy.fft.config.set_cufft_callbacks(cb_load=cb_load, cb_ver="jit"):
+            fft(a, norm=self.norm)
+
+    def test_fft_store_legacy(self):
+        check_should_skip_legacy_test()
+
+        fft = cupy.fft.fft
+        code = _store_callback
+        types = (
+            "x.y",
+            "cufftComplex",
+            "cufftCallbackStoreC",
+            "cufftJITCallbackStoreComplex",
+        )
+        cb_store = _set_store_cb(code, *types, cb_ver="legacy")
+
+        a = testing.shaped_random(self.shape, cupy, self.dtype)
+        with pytest.deprecated_call(
+            match="legacy callback is considered deprecated"
+        ):
+            with cupy.fft.config.set_cufft_callbacks(
+                cb_store=cb_store, cb_ver="legacy"
+            ):
+                fft(a, norm=self.norm)
+
+    def test_fft_store_jit_no_name(self):
+        check_should_skip_jit_test()
+
+        fft = cupy.fft.fft
+        code = _store_callback
+        types = (
+            "x.y",
+            "cufftComplex",
+            "cufftCallbackStoreC",
+            "cufftJITCallbackStoreComplex",
+        )
+        cb_store = _set_store_cb(code, *types, cb_ver="jit")
+
+        a = testing.shaped_random(self.shape, cupy, self.dtype)
+        # We omit passing cb_store_name. The test infra setup would check
+        # if we can infer it correctly.
+        with cupy.fft.config.set_cufft_callbacks(
+            cb_store=cb_store, cb_ver="jit"
+        ):
+            fft(a, norm=self.norm)
+
+    def test_fft_load_store_legacy_aux(self):
+        check_should_skip_legacy_test()
+
+        fft = cupy.fft.fft
+        dtype = self.dtype
+        load_code = _load_callback_with_aux
+        store_code = _store_callback_with_aux
+        load_aux = cupy.asarray(2.5, dtype=cupy.dtype(dtype).char.lower())
+        store_aux = cupy.asarray(3.8, dtype=cupy.dtype(dtype).char.lower())
+
+        load_types = (
+            "x.x",
+            "cufftComplex",
+            "cufftCallbackLoadC",
+            "cufftJITCallbackLoadComplex",
+            "float",
+        )
+        store_types = (
+            "x.y",
+            "cufftComplex",
+            "cufftCallbackStoreC",
+            "cufftJITCallbackStoreComplex",
+            "float",
+        )
+        cb_load = _set_load_cb(load_code, *load_types, cb_ver="legacy")
+        cb_store = _set_store_cb(store_code, *store_types, cb_ver="legacy")
+
+        a = testing.shaped_random(self.shape, cupy, self.dtype)
+        with (
+            pytest.deprecated_call(
+                match="cb_load_aux_arr or cb_store_aux_arr is deprecated"
+            ),
+            pytest.deprecated_call(
+                match="legacy callback is considered deprecated"
+            ),
+        ):
+            with cupy.fft.config.set_cufft_callbacks(
+                cb_load=cb_load,
+                cb_store=cb_store,
+                cb_load_aux_arr=load_aux,
+                cb_store_aux_arr=store_aux,
+                cb_ver="legacy",
+            ):
+                fft(a, norm=self.norm)
 
 
 @testing.parameterize(
@@ -108,31 +340,60 @@ def _set_store_cb(code, element, data_type, callback_type, aux_type=None):
             "n": [None, 5, 10, 15],
             "shape": [(10, 7), (10,), (10, 10)],
             "norm": [None, "ortho"],
+            "cb_ver": cb_ver_for_test,
         }
     )
 )
-@testing.with_requires("cython>=0.29.0")
-@pytest.mark.skipif(
-    not sys.platform.startswith("linux"),
-    reason="callbacks are only supported on Linux",
-)
 @pytest.mark.skipif(
     cupy.cuda.runtime.is_hip, reason="hipFFT does not support callbacks"
 )
 class Test1dCallbacks:
 
+    @classmethod
+    def setup_class(cls):
+        # All tests in this class use a temporary cache dir (also if threaded)
+        with use_temporary_cache_dir():
+            yield
+
     def _test_load_helper(self, xp, dtype, fft_func):
+        if self.cb_ver == "legacy":
+            check_should_skip_legacy_test()
+        else:
+            check_should_skip_jit_test()
+
+        # for simplicity we use the JIT callback names for both legacy/jit
         fft = getattr(xp.fft, fft_func)
         code = _load_callback
         if dtype == np.complex64:
-            types = ("x.x", "cufftComplex", "cufftCallbackLoadC")
+            types = (
+                "x.x",
+                "cufftComplex",
+                "cufftCallbackLoadC",
+                "cufftJITCallbackLoadComplex",
+            )
         elif dtype == np.complex128:
-            types = ("x.x", "cufftDoubleComplex", "cufftCallbackLoadZ")
+            types = (
+                "x.x",
+                "cufftDoubleComplex",
+                "cufftCallbackLoadZ",
+                "cufftJITCallbackLoadDoubleComplex",
+            )
         elif dtype == np.float32:
-            types = ("x", "cufftReal", "cufftCallbackLoadR")
-        else:
-            types = ("x", "cufftDoubleReal", "cufftCallbackLoadD")
-        cb_load = _set_load_cb(code, *types)
+            types = (
+                "x",
+                "cufftReal",
+                "cufftCallbackLoadR",
+                "cufftJITCallbackLoadReal",
+            )
+        else:  # float64
+            types = (
+                "x",
+                "cufftDoubleReal",
+                "cufftCallbackLoadD",
+                "cufftJITCallbackLoadDoubleReal",
+            )
+        cb_load = _set_load_cb(code, *types, cb_ver=self.cb_ver)
+        cb_load_name = types[-1] if self.cb_ver == "jit" else None
 
         a = testing.shaped_random(self.shape, xp, dtype)
         if xp is np:
@@ -144,50 +405,93 @@ def _test_load_helper(self, xp, dtype, fft_func):
                 else:
                     out = out.astype(np.float32)
         else:
-            with use_temporary_cache_dir():
-                with xp.fft.config.set_cufft_callbacks(cb_load=cb_load):
-                    out = fft(a, n=self.n, norm=self.norm)
+            with xp.fft.config.set_cufft_callbacks(
+                cb_load=cb_load, cb_load_name=cb_load_name, cb_ver=self.cb_ver
+            ):
+                out = fft(a, n=self.n, norm=self.norm)
 
         return out
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_fft_load(self, xp, dtype):
         return self._test_load_helper(xp, dtype, "fft")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_ifft_load(self, xp, dtype):
         return self._test_load_helper(xp, dtype, "ifft")
 
+    @suppress_legacy_warning
     @testing.for_float_dtypes(no_float16=True)
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_rfft_load(self, xp, dtype):
         return self._test_load_helper(xp, dtype, "rfft")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_irfft_load(self, xp, dtype):
         return self._test_load_helper(xp, dtype, "irfft")
 
     def _test_store_helper(self, xp, dtype, fft_func):
+        if self.cb_ver == "legacy":
+            check_should_skip_legacy_test()
+        else:
+            check_should_skip_jit_test()
+
         fft = getattr(xp.fft, fft_func)
         code = _store_callback
+
+        # for simplicity we use the JIT callback names for both legacy/jit
         if dtype == np.complex64:
             if fft_func != "irfft":
-                types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
-            else:
-                types = ("x", "cufftReal", "cufftCallbackStoreR")
+                types = (
+                    "x.y",
+                    "cufftComplex",
+                    "cufftCallbackStoreC",
+                    "cufftJITCallbackStoreComplex",
+                )
+            else:  # float32 for irfft
+                types = (
+                    "x",
+                    "cufftReal",
+                    "cufftCallbackStoreR",
+                    "cufftJITCallbackStoreReal",
+                )
         elif dtype == np.complex128:
             if fft_func != "irfft":
-                types = ("x.y", "cufftDoubleComplex", "cufftCallbackStoreZ")
-            else:
-                types = ("x", "cufftDoubleReal", "cufftCallbackStoreD")
+                types = (
+                    "x.y",
+                    "cufftDoubleComplex",
+                    "cufftCallbackStoreZ",
+                    "cufftJITCallbackStoreDoubleComplex",
+                )
+            else:  # float64 for irfft
+                types = (
+                    "x",
+                    "cufftDoubleReal",
+                    "cufftCallbackStoreD",
+                    "cufftJITCallbackStoreDoubleReal",
+                )
         elif dtype == np.float32:
-            types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
+            types = (
+                "x.y",
+                "cufftComplex",
+                "cufftCallbackStoreC",
+                "cufftJITCallbackStoreComplex",
+            )
         elif dtype == np.float64:
-            types = ("x.y", "cufftDoubleComplex", "cufftCallbackStoreZ")
-        cb_store = _set_store_cb(code, *types)
+            types = (
+                "x.y",
+                "cufftDoubleComplex",
+                "cufftCallbackStoreZ",
+                "cufftJITCallbackStoreDoubleComplex",
+            )
+        cb_store = _set_store_cb(code, *types, cb_ver=self.cb_ver)
+        cb_store_name = types[-1] if self.cb_ver == "jit" else None
 
         a = testing.shaped_random(self.shape, xp, dtype)
         if xp is np:
@@ -201,67 +505,134 @@ def _test_store_helper(self, xp, dtype, fft_func):
                 if dtype in (np.float32, np.complex64):
                     out = out.astype(np.float32)
         else:
-            with use_temporary_cache_dir():
-                with xp.fft.config.set_cufft_callbacks(cb_store=cb_store):
-                    out = fft(a, n=self.n, norm=self.norm)
+            with xp.fft.config.set_cufft_callbacks(
+                cb_store=cb_store,
+                cb_store_name=cb_store_name,
+                cb_ver=self.cb_ver,
+            ):
+                out = fft(a, n=self.n, norm=self.norm)
 
         return out
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_fft_store(self, xp, dtype):
         return self._test_store_helper(xp, dtype, "fft")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_ifft_store(self, xp, dtype):
         return self._test_store_helper(xp, dtype, "ifft")
 
+    @suppress_legacy_warning
     @testing.for_float_dtypes(no_float16=True)
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_rfft_store(self, xp, dtype):
         return self._test_store_helper(xp, dtype, "rfft")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_irfft_store(self, xp, dtype):
         return self._test_store_helper(xp, dtype, "irfft")
 
     def _test_load_store_helper(self, xp, dtype, fft_func):
+        if self.cb_ver == "legacy":
+            check_should_skip_legacy_test()
+        else:
+            check_should_skip_jit_test()
+
+        # for simplicity we use the JIT callback names for both legacy/jit
         fft = getattr(xp.fft, fft_func)
         load_code = _load_callback
         store_code = _store_callback
         if fft_func in ("fft", "ifft"):
             if dtype == np.complex64:
-                load_types = ("x.x", "cufftComplex", "cufftCallbackLoadC")
-                store_types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
-            else:
-                load_types = ("x.x", "cufftDoubleComplex", "cufftCallbackLoadZ")
+                load_types = (
+                    "x.x",
+                    "cufftComplex",
+                    "cufftCallbackLoadC",
+                    "cufftJITCallbackLoadComplex",
+                )
+                store_types = (
+                    "x.y",
+                    "cufftComplex",
+                    "cufftCallbackStoreC",
+                    "cufftJITCallbackStoreComplex",
+                )
+            else:  # complex128
+                load_types = (
+                    "x.x",
+                    "cufftDoubleComplex",
+                    "cufftCallbackLoadZ",
+                    "cufftJITCallbackLoadDoubleComplex",
+                )
                 store_types = (
                     "x.y",
                     "cufftDoubleComplex",
                     "cufftCallbackStoreZ",
+                    "cufftJITCallbackStoreDoubleComplex",
                 )
         elif fft_func == "rfft":
             if dtype == np.float32:
-                load_types = ("x", "cufftReal", "cufftCallbackLoadR")
-                store_types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
-            else:
-                load_types = ("x", "cufftDoubleReal", "cufftCallbackLoadD")
+                load_types = (
+                    "x",
+                    "cufftReal",
+                    "cufftCallbackLoadR",
+                    "cufftJITCallbackLoadReal",
+                )
+                store_types = (
+                    "x.y",
+                    "cufftComplex",
+                    "cufftCallbackStoreC",
+                    "cufftJITCallbackStoreComplex",
+                )
+            else:  # float64
+                load_types = (
+                    "x",
+                    "cufftDoubleReal",
+                    "cufftCallbackLoadD",
+                    "cufftJITCallbackLoadDoubleReal",
+                )
                 store_types = (
                     "x.y",
                     "cufftDoubleComplex",
                     "cufftCallbackStoreZ",
+                    "cufftJITCallbackStoreDoubleComplex",
                 )
         else:  # irfft
             if dtype == np.complex64:
-                load_types = ("x.x", "cufftComplex", "cufftCallbackLoadC")
-                store_types = ("x", "cufftReal", "cufftCallbackStoreR")
-            else:
-                load_types = ("x.x", "cufftDoubleComplex", "cufftCallbackLoadZ")
-                store_types = ("x", "cufftDoubleReal", "cufftCallbackStoreD")
-        cb_load = _set_load_cb(load_code, *load_types)
-        cb_store = _set_store_cb(store_code, *store_types)
+                load_types = (
+                    "x.x",
+                    "cufftComplex",
+                    "cufftCallbackLoadC",
+                    "cufftJITCallbackLoadComplex",
+                )
+                store_types = (
+                    "x",
+                    "cufftReal",
+                    "cufftCallbackStoreR",
+                    "cufftJITCallbackStoreReal",
+                )
+            else:  # complex128
+                load_types = (
+                    "x.x",
+                    "cufftDoubleComplex",
+                    "cufftCallbackLoadZ",
+                    "cufftJITCallbackLoadDoubleComplex",
+                )
+                store_types = (
+                    "x",
+                    "cufftDoubleReal",
+                    "cufftCallbackStoreD",
+                    "cufftJITCallbackStoreDoubleReal",
+                )
+        cb_load = _set_load_cb(load_code, *load_types, cb_ver=self.cb_ver)
+        cb_load_name = load_types[-1] if self.cb_ver == "jit" else None
+        cb_store = _set_store_cb(store_code, *store_types, cb_ver=self.cb_ver)
+        cb_store_name = store_types[-1] if self.cb_ver == "jit" else None
 
         a = testing.shaped_random(self.shape, xp, dtype)
         if xp is np:
@@ -276,47 +647,71 @@ def _test_load_store_helper(self, xp, dtype, fft_func):
                 if dtype in (np.float32, np.complex64):
                     out = out.astype(np.float32)
         else:
-            with use_temporary_cache_dir():
-                with xp.fft.config.set_cufft_callbacks(
-                    cb_load=cb_load, cb_store=cb_store
-                ):
-                    out = fft(a, n=self.n, norm=self.norm)
+            with xp.fft.config.set_cufft_callbacks(
+                cb_load=cb_load,
+                cb_load_name=cb_load_name,
+                cb_store=cb_store,
+                cb_store_name=cb_store_name,
+                cb_ver=self.cb_ver,
+            ):
+                out = fft(a, n=self.n, norm=self.norm)
 
         return out
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_fft_load_store(self, xp, dtype):
         return self._test_load_store_helper(xp, dtype, "fft")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_ifft_load_store(self, xp, dtype):
         return self._test_load_store_helper(xp, dtype, "ifft")
 
+    @suppress_legacy_warning
     @testing.for_float_dtypes(no_float16=True)
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_rfft_load_store(self, xp, dtype):
         return self._test_load_store_helper(xp, dtype, "rfft")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_irfft_load_store(self, xp, dtype):
         return self._test_load_store_helper(xp, dtype, "irfft")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_fft_load_aux(self, xp, dtype):
+        if self.cb_ver == "legacy":
+            check_should_skip_legacy_test()
+        else:
+            check_should_skip_jit_test()
+
         fft = xp.fft.fft
         c = _load_callback_with_aux2
+        # for simplicity we use the JIT callback names for both legacy/jit
         if dtype == np.complex64:
-            cb_load = _set_load_cb(
-                c, "x.x", "cufftComplex", "cufftCallbackLoadC", "float"
+            types = (
+                "x.x",
+                "cufftComplex",
+                "cufftCallbackLoadC",
+                "cufftJITCallbackLoadComplex",
+                "float",
             )
-        else:
-            cb_load = _set_load_cb(
-                c, "x.x", "cufftDoubleComplex", "cufftCallbackLoadZ", "double"
+        else:  # complex128
+            types = (
+                "x.x",
+                "cufftDoubleComplex",
+                "cufftCallbackLoadZ",
+                "cufftJITCallbackLoadDoubleComplex",
+                "double",
             )
+        cb_load = _set_load_cb(c, *types, cb_ver=self.cb_ver)
+        cb_load_name = types[3] if self.cb_ver == "jit" else None
 
         a = testing.shaped_random(self.shape, xp, dtype)
         out_last = self.n if self.n is not None else self.shape[-1]
@@ -333,15 +728,22 @@ def test_fft_load_aux(self, xp, dtype):
             if dtype in (np.float32, np.complex64):
                 out = out.astype(np.complex64)
         else:
-            with use_temporary_cache_dir():
-                with xp.fft.config.set_cufft_callbacks(
-                    cb_load=cb_load, cb_load_aux_arr=b
-                ):
-                    out = fft(a, n=self.n, norm=self.norm)
+            with xp.fft.config.set_cufft_callbacks(
+                cb_load=cb_load,
+                cb_load_name=cb_load_name,
+                cb_load_data=b.data,
+                cb_ver=self.cb_ver,
+            ):
+                out = fft(a, n=self.n, norm=self.norm)
 
         return out
 
     def _test_load_store_aux_helper(self, xp, dtype, fft_func):
+        if self.cb_ver == "legacy":
+            check_should_skip_legacy_test()
+        else:
+            check_should_skip_jit_test()
+
         fft = getattr(xp.fft, fft_func)
         load_code = _load_callback_with_aux
         store_code = _store_callback_with_aux
@@ -349,53 +751,67 @@ def _test_load_store_aux_helper(self, xp, dtype, fft_func):
             load_aux = xp.asarray(2.5, dtype=xp.dtype(dtype).char.lower())
             store_aux = xp.asarray(3.8, dtype=xp.dtype(dtype).char.lower())
 
+        # for simplicity we use the JIT callback names for both legacy/jit
         if fft_func in ("fft", "ifft"):
             if dtype == np.complex64:
                 load_types = (
                     "x.x",
                     "cufftComplex",
                     "cufftCallbackLoadC",
+                    "cufftJITCallbackLoadComplex",
                     "float",
                 )
                 store_types = (
                     "x.y",
                     "cufftComplex",
                     "cufftCallbackStoreC",
+                    "cufftJITCallbackStoreComplex",
                     "float",
                 )
-            else:
+            else:  # complex128
                 load_types = (
                     "x.x",
                     "cufftDoubleComplex",
                     "cufftCallbackLoadZ",
+                    "cufftJITCallbackLoadDoubleComplex",
                     "double",
                 )
                 store_types = (
                     "x.y",
                     "cufftDoubleComplex",
                     "cufftCallbackStoreZ",
+                    "cufftJITCallbackStoreDoubleComplex",
                     "double",
                 )
         elif fft_func == "rfft":
             if dtype == np.float32:
-                load_types = ("x", "cufftReal", "cufftCallbackLoadR", "float")
+                load_types = (
+                    "x",
+                    "cufftReal",
+                    "cufftCallbackLoadR",
+                    "cufftJITCallbackLoadReal",
+                    "float",
+                )
                 store_types = (
                     "x.y",
                     "cufftComplex",
                     "cufftCallbackStoreC",
+                    "cufftJITCallbackStoreComplex",
                     "float",
                 )
-            else:
+            else:  # float64
                 load_types = (
                     "x",
                     "cufftDoubleReal",
                     "cufftCallbackLoadD",
+                    "cufftJITCallbackLoadDoubleReal",
                     "double",
                 )
                 store_types = (
                     "x.y",
                     "cufftDoubleComplex",
                     "cufftCallbackStoreZ",
+                    "cufftJITCallbackStoreDoubleComplex",
                     "double",
                 )
         else:  # irfft
@@ -404,24 +820,35 @@ def _test_load_store_aux_helper(self, xp, dtype, fft_func):
                     "x.x",
                     "cufftComplex",
                     "cufftCallbackLoadC",
+                    "cufftJITCallbackLoadComplex",
                     "float",
                 )
-                store_types = ("x", "cufftReal", "cufftCallbackStoreR", "float")
-            else:
+                store_types = (
+                    "x",
+                    "cufftReal",
+                    "cufftCallbackStoreR",
+                    "cufftJITCallbackStoreReal",
+                    "float",
+                )
+            else:  # complex128
                 load_types = (
                     "x.x",
                     "cufftDoubleComplex",
                     "cufftCallbackLoadZ",
+                    "cufftJITCallbackLoadDoubleComplex",
                     "double",
                 )
                 store_types = (
                     "x",
                     "cufftDoubleReal",
                     "cufftCallbackStoreD",
+                    "cufftJITCallbackStoreDoubleReal",
                     "double",
                 )
-        cb_load = _set_load_cb(load_code, *load_types)
-        cb_store = _set_store_cb(store_code, *store_types)
+        cb_load = _set_load_cb(load_code, *load_types, cb_ver=self.cb_ver)
+        cb_load_name = load_types[3] if self.cb_ver == "jit" else None
+        cb_store = _set_store_cb(store_code, *store_types, cb_ver=self.cb_ver)
+        cb_store_name = store_types[3] if self.cb_ver == "jit" else None
 
         a = testing.shaped_random(self.shape, xp, dtype)
         if xp is np:
@@ -436,32 +863,38 @@ def _test_load_store_aux_helper(self, xp, dtype, fft_func):
                 if dtype in (np.float32, np.complex64):
                     out = out.astype(np.float32)
         else:
-            with use_temporary_cache_dir():
-                with xp.fft.config.set_cufft_callbacks(
-                    cb_load=cb_load,
-                    cb_store=cb_store,
-                    cb_load_aux_arr=load_aux,
-                    cb_store_aux_arr=store_aux,
-                ):
-                    out = fft(a, n=self.n, norm=self.norm)
+            with xp.fft.config.set_cufft_callbacks(
+                cb_load=cb_load,
+                cb_load_name=cb_load_name,
+                cb_store=cb_store,
+                cb_store_name=cb_store_name,
+                cb_load_data=load_aux.data,
+                cb_store_data=store_aux.data,
+                cb_ver=self.cb_ver,
+            ):
+                out = fft(a, n=self.n, norm=self.norm)
 
         return out
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_fft_load_store_aux(self, xp, dtype):
         return self._test_load_store_aux_helper(xp, dtype, "fft")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_ifft_load_store_aux(self, xp, dtype):
         return self._test_load_store_aux_helper(xp, dtype, "ifft")
 
+    @suppress_legacy_warning
     @testing.for_float_dtypes(no_float16=True)
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_rfft_load_store_aux(self, xp, dtype):
         return self._test_load_store_aux_helper(xp, dtype, "rfft")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
     def test_irfft_load_store_aux(self, xp, dtype):
@@ -469,38 +902,92 @@ def test_irfft_load_store_aux(self, xp, dtype):
 
 
 @testing.parameterize(
-    {"shape": (3, 4), "s": None, "axes": None, "norm": None},
-    {"shape": (3, 4), "s": (1, 5), "axes": (-2, -1), "norm": None},
-    {"shape": (3, 4), "s": None, "axes": (-2, -1), "norm": None},
-    {"shape": (3, 4), "s": None, "axes": None, "norm": "ortho"},
-    {"shape": (2, 3, 4), "s": None, "axes": None, "norm": None},
-    {"shape": (2, 3, 4), "s": (1, 4, 10), "axes": (-3, -2, -1), "norm": None},
-    {"shape": (2, 3, 4), "s": None, "axes": (-3, -2, -1), "norm": None},
-    {"shape": (2, 3, 4), "s": None, "axes": None, "norm": "ortho"},
-    {"shape": (2, 3, 4), "s": (2, 3), "axes": (0, 1, 2), "norm": "ortho"},
-)
-@testing.with_requires("cython>=0.29.0")
-@pytest.mark.skipif(
-    not sys.platform.startswith("linux"),
-    reason="callbacks are only supported on Linux",
+    *(
+        testing.product_dict(
+            [
+                {"shape": (3, 4), "s": None, "axes": None, "norm": None},
+                {"shape": (3, 4), "s": (1, 5), "axes": (-2, -1), "norm": None},
+                {"shape": (3, 4), "s": None, "axes": (-2, -1), "norm": None},
+                {"shape": (3, 4), "s": None, "axes": None, "norm": "ortho"},
+                {"shape": (2, 3, 4), "s": None, "axes": None, "norm": None},
+                {
+                    "shape": (2, 3, 4),
+                    "s": (1, 4, 10),
+                    "axes": (-3, -2, -1),
+                    "norm": None,
+                },
+                {
+                    "shape": (2, 3, 4),
+                    "s": None,
+                    "axes": (-3, -2, -1),
+                    "norm": None,
+                },
+                {"shape": (2, 3, 4), "s": None, "axes": None, "norm": "ortho"},
+                {
+                    "shape": (2, 3, 4),
+                    "s": (2, 3),
+                    "axes": (0, 1, 2),
+                    "norm": "ortho",
+                },
+            ],
+            testing.product(
+                {
+                    "cb_ver": cb_ver_for_test,
+                },
+            ),
+        )
+    )
 )
 @pytest.mark.skipif(
     cupy.cuda.runtime.is_hip, reason="hipFFT does not support callbacks"
 )
 class TestNdCallbacks:
 
+    @classmethod
+    def setup_class(cls):
+        # All tests in this class use a temporary cache dir (also if threaded)
+        with use_temporary_cache_dir():
+            yield
+
     def _test_load_helper(self, xp, dtype, fft_func):
+        if self.cb_ver == "legacy":
+            check_should_skip_legacy_test()
+        else:
+            check_should_skip_jit_test()
+
+        # for simplicity we use the JIT callback names for both legacy/jit
         fft = getattr(xp.fft, fft_func)
         load_code = _load_callback
         if dtype == np.complex64:
-            types = ("x.x", "cufftComplex", "cufftCallbackLoadC")
+            types = (
+                "x.x",
+                "cufftComplex",
+                "cufftCallbackLoadC",
+                "cufftJITCallbackLoadComplex",
+            )
         elif dtype == np.complex128:
-            types = ("x.x", "cufftDoubleComplex", "cufftCallbackLoadZ")
+            types = (
+                "x.x",
+                "cufftDoubleComplex",
+                "cufftCallbackLoadZ",
+                "cufftJITCallbackLoadDoubleComplex",
+            )
         elif dtype == np.float32:
-            types = ("x", "cufftReal", "cufftCallbackLoadR")
-        else:
-            types = ("x", "cufftDoubleReal", "cufftCallbackLoadD")
-        cb_load = _set_load_cb(load_code, *types)
+            types = (
+                "x",
+                "cufftReal",
+                "cufftCallbackLoadR",
+                "cufftJITCallbackLoadReal",
+            )
+        else:  # float64
+            types = (
+                "x",
+                "cufftDoubleReal",
+                "cufftCallbackLoadD",
+                "cufftJITCallbackLoadDoubleReal",
+            )
+        cb_load = _set_load_cb(load_code, *types, cb_ver=self.cb_ver)
+        cb_load_name = types[3] if self.cb_ver == "jit" else None
 
         a = testing.shaped_random(self.shape, xp, dtype)
         if xp is np:
@@ -512,12 +999,14 @@ def _test_load_helper(self, xp, dtype, fft_func):
                 else:
                     out = out.astype(np.float32)
         else:
-            with use_temporary_cache_dir():
-                with xp.fft.config.set_cufft_callbacks(cb_load=cb_load):
-                    out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
+            with xp.fft.config.set_cufft_callbacks(
+                cb_load=cb_load, cb_load_name=cb_load_name, cb_ver=self.cb_ver
+            ):
+                out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
 
         return out
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -525,6 +1014,7 @@ def _test_load_helper(self, xp, dtype, fft_func):
     def test_fftn_load(self, xp, dtype):
         return self._test_load_helper(xp, dtype, "fftn")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -532,6 +1022,7 @@ def test_fftn_load(self, xp, dtype):
     def test_ifftn_load(self, xp, dtype):
         return self._test_load_helper(xp, dtype, "ifftn")
 
+    @suppress_legacy_warning
     @testing.for_float_dtypes(no_float16=True)
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -539,6 +1030,7 @@ def test_ifftn_load(self, xp, dtype):
     def test_rfftn_load(self, xp, dtype):
         return self._test_load_helper(xp, dtype, "rfftn")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -547,23 +1039,61 @@ def test_irfftn_load(self, xp, dtype):
         return self._test_load_helper(xp, dtype, "irfftn")
 
     def _test_store_helper(self, xp, dtype, fft_func):
+        if self.cb_ver == "legacy":
+            check_should_skip_legacy_test()
+        else:
+            check_should_skip_jit_test()
+
         fft = getattr(xp.fft, fft_func)
         store_code = _store_callback
+
+        # for simplicity we use the JIT callback names for both legacy/jit
         if dtype == np.complex64:
             if fft_func != "irfftn":
-                types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
-            else:
-                types = ("x", "cufftReal", "cufftCallbackStoreR")
+                types = (
+                    "x.y",
+                    "cufftComplex",
+                    "cufftCallbackStoreC",
+                    "cufftJITCallbackStoreComplex",
+                )
+            else:  # float32 for irfftn
+                types = (
+                    "x",
+                    "cufftReal",
+                    "cufftCallbackStoreR",
+                    "cufftJITCallbackStoreReal",
+                )
         elif dtype == np.complex128:
             if fft_func != "irfftn":
-                types = ("x.y", "cufftDoubleComplex", "cufftCallbackStoreZ")
-            else:
-                types = ("x", "cufftDoubleReal", "cufftCallbackStoreD")
+                types = (
+                    "x.y",
+                    "cufftDoubleComplex",
+                    "cufftCallbackStoreZ",
+                    "cufftJITCallbackStoreDoubleComplex",
+                )
+            else:  # float64 for irfftn
+                types = (
+                    "x",
+                    "cufftDoubleReal",
+                    "cufftCallbackStoreD",
+                    "cufftJITCallbackStoreDoubleReal",
+                )
         elif dtype == np.float32:
-            types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
+            types = (
+                "x.y",
+                "cufftComplex",
+                "cufftCallbackStoreC",
+                "cufftJITCallbackStoreComplex",
+            )
         elif dtype == np.float64:
-            types = ("x.y", "cufftDoubleComplex", "cufftCallbackStoreZ")
-        cb_store = _set_store_cb(store_code, *types)
+            types = (
+                "x.y",
+                "cufftDoubleComplex",
+                "cufftCallbackStoreZ",
+                "cufftJITCallbackStoreDoubleComplex",
+            )
+        cb_store = _set_store_cb(store_code, *types, cb_ver=self.cb_ver)
+        cb_store_name = types[3] if self.cb_ver == "jit" else None
 
         a = testing.shaped_random(self.shape, xp, dtype)
         if xp is np:
@@ -577,12 +1107,16 @@ def _test_store_helper(self, xp, dtype, fft_func):
                 if dtype in (np.float32, np.complex64):
                     out = out.astype(np.float32)
         else:
-            with use_temporary_cache_dir():
-                with xp.fft.config.set_cufft_callbacks(cb_store=cb_store):
-                    out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
+            with xp.fft.config.set_cufft_callbacks(
+                cb_store=cb_store,
+                cb_store_name=cb_store_name,
+                cb_ver=self.cb_ver,
+            ):
+                out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
 
         return out
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -590,6 +1124,7 @@ def _test_store_helper(self, xp, dtype, fft_func):
     def test_fftn_store(self, xp, dtype):
         return self._test_store_helper(xp, dtype, "fftn")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -597,6 +1132,7 @@ def test_fftn_store(self, xp, dtype):
     def test_ifftn_store(self, xp, dtype):
         return self._test_store_helper(xp, dtype, "ifftn")
 
+    @suppress_legacy_warning
     @testing.for_float_dtypes(no_float16=True)
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -604,6 +1140,7 @@ def test_ifftn_store(self, xp, dtype):
     def test_rfftn_store(self, xp, dtype):
         return self._test_store_helper(xp, dtype, "rfftn")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -612,40 +1149,101 @@ def test_irfftn_store(self, xp, dtype):
         return self._test_store_helper(xp, dtype, "irfftn")
 
     def _test_load_store_helper(self, xp, dtype, fft_func):
+        if self.cb_ver == "legacy":
+            check_should_skip_legacy_test()
+        else:
+            check_should_skip_jit_test()
+
         fft = getattr(xp.fft, fft_func)
         load_code = _load_callback
         store_code = _store_callback
+
+        # for simplicity we use the JIT callback names for both legacy/jit
         if fft_func in ("fftn", "ifftn"):
             if dtype == np.complex64:
-                load_types = ("x.x", "cufftComplex", "cufftCallbackLoadC")
-                store_types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
-            else:
-                load_types = ("x.x", "cufftDoubleComplex", "cufftCallbackLoadZ")
+                load_types = (
+                    "x.x",
+                    "cufftComplex",
+                    "cufftCallbackLoadC",
+                    "cufftJITCallbackLoadComplex",
+                )
+                store_types = (
+                    "x.y",
+                    "cufftComplex",
+                    "cufftCallbackStoreC",
+                    "cufftJITCallbackStoreComplex",
+                )
+            else:  # complex128
+                load_types = (
+                    "x.x",
+                    "cufftDoubleComplex",
+                    "cufftCallbackLoadZ",
+                    "cufftJITCallbackLoadDoubleComplex",
+                )
                 store_types = (
                     "x.y",
                     "cufftDoubleComplex",
                     "cufftCallbackStoreZ",
+                    "cufftJITCallbackStoreDoubleComplex",
                 )
         elif fft_func == "rfftn":
             if dtype == np.float32:
-                load_types = ("x", "cufftReal", "cufftCallbackLoadR")
-                store_types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
-            else:
-                load_types = ("x", "cufftDoubleReal", "cufftCallbackLoadD")
+                load_types = (
+                    "x",
+                    "cufftReal",
+                    "cufftCallbackLoadR",
+                    "cufftJITCallbackLoadReal",
+                )
+                store_types = (
+                    "x.y",
+                    "cufftComplex",
+                    "cufftCallbackStoreC",
+                    "cufftJITCallbackStoreComplex",
+                )
+            else:  # float64
+                load_types = (
+                    "x",
+                    "cufftDoubleReal",
+                    "cufftCallbackLoadD",
+                    "cufftJITCallbackLoadDoubleReal",
+                )
                 store_types = (
                     "x.y",
                     "cufftDoubleComplex",
                     "cufftCallbackStoreZ",
+                    "cufftJITCallbackStoreDoubleComplex",
                 )
         else:  # irfft
             if dtype == np.complex64:
-                load_types = ("x.x", "cufftComplex", "cufftCallbackLoadC")
-                store_types = ("x", "cufftReal", "cufftCallbackStoreR")
-            else:
-                load_types = ("x.x", "cufftDoubleComplex", "cufftCallbackLoadZ")
-                store_types = ("x", "cufftDoubleReal", "cufftCallbackStoreD")
-        cb_load = _set_load_cb(load_code, *load_types)
-        cb_store = _set_store_cb(store_code, *store_types)
+                load_types = (
+                    "x.x",
+                    "cufftComplex",
+                    "cufftCallbackLoadC",
+                    "cufftJITCallbackLoadComplex",
+                )
+                store_types = (
+                    "x",
+                    "cufftReal",
+                    "cufftCallbackStoreR",
+                    "cufftJITCallbackStoreReal",
+                )
+            else:  # complex128
+                load_types = (
+                    "x.x",
+                    "cufftDoubleComplex",
+                    "cufftCallbackLoadZ",
+                    "cufftJITCallbackLoadDoubleComplex",
+                )
+                store_types = (
+                    "x",
+                    "cufftDoubleReal",
+                    "cufftCallbackStoreD",
+                    "cufftJITCallbackStoreDoubleReal",
+                )
+        cb_load = _set_load_cb(load_code, *load_types, cb_ver=self.cb_ver)
+        cb_load_name = load_types[3] if self.cb_ver == "jit" else None
+        cb_store = _set_store_cb(store_code, *store_types, cb_ver=self.cb_ver)
+        cb_store_name = store_types[3] if self.cb_ver == "jit" else None
 
         a = testing.shaped_random(self.shape, xp, dtype)
         if xp is np:
@@ -660,14 +1258,18 @@ def _test_load_store_helper(self, xp, dtype, fft_func):
                 if dtype in (np.float32, np.complex64):
                     out = out.astype(np.float32)
         else:
-            with use_temporary_cache_dir():
-                with xp.fft.config.set_cufft_callbacks(
-                    cb_load=cb_load, cb_store=cb_store
-                ):
-                    out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
+            with xp.fft.config.set_cufft_callbacks(
+                cb_load=cb_load,
+                cb_load_name=cb_load_name,
+                cb_store=cb_store,
+                cb_store_name=cb_store_name,
+                cb_ver=self.cb_ver,
+            ):
+                out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
 
         return out
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -675,6 +1277,7 @@ def _test_load_store_helper(self, xp, dtype, fft_func):
     def test_fftn_load_store(self, xp, dtype):
         return self._test_load_store_helper(xp, dtype, "fftn")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -682,6 +1285,7 @@ def test_fftn_load_store(self, xp, dtype):
     def test_ifftn_load_store(self, xp, dtype):
         return self._test_load_store_helper(xp, dtype, "ifftn")
 
+    @suppress_legacy_warning
     @testing.for_float_dtypes(no_float16=True)
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -689,6 +1293,7 @@ def test_ifftn_load_store(self, xp, dtype):
     def test_rfftn_load_store(self, xp, dtype):
         return self._test_load_store_helper(xp, dtype, "rfftn")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -697,6 +1302,11 @@ def test_irfftn_load_store(self, xp, dtype):
         return self._test_load_store_helper(xp, dtype, "irfftn")
 
     def _test_load_store_aux_helper(self, xp, dtype, fft_func):
+        if self.cb_ver == "legacy":
+            check_should_skip_legacy_test()
+        else:
+            check_should_skip_jit_test()
+
         fft = getattr(xp.fft, fft_func)
         load_code = _load_callback_with_aux
         store_code = _store_callback_with_aux
@@ -704,53 +1314,67 @@ def _test_load_store_aux_helper(self, xp, dtype, fft_func):
             load_aux = xp.asarray(2.5, dtype=xp.dtype(dtype).char.lower())
             store_aux = xp.asarray(3.8, dtype=xp.dtype(dtype).char.lower())
 
+        # for simplicity we use the JIT callback names for both legacy/jit
         if fft_func in ("fftn", "ifftn"):
             if dtype == np.complex64:
                 load_types = (
                     "x.x",
                     "cufftComplex",
                     "cufftCallbackLoadC",
+                    "cufftJITCallbackLoadComplex",
                     "float",
                 )
                 store_types = (
                     "x.y",
                     "cufftComplex",
                     "cufftCallbackStoreC",
+                    "cufftJITCallbackStoreComplex",
                     "float",
                 )
-            else:
+            else:  # complex128
                 load_types = (
                     "x.x",
                     "cufftDoubleComplex",
                     "cufftCallbackLoadZ",
+                    "cufftJITCallbackLoadDoubleComplex",
                     "double",
                 )
                 store_types = (
                     "x.y",
                     "cufftDoubleComplex",
                     "cufftCallbackStoreZ",
+                    "cufftJITCallbackStoreDoubleComplex",
                     "double",
                 )
         elif fft_func == "rfftn":
             if dtype == np.float32:
-                load_types = ("x", "cufftReal", "cufftCallbackLoadR", "float")
+                load_types = (
+                    "x",
+                    "cufftReal",
+                    "cufftCallbackLoadR",
+                    "cufftJITCallbackLoadReal",
+                    "float",
+                )
                 store_types = (
                     "x.y",
                     "cufftComplex",
                     "cufftCallbackStoreC",
+                    "cufftJITCallbackStoreComplex",
                     "float",
                 )
-            else:
+            else:  # float64
                 load_types = (
                     "x",
                     "cufftDoubleReal",
                     "cufftCallbackLoadD",
+                    "cufftJITCallbackLoadDoubleReal",
                     "double",
                 )
                 store_types = (
                     "x.y",
                     "cufftDoubleComplex",
                     "cufftCallbackStoreZ",
+                    "cufftJITCallbackStoreDoubleComplex",
                     "double",
                 )
         else:  # irfftn
@@ -759,24 +1383,35 @@ def _test_load_store_aux_helper(self, xp, dtype, fft_func):
                     "x.x",
                     "cufftComplex",
                     "cufftCallbackLoadC",
+                    "cufftJITCallbackLoadComplex",
                     "float",
                 )
-                store_types = ("x", "cufftReal", "cufftCallbackStoreR", "float")
-            else:
+                store_types = (
+                    "x",
+                    "cufftReal",
+                    "cufftCallbackStoreR",
+                    "cufftJITCallbackStoreReal",
+                    "float",
+                )
+            else:  # complex128
                 load_types = (
                     "x.x",
                     "cufftDoubleComplex",
                     "cufftCallbackLoadZ",
+                    "cufftJITCallbackLoadDoubleComplex",
                     "double",
                 )
                 store_types = (
                     "x",
                     "cufftDoubleReal",
                     "cufftCallbackStoreD",
+                    "cufftJITCallbackStoreDoubleReal",
                     "double",
                 )
-        cb_load = _set_load_cb(load_code, *load_types)
-        cb_store = _set_store_cb(store_code, *store_types)
+        cb_load = _set_load_cb(load_code, *load_types, cb_ver=self.cb_ver)
+        cb_load_name = load_types[3] if self.cb_ver == "jit" else None
+        cb_store = _set_store_cb(store_code, *store_types, cb_ver=self.cb_ver)
+        cb_store_name = store_types[3] if self.cb_ver == "jit" else None
 
         a = testing.shaped_random(self.shape, xp, dtype)
         if xp is np:
@@ -791,17 +1426,20 @@ def _test_load_store_aux_helper(self, xp, dtype, fft_func):
                 if dtype in (np.float32, np.complex64):
                     out = out.astype(np.float32)
         else:
-            with use_temporary_cache_dir():
-                with xp.fft.config.set_cufft_callbacks(
-                    cb_load=cb_load,
-                    cb_store=cb_store,
-                    cb_load_aux_arr=load_aux,
-                    cb_store_aux_arr=store_aux,
-                ):
-                    out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
+            with xp.fft.config.set_cufft_callbacks(
+                cb_load=cb_load,
+                cb_load_name=cb_load_name,
+                cb_store=cb_store,
+                cb_store_name=cb_store_name,
+                cb_load_data=load_aux.data,
+                cb_store_data=store_aux.data,
+                cb_ver=self.cb_ver,
+            ):
+                out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
 
         return out
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -809,6 +1447,7 @@ def _test_load_store_aux_helper(self, xp, dtype, fft_func):
     def test_fftn_load_store_aux(self, xp, dtype):
         return self._test_load_store_aux_helper(xp, dtype, "fftn")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -816,6 +1455,7 @@ def test_fftn_load_store_aux(self, xp, dtype):
     def test_ifftn_load_store_aux(self, xp, dtype):
         return self._test_load_store_aux_helper(xp, dtype, "ifftn")
 
+    @suppress_legacy_warning
     @testing.for_float_dtypes(no_float16=True)
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
@@ -823,6 +1463,7 @@ def test_ifftn_load_store_aux(self, xp, dtype):
     def test_rfftn_load_store_aux(self, xp, dtype):
         return self._test_load_store_aux_helper(xp, dtype, "rfftn")
 
+    @suppress_legacy_warning
     @testing.for_complex_dtypes()
     @testing.numpy_cupy_allclose(
         rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
diff --git a/dpnp/tests/third_party/cupy/fft_tests/test_fft.py b/dpnp/tests/third_party/cupy/fft_tests/test_fft.py
index 534b474363f1..369409ba001c 100644
--- a/dpnp/tests/third_party/cupy/fft_tests/test_fft.py
+++ b/dpnp/tests/third_party/cupy/fft_tests/test_fft.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import functools
 import warnings
 
@@ -6,6 +8,14 @@
 
 import dpnp as cupy
 from dpnp.tests.helper import has_support_aspect64
+
+# from cupy.fft import config
+# from cupy.fft._fft import (
+#     _default_fft_func,
+#     _fft,
+#     _fftn,
+#     _size_last_transform_axis,
+# )
 from dpnp.tests.third_party.cupy import testing
 from dpnp.tests.third_party.cupy.testing._loops import _wraps_partial
 
@@ -36,12 +46,16 @@ def decorator(impl):
         @_wraps_partial(impl, name)
         def test_func(self, *args, **kw):
             # get original global planning state
-            # planning_state = config.enable_nd_planning
+            # with pytest.warns(DeprecationWarning, match="enable_nd_planning"):
+            #     planning_state = config.enable_nd_planning
             try:
                 for nd_planning in states:
                     try:
                         # enable or disable nd planning
-                        # config.enable_nd_planning = nd_planning
+                        # with pytest.warns(
+                        #     DeprecationWarning, match="enable_nd_planning"
+                        # ):
+                        #     config.enable_nd_planning = nd_planning
 
                         kw[name] = nd_planning
                         impl(self, *args, **kw)
@@ -50,7 +64,10 @@ def test_func(self, *args, **kw):
                         raise
             finally:
                 # restore original global planning state
-                # config.enable_nd_planning = planning_state
+                # with pytest.warns(
+                #     DeprecationWarning, match="enable_nd_planning"
+                # ):
+                #     config.enable_nd_planning = planning_state
                 pass
 
         return test_func
@@ -71,8 +88,8 @@ def multi_gpu_config(gpu_configs=None):
     def decorator(impl):
         @functools.wraps(impl)
         def test_func(self, *args, **kw):
-            use_multi_gpus = config.use_multi_gpus
-            _devices = config._devices
+            use_multi_gpus = config._use_multi_gpus.get()
+            _devices = config._devices.get()
 
             try:
                 for gpus in gpu_configs:
@@ -81,23 +98,21 @@ def test_func(self, *args, **kw):
                         assert nGPUs >= 2, "Must use at least two gpus"
                         config.use_multi_gpus = True
                         config.set_cufft_gpus(gpus)
-                        self.gpus = gpus
 
                         impl(self, *args, **kw)
                     except Exception:
                         print("GPU config is:", gpus)
                         raise
             finally:
-                config.use_multi_gpus = use_multi_gpus
-                config._devices = _devices
-                del self.gpus
+                config._use_multi_gpus.set(use_multi_gpus)
+                config._devices.set(_devices)
 
         return test_func
 
     return decorator
 
 
-# @testing.with_requires("numpy>=2.0")
+@testing.with_requires("numpy>=2.0")
 @pytest.mark.usefixtures("skip_forward_backward")
 @testing.parameterize(
     *testing.product(
@@ -138,7 +153,7 @@ def test_ifft(self, xp, dtype):
         return xp.fft.ifft(a, n=self.n, norm=self.norm)
 
 
-# @testing.with_requires("numpy>=2.0")
+@testing.with_requires("numpy>=2.0")
 @testing.parameterize(
     *testing.product(
         {
@@ -179,19 +194,6 @@ def test_ifft(self, xp, dtype):
         return xp.fft.ifft(a, axis=self.axis)
 
 
-# See #3757 and NVIDIA internal ticket 3093094
-def _skip_multi_gpu_bug(shape, gpus):
-    # avoid CUDA 11.0 (will be fixed by CUDA 11.2) bug triggered by
-    # - batch = 1
-    # - gpus = [1, 0]
-    if (
-        11000 <= cupy.cuda.runtime.runtimeGetVersion() < 11020
-        and len(shape) == 1
-        and gpus == [1, 0]
-    ):
-        pytest.skip("avoid CUDA 11 bug")
-
-
 # Almost identical to the TestFft class, except that
 # 1. multi-GPU cuFFT is used
 # 2. the tested parameter combinations are adjusted to meet the requirements
@@ -208,6 +210,9 @@ def _skip_multi_gpu_bug(shape, gpus):
 )
 @pytest.mark.skip("multi GPU is not supported")
 @testing.multi_gpu(2)
+# @pytest.mark.skipif(
+#     cupy.cuda.runtime.is_hip, reason="hipFFT does not support multi-GPU FFT"
+# )
 class TestMultiGpuFft:
 
     @multi_gpu_config(gpu_configs=[[0, 1], [1, 0]])
@@ -216,8 +221,6 @@ class TestMultiGpuFft:
         rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
     )
     def test_fft(self, xp, dtype):
-        _skip_multi_gpu_bug(self.shape, self.gpus)
-
         a = testing.shaped_random(self.shape, xp, dtype)
         return xp.fft.fft(a, n=self.n, norm=self.norm)
 
@@ -230,8 +233,6 @@ def test_fft(self, xp, dtype):
     @testing.with_requires("numpy!=1.17.0")
     @testing.with_requires("numpy!=1.17.1")
     def test_ifft(self, xp, dtype):
-        _skip_multi_gpu_bug(self.shape, self.gpus)
-
         a = testing.shaped_random(self.shape, xp, dtype)
         return xp.fft.ifft(a, n=self.n, norm=self.norm)
 
@@ -251,6 +252,9 @@ def test_ifft(self, xp, dtype):
 )
 @pytest.mark.skip("multi GPU is not supported")
 @testing.multi_gpu(2)
+# @pytest.mark.skipif(
+#     cupy.cuda.runtime.is_hip, reason="hipFFT does not support multi-GPU FFT"
+# )
 class TestMultiGpuFftOrder:
     @multi_gpu_config(gpu_configs=[[0, 1], [1, 0]])
     @testing.for_complex_dtypes()
@@ -258,8 +262,6 @@ class TestMultiGpuFftOrder:
         rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
     )
     def test_fft(self, xp, dtype):
-        _skip_multi_gpu_bug(self.shape, self.gpus)
-
         a = testing.shaped_random(self.shape, xp, dtype)
         if self.data_order == "F":
             a = xp.asfortranarray(a)
@@ -271,8 +273,6 @@ def test_fft(self, xp, dtype):
         rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
     )
     def test_ifft(self, xp, dtype):
-        _skip_multi_gpu_bug(self.shape, self.gpus)
-
         a = testing.shaped_random(self.shape, xp, dtype)
         if self.data_order == "F":
             a = xp.asfortranarray(a)
@@ -348,9 +348,13 @@ def test_default_fft_func(self, enable_nd):
 
 @pytest.mark.skip("memory management is not supported")
 @testing.with_requires("numpy>=2.0")
+# @pytest.mark.skipif(
+#     10010 <= cupy.cuda.runtime.runtimeGetVersion() <= 11010,
+#     reason="avoid a cuFFT bug (cupy/cupy#3777)",
+# )
 @testing.slow
 class TestFftAllocate:
-
+    # @pytest.mark.thread_unsafe(reason="does large allocations")
     def test_fft_allocate(self):
         # Check CuFFTError is not raised when the GPU memory is enough.
         # See https://github.com/cupy/cupy/issues/1063
@@ -368,7 +372,7 @@ def test_fft_allocate(self):
         cupy.fft.config.clear_plan_cache()
 
 
-# @testing.with_requires("numpy>=2.0")
+@testing.with_requires("numpy>=2.0")
 @pytest.mark.usefixtures("skip_forward_backward")
 @testing.parameterize(
     *(
@@ -413,7 +417,8 @@ class TestFft2:
         type_check=has_support_aspect64(),
     )
     def test_fft2(self, xp, dtype, order, enable_nd):
-        # assert config.enable_nd_planning == enable_nd
+        # with pytest.warns(DeprecationWarning, match="enable_nd_planning"):
+        #     assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
@@ -440,7 +445,8 @@ def test_fft2(self, xp, dtype, order, enable_nd):
         type_check=has_support_aspect64(),
     )
     def test_ifft2(self, xp, dtype, order, enable_nd):
-        # assert config.enable_nd_planning == enable_nd
+        # with pytest.warns(DeprecationWarning, match="enable_nd_planning"):
+        #     assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
@@ -457,7 +463,7 @@ def test_ifft2(self, xp, dtype, order, enable_nd):
         return out
 
 
-# @testing.with_requires("numpy>=2.0")
+@testing.with_requires("numpy>=2.0")
 @pytest.mark.usefixtures("skip_forward_backward")
 @testing.parameterize(
     *(
@@ -503,7 +509,8 @@ class TestFftn:
         type_check=has_support_aspect64(),
     )
     def test_fftn(self, xp, dtype, order, enable_nd):
-        # assert config.enable_nd_planning == enable_nd
+        # with pytest.warns(DeprecationWarning, match="enable_nd_planning"):
+        #     assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
@@ -530,7 +537,8 @@ def test_fftn(self, xp, dtype, order, enable_nd):
         type_check=has_support_aspect64(),
     )
     def test_ifftn(self, xp, dtype, order, enable_nd):
-        # assert config.enable_nd_planning == enable_nd
+        # with pytest.warns(DeprecationWarning, match="enable_nd_planning"):
+        #     assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
@@ -595,7 +603,8 @@ def skip_buggy(self):
         rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
     )
     def test_fftn(self, xp, dtype, enable_nd):
-        assert config.enable_nd_planning == enable_nd
+        with pytest.warns(DeprecationWarning, match="enable_nd_planning"):
+            assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
 
         if xp is np:
@@ -613,7 +622,8 @@ def test_fftn(self, xp, dtype, enable_nd):
         rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
     )
     def test_ifftn(self, xp, dtype, enable_nd):
-        assert config.enable_nd_planning == enable_nd
+        with pytest.warns(DeprecationWarning, match="enable_nd_planning"):
+            assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
 
         if xp is np:
@@ -635,7 +645,8 @@ def test_fftn_error_on_wrong_plan(self, dtype, enable_nd):
         from cupy.fft import fftn
         from cupyx.scipy.fftpack import get_fft_plan
 
-        assert config.enable_nd_planning == enable_nd
+        with pytest.warns(DeprecationWarning, match="enable_nd_planning"):
+            assert config.enable_nd_planning == enable_nd
 
         # can't get a plan, so skip
         if self.axes is not None:
@@ -747,6 +758,9 @@ def test_fft_error_on_wrong_plan(self, dtype):
 )
 @pytest.mark.skip("get_fft_plan() is not supported")
 @testing.multi_gpu(2)
+# @pytest.mark.skipif(
+#     cupy.cuda.runtime.is_hip, reason="hipFFT does not support multi-GPU FFT"
+# )
 class TestMultiGpuPlanCtxManagerFft:
 
     @multi_gpu_config(gpu_configs=[[0, 1], [1, 0]])
@@ -755,8 +769,6 @@ class TestMultiGpuPlanCtxManagerFft:
         rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
     )
     def test_fft(self, xp, dtype):
-        _skip_multi_gpu_bug(self.shape, self.gpus)
-
         a = testing.shaped_random(self.shape, xp, dtype)
 
         if xp is np:
@@ -776,8 +788,6 @@ def test_fft(self, xp, dtype):
         rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
     )
     def test_ifft(self, xp, dtype):
-        _skip_multi_gpu_bug(self.shape, self.gpus)
-
         a = testing.shaped_random(self.shape, xp, dtype)
 
         if xp is np:
@@ -1029,7 +1039,8 @@ class TestRfft2:
         type_check=has_support_aspect64(),
     )
     def test_rfft2(self, xp, dtype, order, enable_nd):
-        # assert config.enable_nd_planning == enable_nd
+        # with pytest.warns(DeprecationWarning, match="enable_nd_planning"):
+        #     assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
@@ -1046,14 +1057,21 @@ def test_rfft2(self, xp, dtype, order, enable_nd):
         type_check=has_support_aspect64(),
     )
     def test_irfft2(self, xp, dtype, order, enable_nd):
-        # assert config.enable_nd_planning == enable_nd
+        # with pytest.warns(DeprecationWarning, match="enable_nd_planning"):
+        #     assert config.enable_nd_planning == enable_nd
+        # if (
+        #     10020 >= cupy.cuda.runtime.runtimeGetVersion() >= 10010
+        #     and int(cupy.cuda.device.get_compute_capability()) < 70
+        #     and _size_last_transform_axis(self.shape, self.s, self.axes) == 2
+        # ):
+        #     pytest.skip("work-around for cuFFT issue")
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
         return xp.fft.irfft2(a, s=self.s, axes=self.axes, norm=self.norm)
 
 
-# @testing.with_requires("numpy>=2.0")
+@testing.with_requires("numpy>=2.0")
 @testing.parameterize(
     {"shape": (3, 4), "s": None, "axes": (), "norm": None},
     {"shape": (2, 3, 4), "s": None, "axes": (), "norm": None},
@@ -1115,7 +1133,8 @@ class TestRfftn:
         type_check=has_support_aspect64(),
     )
     def test_rfftn(self, xp, dtype, order, enable_nd):
-        # assert config.enable_nd_planning == enable_nd
+        # with pytest.warns(DeprecationWarning, match="enable_nd_planning"):
+        #     assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
@@ -1132,7 +1151,14 @@ def test_rfftn(self, xp, dtype, order, enable_nd):
         type_check=has_support_aspect64(),
     )
     def test_irfftn(self, xp, dtype, order, enable_nd):
-        # assert config.enable_nd_planning == enable_nd
+        # with pytest.warns(DeprecationWarning, match="enable_nd_planning"):
+        #     assert config.enable_nd_planning == enable_nd
+        # if (
+        #     10020 >= cupy.cuda.runtime.runtimeGetVersion() >= 10010
+        #     and int(cupy.cuda.device.get_compute_capability()) < 70
+        #     and _size_last_transform_axis(self.shape, self.s, self.axes) == 2
+        # ):
+        #     pytest.skip("work-around for cuFFT issue")
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
@@ -1182,7 +1208,8 @@ def skip_buggy(self):
         rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
     )
     def test_rfftn(self, xp, dtype, enable_nd):
-        assert config.enable_nd_planning == enable_nd
+        with pytest.warns(DeprecationWarning, match="enable_nd_planning"):
+            assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
 
         if xp is np:
@@ -1194,13 +1221,17 @@ def test_rfftn(self, xp, dtype, enable_nd):
         with plan:
             return xp.fft.rfftn(a, s=self.s, axes=self.axes, norm=self.norm)
 
+    # @pytest.mark.skipif(
+    #     cupy.cuda.runtime.is_hip, reason="hipFFT's PlanNd for C2R is buggy"
+    # )
     @nd_planning_states()
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
         rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
     )
     def test_irfftn(self, xp, dtype, enable_nd):
-        assert config.enable_nd_planning == enable_nd
+        with pytest.warns(DeprecationWarning, match="enable_nd_planning"):
+            assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
         if xp is np:
             return xp.fft.irfftn(a, s=self.s, axes=self.axes, norm=self.norm)
@@ -1282,7 +1313,7 @@ def test_ifftn_orders(self, dtype, enable_nd):
                 pass
 
 
-# @testing.with_requires("numpy>=2.0")
+@testing.with_requires("numpy>=2.0")
 @testing.parameterize(
     {"shape": (3, 4), "s": None, "axes": (), "norm": None},
     {"shape": (2, 3, 4), "s": None, "axes": (), "norm": None},
@@ -1343,7 +1374,7 @@ def test_ihfft(self, xp, dtype):
         return xp.fft.ihfft(a, n=self.n, norm=self.norm)
 
 
-# @testing.with_requires("numpy>=2.0")
+@testing.with_requires("numpy>=2.0")
 @testing.parameterize(
     {"n": 1, "d": 1},
     {"n": 10, "d": 0.5},
@@ -1372,7 +1403,7 @@ def test_rfftfreq(self, xp, dtype):
         return xp.fft.rfftfreq(self.n, self.d)
 
 
-# @testing.with_requires("numpy>=2.0")
+@testing.with_requires("numpy>=2.0")
 @testing.parameterize(
     {"shape": (5,), "axes": None},
     {"shape": (5,), "axes": 0},

From 8f8a1f9ef2c712e79a316542f8b59e4535ea0e85 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Mon, 19 Jan 2026 07:45:16 -0800
Subject: [PATCH 3/3] Update tests for random.choice

---
 .../cupy/random_tests/test_generator.py       | 160 ++++++++++++++++++
 1 file changed, 160 insertions(+)

diff --git a/dpnp/tests/third_party/cupy/random_tests/test_generator.py b/dpnp/tests/third_party/cupy/random_tests/test_generator.py
index 9d1a3d233473..abb58df07af9 100644
--- a/dpnp/tests/third_party/cupy/random_tests/test_generator.py
+++ b/dpnp/tests/third_party/cupy/random_tests/test_generator.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import functools
 import os
 import threading
@@ -850,6 +852,7 @@ def test_goodness_of_fit(self):
         assert _hypothesis.chi_square_test(counts, expected)
 
     @_condition.repeat(3, 10)
+    # @pytest.mark.xfail(runtime.is_hip, reason="ROCm/HIP may have a bug")
     def test_goodness_of_fit_2(self):
         vals = self.generate(3, (5, 20), True, [0.3, 0.3, 0.4]).get()
         counts = numpy.histogram(vals, bins=numpy.arange(4))[0]
@@ -929,6 +932,163 @@ def test_bound(self):
         assert numpy.unique(val).size == val.size
 
 
+@testing.parameterize(
+    # Edge cases with small domain sizes
+    {"a": 0, "size": 0},
+    {"a": 1, "size": 1},
+    {"a": 2, "size": 1},
+    {"a": 256, "size": 100},  # Minimum cipher bits threshold
+    {"a": 257, "size": 100},
+    # large scalare uniqueness
+    {"a": 100, "size": 50},
+    {"a": 1000, "size": 500},
+    {"a": 10000, "size": 5000},
+    {"a": 100000, "size": 50000},
+    # full inpupt permutation
+    {"a": 10, "size": 10},
+    {"a": 100, "size": 100},
+    {"a": 1000, "size": 1000},
+    # Power of 2
+    {"a": 2**8, "size": 100},
+    {"a": 2**10, "size": 500},
+    {"a": 2**16, "size": 1000},
+    {"a": 2**20, "size": 5000},
+    {"a": 2**24, "size": 10000},
+    # Just below power of 2
+    {"a": 2**8 - 1, "size": 100},
+    {"a": 2**16 - 1, "size": 1000},
+    {"a": 2**20 - 1, "size": 5000},
+    # Just above power of 2
+    {"a": 2**8 + 1, "size": 100},
+    {"a": 2**16 + 1, "size": 1000},
+    {"a": 2**20 + 1, "size": 5000},
+    # Test multi-dimensional shapes.
+    {"a": 6, "size": (2, 3)},
+    {"a": 32, "size": (4, 5)},
+    {"a": 120, "size": (5, 4, 5)},
+)
+@testing.fix_random()
+class TestChoiceReplaceFalseLargeScale(RandomGeneratorTestCase):
+    """Test large-scale uniqueness for Feistel bijection implementation."""
+
+    target_method = "choice"
+
+    def test_uniqueness_and_bounds(self):
+        """Test that samples have no duplicates and correct bounds."""
+        val = self.generate(a=self.a, size=self.size, replace=False).get()
+        size = self.size if isinstance(self.size, tuple) else (self.size,)
+
+        # Check shape
+        assert val.shape == size
+
+        # Check bounds
+        assert (0 <= val).all()
+        assert (val < self.a).all()
+
+        # Check uniqueness
+        val_flat = numpy.asarray(val).flatten()
+        assert (
+            numpy.unique(val_flat).size == val_flat.size
+        ), "Found duplicate values in replace=False sample"
+
+
+@testing.fix_random()
+class TestChoiceReplaceFalseStatistical(RandomGeneratorTestCase):
+    """Statistical tests for uniformity of Feistel bijection."""
+
+    target_method = "choice"
+
+    @_condition.repeat(3)
+    def test_small_domain_uniformity(self):
+        """Chi-square test for uniform sampling in small domain."""
+        # Sample from domain of size 10, taking 5 elements
+        # Repeat many times and check each index appears uniformly
+        n = 10
+        sample_size = 5
+        n_trials = 1000
+
+        counts = cupy.zeros(n, dtype=int)
+        vals = self.generate_many(
+            n, size=sample_size, replace=False, _count=n_trials
+        )
+        for val in vals:
+            counts[val] += 1
+        counts = counts.get()
+
+        # Each index should appear ~500 times (5/10 * 1000)
+        expected = numpy.ones(n, dtype=int) * (sample_size * n_trials // n)
+        assert _hypothesis.chi_square_test(counts, expected)
+
+    @_condition.repeat(3, 10)
+    def test_permutation_variability(self):
+        """Test that repeated full permutations are different."""
+        n = 20
+        n_trials = 10
+
+        vals = self.generate_many(n, size=n, replace=False, _count=n_trials)
+        perms = cupy.vstack(vals)
+
+        # Should have multiple unique permutations
+        unique_perms = cupy.unique(perms, axis=0)
+        assert (
+            len(unique_perms) == n_trials
+        ), "Permutations should vary across multiple calls"
+
+
+@testing.slow
+@testing.fix_random()
+class TestChoiceReplaceFalseVeryLargeDomain(unittest.TestCase):
+    """Test memory efficiency with very large domains."""
+
+    def setUp(self):
+        self.rs = _generator.RandomState(seed=testing.generate_seed())
+
+    def test_large_domain_memory_efficiency(self):
+        """Test that very large domains don't allocate full arrays."""
+        # This should NOT allocate a 2^30 element array
+        # If it did, it would require ~8GB of memory
+        a = 2**30
+        size = 1000
+
+        val = self.rs.choice(a=a, size=size, replace=False).get()
+
+        # Check bounds
+        assert (0 <= val).all()
+        assert (val < a).all()
+
+        # Check uniqueness
+        assert numpy.unique(val).size == size
+
+    def test_near_32bit_limit(self):
+        """Test at the 32-bit boundary."""
+        # Current implementation supports up to 2^32
+        a = 2**31
+        size = 500
+
+        val = self.rs.choice(a=a, size=size, replace=False).get()
+
+        # Check bounds
+        assert (0 <= val).all()
+        assert (val < a).all()
+
+        # Check uniqueness
+        assert numpy.unique(val).size == size
+
+
+@testing.fix_random()
+class TestChoiceReplaceFalseDtypeConsistency(RandomGeneratorTestCase):
+    """Test output dtype consistency."""
+
+    target_method = "choice"
+
+    def test_integer_input_dtype(self):
+        """Integer input should produce int64/long dtype."""
+        val = self.generate(a=100, size=50, replace=False)
+
+        # Should be 'l' (long) dtype, which is int64 on most platforms
+        assert val.dtype == numpy.dtype("l") or val.dtype == numpy.int64
+
+
 @testing.fix_random()
 class TestGumbel(RandomGeneratorTestCase):