Source code for hpctestlib.microbenchmarks.gpu.gpu_burn

# Copyright 2016-2024 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
# ReFrame Project Developers. See the top-level LICENSE file for details.
#
# SPDX-License-Identifier: BSD-3-Clause

import os

import reframe as rfm
import reframe.utility.typecheck as typ
import reframe.utility.sanity as sn



[docs]
class gpu_burn_build(rfm.CompileOnlyRegressionTest, pin_prefix=True):
    '''Fixture for building the GPU burn benchmark.

    .. list-table:: Summary
       :widths: 20 40 40
       :header-rows: 1

       * - Variables
         - Parameters
         - Fixtures
       * - - :attr:`gpu_arch`
           - :attr:`gpu_build`
         - *None*
         - *None*
    '''

    #: Set the build option to either ``'cuda'`` or ``'hip'``.
    #:
    #: :type: :class:`str`
    #: :default: ``'cuda'``
    gpu_build = variable(str, type(None), value=None)

    #: Set the GPU architecture.
    #:
    #: This variable will be passed to the compiler to generate the
    #: arch-specific code.
    #:
    #: :type: :class:`str` or :obj:`None`
    #: :default: ``None``
    gpu_arch = variable(str, type(None), value=None)

    descr = 'GPU burn test build fixture'
    sourcesdir = 'src/gpu_burn'
    build_system = 'Make'

    @run_before('compile')
    def setup_build(self):
        curr_part = self.current_partition
        curr_env = self.current_environ

        if self.gpu_build is None:
            # Try to set the build type from the partition features
            if 'cuda' in curr_env.features:
                self.gpu_build = 'cuda'
            elif 'hip' in curr_env.features:
                self.gpu_build = 'hip'

        gpu_devices = curr_part.select_devices('gpu')
        if self.gpu_arch is None and gpu_devices:
            # Try to set the gpu arch from the partition's devices; we assume
            # all devices are of the same architecture
            self.gpu_arch = gpu_devices[0].arch

        if self.gpu_build == 'cuda':
            self.build_system.makefile = 'makefile.cuda'
            if self.gpu_arch:
                cc = self.gpu_arch.replace('sm_', 'compute_')
                self.build_system.cxxflags = [f'-arch={cc}',
                                              f'-code={self.gpu_arch}']
        elif self.gpu_build == 'hip':
            self.build_system.makefile = 'makefile.hip'
            if self.gpu_arch:
                self.build_system.cxxflags = [
                    f'--amdgpu-target={self.gpu_arch}'
                ]
        else:
            raise ValueError(f'unknown build variant: {self.gpu_build!r}')

    @sanity_function
    def valid_build(self):
        return True




[docs]
@rfm.simple_test
class gpu_burn_check(rfm.RunOnlyRegressionTest):
    '''GPU burn benchmark.

    This benchmark runs continuously GEMM, either single or double precision,
    on a selected set of GPUs on the node where the benchmark runs.

    The floating point precision of the computations, the duration of the
    benchmark as well as the list of GPU devices that the benchmark will run
    on can be controlled through test variables.

    This benchmark tries to build the benchmark code through the
    :class:`gpu_burn_build` fixture.

    This benchmark sets the
    :attr:`~reframe.core.pipeline.RegressionTest.num_gpus_per_node` test
    attribute, if not already set, based on the number of devices with ``type
    == 'gpu'`` defined in the corresponding partition configuration.
    Similarly, this benchmark will use the ``arch`` device configuration
    attribute to set the :attr:`gpu_arch` variable, if this is not already set
    by the user.

    .. list-table:: Summary
       :widths: 10 10 20 20 20 20
       :header-rows: 1

       * - Variables
         - Parameters
         - Metrics
         - Fixtures
         - System features
         - Environment features
       * - - :attr:`use_dp`
           - :attr:`duration`
           - :attr:`devices`
         - *None*
         - - :obj:`gpu_perf_min`
           - :obj:`gpu_temp_max`
         - - :class:`gpu_burn_build` :obj:`[E]`
         - ``+gpu``
         - ``+cuda`` OR ``+hip``

    '''

    #: Use double-precision arithmetic when running the benchmark.
    #:
    #: :type: :class:`bool`
    #: :default: ``True``
    use_dp = variable(typ.Bool, value=True)

    #: Duration of the benchmark in seconds.
    #:
    #: :type: :class:`int`
    #: :default: ``10``
    duration = variable(int, value=10)

    #: List of device IDs to run the benchmark on.
    #:
    #: If empty, the benchmark will run on all the available devices.
    #:
    #: :type: :class:`List[int]`
    #: :default: ``[]``
    devices = variable(typ.List[int], value=[])

    num_tasks = 1
    num_tasks_per_node = 1

    descr = 'GPU burn test'
    build_system = 'Make'
    executable = 'gpu_burn.x'

    # The fixture to build the benchmark
    #
    # :type: :class:`gpu_burn_build`
    # :scope: *environment*
    gpu_burn_binaries = fixture(gpu_burn_build, scope='environment')

    valid_systems = ['+gpu']
    valid_prog_environs = ['+cuda', '+hip']

    @run_before('run')
    def set_exec_opts(self):
        if self.use_dp:
            self.executable_opts += ['-d']

        if self.devices:
            self.executable_opts += ['-D',
                                     ','.join(str(x) for x in self.devices)]

        self.executable_opts += [str(self.duration)]

    @run_before('run')
    def add_exec_prefix(self):
        self.executable = os.path.join(self.gpu_burn_binaries.stagedir,
                                       self.executable)

    @run_before('run')
    def set_num_gpus_per_node(self):
        if self.num_gpus_per_node is not None:
            return

        gpu_devices = self.current_partition.select_devices('gpu')
        if gpu_devices:
            self.num_gpus_per_node = gpu_devices[0].num_devices

    @sanity_function
    def assert_sanity(self):
        num_gpus_detected = sn.extractsingle(
            r'==> devices selected \((\d+)\)', self.stdout, 1, int
        )
        return sn.assert_eq(
            sn.count(sn.findall(r'GPU\s+\d+\(OK\)', self.stdout)),
            num_gpus_detected
        )

    def _extract_metric(self, metric):
        return sn.extractall(
            r'GPU\s+\d+\(OK\):\s+(?P<perf>\S+)\s+GF/s\s+'
            r'(?P<temp>\S+)\s+Celsius', self.stdout, metric, float
        )


[docs]
    @performance_function('Gflop/s')
    def gpu_perf_min(self):
        '''Lowest performance recorded among all the selected devices.'''
        return sn.min(self._extract_metric('perf'))



[docs]
    @performance_function('degC')
    def gpu_temp_max(self):
        '''Maximum temperature recorded among all the selected devices.'''
        return sn.max(self._extract_metric('temp'))