geopm 3.1.1.dev579+g75d9c8b9
GEOPM - Global Extensible Open Power Manager
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
LevelZeroDevicePoolImp.hpp
Go to the documentation of this file.
1/*
2 * Copyright (c) 2015 - 2025 Intel Corporation
3 * SPDX-License-Identifier: BSD-3-Clause
4 */
5
6#ifndef LEVELZERODEVICEPOOLIMP_HPP_INCLUDE
7#define LEVELZERODEVICEPOOLIMP_HPP_INCLUDE
8
9#include <string>
10#include <cstdint>
11#include <map>
12
14#include "LevelZero.hpp"
15
16#include "geopm_time.h"
17
18namespace geopm
19{
21 {
22 public:
25 virtual ~LevelZeroDevicePoolImp() = default;
26 int num_gpu(int domain_type) const override;
27 double frequency_status(int domain, unsigned int domain_idx,
28 int l0_domain) const override;
29 double frequency_efficient(int domain, unsigned int domain_idx,
30 int l0_domain) const override;
31 double frequency_min(int domain, unsigned int domain_idx,
32 int l0_domain) const override;
33 double frequency_max(int domain, unsigned int domain_idx,
34 int l0_domain) const override;
35 double frequency_step(int domain, unsigned int domain_idx,
36 int l0_domain) const override;
37 uint32_t frequency_throttle_reasons(int domain, unsigned int domain_idx,
38 int l0_domain) const override;
39 std::pair <double, double> frequency_range(int domain,
40 unsigned int domain_idx,
41 int l0_domain) const override;
42 double temperature_max(int domain, unsigned int domain_idx,
43 int l0_domain) const override;
44 std::pair<uint64_t, uint64_t> active_time_pair(int domain,
45 unsigned int device_idx,
46 int l0_domain) const override;
47 double active_time(int domain, unsigned int device_idx,
48 int l0_domain) const override;
49 double active_time_timestamp(int domain, unsigned int device_idx,
50 int l0_domain) const override;
51 int32_t power_limit_tdp(int domain, unsigned int domain_idx,
52 int l0_domain) const override;
53 int32_t power_limit_min(int domain, unsigned int domain_idx,
54 int l0_domain) const override;
55 int32_t power_limit_max(int domain, unsigned int domain_idx,
56 int l0_domain) const override;
57 std::pair<uint64_t, uint64_t> energy_pair(int domain, unsigned int domain_idx,
58 int l0_domain) const override;
59 uint64_t energy(int domain, unsigned int domain_idx, int l0_domain) const override;
60 uint64_t energy_timestamp(int domain, unsigned int domain_idx,
61 int l0_domain) const override;
62 double performance_factor(int domain,
63 unsigned int domain_idx,
64 int l0_domain) const override;
65 void frequency_control(int domain, unsigned int domain_idx,
66 int l0_domain, double range_min,
67 double range_max) const override;
68 void performance_factor_control(int domain, unsigned int domain_idx,
69 int l0_domain,
70 double setting) const override;
71 double ras_reset_count_correctable(int domain, unsigned int domain_idx,
72 int l0_domain) const override;
73 double ras_programming_errcount_correctable(int domain, unsigned int domain_idx,
74 int l0_domain) const override;
75 double ras_driver_errcount_correctable(int domain, unsigned int domain_idx,
76 int l0_domain) const override;
77 double ras_compute_errcount_correctable(int domain, unsigned int domain_idx,
78 int l0_domain) const override;
79 double ras_noncompute_errcount_correctable(int domain, unsigned int domain_idx,
80 int l0_domain) const override;
81 double ras_cache_errcount_correctable(int domain, unsigned int domain_idx,
82 int l0_domain) const override;
83 double ras_display_errcount_correctable(int domain, unsigned int domain_idx,
84 int l0_domain) const override;
85 double ras_reset_count_uncorrectable(int domain, unsigned int domain_idx,
86 int l0_domain) const override;
87 double ras_programming_errcount_uncorrectable(int domain, unsigned int domain_idx,
88 int l0_domain) const override;
89 double ras_driver_errcount_uncorrectable(int domain, unsigned int domain_idx,
90 int l0_domain) const override;
91 double ras_compute_errcount_uncorrectable(int domain, unsigned int domain_idx,
92 int l0_domain) const override;
93 double ras_noncompute_errcount_uncorrectable(int domain, unsigned int domain_idx,
94 int l0_domain) const override;
95 double ras_cache_errcount_uncorrectable(int domain, unsigned int domain_idx,
96 int l0_domain) const override;
97 double ras_display_errcount_uncorrectable(int domain, unsigned int domain_idx,
98 int l0_domain) const override;
99 private:
100 const LevelZero &m_levelzero;
101
102 void check_idx_range(int domain, unsigned int domain_idx) const;
103 void check_domain_exists(int size, const char *func, int line) const;
104 std::pair<unsigned int, unsigned int> subdevice_device_conversion(unsigned int idx) const;
105 mutable std::map<int, std::vector<uint64_t> > m_active_time_last; // Map from l0_domain to vector over gpu chips
106 mutable std::map<int, std::vector<uint64_t> > m_active_time_rollover; // Map from l0_domain to vector over gpu chips
107 };
108}
109#endif
Definition LevelZeroDevicePool.hpp:19
Definition LevelZeroDevicePoolImp.hpp:21
double frequency_min(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero device mininmum frequency in MHz.
Definition LevelZeroDevicePool.cpp:127
void performance_factor_control(int domain, unsigned int domain_idx, int l0_domain, double setting) const override
Set performance factor for LevelZero device.
Definition LevelZeroDevicePool.cpp:493
double ras_compute_errcount_correctable(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero count of number of correctable errors that have occurred in the compute accelerator...
Definition LevelZeroDevicePool.cpp:568
double ras_display_errcount_correctable(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero count of number of correctable errors that have occurred in the display.
Definition LevelZeroDevicePool.cpp:622
double ras_driver_errcount_correctable(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero count of number of low level driver communication correctable errors have occurred.
Definition LevelZeroDevicePool.cpp:550
double frequency_status(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero device actual frequency in MHz.
Definition LevelZeroDevicePool.cpp:91
uint32_t frequency_throttle_reasons(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero device frequency throttle reasons.
Definition LevelZeroDevicePool.cpp:191
double temperature_max(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero domain maximum temperature in Celsius.
Definition LevelZeroDevicePool.cpp:229
double performance_factor(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero device performance factor.
Definition LevelZeroDevicePool.cpp:457
double ras_cache_errcount_correctable(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero count of number of correctable errors that have occurred in caches (L1/L3/register ...
Definition LevelZeroDevicePool.cpp:604
double ras_cache_errcount_uncorrectable(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero count of number of uncorrectable errors that have occurred in caches (L1/L3/registe...
Definition LevelZeroDevicePool.cpp:732
LevelZeroDevicePoolImp()
Definition LevelZeroDevicePool.cpp:31
double frequency_step(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero device frequency step in MHz.
Definition LevelZeroDevicePool.cpp:163
double ras_compute_errcount_uncorrectable(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero count of number of uncorrectable errors that have occurred in the compute accelerat...
Definition LevelZeroDevicePool.cpp:696
double frequency_efficient(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero device efficient frequency in MHz.
Definition LevelZeroDevicePool.cpp:109
void frequency_control(int domain, unsigned int domain_idx, int l0_domain, double range_min, double range_max) const override
Set min and max frequency for LevelZero device.
Definition LevelZeroDevicePool.cpp:473
virtual ~LevelZeroDevicePoolImp()=default
std::pair< double, double > frequency_range(int domain, unsigned int domain_idx, int l0_domain) const override
Definition LevelZeroDevicePool.cpp:209
uint64_t energy(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero device energy in microjoules.
Definition LevelZeroDevicePool.cpp:430
int32_t power_limit_min(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero device minimum power limit in milliwatts.
Definition LevelZeroDevicePool.cpp:328
double frequency_max(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero device maximum frequency in MHz.
Definition LevelZeroDevicePool.cpp:145
double ras_noncompute_errcount_uncorrectable(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero count of number of uncorrectable errors that have occurred in the fixed-function ac...
Definition LevelZeroDevicePool.cpp:714
double active_time_timestamp(int domain, unsigned int device_idx, int l0_domain) const override
Get the LevelZero device timestamp for the active time value in microseconds.
Definition LevelZeroDevicePool.cpp:283
double ras_programming_errcount_correctable(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero count of number of correctable hardware exceptions generated by the way workloads h...
Definition LevelZeroDevicePool.cpp:532
std::pair< uint64_t, uint64_t > energy_pair(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero device energy in microjoules and timestamp in microseconds.
Definition LevelZeroDevicePool.cpp:369
double ras_reset_count_uncorrectable(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero count of number of uncorrectable accelerator engine resets attempted by the driver.
Definition LevelZeroDevicePool.cpp:642
double ras_reset_count_correctable(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero count of number of correctable accelerator engine resets attempted by the driver.
Definition LevelZeroDevicePool.cpp:514
double ras_programming_errcount_uncorrectable(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero count of number of uncorrectable hardware exceptions generated by the way workloads...
Definition LevelZeroDevicePool.cpp:660
uint64_t energy_timestamp(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero device energy timestamp in microseconds.
Definition LevelZeroDevicePool.cpp:399
double active_time(int domain, unsigned int device_idx, int l0_domain) const override
Get the LevelZero device timestamp for the active time value in microseconds.
Definition LevelZeroDevicePool.cpp:304
int32_t power_limit_max(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero device maximum power limit in milliwatts.
Definition LevelZeroDevicePool.cpp:341
double ras_driver_errcount_uncorrectable(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero count of number of low level driver communication uncorrectable errors have occurre...
Definition LevelZeroDevicePool.cpp:678
double ras_noncompute_errcount_correctable(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero count of number of correctable errors that have occurred in the fixed-function acce...
Definition LevelZeroDevicePool.cpp:586
int32_t power_limit_tdp(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero device default power limit in milliwatts.
Definition LevelZeroDevicePool.cpp:355
std::pair< uint64_t, uint64_t > active_time_pair(int domain, unsigned int device_idx, int l0_domain) const override
Get the LevelZero device active time and timestamp in microseconds.
Definition LevelZeroDevicePool.cpp:248
int num_gpu(int domain_type) const override
Number of GPUs on the platform.
Definition LevelZeroDevicePool.cpp:36
double ras_display_errcount_uncorrectable(int domain, unsigned int domain_idx, int l0_domain) const override
Get the LevelZero count of number of uncorrectable errors that have occurred in the display.
Definition LevelZeroDevicePool.cpp:750
Definition LevelZero.hpp:18
Definition Agg.cpp:20
const LevelZero & levelzero()
Definition LevelZero.cpp:29