6 #ifndef LEVELZEROIMP_HPP_INCLUDE
7 #define LEVELZEROIMP_HPP_INCLUDE
11 #include <level_zero/ze_api.h>
12 #include <level_zero/zes_api.h>
25 int num_gpu(
void)
const override;
26 int num_gpu(
int domain)
const override;
28 int domain)
const override;
30 int l0_domain,
int l0_domain_idx)
const override;
32 int l0_domain,
int l0_domain_idx)
const override;
33 double frequency_min(
unsigned int l0_device_idx,
int l0_domain,
34 int l0_domain_idx)
const override;
35 double frequency_max(
unsigned int l0_device_idx,
int l0_domain,
36 int l0_domain_idx)
const override;
39 int l0_domain_idx)
const override;
41 int l0_domain_idx)
const override;
44 int l0_domain_idx)
const override;
46 int l0_domain)
const override;
48 int l0_domain_idx)
const override;
51 int l0_domain,
int l0_domain_idx)
const override;
52 uint64_t
active_time(
unsigned int l0_device_idx,
int l0_domain,
53 int l0_domain_idx)
const override;
55 int l0_domain,
int l0_domain_idx)
const override;
57 int l0_domain)
const override;
58 std::pair<uint64_t, uint64_t>
energy_pair(
int geopm_domain,
unsigned int l0_device_idx,
59 int l0_domain_idx)
const override;
60 uint64_t
energy(
int geopm_domain,
unsigned int l0_device_idx,
61 int l0_domain,
int l0_domain_idx)
const override;
63 unsigned int l0_device_idx,
65 int l0_domain_idx)
const override;
67 int l0_domain)
const override;
69 int l0_domain,
int l0_domain_idx)
const override;
79 double range_max)
const override;
84 double setting)
const override;
86 int ras_domain_count(
unsigned int l0_device_idx,
int l0_domain)
const override;
89 int l0_domain_idx)
const override;
92 int l0_domain_idx)
const override;
95 int l0_domain_idx)
const override;
98 int l0_domain_idx)
const override;
101 int l0_domain_idx)
const override;
104 int l0_domain_idx)
const override;
107 int l0_domain_idx)
const override;
110 int l0_domain_idx)
const override;
113 int l0_domain_idx)
const override;
116 int l0_domain_idx)
const override;
119 int l0_domain_idx)
const override;
122 int l0_domain_idx)
const override;
125 int l0_domain_idx)
const override;
128 int l0_domain_idx)
const override;
132 M_ERROR_TYPE_CORRECTABLE,
133 M_ERROR_TYPE_UNCORRECTABLE,
137 struct m_frequency_s {
141 double efficient = 0;
143 uint32_t throttle_reasons = 0;
145 struct m_power_limit_s {
151 struct m_subdevice_s {
153 std::vector<std::vector<zes_freq_handle_t> > freq_domain;
154 std::vector<std::vector<zes_temp_handle_t> > temp_domain_max;
155 std::vector<std::vector<zes_engine_handle_t> > engine_domain;
156 mutable std::vector<std::vector<uint64_t> > cached_timestamp;
159 std::vector<std::vector<zes_perf_handle_t>> perf_domain;
161 uint32_t num_subdevice_power_domain;
162 std::vector<zes_pwr_handle_t> power_domain;
163 mutable std::vector<uint64_t> cached_energy_timestamp;
171 std::vector<zes_ras_handle_t> ras_domain;
175 struct m_device_info_s {
176 zes_device_handle_t device_handle;
177 ze_device_properties_t property;
178 uint32_t m_num_subdevice;
179 std::vector<zes_device_handle_t> subdevice_handle;
185 m_subdevice_s subdevice;
188 uint32_t num_device_power_domain;
189 zes_pwr_handle_t power_domain;
190 mutable uint64_t cached_energy_timestamp;
193 void ras_domain_cache(
unsigned int l0_device_idx);
194 void frequency_domain_cache(
unsigned int l0_device_idx);
195 void power_domain_cache(
unsigned int l0_device_idx);
196 void perf_domain_cache(
unsigned int l0_device_idx);
197 void engine_domain_cache(
unsigned int l0_device_idx);
198 void temperature_domain_cache(
unsigned int l0_device_idx);
199 void check_ze_result(ze_result_t ze_result,
int error, std::string message,
202 std::pair<double, double> frequency_min_max(
unsigned int l0_device_idx,
203 int l0_domain,
int l0_domain_idx)
const;
205 m_power_limit_s power_limit_default(
unsigned int l0_device_idx)
const;
206 uint64_t ras_status_helper(
unsigned int l0_device_idx,
209 zes_ras_error_cat_t errorcat,
210 int errortype)
const;
212 m_frequency_s frequency_status_helper(
unsigned int l0_device_idx,
213 int l0_domain,
int l0_domain_idx)
const;
216 uint32_t m_num_gpu_subdevice;
218 std::vector<ze_driver_handle_t> m_levelzero_driver;
219 std::vector<m_device_info_s> m_devices;
Definition: LevelZero.hpp:18
Definition: LevelZeroImp.hpp:21
virtual ~LevelZeroImp()=default
int frequency_domain_count(unsigned int l0_device_idx, int domain) const override
Get the number of LevelZero frequency domains of a certain type.
Definition: LevelZero.cpp:633
double ras_programming_errcount_correctable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of correctable hardware exceptions generated by the way workloads h...
Definition: LevelZero.cpp:690
uint64_t active_time(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device active time in microseconds.
Definition: LevelZero.cpp:924
LevelZeroImp()
Definition: LevelZero.cpp:34
double ras_reset_count_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of uncorrectable accelerator engine resets attempted by the driver.
Definition: LevelZero.cpp:738
double frequency_efficient(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device efficient frequency in MHz.
Definition: LevelZero.cpp:820
std::pair< uint64_t, uint64_t > energy_pair(int geopm_domain, unsigned int l0_device_idx, int l0_domain_idx) const override
Get the LevelZero device energy and timestamp in microjoules and microseconds.
Definition: LevelZero.cpp:969
int num_gpu(void) const override
Number of GPUs on the platform.
Definition: LevelZero.cpp:590
double ras_compute_errcount_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of uncorrectable errors that have occurred in the compute accelerat...
Definition: LevelZero.cpp:762
double frequency_status(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device actual frequency in MHz.
Definition: LevelZero.cpp:814
double temperature_max(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device maximum temperature in Celsius.
Definition: LevelZero.cpp:906
int ras_domain_count(unsigned int l0_device_idx, int l0_domain) const override
Get the number of LevelZero RAS domains of a certain type.
Definition: LevelZero.cpp:666
std::pair< uint64_t, uint64_t > active_time_pair(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device active time and timestamp in microseconds.
Definition: LevelZero.cpp:930
double ras_noncompute_errcount_correctable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of correctable errors that have occurred in the fixed-function acce...
Definition: LevelZero.cpp:714
int power_domain_count(int geopm_domain, unsigned int l0_device_idx, int l0_domain) const override
Get the number of LevelZero power domains of a certain type.
Definition: LevelZero.cpp:617
double ras_noncompute_errcount_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of uncorrectable errors that have occurred in the fixed-function ac...
Definition: LevelZero.cpp:770
double ras_display_errcount_correctable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of correctable errors that have occurred in the display.
Definition: LevelZero.cpp:730
double ras_display_errcount_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of uncorrectable errors that have occurred in the display.
Definition: LevelZero.cpp:786
int temperature_domain_count(unsigned int l0_device_idx, int l0_domain) const override
Get the number of LevelZero temperature domains.
Definition: LevelZero.cpp:648
int engine_domain_count(unsigned int l0_device_idx, int domain) const override
Get the number of LevelZero engine domains.
Definition: LevelZero.cpp:638
void performance_factor_control(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx, double setting) const override
Set the performance factor for the LevelZero device.
Definition: LevelZero.cpp:1070
int32_t power_limit_max(unsigned int l0_device_idx) const override
Get the LevelZero device maximum power limit in milliwatts.
Definition: LevelZero.cpp:1021
int32_t power_limit_tdp(unsigned int l0_device_idx) const override
Get the LevelZero device default power limit in milliwatts.
Definition: LevelZero.cpp:1003
uint64_t active_time_timestamp(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the cachced LevelZero device timestamp for the active time value in microseconds.
Definition: LevelZero.cpp:918
uint64_t energy_timestamp(int geopm_domain, unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device energy cached timestamp in microseconds.
Definition: LevelZero.cpp:948
double ras_cache_errcount_correctable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of correctable errors that have occurred in caches (L1/L3/register ...
Definition: LevelZero.cpp:722
std::pair< double, double > frequency_range(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device minimum and maximum frequency control range in MHz.
Definition: LevelZero.cpp:894
double performance_factor(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the performance factor value of various LevelZero domains.
Definition: LevelZero.cpp:653
uint32_t frequency_throttle_reasons(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device frequency throttle reasons.
Definition: LevelZero.cpp:826
double ras_cache_errcount_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of uncorrectable errors that have occurred in caches (L1/L3/registe...
Definition: LevelZero.cpp:778
double frequency_min(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device mininmum frequency in MHz.
Definition: LevelZero.cpp:852
std::vector< double > frequency_supported(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device supported frequencies in MHz.
Definition: LevelZero.cpp:876
double ras_reset_count_correctable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of correctable accelerator engine resets attempted by the driver.
Definition: LevelZero.cpp:682
int32_t power_limit_min(unsigned int l0_device_idx) const override
Get the LevelZero device minimum power limit in milliwatts.
Definition: LevelZero.cpp:1012
double frequency_max(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device maximum frequency in MHz.
Definition: LevelZero.cpp:858
double ras_compute_errcount_correctable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of correctable errors that have occurred in the compute accelerator...
Definition: LevelZero.cpp:706
void frequency_control(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx, double range_min, double range_max) const override
Set min and max frequency for LevelZero device.
Definition: LevelZero.cpp:1046
double ras_driver_errcount_correctable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of low level driver communication correctable errors have occurred.
Definition: LevelZero.cpp:698
int performance_domain_count(unsigned int l0_device_idx, int l0_domain) const override
Get the number of LevelZero perf domains of a certain type.
Definition: LevelZero.cpp:643
double ras_programming_errcount_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of uncorrectable hardware exceptions generated by the way workloads...
Definition: LevelZero.cpp:746
double ras_driver_errcount_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of low level driver communication uncorrectable errors have occurre...
Definition: LevelZero.cpp:754
uint64_t energy(int geopm_domain, unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device energy in microjoules.
Definition: LevelZero.cpp:963