geopm  3.1.1.dev296+g5916b956
GEOPM - Global Extensible Open Power Manager
LevelZeroImp.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2024 Intel Corporation
3  * SPDX-License-Identifier: BSD-3-Clause
4  */
5 
6 #ifndef LEVELZEROIMP_HPP_INCLUDE
7 #define LEVELZEROIMP_HPP_INCLUDE
8 
9 #include <string>
10 
11 #include <level_zero/ze_api.h>
12 #include <level_zero/zes_api.h>
13 
14 #include "LevelZero.hpp"
15 
16 #include "geopm_time.h"
17 
18 namespace geopm
19 {
20  class LevelZeroImp : public LevelZero
21  {
22  public:
23  LevelZeroImp();
24  virtual ~LevelZeroImp() = default;
25  int num_gpu(void) const override;
26  int num_gpu(int domain) const override;
27  int frequency_domain_count(unsigned int l0_device_idx,
28  int domain) const override;
29  double frequency_status(unsigned int l0_device_idx,
30  int l0_domain, int l0_domain_idx) const override;
31  double frequency_efficient(unsigned int l0_device_idx,
32  int l0_domain, int l0_domain_idx) const override;
33  double frequency_min(unsigned int l0_device_idx, int l0_domain,
34  int l0_domain_idx) const override;
35  double frequency_max(unsigned int l0_device_idx, int l0_domain,
36  int l0_domain_idx) const override;
37  std::vector<double> frequency_supported(unsigned int l0_device_idx,
38  int l0_domain,
39  int l0_domain_idx) const override;
40  uint32_t frequency_throttle_reasons(unsigned int l0_device_idx, int l0_domain,
41  int l0_domain_idx) const override;
42  std::pair<double, double> frequency_range(unsigned int l0_device_idx,
43  int l0_domain,
44  int l0_domain_idx) const override;
45  int temperature_domain_count(unsigned int l0_device_idx,
46  int l0_domain) const override;
47  double temperature_max(unsigned int l0_device_idx, int l0_domain,
48  int l0_domain_idx) const override;
49  int engine_domain_count(unsigned int l0_device_idx, int domain) const override;
50  std::pair<uint64_t, uint64_t> active_time_pair(unsigned int l0_device_idx,
51  int l0_domain, int l0_domain_idx) const override;
52  uint64_t active_time(unsigned int l0_device_idx, int l0_domain,
53  int l0_domain_idx) const override;
54  uint64_t active_time_timestamp(unsigned int l0_device_idx,
55  int l0_domain, int l0_domain_idx) const override;
56  int power_domain_count(int geopm_domain, unsigned int l0_device_idx,
57  int l0_domain) const override;
58  std::pair<uint64_t, uint64_t> energy_pair(int geopm_domain, unsigned int l0_device_idx,
59  int l0_domain_idx) const override;
60  uint64_t energy(int geopm_domain, unsigned int l0_device_idx,
61  int l0_domain, int l0_domain_idx) const override;
62  uint64_t energy_timestamp(int geopm_domain,
63  unsigned int l0_device_idx,
64  int l0_domain,
65  int l0_domain_idx) const override;
66  int performance_domain_count(unsigned int l0_device_idx,
67  int l0_domain) const override;
68  double performance_factor(unsigned int l0_device_idx,
69  int l0_domain, int l0_domain_idx) const override;
70 
71  int32_t power_limit_tdp(unsigned int l0_device_idx) const override;
72  int32_t power_limit_min(unsigned int l0_device_idx) const override;
73  int32_t power_limit_max(unsigned int l0_device_idx) const override;
74 
75  void frequency_control(unsigned int l0_device_idx,
76  int l0_domain,
77  int l0_domain_idx,
78  double range_min,
79  double range_max) const override;
80 
81  void performance_factor_control(unsigned int l0_device_idx,
82  int l0_domain,
83  int l0_domain_idx,
84  double setting) const override;
85 
86  int ras_domain_count(unsigned int l0_device_idx,int l0_domain) const override;
87  double ras_reset_count_correctable(unsigned int l0_device_idx,
88  int l0_domain,
89  int l0_domain_idx) const override;
90  double ras_programming_errcount_correctable(unsigned int l0_device_idx,
91  int l0_domain,
92  int l0_domain_idx) const override;
93  double ras_driver_errcount_correctable(unsigned int l0_device_idx,
94  int l0_domain,
95  int l0_domain_idx) const override;
96  double ras_compute_errcount_correctable(unsigned int l0_device_idx,
97  int l0_domain,
98  int l0_domain_idx) const override;
99  double ras_noncompute_errcount_correctable(unsigned int l0_device_idx,
100  int l0_domain,
101  int l0_domain_idx) const override;
102  double ras_cache_errcount_correctable(unsigned int l0_device_idx,
103  int l0_domain,
104  int l0_domain_idx) const override;
105  double ras_display_errcount_correctable(unsigned int l0_device_idx,
106  int l0_domain,
107  int l0_domain_idx) const override;
108  double ras_reset_count_uncorrectable(unsigned int l0_device_idx,
109  int l0_domain,
110  int l0_domain_idx) const override;
111  double ras_programming_errcount_uncorrectable(unsigned int l0_device_idx,
112  int l0_domain,
113  int l0_domain_idx) const override;
114  double ras_driver_errcount_uncorrectable(unsigned int l0_device_idx,
115  int l0_domain,
116  int l0_domain_idx) const override;
117  double ras_compute_errcount_uncorrectable(unsigned int l0_device_idx,
118  int l0_domain,
119  int l0_domain_idx) const override;
120  double ras_noncompute_errcount_uncorrectable(unsigned int l0_device_idx,
121  int l0_domain,
122  int l0_domain_idx) const override;
123  double ras_cache_errcount_uncorrectable(unsigned int l0_device_idx,
124  int l0_domain,
125  int l0_domain_idx) const override;
126  double ras_display_errcount_uncorrectable(unsigned int l0_device_idx,
127  int l0_domain,
128  int l0_domain_idx) const override;
129 
130  private:
131  enum m_error_type {
132  M_ERROR_TYPE_CORRECTABLE,
133  M_ERROR_TYPE_UNCORRECTABLE,
134  M_NUM_ERROR_TYPE,
135  };
136 
137  struct m_frequency_s {
138  double voltage = 0;
139  double request = 0;
140  double tdp = 0;
141  double efficient = 0;
142  double actual = 0;
143  uint32_t throttle_reasons = 0;
144  };
145  struct m_power_limit_s {
146  int32_t tdp = 0;
147  int32_t min = 0;
148  int32_t max = 0;
149  };
150 
151  struct m_subdevice_s {
152  // These are enum geopm_levelzero_domain_e indexed, then subdevice indexed
153  std::vector<std::vector<zes_freq_handle_t> > freq_domain;
154  std::vector<std::vector<zes_temp_handle_t> > temp_domain_max;
155  std::vector<std::vector<zes_engine_handle_t> > engine_domain;
156  mutable std::vector<std::vector<uint64_t> > cached_timestamp;
157 
158  //uint32_t num_subdevice_perf_domain;
159  std::vector<std::vector<zes_perf_handle_t>> perf_domain;
160 
161  uint32_t num_subdevice_power_domain;
162  std::vector<zes_pwr_handle_t> power_domain;
163  mutable std::vector<uint64_t> cached_energy_timestamp;
164 
165  // Note: For RAS counters, as of LevelZero ver 1.9, can't be neatly
166  // categorized as being specific to compute/memory domains.
167  // So, assume, L0_domain_type = M_DOMAIN_ALL
168 
169  // The RAS counters are index first by subdevice indexed,
170  // then by error set type (correctable vs uncorrectable)
171  std::vector<zes_ras_handle_t> ras_domain;
172 
173  };
174 
175  struct m_device_info_s {
176  zes_device_handle_t device_handle;
177  ze_device_properties_t property;
178  uint32_t m_num_subdevice;
179  std::vector<zes_device_handle_t> subdevice_handle;
180 
181  // Sub-Device domain tracking. Because levelzero returns ALL handles for a
182  // 'class' (freq, power, etc) regardless of subdevice it is easier to track
183  // this as class.domain.subdevice where domain is compute/memory. This avoids
184  // an additional step of sorting handles to determine how many per subdevice
185  m_subdevice_s subdevice;
186 
187  // Device/Package domains
188  uint32_t num_device_power_domain;
189  zes_pwr_handle_t power_domain;
190  mutable uint64_t cached_energy_timestamp;
191  };
192 
193  void ras_domain_cache(unsigned int l0_device_idx);
194  void frequency_domain_cache(unsigned int l0_device_idx);
195  void power_domain_cache(unsigned int l0_device_idx);
196  void perf_domain_cache(unsigned int l0_device_idx);
197  void engine_domain_cache(unsigned int l0_device_idx);
198  void temperature_domain_cache(unsigned int l0_device_idx);
199  void check_ze_result(ze_result_t ze_result, int error, std::string message,
200  int line) const;
201 
202  std::pair<double, double> frequency_min_max(unsigned int l0_device_idx,
203  int l0_domain, int l0_domain_idx) const;
204 
205  m_power_limit_s power_limit_default(unsigned int l0_device_idx) const;
206  uint64_t ras_status_helper(unsigned int l0_device_idx,
207  int l0_domain,
208  int l0_domain_idx,
209  zes_ras_error_cat_t errorcat,
210  int errortype) const;
211 
212  m_frequency_s frequency_status_helper(unsigned int l0_device_idx,
213  int l0_domain, int l0_domain_idx) const;
214 
215  uint32_t m_num_gpu;
216  uint32_t m_num_gpu_subdevice;
217 
218  std::vector<ze_driver_handle_t> m_levelzero_driver;
219  std::vector<m_device_info_s> m_devices;
220  };
221 }
222 #endif
Definition: LevelZero.hpp:18
Definition: LevelZeroImp.hpp:21
virtual ~LevelZeroImp()=default
int frequency_domain_count(unsigned int l0_device_idx, int domain) const override
Get the number of LevelZero frequency domains of a certain type.
Definition: LevelZero.cpp:633
double ras_programming_errcount_correctable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of correctable hardware exceptions generated by the way workloads h...
Definition: LevelZero.cpp:690
uint64_t active_time(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device active time in microseconds.
Definition: LevelZero.cpp:924
LevelZeroImp()
Definition: LevelZero.cpp:34
double ras_reset_count_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of uncorrectable accelerator engine resets attempted by the driver.
Definition: LevelZero.cpp:738
double frequency_efficient(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device efficient frequency in MHz.
Definition: LevelZero.cpp:820
std::pair< uint64_t, uint64_t > energy_pair(int geopm_domain, unsigned int l0_device_idx, int l0_domain_idx) const override
Get the LevelZero device energy and timestamp in microjoules and microseconds.
Definition: LevelZero.cpp:969
int num_gpu(void) const override
Number of GPUs on the platform.
Definition: LevelZero.cpp:590
double ras_compute_errcount_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of uncorrectable errors that have occurred in the compute accelerat...
Definition: LevelZero.cpp:762
double frequency_status(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device actual frequency in MHz.
Definition: LevelZero.cpp:814
double temperature_max(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device maximum temperature in Celsius.
Definition: LevelZero.cpp:906
int ras_domain_count(unsigned int l0_device_idx, int l0_domain) const override
Get the number of LevelZero RAS domains of a certain type.
Definition: LevelZero.cpp:666
std::pair< uint64_t, uint64_t > active_time_pair(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device active time and timestamp in microseconds.
Definition: LevelZero.cpp:930
double ras_noncompute_errcount_correctable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of correctable errors that have occurred in the fixed-function acce...
Definition: LevelZero.cpp:714
int power_domain_count(int geopm_domain, unsigned int l0_device_idx, int l0_domain) const override
Get the number of LevelZero power domains of a certain type.
Definition: LevelZero.cpp:617
double ras_noncompute_errcount_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of uncorrectable errors that have occurred in the fixed-function ac...
Definition: LevelZero.cpp:770
double ras_display_errcount_correctable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of correctable errors that have occurred in the display.
Definition: LevelZero.cpp:730
double ras_display_errcount_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of uncorrectable errors that have occurred in the display.
Definition: LevelZero.cpp:786
int temperature_domain_count(unsigned int l0_device_idx, int l0_domain) const override
Get the number of LevelZero temperature domains.
Definition: LevelZero.cpp:648
int engine_domain_count(unsigned int l0_device_idx, int domain) const override
Get the number of LevelZero engine domains.
Definition: LevelZero.cpp:638
void performance_factor_control(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx, double setting) const override
Set the performance factor for the LevelZero device.
Definition: LevelZero.cpp:1070
int32_t power_limit_max(unsigned int l0_device_idx) const override
Get the LevelZero device maximum power limit in milliwatts.
Definition: LevelZero.cpp:1021
int32_t power_limit_tdp(unsigned int l0_device_idx) const override
Get the LevelZero device default power limit in milliwatts.
Definition: LevelZero.cpp:1003
uint64_t active_time_timestamp(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the cachced LevelZero device timestamp for the active time value in microseconds.
Definition: LevelZero.cpp:918
uint64_t energy_timestamp(int geopm_domain, unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device energy cached timestamp in microseconds.
Definition: LevelZero.cpp:948
double ras_cache_errcount_correctable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of correctable errors that have occurred in caches (L1/L3/register ...
Definition: LevelZero.cpp:722
std::pair< double, double > frequency_range(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device minimum and maximum frequency control range in MHz.
Definition: LevelZero.cpp:894
double performance_factor(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the performance factor value of various LevelZero domains.
Definition: LevelZero.cpp:653
uint32_t frequency_throttle_reasons(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device frequency throttle reasons.
Definition: LevelZero.cpp:826
double ras_cache_errcount_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of uncorrectable errors that have occurred in caches (L1/L3/registe...
Definition: LevelZero.cpp:778
double frequency_min(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device mininmum frequency in MHz.
Definition: LevelZero.cpp:852
std::vector< double > frequency_supported(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device supported frequencies in MHz.
Definition: LevelZero.cpp:876
double ras_reset_count_correctable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of correctable accelerator engine resets attempted by the driver.
Definition: LevelZero.cpp:682
int32_t power_limit_min(unsigned int l0_device_idx) const override
Get the LevelZero device minimum power limit in milliwatts.
Definition: LevelZero.cpp:1012
double frequency_max(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device maximum frequency in MHz.
Definition: LevelZero.cpp:858
double ras_compute_errcount_correctable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of correctable errors that have occurred in the compute accelerator...
Definition: LevelZero.cpp:706
void frequency_control(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx, double range_min, double range_max) const override
Set min and max frequency for LevelZero device.
Definition: LevelZero.cpp:1046
double ras_driver_errcount_correctable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of low level driver communication correctable errors have occurred.
Definition: LevelZero.cpp:698
int performance_domain_count(unsigned int l0_device_idx, int l0_domain) const override
Get the number of LevelZero perf domains of a certain type.
Definition: LevelZero.cpp:643
double ras_programming_errcount_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of uncorrectable hardware exceptions generated by the way workloads...
Definition: LevelZero.cpp:746
double ras_driver_errcount_uncorrectable(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of low level driver communication uncorrectable errors have occurre...
Definition: LevelZero.cpp:754
uint64_t energy(int geopm_domain, unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device energy in microjoules.
Definition: LevelZero.cpp:963
Definition: Agg.cpp:20