geopm  3.1.0
GEOPM - Global Extensible Open Power Manager
All Classes Namespaces Files Functions Variables Enumerations Enumerator Friends Macros Pages
LevelZeroImp.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2024 Intel Corporation
3  * SPDX-License-Identifier: BSD-3-Clause
4  */
5 
6 #ifndef LEVELZEROIMP_HPP_INCLUDE
7 #define LEVELZEROIMP_HPP_INCLUDE
8 
9 #include <string>
10 
11 #include <level_zero/ze_api.h>
12 #include <level_zero/zes_api.h>
13 
14 #include "LevelZero.hpp"
15 
16 #include "geopm_time.h"
17 
18 namespace geopm
19 {
20  class LevelZeroImp : public LevelZero
21  {
22  public:
23  LevelZeroImp();
24  virtual ~LevelZeroImp() = default;
25  int num_gpu(void) const override;
26  int num_gpu(int domain) const override;
27  int ras_domain_count(unsigned int l0_device_idx,
28  int l0_domain) const override;
29  double ras_reset_count(unsigned int l0_device_idx,
30  int l0_domain, int l0_domain_idx) const override;
31  double ras_programming_errcount(unsigned int l0_device_idx,
32  int l0_domain, int l0_domain_idx) const override;
33  double ras_driver_errcount(unsigned int l0_device_idx,
34  int l0_domain, int l0_domain_idx) const override;
35  double ras_compute_errcount(unsigned int l0_device_idx,
36  int l0_domain, int l0_domain_idx) const override;
37  double ras_noncompute_errcount(unsigned int l0_device_idx,
38  int l0_domain, int l0_domain_idx) const override;
39  double ras_cache_errcount(unsigned int l0_device_idx,
40  int l0_domain, int l0_domain_idx) const override;
41  double ras_display_errcount(unsigned int l0_device_idx,
42  int l0_domain, int l0_domain_idx) const override;
43  int frequency_domain_count(unsigned int l0_device_idx,
44  int domain) const override;
45  double frequency_status(unsigned int l0_device_idx,
46  int l0_domain, int l0_domain_idx) const override;
47  double frequency_efficient(unsigned int l0_device_idx,
48  int l0_domain, int l0_domain_idx) const override;
49  double frequency_min(unsigned int l0_device_idx, int l0_domain,
50  int l0_domain_idx) const override;
51  double frequency_max(unsigned int l0_device_idx, int l0_domain,
52  int l0_domain_idx) const override;
53  std::vector<double> frequency_supported(unsigned int l0_device_idx,
54  int l0_domain,
55  int l0_domain_idx) const override;
56  uint32_t frequency_throttle_reasons(unsigned int l0_device_idx, int l0_domain,
57  int l0_domain_idx) const override;
58  std::pair<double, double> frequency_range(unsigned int l0_device_idx,
59  int l0_domain,
60  int l0_domain_idx) const override;
61  int temperature_domain_count(unsigned int l0_device_idx,
62  int l0_domain) const override;
63  double temperature_max(unsigned int l0_device_idx, int l0_domain,
64  int l0_domain_idx) const override;
65  int engine_domain_count(unsigned int l0_device_idx, int domain) const override;
66  std::pair<uint64_t, uint64_t> active_time_pair(unsigned int l0_device_idx,
67  int l0_domain, int l0_domain_idx) const override;
68  uint64_t active_time(unsigned int l0_device_idx, int l0_domain,
69  int l0_domain_idx) const override;
70  uint64_t active_time_timestamp(unsigned int l0_device_idx,
71  int l0_domain, int l0_domain_idx) const override;
72  int power_domain_count(int geopm_domain, unsigned int l0_device_idx,
73  int l0_domain) const override;
74  std::pair<uint64_t, uint64_t> energy_pair(int geopm_domain, unsigned int l0_device_idx,
75  int l0_domain_idx) const override;
76  uint64_t energy(int geopm_domain, unsigned int l0_device_idx,
77  int l0_domain, int l0_domain_idx) const override;
78  uint64_t energy_timestamp(int geopm_domain,
79  unsigned int l0_device_idx,
80  int l0_domain,
81  int l0_domain_idx) const override;
82  int performance_domain_count(unsigned int l0_device_idx,
83  int l0_domain) const override;
84  double performance_factor(unsigned int l0_device_idx,
85  int l0_domain, int l0_domain_idx) const override;
86 
87  int32_t power_limit_tdp(unsigned int l0_device_idx) const override;
88  int32_t power_limit_min(unsigned int l0_device_idx) const override;
89  int32_t power_limit_max(unsigned int l0_device_idx) const override;
90 
91  void frequency_control(unsigned int l0_device_idx, int l0_domain,
92  int l0_domain_idx, double range_min,
93  double range_max) const override;
94 
95  void performance_factor_control(unsigned int l0_device_idx,
96  int l0_domain,
97  int l0_domain_idx,
98  double setting) const override;
99 
100  private:
101  struct m_frequency_s {
102  double voltage = 0;
103  double request = 0;
104  double tdp = 0;
105  double efficient = 0;
106  double actual = 0;
107  uint32_t throttle_reasons = 0;
108  };
109  struct m_power_limit_s {
110  int32_t tdp = 0;
111  int32_t min = 0;
112  int32_t max = 0;
113  };
114 
115  struct m_subdevice_s {
116  // These are enum geopm_levelzero_domain_e indexed, then subdevice indexed
117  std::vector<std::vector<zes_ras_handle_t> > ras_domain;
118  std::vector<std::vector<zes_freq_handle_t> > freq_domain;
119  std::vector<std::vector<zes_temp_handle_t> > temp_domain_max;
120  std::vector<std::vector<zes_engine_handle_t> > engine_domain;
121  mutable std::vector<std::vector<uint64_t> > cached_timestamp;
122 
123  //uint32_t num_subdevice_perf_domain;
124  std::vector<std::vector<zes_perf_handle_t>> perf_domain;
125 
126  uint32_t num_subdevice_power_domain;
127  std::vector<zes_pwr_handle_t> power_domain;
128  mutable std::vector<uint64_t> cached_energy_timestamp;
129 
130  };
131 
132  struct m_device_info_s {
133  zes_device_handle_t device_handle;
134  ze_device_properties_t property;
135  uint32_t m_num_subdevice;
136  std::vector<zes_device_handle_t> subdevice_handle;
137 
138  // Sub-Device domain tracking. Because levelzero returns ALL handles for a
139  // 'class' (freq, power, etc) regardless of subdevice it is easier to track
140  // this as class.domain.subdevice where domain is compute/memory. This avoids
141  // an additional step of sorting handles to determine how many per subdevice
142  m_subdevice_s subdevice;
143 
144  // Device/Package domains
145  uint32_t num_device_power_domain;
146  zes_pwr_handle_t power_domain;
147  mutable uint64_t cached_energy_timestamp;
148  };
149 
150  void ras_domain_cache(unsigned int l0_device_idx);
151  void frequency_domain_cache(unsigned int l0_device_idx);
152  void power_domain_cache(unsigned int l0_device_idx);
153  void perf_domain_cache(unsigned int l0_device_idx);
154  void engine_domain_cache(unsigned int l0_device_idx);
155  void temperature_domain_cache(unsigned int l0_device_idx);
156  void check_ze_result(ze_result_t ze_result, int error, std::string message,
157  int line) const;
158 
159  std::pair<double, double> frequency_min_max(unsigned int l0_device_idx,
160  int l0_domain, int l0_domain_idx) const;
161 
162  m_power_limit_s power_limit_default(unsigned int l0_device_idx) const;
163  std::array<uint64_t, ZES_MAX_RAS_ERROR_CATEGORY_COUNT> ras_status_helper(unsigned int l0_device_idx,
164  int l0_domain,
165  int l0_domain_idx) const;
166  m_frequency_s frequency_status_helper(unsigned int l0_device_idx,
167  int l0_domain, int l0_domain_idx) const;
168 
169  uint32_t m_num_gpu;
170  uint32_t m_num_gpu_subdevice;
171 
172  std::vector<ze_driver_handle_t> m_levelzero_driver;
173  std::vector<m_device_info_s> m_devices;
174  };
175 }
176 #endif
Definition: LevelZero.hpp:18
Definition: LevelZeroImp.hpp:21
virtual ~LevelZeroImp()=default
int frequency_domain_count(unsigned int l0_device_idx, int domain) const override
Get the number of LevelZero frequency domains of a certain type.
Definition: LevelZero.cpp:576
uint64_t active_time(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device active time in microseconds.
Definition: LevelZero.cpp:785
LevelZeroImp()
Definition: LevelZero.cpp:26
double ras_driver_errcount(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of low level driver communication errors have occurred.
Definition: LevelZero.cpp:626
double frequency_efficient(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device efficient frequency in MHz.
Definition: LevelZero.cpp:681
std::pair< uint64_t, uint64_t > energy_pair(int geopm_domain, unsigned int l0_device_idx, int l0_domain_idx) const override
Get the LevelZero device energy and timestamp in microjoules and microseconds.
Definition: LevelZero.cpp:830
int num_gpu(void) const override
Number of GPUs on the platform.
Definition: LevelZero.cpp:533
double ras_cache_errcount(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of errors that have occurred in caches (L1/L3/register file/shared ...
Definition: LevelZero.cpp:644
double frequency_status(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device actual frequency in MHz.
Definition: LevelZero.cpp:675
double temperature_max(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device maximum temperature in Celsius.
Definition: LevelZero.cpp:767
double ras_display_errcount(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of errors that have occurred in the display.
Definition: LevelZero.cpp:650
int ras_domain_count(unsigned int l0_device_idx, int l0_domain) const override
Get the number of LevelZero RAS domains of a certain type.
Definition: LevelZero.cpp:608
std::pair< uint64_t, uint64_t > active_time_pair(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device active time and timestamp in microseconds.
Definition: LevelZero.cpp:791
int power_domain_count(int geopm_domain, unsigned int l0_device_idx, int l0_domain) const override
Get the number of LevelZero power domains of a certain type.
Definition: LevelZero.cpp:560
int temperature_domain_count(unsigned int l0_device_idx, int l0_domain) const override
Get the number of LevelZero temperature domains.
Definition: LevelZero.cpp:591
int engine_domain_count(unsigned int l0_device_idx, int domain) const override
Get the number of LevelZero engine domains.
Definition: LevelZero.cpp:581
double ras_programming_errcount(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of hardware exceptions generated by the way workloads have programm...
Definition: LevelZero.cpp:620
void performance_factor_control(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx, double setting) const override
Set the performance factor for the LevelZero device.
Definition: LevelZero.cpp:931
int32_t power_limit_max(unsigned int l0_device_idx) const override
Get the LevelZero device maximum power limit in milliwatts.
Definition: LevelZero.cpp:882
int32_t power_limit_tdp(unsigned int l0_device_idx) const override
Get the LevelZero device default power limit in milliwatts.
Definition: LevelZero.cpp:864
uint64_t active_time_timestamp(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the cachced LevelZero device timestamp for the active time value in microseconds.
Definition: LevelZero.cpp:779
uint64_t energy_timestamp(int geopm_domain, unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device energy cached timestamp in microseconds.
Definition: LevelZero.cpp:809
std::pair< double, double > frequency_range(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device minimum and maximum frequency control range in MHz.
Definition: LevelZero.cpp:755
double performance_factor(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the performance factor value of various LevelZero domains.
Definition: LevelZero.cpp:596
double ras_reset_count(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of accelerator engine resets attempted by the driver.
Definition: LevelZero.cpp:614
uint32_t frequency_throttle_reasons(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device frequency throttle reasons.
Definition: LevelZero.cpp:687
double frequency_min(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device mininmum frequency in MHz.
Definition: LevelZero.cpp:713
std::vector< double > frequency_supported(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device supported frequencies in MHz.
Definition: LevelZero.cpp:737
double ras_compute_errcount(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of errors that have occurred in the compute accelerator hardware.
Definition: LevelZero.cpp:632
double ras_noncompute_errcount(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero count of number of errors that have occurred in the fixed-function accelerator hard...
Definition: LevelZero.cpp:638
int32_t power_limit_min(unsigned int l0_device_idx) const override
Get the LevelZero device minimum power limit in milliwatts.
Definition: LevelZero.cpp:873
double frequency_max(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device maximum frequency in MHz.
Definition: LevelZero.cpp:719
void frequency_control(unsigned int l0_device_idx, int l0_domain, int l0_domain_idx, double range_min, double range_max) const override
Set min and max frequency for LevelZero device.
Definition: LevelZero.cpp:907
int performance_domain_count(unsigned int l0_device_idx, int l0_domain) const override
Get the number of LevelZero perf domains of a certain type.
Definition: LevelZero.cpp:586
uint64_t energy(int geopm_domain, unsigned int l0_device_idx, int l0_domain, int l0_domain_idx) const override
Get the LevelZero device energy in microjoules.
Definition: LevelZero.cpp:824
Definition: Agg.cpp:20