The upshot is that setting up the perf run time library a second and subsequent times appear to use new Intel x86 hardware counters. This can quickly exhaust the supply of hardware counters. Resulting in read() returning counters without values, e.g. 0, but no error is reported.
In the following example code,
our add_perf_event() function
is initially called several times (actually four times)
with different parameters,
together forming a perf event group.
add_perf_event() is called once for each performance counter to be monitored.
We then measure the performance of some software of interest
multiple times
(e.g. when it is used in a loop).
Rather than close(fd) and repeat the sequence of add_perf_event() calls,
we leave fd open,
allowing us to re-use the perf runtime data we have set up,
and only repeat the
PERF_EVENT_IOC_RESET, PERF_EVENT_IOC_ENABLE
code to be measured
PERF_EVENT_IOC_DISABLE,
sequence.
Here we never call close(fd)
and instead rely on Linux to clean up when the whole process exits.
//https://classes.engineering.wustl.edu/cse522/man-pages/perf_event_open.2.pdf #include <errno.h> #include <assert.h> #include <unistd.h> #include <sys/ioctl.h> #include <linux/perf_event.h> #include <asm/unistd.h>
enum {nr=4}; struct read_format { u64 nr; /* The number of events, only if PERF_FORMAT_GROUP */ u64 time_enabled; /* only if PERF_FORMAT_TOTAL_TIME_ENABLED */ u64 time_running; /* only if PERF_FORMAT_TOTAL_TIME_RUNNING */ struct { u64 value; /* The value of the event */ u64 id; /* only if PERF_FORMAT_ID */ } values[nr]; };
int add_perf_event(const int fd, const unsigned int type, const unsigned long long int config){ struct perf_event_attr pe; memset(&pe, 0, sizeof(pe)); pe.type = type; //e.g. PERF_TYPE_HW_CACHE or PERF_TYPE_HARDWARE or PERF_TYPE_SOFTWARE ... pe.size = sizeof(pe); pe.config = config; //eg mask | PERF_COUNT_HW_CACHE_L1x or PERF_COUNT_HW_INSTRUCTIONS or PERF_COUNT_SW_CPU_CLOCK ... pe.disabled = (fd == -1)? 1 : 0; //disable only on group leader, fd == -1 pe.exclude_kernel = 1; pe.exclude_hv = 1; pe.read_format = PERF_FORMAT_GROUP; //| PERF_FORMAT_ID; //| PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; const int ret = perf_event_open(&pe, 0, -1, fd, 0); if (ret == -1) { fprintf(stderr, "Error opening %d %llx\n", fd,config); exit(EXIT_FAILURE); } return (fd == -1)? ret : fd; //if first call, this is our group leader }
int fd = -1; //will be perf runtime library file descriptor, -1 says perf info not yet set up
if(fd == -1) { //first time int f_; //error checking only //from cwperson.. L1D and LII good for read miss const int r = PERF_COUNT_HW_CACHE_RESULT_MISS; //const int r = PERF_COUNT_HW_CACHE_RESULT_ACCESS; also valid on L1D read and write const int o = PERF_COUNT_HW_CACHE_OP_READ; //const int t = PERF_COUNT_HW_CACHE_L1D; //const int t = PERF_COUNT_HW_CACHE_L1I; const unsigned long long mask = 0 | (o << 8) | (r <<16); fd = add_perf_event(fd, PERF_TYPE_HW_CACHE, mask | PERF_COUNT_HW_CACHE_L1D); assert(fd != -1); f_ = add_perf_event(fd, PERF_TYPE_HW_CACHE, mask | PERF_COUNT_HW_CACHE_L1I); assert(f_ == fd); f_ = add_perf_event(fd, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS); assert(f_ == fd); f_ = add_perf_event(fd, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK); assert(f_ == fd); }
ioctl(fd, PERF_EVENT_IOC_RESET, 0); ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); code to be measured ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
struct read_format perf_data; memset(&perf_data,0,sizeof(perf_data)); const int ret = read(fd, &perf_data, sizeof(perf_data)); assert(errno == 0); //ENOSPC may mean buffer too small assert(ret == sizeof(perf_data)); NB dont //close(fd);
mask 0 | PERF_COUNT_HW_CACHE_L1D | PERF_COUNT_HW_CACHE_OP_READ | PERF_COUNT_HW_CACHE_RESULT_ACCESS |
mask 2 | PERF_COUNT_HW_CACHE_LL | PERF_COUNT_HW_CACHE_OP_READ | PERF_COUNT_HW_CACHE_RESULT_ACCESS |
mask 3 | PERF_COUNT_HW_CACHE_DTLB | PERF_COUNT_HW_CACHE_OP_READ | PERF_COUNT_HW_CACHE_RESULT_ACCESS |
mask 4 | PERF_COUNT_HW_CACHE_ITLB | PERF_COUNT_HW_CACHE_OP_READ | PERF_COUNT_HW_CACHE_RESULT_ACCESS |
mask 5 | PERF_COUNT_HW_CACHE_BPU | PERF_COUNT_HW_CACHE_OP_READ | PERF_COUNT_HW_CACHE_RESULT_ACCESS |
mask 6 | PERF_COUNT_HW_CACHE_NODE | PERF_COUNT_HW_CACHE_OP_READ | PERF_COUNT_HW_CACHE_RESULT_ACCESS |
mask 256 | PERF_COUNT_HW_CACHE_L1D | PERF_COUNT_HW_CACHE_OP_WRITE | PERF_COUNT_HW_CACHE_RESULT_ACCESS |
mask 258 | PERF_COUNT_HW_CACHE_LL | PERF_COUNT_HW_CACHE_OP_WRITE | PERF_COUNT_HW_CACHE_RESULT_ACCESS |
mask 259 | PERF_COUNT_HW_CACHE_DTLB | PERF_COUNT_HW_CACHE_OP_WRITE | PERF_COUNT_HW_CACHE_RESULT_ACCESS |
mask 262 | PERF_COUNT_HW_CACHE_NODE | PERF_COUNT_HW_CACHE_OP_WRITE | PERF_COUNT_HW_CACHE_RESULT_ACCESS |
mask 65536 | PERF_COUNT_HW_CACHE_L1D | PERF_COUNT_HW_CACHE_OP_READ | PERF_COUNT_HW_CACHE_RESULT_MISS |
mask 65537 | PERF_COUNT_HW_CACHE_L1I | PERF_COUNT_HW_CACHE_OP_READ | PERF_COUNT_HW_CACHE_RESULT_MISS |
mask 65538 | PERF_COUNT_HW_CACHE_LL | PERF_COUNT_HW_CACHE_OP_READ | PERF_COUNT_HW_CACHE_RESULT_MISS |
mask 65539 | PERF_COUNT_HW_CACHE_DTLB | PERF_COUNT_HW_CACHE_OP_READ | PERF_COUNT_HW_CACHE_RESULT_MISS |
mask 65540 | PERF_COUNT_HW_CACHE_ITLB | PERF_COUNT_HW_CACHE_OP_READ | PERF_COUNT_HW_CACHE_RESULT_MISS |
mask 65541 | PERF_COUNT_HW_CACHE_BPU | PERF_COUNT_HW_CACHE_OP_READ | PERF_COUNT_HW_CACHE_RESULT_MISS |
mask 65542 | PERF_COUNT_HW_CACHE_NODE | PERF_COUNT_HW_CACHE_OP_READ | PERF_COUNT_HW_CACHE_RESULT_MISS |
mask 65794 | PERF_COUNT_HW_CACHE_LL | PERF_COUNT_HW_CACHE_OP_WRITE | PERF_COUNT_HW_CACHE_RESULT_MISS |
mask 65795 | PERF_COUNT_HW_CACHE_DTLB | PERF_COUNT_HW_CACHE_OP_WRITE | PERF_COUNT_HW_CACHE_RESULT_MISS |
mask 65798 | PERF_COUNT_HW_CACHE_NODE | PERF_COUNT_HW_CACHE_OP_WRITE | PERF_COUNT_HW_CACHE_RESULT_MISS |