diff --git a/19-perf/perf.c b/19-perf/perf.c
index 17cc202..f570465 100644
--- a/19-perf/perf.c
+++ b/19-perf/perf.c
@@ -54,26 +54,63 @@ int sys_perf_event_open(struct perf_event_attr *attr,
 // the group leader. The function returns an id that can be used in
 // combination with perf_event_get.
 perf_event_id perf_event_add(struct perf_handle *p, int type, int config) {
-    // FIXME: Create event with perf_event_open
-    // FIXME: Get perf_event_id with PERF_EVENT_IOC_ID
-    return -1;
+    struct perf_event_attr attr;
+
+    memset(&attr, 0, sizeof(struct perf_event_attr));
+    attr.type = type;
+    attr.size = sizeof(struct perf_event_attr);
+    attr.config = config;
+    attr.disabled = 1;
+    attr.exclude_kernel = 1;
+    attr.exclude_hv = 1;
+    attr.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
+    int fd = sys_perf_event_open(&attr, 0, -1,
+                             p->group_fd > 0 ? p->group_fd : -1,
+                             0);
+    if (fd < 0) die("perf_event_open");
+    if (p->group_fd <= 0)
+        p->group_fd = fd;
+
+    p->nevents ++;
+
+    perf_event_id id;
+    if (ioctl(fd, PERF_EVENT_IOC_ID, &id) < 0)
+        die("perf/IOC_ID");
+    return id;
 }
 
 // Resets and starts the perf measurement
 void perf_event_start(struct perf_handle *p) {
-    // FIXME: PERF_EVENT_IOC_{RESET, ENABLE}
+    // Reset and enable the event group
+    ioctl(p->group_fd, PERF_EVENT_IOC_RESET,  PERF_IOC_FLAG_GROUP);
+    ioctl(p->group_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
 }
 
 // Stops the perf measurement and reads out the event
 void perf_event_stop(struct perf_handle *p) {
-    // FIXME: PERF_EVENT_IOC_DISABLE
-    // FIXME: Read event from the group_fd into an allocated buffer
+    // Stop the tracing for the whole event group
+    ioctl(p->group_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
+
+    // Allocate a read_format buffer if not done yet.
+    if (p->rf == NULL) {
+        p->rf_size = sizeof(uint64_t) + 2 * p->nevents * sizeof(uint64_t);
+        p->rf = malloc(p->rf_size);
+    }
+
+    // get the event from the kernel. Our buffer should be sized exactly righ
+    if (read(p->group_fd, p->rf, p->rf_size) < 0)
+        die("read");
 }
 
 
 // After the measurement, this helper extracts the event counter for
 // the given perf_event_id (which was returned by perf_event_add)
 uint64_t perf_event_get(struct perf_handle *p, perf_event_id id) {
+    for (unsigned i = 0; i < p->rf->nr; i++) {
+        if (p->rf->values[i].id == id) {
+            return p->rf->values[i].value;
+        }
+    }
     return -1;
 }
 
@@ -96,16 +133,42 @@ int main(int argc, char* argv[]) {
     size_t msize = sizeof(double) * dim * dim;
     printf("matrix_size: %.2f MiB\n", msize / (1024.0 * 1024.0));
 
-    // We provide you with two matrix multiply implementations (see
-    // matrix.c). The optimized variant was created by Ulrich Drepper
-    // and is optimized for cache usage. For a detailed discussion,
-    // we refer you to
-    //
-    // "What Every Programmer Should Know About Memory", Ulrich Drepper, 2007,
-    // https://www.akkadia.org/drepper/cpumemory.pdf
+    // Create and initialize a new perf handle
+    struct perf_handle p;
+    memset(&p, 0, sizeof(p));
+
+    // Create three new perf events that we want to monitor for our
+    // matrix multiplication algorithms
+    perf_event_id id_instrs =
+        perf_event_add(&p, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS);
+    perf_event_id id_cycles =
+        perf_event_add(&p, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES);
+    perf_event_id id_cache_miss =
+        perf_event_add(&p, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES);
 
-    matrixmul_drepper(dim, A, B, C0);
-    matrixmul_naive(dim, A, B, C1);
+    // Define an anonymous struct type to make our measurement code easier to read
+    struct {
+        char *name;
+        void (*func)(unsigned, double *, double *, double*);
+        double *result;
+    } algorithms[] = {
+        {"drepper", &matrixmul_drepper, C0},
+        {"naive",   &matrixmul_naive,   C1},
+    };
+
+    for (unsigned i = 0; i < ARRAY_SIZE(algorithms); i++) {
+        // Execute the matrix multiplication under perf tracing
+        perf_event_start(&p);
+        algorithms[i].func(dim, A, B, algorithms[i].result);
+        perf_event_stop(&p);
+
+        // Print out the results as a single line
+        double instrs = perf_event_get(&p, id_instrs) / 1e6;
+        double cycles = perf_event_get(&p, id_cycles) / 1e6;
+        double misses = perf_event_get(&p, id_cache_miss) / 1e6;
+        printf("%-10s %8.2fM instr, %8.2f instr-per-cycle, %8.2f miss-per-instr\n",
+               algorithms[i].name, instrs, instrs/cycles, misses / instrs);
+    }
 
     // Sanity Checking: are both result matrices equal (with a margin of 0.1%) ?
     for (unsigned i = 0; i < (dim*dim); i++) {