#ifndef PK_PKFUNCINSTR_H #define PK_PKFUNCINSTR_H #include "./pktmr.h" /* deleteme */ #define PK_FUNCINSTR_CHILDREN_INCREMENT_COUNT 8 struct pk_funcinstr; struct pk_funcinstr { void *fn; struct pk_tmr tmr; struct pk_funcinstr *parent; struct pk_funcinstr *first_child; struct pk_funcinstr **children; size_t n_children; size_t r_children; }; void pk_funcinstr_init(); void pk_funcinstr_teardown(); #if defined(__cplusplus) extern "C" { #endif #if defined(__clang__) // clang #elif defined(__GNUC__) || defined(__GNUG__) void __cyg_profile_func_enter(void* this_fn, void* call_site); void __cyg_profile_func_exit(void* this_fn, void* call_site); #else // other #endif #if defined(__cplusplus) } // extern "C" #endif #endif /* PK_PKFUNCINSTR_H */ #if defined(PK_IMPL_FUNCINSTR) #include #include #include #include // TODO 2025-06-02 JCB // There's some speed improvements that can be made here by growing faster. // OR use some type of bucket // - might be a good chance to isolate some of the pkmem logic #define PK_FUNCINSTR_BKT_START_COUNT 4 #define PK_FUNCINSTR_BKT_GROW_AMOUNT 4 #define PK_FUNCINSTR_BKT_DATA_COUNT 255 struct pk_funcinstr_bkt { uint8_t used_count; uint8_t guard_enter; uint8_t guard_exit; struct timespec reset_time; struct pk_funcinstr data[PK_FUNCINSTR_BKT_DATA_COUNT+1]; }; struct pk_funcinstr_mstr { mtx_t mtx; struct timespec reset_time; struct pk_funcinstr_bkt **buckets; size_t r_buckets; size_t n_buckets; }; // if NULL, get a new bucket (or alloc if full). if !NULL, existing thread static thread_local struct pk_funcinstr_bkt *pk_funcinstr_thrd_bkt = NULL; // last function call (should be NULL or parent of current) static thread_local struct pk_funcinstr *pk_funcinstr_thrd_instr = NULL; static struct pk_funcinstr_mstr thrd_mstr; __attribute__((no_instrument_function)) void pk_funcinstr_init() { assert(thrd_mstr.reset_time.tv_sec == 0); assert(thrd_mstr.reset_time.tv_nsec == 0); assert(thrd_mstr.buckets == NULL); assert(thrd_mstr.r_buckets == 0); assert(thrd_mstr.n_buckets == 0); mtx_init(&thrd_mstr.mtx, mtx_plain); thrd_mstr.r_buckets = PK_FUNCINSTR_BKT_START_COUNT; thrd_mstr.buckets = (struct pk_funcinstr_bkt**)aligned_alloc(alignof(struct pk_funcinstr_bkt *), (sizeof(struct pk_funcinstr_bkt *) * PK_FUNCINSTR_BKT_START_COUNT)); memset(thrd_mstr.buckets, 0, (sizeof(struct pk_funcinstr_bkt *) * PK_FUNCINSTR_BKT_START_COUNT)); clock_gettime(PK_TMR_CLOCK, &thrd_mstr.reset_time); } __attribute__((no_instrument_function)) void pk_funcinstr_teardown() { size_t i, k; mtx_lock(&thrd_mstr.mtx); for (i = 0; i < thrd_mstr.n_buckets; ++i) { struct pk_funcinstr_bkt *bkt = thrd_mstr.buckets[i]; for (k = 0; k < bkt->used_count; ++k) { free(bkt->data[k].children); } } free(thrd_mstr.buckets); thrd_mstr.reset_time.tv_sec = 0; thrd_mstr.reset_time.tv_nsec = 0; thrd_mstr.buckets = NULL; thrd_mstr.r_buckets = 0; thrd_mstr.n_buckets = 0; mtx_unlock(&thrd_mstr.mtx); mtx_destroy(&thrd_mstr.mtx); } #if defined(__clang__) // TODO clang XRay // Come up with pk macros since XRay requires attributes to instrument? #elif defined(__GNUC__) || defined(__GNUG__) #ifndef __USE_GNU #define __USE_GNU #endif #if defined(__cplusplus) #include #endif #include #include #include __attribute__((no_instrument_function)) bool pk_funcinstr_detect_not_initialized() { if (thrd_mstr.buckets == NULL) return true; if (thrd_mstr.r_buckets == 0) return true; return false; } __attribute__((no_instrument_function)) void pk_funcinstr_detect_and_handle_reset() { bool should_hard_reset = false; bool should_reset = pk_funcinstr_thrd_bkt == NULL; if (pk_funcinstr_thrd_bkt != NULL) { should_reset = pk_funcinstr_thrd_bkt->used_count == PK_FUNCINSTR_BKT_DATA_COUNT; should_hard_reset = thrd_mstr.reset_time.tv_sec > pk_funcinstr_thrd_bkt->reset_time.tv_sec; should_hard_reset = should_hard_reset || (thrd_mstr.reset_time.tv_sec == pk_funcinstr_thrd_bkt->reset_time.tv_sec && thrd_mstr.reset_time.tv_nsec > pk_funcinstr_thrd_bkt->reset_time.tv_nsec); } if (should_hard_reset) { pk_funcinstr_thrd_bkt = NULL; pk_funcinstr_thrd_instr = NULL; should_reset = true; } if (should_reset) { if (thrd_mstr.n_buckets == thrd_mstr.r_buckets) { mtx_lock(&thrd_mstr.mtx); thrd_mstr.r_buckets += PK_FUNCINSTR_BKT_GROW_AMOUNT; struct pk_funcinstr_bkt **buckets = (struct pk_funcinstr_bkt**)aligned_alloc(alignof(void *), sizeof(void *) * thrd_mstr.r_buckets); memcpy(buckets, thrd_mstr.buckets, sizeof(void *) * (thrd_mstr.n_buckets)); memset((char*)buckets + (sizeof(void *) * (thrd_mstr.n_buckets)), 0, (sizeof(void *) * thrd_mstr.r_buckets) - sizeof(void *) * (thrd_mstr.n_buckets)); free(thrd_mstr.buckets); thrd_mstr.buckets = buckets; mtx_unlock(&thrd_mstr.mtx); } struct pk_funcinstr_bkt *bkt = (struct pk_funcinstr_bkt *)aligned_alloc(alignof(struct pk_funcinstr_bkt), sizeof(struct pk_funcinstr_bkt)); bkt->used_count = 0; bkt->guard_enter = 0; bkt->guard_exit = 0; bkt->reset_time.tv_sec = 0; bkt->reset_time.tv_nsec = 0; if (pk_funcinstr_thrd_bkt != NULL) { pk_funcinstr_thrd_bkt->guard_enter = 0; pk_funcinstr_thrd_bkt->guard_exit = 0; } pk_funcinstr_thrd_bkt = bkt; mtx_lock(&thrd_mstr.mtx); thrd_mstr.buckets[thrd_mstr.n_buckets++] = bkt; mtx_unlock(&thrd_mstr.mtx); clock_gettime(PK_TMR_CLOCK, &pk_funcinstr_thrd_bkt->reset_time); } } __attribute__((no_instrument_function)) bool pk_funcinstr_should_early_exit() { if (pk_funcinstr_thrd_bkt->guard_enter != 0) return true; if (pk_funcinstr_thrd_bkt->guard_exit != 0) return true; return false; } __attribute__((no_instrument_function)) struct pk_funcinstr *pk_funcinstr_create_funcinstr(void *this_fn) { struct pk_funcinstr *funcinstr = &pk_funcinstr_thrd_bkt->data[pk_funcinstr_thrd_bkt->used_count]; pk_funcinstr_thrd_bkt->used_count++; pk_tmr_start(funcinstr->tmr); funcinstr->fn = this_fn; funcinstr->parent = pk_funcinstr_thrd_instr; funcinstr->children = NULL; funcinstr->first_child = NULL; funcinstr->n_children = 0; funcinstr->r_children = 0; if (pk_funcinstr_thrd_instr != NULL) { if (pk_funcinstr_thrd_instr->first_child == NULL) { // avoid an malloc if n_children will only == 1 pk_funcinstr_thrd_instr->first_child = funcinstr; } else { if (pk_funcinstr_thrd_instr->n_children == pk_funcinstr_thrd_instr->r_children) { pk_funcinstr_thrd_instr->r_children += PK_FUNCINSTR_CHILDREN_INCREMENT_COUNT; struct pk_funcinstr **children = (struct pk_funcinstr **)aligned_alloc(alignof(void *), sizeof(void *) * pk_funcinstr_thrd_instr->r_children); memset((char*)children + (sizeof(void *) * (pk_funcinstr_thrd_instr->n_children)), 0, (sizeof(void *) * pk_funcinstr_thrd_instr->r_children) - sizeof(void *) * (pk_funcinstr_thrd_instr->n_children)); if (pk_funcinstr_thrd_instr->children != NULL) { memcpy(children, pk_funcinstr_thrd_instr->children, sizeof(void *) * pk_funcinstr_thrd_instr->n_children); free(pk_funcinstr_thrd_instr->children); } pk_funcinstr_thrd_instr->children = children; if (pk_funcinstr_thrd_instr->n_children == 0) { pk_funcinstr_thrd_instr->children[0] = pk_funcinstr_thrd_instr->first_child; pk_funcinstr_thrd_instr->n_children++; } } pk_funcinstr_thrd_instr->children[pk_funcinstr_thrd_instr->n_children] = funcinstr; pk_funcinstr_thrd_instr->n_children++; } } return funcinstr; } __attribute__((no_instrument_function)) void __cyg_profile_func_enter(void* this_fn, void* call_site) { (void)call_site; if (pk_funcinstr_detect_not_initialized()) return; pk_funcinstr_detect_and_handle_reset(); if (pk_funcinstr_should_early_exit()) return; pk_funcinstr_thrd_bkt->guard_enter++; pk_funcinstr_thrd_instr = pk_funcinstr_create_funcinstr(this_fn); pk_funcinstr_thrd_bkt->guard_enter = 0; } __attribute__((no_instrument_function)) void __cyg_profile_func_exit(void* this_fn, void* call_site) { (void)call_site; if (pk_funcinstr_detect_not_initialized()) return; pk_funcinstr_detect_and_handle_reset(); if (pk_funcinstr_should_early_exit()) return; if (pk_funcinstr_thrd_instr == NULL) return; // exit called before enter? pk_funcinstr_thrd_bkt->guard_exit++; Dl_info info; if (this_fn != pk_funcinstr_thrd_instr->fn) { int64_t i = (int64_t)pk_funcinstr_thrd_bkt->used_count - 1; for (; i > -1; --i) { if (pk_funcinstr_thrd_bkt->data[i].fn == this_fn) { if (pk_funcinstr_thrd_bkt->data[i].tmr.e.tv_sec == 0) { pk_funcinstr_thrd_instr = &pk_funcinstr_thrd_bkt->data[i]; break; } } } } if (this_fn != pk_funcinstr_thrd_instr->fn) { if (pk_funcinstr_thrd_instr->parent == NULL) { struct pk_tmr tmr = pk_funcinstr_thrd_instr->tmr; pk_funcinstr_thrd_instr = pk_funcinstr_create_funcinstr(this_fn); pk_funcinstr_thrd_instr->tmr = tmr; fprintf(stdout, "[pkfuncinstr] func mismatch; Parent func? Duration not accurate."); } else { fprintf(stderr, "[pkfuncinstr] func mismatch. Last: '"); if (dladdr(pk_funcinstr_thrd_instr->fn, &info) != 0) { fprintf(stderr, "%s", info.dli_sname); } else { fprintf(stderr, "(unknown)"); } fprintf(stderr, "'. Current: '"); if (dladdr(this_fn, &info) != 0) { fprintf(stderr, "%s'.\n", info.dli_sname); } else { fprintf(stderr, "(unknown)'.\n"); } pk_funcinstr_thrd_bkt->guard_exit=0; return; } } pk_tmr_stop(pk_funcinstr_thrd_instr->tmr); if (dladdr(this_fn, &info) != 0) { int depth = 0; // TODO track depth in a better way struct pk_funcinstr *p = pk_funcinstr_thrd_instr->parent; while (p != NULL) { depth += 1; p = p->parent; } char *demangled = NULL; if (info.dli_sname != NULL) { #if defined(__cplusplus) demangled = abi::__cxa_demangle(info.dli_sname, NULL, NULL, NULL); #endif } fprintf(stdout, "[pkfuncinstr] %p %*s %s took %.6f ms\n" ,this_fn ,depth, "" ,demangled != NULL ? demangled : info.dli_sname != NULL ? info.dli_sname : "???" ,pk_tmr_duration_dbl_mili(pk_funcinstr_thrd_instr->tmr) ); if (demangled != NULL) free(demangled); } pk_funcinstr_thrd_bkt->guard_exit=0; pk_funcinstr_thrd_instr = pk_funcinstr_thrd_instr->parent; } #else // other #endif #endif /* PK_IMPL_FUNCINSTR */