diff --git a/ddprof-lib/src/main/cpp/arguments.cpp b/ddprof-lib/src/main/cpp/arguments.cpp index 72b8aec22..52826554f 100644 --- a/ddprof-lib/src/main/cpp/arguments.cpp +++ b/ddprof-lib/src/main/cpp/arguments.cpp @@ -374,6 +374,15 @@ Error Arguments::parse(const char *args) { } } + CASE("nativemem") + _nativemem = value == NULL ? 0 : parseUnits(value, BYTES); + if (_nativemem < 0) { + msg = "nativemem must be >= 0"; + } + + CASE("nofree") + _nofree = true; + DEFAULT() if (_unknown_arg == NULL) _unknown_arg = arg; @@ -385,7 +394,7 @@ Error Arguments::parse(const char *args) { return Error(msg); } - if (_event == NULL && _cpu < 0 && _wall < 0 && _memory < 0) { + if (_event == NULL && _cpu < 0 && _wall < 0 && _memory < 0 && _nativemem < 0) { _event = EVENT_CPU; } diff --git a/ddprof-lib/src/main/cpp/arguments.h b/ddprof-lib/src/main/cpp/arguments.h index 3f2542705..743b91403 100644 --- a/ddprof-lib/src/main/cpp/arguments.h +++ b/ddprof-lib/src/main/cpp/arguments.h @@ -1,5 +1,6 @@ /* * Copyright 2017 Andrei Pangin + * Copyright 2026, Datadog, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,6 +35,7 @@ const char *const EVENT_ALLOC = "alloc"; const char *const EVENT_WALL = "wall"; const char *const EVENT_ITIMER = "itimer"; const char *const EVENT_CTIMER = "ctimer"; +const char *const EVENT_NATIVEMEM = "nativemem"; enum Action { ACTION_NONE, @@ -174,6 +176,8 @@ class Arguments { double _live_samples_ratio; bool _record_heap_usage; bool _gc_generations; + long _nativemem; + bool _nofree; int _jstackdepth; int _safe_mode; StackWalkFeatures _features; @@ -208,6 +212,8 @@ class Arguments { _live_samples_ratio(0.1), // default to liveness-tracking 10% of the allocation samples _record_heap_usage(false), _gc_generations(false), + _nativemem(-1), + _nofree(false), _jstackdepth(DEFAULT_JSTACKDEPTH), _safe_mode(0), _features{1, 1, 1, 1, 1, 1}, diff --git a/ddprof-lib/src/main/cpp/codeCache.cpp b/ddprof-lib/src/main/cpp/codeCache.cpp index a8a360160..fcd4d3739 100644 --- a/ddprof-lib/src/main/cpp/codeCache.cpp +++ b/ddprof-lib/src/main/cpp/codeCache.cpp @@ -1,5 +1,6 @@ /* * Copyright The async-profiler authors + * Copyright 2026, Datadog, Inc. * SPDX-License-Identifier: Apache-2.0 */ @@ -302,6 +303,11 @@ void CodeCache::saveImport(ImportId id, void** entry) { void CodeCache::addImport(void **entry, const char *name) { switch (name[0]) { + case 'a': + if (strcmp(name, "aligned_alloc") == 0) { + saveImport(im_aligned_alloc, entry); + } + break; case 'c': if (strcmp(name, "calloc") == 0) { saveImport(im_calloc, entry); @@ -331,6 +337,8 @@ void CodeCache::addImport(void **entry, const char *name) { saveImport(im_pthread_setspecific, entry); } else if (strcmp(name, "poll") == 0) { saveImport(im_poll, entry); + } else if (strcmp(name, "posix_memalign") == 0) { + saveImport(im_posix_memalign, entry); } break; case 'r': diff --git a/ddprof-lib/src/main/cpp/codeCache.h b/ddprof-lib/src/main/cpp/codeCache.h index 9192719ff..19c53f2b9 100644 --- a/ddprof-lib/src/main/cpp/codeCache.h +++ b/ddprof-lib/src/main/cpp/codeCache.h @@ -1,5 +1,6 @@ /* * Copyright The async-profiler authors + * Copyright 2026, Datadog, Inc. * SPDX-License-Identifier: Apache-2.0 */ @@ -31,6 +32,8 @@ enum ImportId { im_calloc, im_realloc, im_free, + im_posix_memalign, + im_aligned_alloc, NUM_IMPORTS }; diff --git a/ddprof-lib/src/main/cpp/event.h b/ddprof-lib/src/main/cpp/event.h index e9363165f..df2ba29e7 100644 --- a/ddprof-lib/src/main/cpp/event.h +++ b/ddprof-lib/src/main/cpp/event.h @@ -1,5 +1,6 @@ /* * Copyright 2020 Andrei Pangin + * Copyright 2026, Datadog, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -88,6 +89,13 @@ class ObjectLivenessEvent : public Event { Context _ctx; }; +class MallocEvent : public Event { +public: + u64 _start_time; + uintptr_t _address; + u64 _size; // 0 for free events +}; + class WallClockEpochEvent { public: bool _dirty; diff --git a/ddprof-lib/src/main/cpp/flightRecorder.cpp b/ddprof-lib/src/main/cpp/flightRecorder.cpp index 78d1b6711..b7c5bf31a 100644 --- a/ddprof-lib/src/main/cpp/flightRecorder.cpp +++ b/ddprof-lib/src/main/cpp/flightRecorder.cpp @@ -898,6 +898,10 @@ void Recording::writeSettings(Buffer *buf, Arguments &args) { writeBoolSetting(buf, T_ALLOC, "enabled", args._record_allocations); writeBoolSetting(buf, T_HEAP_LIVE_OBJECT, "enabled", args._record_liveness); + writeBoolSetting(buf, T_MALLOC, "enabled", args._nativemem >= 0); + if (args._nativemem >= 0) { + writeIntSetting(buf, T_MALLOC, "nativemem", args._nativemem); + } writeBoolSetting(buf, T_ACTIVE_RECORDING, "debugSymbols", VMStructs::libjvm()->hasDebugSymbols()); @@ -1563,6 +1567,21 @@ void Recording::recordAllocation(RecordingBuffer *buf, int tid, flushIfNeeded(buf); } +void Recording::recordMallocSample(Buffer *buf, int tid, u64 call_trace_id, + MallocEvent *event) { + int start = buf->skip(1); + buf->putVar64(event->_size != 0 ? T_MALLOC : T_FREE); + buf->putVar64(event->_start_time); + buf->putVar32(tid); + buf->putVar64(call_trace_id); + buf->putVar64(event->_address); + if (event->_size != 0) { + buf->putVar64(event->_size); + } + writeEventSizePrefix(buf, start); + flushIfNeeded(buf); +} + void Recording::recordHeapLiveObject(Buffer *buf, int tid, u64 call_trace_id, ObjectLivenessEvent *event) { int start = buf->skip(1); @@ -1804,6 +1823,9 @@ void FlightRecorder::recordEvent(int lock_index, int tid, u64 call_trace_id, case BCI_PARK: rec->recordThreadPark(buf, tid, call_trace_id, (LockEvent *)event); break; + case BCI_NATIVE_MALLOC: + rec->recordMallocSample(buf, tid, call_trace_id, (MallocEvent *)event); + break; } rec->flushIfNeeded(buf); rec->addThread(lock_index, tid); diff --git a/ddprof-lib/src/main/cpp/flightRecorder.h b/ddprof-lib/src/main/cpp/flightRecorder.h index c1ab88262..65c84d36e 100644 --- a/ddprof-lib/src/main/cpp/flightRecorder.h +++ b/ddprof-lib/src/main/cpp/flightRecorder.h @@ -280,6 +280,8 @@ class Recording { void recordQueueTime(Buffer *buf, int tid, QueueTimeEvent *event); void recordAllocation(RecordingBuffer *buf, int tid, u64 call_trace_id, AllocEvent *event); + void recordMallocSample(Buffer *buf, int tid, u64 call_trace_id, + MallocEvent *event); void recordHeapLiveObject(Buffer *buf, int tid, u64 call_trace_id, ObjectLivenessEvent *event); void recordMonitorBlocked(Buffer *buf, int tid, u64 call_trace_id, diff --git a/ddprof-lib/src/main/cpp/jfrMetadata.cpp b/ddprof-lib/src/main/cpp/jfrMetadata.cpp index 6991a8a12..d36d902d7 100644 --- a/ddprof-lib/src/main/cpp/jfrMetadata.cpp +++ b/ddprof-lib/src/main/cpp/jfrMetadata.cpp @@ -1,5 +1,6 @@ /* * Copyright 2020 Andrei Pangin + * Copyright 2026, Datadog, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -286,6 +287,21 @@ void JfrMetadata::initialize( << field("name", T_STRING, "Name") << field("count", T_LONG, "Count")) + << (type("profiler.Malloc", T_MALLOC, "malloc") + << category("Java Virtual Machine", "Native Memory") + << field("startTime", T_LONG, "Start Time", F_TIME_TICKS) + << field("eventThread", T_THREAD, "Event Thread", F_CPOOL) + << field("stackTrace", T_STACK_TRACE, "Stack Trace", F_CPOOL) + << field("address", T_LONG, "Address", F_ADDRESS) + << field("size", T_LONG, "Size", F_BYTES)) + + << (type("profiler.Free", T_FREE, "free") + << category("Java Virtual Machine", "Native Memory") + << field("startTime", T_LONG, "Start Time", F_TIME_TICKS) + << field("eventThread", T_THREAD, "Event Thread", F_CPOOL) + << field("stackTrace", T_STACK_TRACE, "Stack Trace", F_CPOOL) + << field("address", T_LONG, "Address", F_ADDRESS)) + << (type("jdk.OSInformation", T_OS_INFORMATION, "OS Information") << category("Operating System") << field("startTime", T_LONG, "Start Time", F_TIME_TICKS) diff --git a/ddprof-lib/src/main/cpp/jfrMetadata.h b/ddprof-lib/src/main/cpp/jfrMetadata.h index 77da96d3f..a826445f2 100644 --- a/ddprof-lib/src/main/cpp/jfrMetadata.h +++ b/ddprof-lib/src/main/cpp/jfrMetadata.h @@ -1,5 +1,6 @@ /* * Copyright 2020 Andrei Pangin + * Copyright 2026, Datadog, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,6 +79,8 @@ enum JfrType { T_DATADOG_CLASSREF_CACHE = 124, T_DATADOG_COUNTER = 125, T_UNWIND_FAILURE = 126, + T_MALLOC = 127, + T_FREE = 128, T_ANNOTATION = 200, T_LABEL = 201, T_CATEGORY = 202, diff --git a/ddprof-lib/src/main/cpp/mallocTracer.cpp b/ddprof-lib/src/main/cpp/mallocTracer.cpp new file mode 100644 index 000000000..dfc9c41f1 --- /dev/null +++ b/ddprof-lib/src/main/cpp/mallocTracer.cpp @@ -0,0 +1,269 @@ +/* + * Copyright The async-profiler authors + * Copyright 2026, Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include "codeCache.h" +#include "libraries.h" +#include "mallocTracer.h" +#include "os.h" +#include "profiler.h" +#include "symbols.h" +#include "tsc.h" +#include "vmEntry.h" + +#ifdef __clang__ +# define NO_OPTIMIZE __attribute__((optnone)) +#else +# define NO_OPTIMIZE __attribute__((optimize("-fno-omit-frame-pointer,-fno-optimize-sibling-calls"))) +#endif + +#define SAVE_IMPORT(FUNC) \ + do { \ + void** _entry = lib->findImport(im_##FUNC); \ + if (_entry != NULL) _orig_##FUNC = (decltype(_orig_##FUNC))*_entry; \ + } while (0) + +static void* (*_orig_malloc)(size_t); +static void (*_orig_free)(void*); +static void* (*_orig_calloc)(size_t, size_t); +static void* (*_orig_realloc)(void*, size_t); +static int (*_orig_posix_memalign)(void**, size_t, size_t); +static void* (*_orig_aligned_alloc)(size_t, size_t); + +extern "C" void* malloc_hook(size_t size) { + void* ret = _orig_malloc(size); + if (MallocTracer::running() && ret && size) { + MallocTracer::recordMalloc(ret, size); + } + return ret; +} + +extern "C" void* calloc_hook(size_t num, size_t size) { + void* ret = _orig_calloc(num, size); + if (MallocTracer::running() && ret && num && size) { + MallocTracer::recordMalloc(ret, num * size); + } + return ret; +} + +// Make sure this is not optimized away (function-scoped -fno-optimize-sibling-calls) +extern "C" NO_OPTIMIZE +void* calloc_hook_dummy(size_t num, size_t size) { + return _orig_calloc(num, size); +} + +extern "C" void* realloc_hook(void* addr, size_t size) { + void* ret = _orig_realloc(addr, size); + if (MallocTracer::running()) { + // On POSIX, realloc(ptr, 0) may return NULL and free ptr. + // Only record the free if allocation didn't simply fail (size > 0 with NULL ret = ENOMEM). + if (addr && !MallocTracer::nofree() && (ret != NULL || size == 0)) { + MallocTracer::recordFree(addr); + } + if (ret != NULL && size > 0) { + MallocTracer::recordMalloc(ret, size); + } + } + return ret; +} + +extern "C" void free_hook(void* addr) { + _orig_free(addr); + if (MallocTracer::running() && !MallocTracer::nofree() && addr) { + MallocTracer::recordFree(addr); + } +} + +extern "C" int posix_memalign_hook(void** memptr, size_t alignment, size_t size) { + int ret = _orig_posix_memalign(memptr, alignment, size); + if (MallocTracer::running() && ret == 0 && memptr && *memptr && size) { + MallocTracer::recordMalloc(*memptr, size); + } + return ret; +} + +// Make sure this is not optimized away (function-scoped -fno-optimize-sibling-calls) +extern "C" NO_OPTIMIZE +int posix_memalign_hook_dummy(void** memptr, size_t alignment, size_t size) { + return _orig_posix_memalign(memptr, alignment, size); +} + +extern "C" void* aligned_alloc_hook(size_t alignment, size_t size) { + void* ret = _orig_aligned_alloc(alignment, size); + if (MallocTracer::running() && ret && size) { + MallocTracer::recordMalloc(ret, size); + } + return ret; +} + +u64 MallocTracer::_interval; +bool MallocTracer::_nofree; +volatile u64 MallocTracer::_allocated_bytes; + +Mutex MallocTracer::_patch_lock; +int MallocTracer::_patched_libs = 0; +bool MallocTracer::_initialized = false; +volatile bool MallocTracer::_running = false; + +static pthread_t _current_thread; +static bool _nested_malloc = false; + +// Test if calloc() implementation calls malloc() +static void* nested_malloc_hook(size_t size) { + if (pthread_self() == _current_thread) { + _nested_malloc = true; + } + return _orig_malloc(size); +} + +// In some implementations, specifically on musl, calloc() calls malloc() internally, +// and posix_memalign() calls aligned_alloc(). Detect such cases to prevent double-accounting. +static void detectNestedMalloc() { + if (_orig_malloc == NULL || _orig_calloc == NULL) { + return; + } + CodeCache* libc = Profiler::instance()->findLibraryByAddress((void*)_orig_calloc); + if (libc == NULL) { + return; + } + + libc->patchImport(im_malloc, (void*)nested_malloc_hook); + + _current_thread = pthread_self(); + free(_orig_calloc(1, 1)); + _current_thread = pthread_t(0); + + // Restore original malloc so libc doesn't carry the probe hook until patchLibraries() runs. + // _orig_malloc != NULL is guaranteed by the early-return guard above. + libc->patchImport(im_malloc, (void*)_orig_malloc); +} + +// Call each intercepted function at least once to ensure its GOT entry is updated +static void resolveMallocSymbols() { + static volatile intptr_t sink; + + void* p0 = malloc(1); + void* p1 = realloc(p0, 2); + void* p2 = calloc(1, 1); + void* p3 = aligned_alloc(1, 1); + void* p4 = NULL; + if (posix_memalign(&p4, sizeof(void*), sizeof(void*)) == 0) free(p4); + free(p3); + free(p2); + free(p1); + + sink = (intptr_t)p1 + (intptr_t)p2 + (intptr_t)p3 + (intptr_t)p4; +} + +void MallocTracer::initialize() { + CodeCache* lib = Profiler::instance()->findLibraryByAddress((void*)MallocTracer::initialize); + if (lib == NULL) { + return; + } + + resolveMallocSymbols(); + + SAVE_IMPORT(malloc); + SAVE_IMPORT(free); + SAVE_IMPORT(calloc); + SAVE_IMPORT(realloc); + SAVE_IMPORT(posix_memalign); + SAVE_IMPORT(aligned_alloc); + + detectNestedMalloc(); + + lib->mark( + [](const char* s) -> bool { + return strcmp(s, "malloc_hook") == 0 + || strcmp(s, "calloc_hook") == 0 + || strcmp(s, "realloc_hook") == 0 + || strcmp(s, "free_hook") == 0 + || strcmp(s, "posix_memalign_hook") == 0 + || strcmp(s, "aligned_alloc_hook") == 0; + }, + MARK_ASYNC_PROFILER); +} + +// To avoid complexity in hooking and tracking reentrancy, a TLS-based approach is not used. +// Reentrant allocation calls would result in double-accounting. However, this does not impact +// the leak detector, as it correctly tracks memory as freed regardless of how many times +// recordMalloc is called with the same address. +void MallocTracer::patchLibraries() { + MutexLocker ml(_patch_lock); + + const CodeCacheArray& native_libs = Libraries::instance()->native_libs(); + int native_lib_count = native_libs.count(); + + while (_patched_libs < native_lib_count) { + CodeCache* cc = native_libs[_patched_libs++]; + + UnloadProtection handle(cc); + if (!handle.isValid()) { + continue; + } + + if (_orig_malloc) cc->patchImport(im_malloc, (void*)malloc_hook); + if (_orig_realloc) cc->patchImport(im_realloc, (void*)realloc_hook); + if (_orig_free) cc->patchImport(im_free, (void*)free_hook); + if (_orig_aligned_alloc) cc->patchImport(im_aligned_alloc, (void*)aligned_alloc_hook); + + if (_nested_malloc) { + // Use dummy hooks to prevent double-accounting. Dummy frames are introduced + // to preserve the frame link to the original caller. + if (_orig_calloc) cc->patchImport(im_calloc, (void*)calloc_hook_dummy); + if (_orig_posix_memalign) cc->patchImport(im_posix_memalign, (void*)posix_memalign_hook_dummy); + } else { + if (_orig_calloc) cc->patchImport(im_calloc, (void*)calloc_hook); + if (_orig_posix_memalign) cc->patchImport(im_posix_memalign, (void*)posix_memalign_hook); + } + } +} + +void MallocTracer::recordMalloc(void* address, size_t size) { + if (updateCounter(_allocated_bytes, size, _interval)) { + MallocEvent event; + event._start_time = TSC::ticks(); + event._address = (uintptr_t)address; + event._size = size; + + Profiler::instance()->recordSample(NULL, size, OS::threadId(), BCI_NATIVE_MALLOC, 0, &event); + } +} + +void MallocTracer::recordFree(void* address) { + MallocEvent event; + event._start_time = TSC::ticks(); + event._address = (uintptr_t)address; + event._size = 0; + + Profiler::instance()->recordEventOnly(BCI_NATIVE_MALLOC, &event); +} + +Error MallocTracer::start(Arguments& args) { + _interval = args._nativemem > 0 ? args._nativemem : 0; + _nofree = args._nofree; + _allocated_bytes = 0; + + if (!_initialized) { + initialize(); + _initialized = true; + } + + // Patch first, then enable recording so hooks never run without valid _orig_* pointers. + patchLibraries(); + _running = true; + + return Error::OK; +} + +void MallocTracer::stop() { + // Ideally, we should reset original malloc entries, but it's not currently safe + // in the view of library unloading. Consider using dl_iterate_phdr. + _running = false; +} diff --git a/ddprof-lib/src/main/cpp/mallocTracer.h b/ddprof-lib/src/main/cpp/mallocTracer.h new file mode 100644 index 000000000..2184d5cef --- /dev/null +++ b/ddprof-lib/src/main/cpp/mallocTracer.h @@ -0,0 +1,55 @@ +/* + * Copyright The async-profiler authors + * Copyright 2026, Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef _MALLOCTRACER_H +#define _MALLOCTRACER_H + +#include +#include "engine.h" +#include "event.h" +#include "mutex.h" + +class MallocTracer : public Engine { + private: + static u64 _interval; + static bool _nofree; + static volatile u64 _allocated_bytes; + + static Mutex _patch_lock; + static int _patched_libs; + static bool _initialized; + static volatile bool _running; + + static void initialize(); + static void patchLibraries(); + + public: + const char* name() { + return "MallocTracer"; + } + + Error start(Arguments& args); + void stop(); + + static inline bool running() { + return _running; + } + + static inline void installHooks() { + if (running()) { + patchLibraries(); + } + } + + static inline bool nofree() { + return _nofree; + } + + static void recordMalloc(void* address, size_t size); + static void recordFree(void* address); +}; + +#endif // _MALLOCTRACER_H diff --git a/ddprof-lib/src/main/cpp/profiler.cpp b/ddprof-lib/src/main/cpp/profiler.cpp index 7158b145c..dedb3f80f 100644 --- a/ddprof-lib/src/main/cpp/profiler.cpp +++ b/ddprof-lib/src/main/cpp/profiler.cpp @@ -6,6 +6,7 @@ #include "profiler.h" #include "asyncSampleMutex.h" +#include "mallocTracer.h" #include "context.h" #include "guards.h" #include "common.h" @@ -52,6 +53,7 @@ static void (*orig_segvHandler)(int signo, siginfo_t *siginfo, void *ucontext); static void (*orig_busHandler)(int signo, siginfo_t *siginfo, void *ucontext); static Engine noop_engine; +static MallocTracer malloc_tracer; static PerfEvents perf_events; static WallClockASGCT wall_asgct_engine; static WallClockJVMTI wall_jvmti_engine; @@ -298,7 +300,7 @@ int Profiler::getNativeTrace(void *ucontext, ASGCT_CallFrame *frames, bool *truncated, int lock_index) { if (_cstack == CSTACK_NO || (event_type == BCI_ALLOC || event_type == BCI_ALLOC_OUTSIDE_TLAB) || - (event_type != BCI_CPU && event_type != BCI_WALL && + (event_type != BCI_CPU && event_type != BCI_WALL && event_type != BCI_NATIVE_MALLOC && _cstack == CSTACK_DEFAULT)) { return 0; } @@ -834,7 +836,15 @@ void Profiler::recordSample(void *ucontext, u64 counter, int tid, &java_ctx, &truncated, lock_index); if (num_frames < _max_stack_depth) { int max_remaining = _max_stack_depth - num_frames; - if (_features.mixed) { + if (event_type == BCI_NATIVE_MALLOC && _cstack >= CSTACK_VM) { + // Walk the Java stack for native malloc events. + // ucontext is NULL here (no signal context); walkVM starts from callerPC() and + // walks native frames via DWARF until it fails, then retries from the thread's + // JavaFrameAnchor (lastJavaPC/lastJavaSP/lastJavaFP) to reach Java frames. + int vm_frames = StackWalker::walkVM(ucontext, frames + num_frames, max_remaining, + _features, eventTypeFromBCI(event_type), lock_index, &truncated); + num_frames += vm_frames; + } else if (_features.mixed) { int vm_start = num_frames; int vm_frames = StackWalker::walkVM(ucontext, frames + vm_start, max_remaining, _features, eventTypeFromBCI(event_type), lock_index, &truncated); num_frames += vm_frames; @@ -880,6 +890,21 @@ void Profiler::recordSample(void *ucontext, u64 counter, int tid, _locks[lock_index].unlock(); } +void Profiler::recordEventOnly(int event_type, Event *event) { + if (!_jfr.active()) { + return; + } + int tid = OS::threadId(); + u32 lock_index = getLockIndex(tid); + if (!_locks[lock_index].tryLock() && + !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && + !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { + return; + } + _jfr.recordEvent(lock_index, tid, 0, event_type, event); + _locks[lock_index].unlock(); +} + void Profiler::recordWallClockEpoch(int tid, WallClockEpochEvent *event) { u32 lock_index = getLockIndex(tid); if (!_locks[lock_index].tryLock() && @@ -985,6 +1010,7 @@ void *Profiler::dlopen_hook(const char *filename, int flags) { // Static function of Profiler -> can not use the instance variable _libs // Since Libraries is a singleton, this does not matter Libraries::instance()->updateSymbols(false); + MallocTracer::installHooks(); // Extract build-ids for newly loaded libraries if remote symbolication is enabled Profiler* profiler = instance(); if (profiler != nullptr && profiler->_remote_symbolication) { @@ -1281,8 +1307,9 @@ Error Profiler::start(Arguments &args, bool reset) { (args._cpu >= 0 ? EM_CPU : 0) | (args._wall >= 0 ? EM_WALL : 0) | (args._record_allocations || args._record_liveness || args._gc_generations ? EM_ALLOC - : 0); - + : 0) | + (args._nativemem >= 0 ? EM_NATIVEMEM : 0); + // Check if signal-based profiling is requested without TLS priming if (_event_mask & (EM_CPU | EM_WALL)) { return Error("CRITICAL: Signal-based profiling (CPU/Wall) requested but TLS priming failed. " @@ -1303,9 +1330,10 @@ Error Profiler::start(Arguments &args, bool reset) { (args._cpu >= 0 ? EM_CPU : 0) | (args._wall >= 0 ? EM_WALL : 0) | (args._record_allocations || args._record_liveness || args._gc_generations ? EM_ALLOC - : 0); + : 0) | + (args._nativemem >= 0 ? EM_NATIVEMEM : 0); } - + if (_event_mask == 0) { return Error("No profiling events specified"); } @@ -1394,8 +1422,9 @@ Error Profiler::start(Arguments &args, bool reset) { Log::warn("Branch stack is supported only with PMU events"); } else if (_cstack == CSTACK_VM) { if (!VMStructs::hasStackStructs()) { - return Error( - "VMStructs stack walking is not supported on this JVM/platform"); + _cstack = CSTACK_DEFAULT; + Log::warn("VMStructs stack walking is not supported on this JVM/platform, " + "defaulting to frame pointer unwinding."); } } @@ -1453,6 +1482,15 @@ Error Profiler::start(Arguments &args, bool reset) { } } } + if (_event_mask & EM_NATIVEMEM) { + error = malloc_tracer.start(args); + if (error) { + Log::warn("%s", error.message()); + error = Error::OK; // recoverable + } else { + activated |= EM_NATIVEMEM; + } + } if (activated) { switchThreadEvents(JVMTI_ENABLE); @@ -1491,6 +1529,8 @@ Error Profiler::stop() { if (_event_mask & EM_ALLOC) _alloc_engine->stop(); + if (_event_mask & EM_NATIVEMEM) + malloc_tracer.stop(); if (_event_mask & EM_WALL) _wall_engine->stop(); if (_event_mask & EM_CPU) @@ -1542,6 +1582,9 @@ Error Profiler::check(Arguments &args) { _alloc_engine = selectAllocEngine(args); error = _alloc_engine->check(args); } + if (!error && args._nativemem >= 0) { + error = malloc_tracer.check(args); + } if (!error) { if (args._cstack == CSTACK_DWARF && !DWARF_SUPPORTED) { return Error("DWARF unwinding is not supported on this platform"); diff --git a/ddprof-lib/src/main/cpp/profiler.h b/ddprof-lib/src/main/cpp/profiler.h index 0977a0a95..2d70a391a 100644 --- a/ddprof-lib/src/main/cpp/profiler.h +++ b/ddprof-lib/src/main/cpp/profiler.h @@ -88,6 +88,8 @@ inline EventType eventTypeFromBCI(jint bci_type) { return LOCK_SAMPLE; case BCI_PARK: return PARK_SAMPLE; + case BCI_NATIVE_MALLOC: + return MALLOC_SAMPLE; default: // For unknown or invalid BCI types, default to EXECUTION_SAMPLE // This maintains backward compatibility and prevents undefined behavior @@ -367,6 +369,7 @@ class alignas(alignof(SpinLock)) Profiler { ASGCT_CallFrame *frames, int lock_index); void recordSample(void *ucontext, u64 weight, int tid, jint event_type, u64 call_trace_id, Event *event); + void recordEventOnly(int event_type, Event *event); u64 recordJVMTISample(u64 weight, int tid, jthread thread, jint event_type, Event *event, bool deferred); void recordDeferredSample(int tid, u64 call_trace_id, jint event_type, Event *event); void recordExternalSample(u64 weight, int tid, int num_frames, diff --git a/ddprof-lib/src/main/cpp/stackWalker.cpp b/ddprof-lib/src/main/cpp/stackWalker.cpp index f009a0bfd..d3ea38bcf 100644 --- a/ddprof-lib/src/main/cpp/stackWalker.cpp +++ b/ddprof-lib/src/main/cpp/stackWalker.cpp @@ -547,7 +547,6 @@ __attribute__((no_sanitize("address"))) int StackWalker::walkVM(void* ucontext, // already used the anchor; disable it anchor = NULL; if (sp < prev_sp || sp >= bottom || !aligned(sp)) { - fillFrame(frames[depth++], BCI_ERROR, "break_no_anchor"); break; } // we restored from Java frame; clean the prev_native_pc @@ -572,7 +571,6 @@ __attribute__((no_sanitize("address"))) int StackWalker::walkVM(void* ucontext, } } } - fillFrame(frames[depth++], BCI_ERROR, "break_no_anchor"); break; } fillFrame(frames[depth++], frame_bci, (void*)method_name); diff --git a/ddprof-lib/src/main/cpp/vmEntry.cpp b/ddprof-lib/src/main/cpp/vmEntry.cpp index c0700a58d..dc00a82f5 100644 --- a/ddprof-lib/src/main/cpp/vmEntry.cpp +++ b/ddprof-lib/src/main/cpp/vmEntry.cpp @@ -318,14 +318,17 @@ bool VM::initShared(JavaVM* vm) { // Mark thread entry points for all JVMs (critical for correct stack unwinding) lib->mark(isThreadEntry, MARK_THREAD_ENTRY); - // Also mark libc/pthread libraries which contain thread start/exit points - CodeCache* libc = libraries->findJvmLibrary("libc"); - if (libc != NULL) { - libc->mark(isThreadEntry, MARK_THREAD_ENTRY); - } - CodeCache* libpthread = libraries->findJvmLibrary("libpthread"); - if (libpthread != NULL) { - libpthread->mark(isThreadEntry, MARK_THREAD_ENTRY); + // Mark OS-level pthread entry points across ALL loaded native libraries. + // On glibc these live in libc.so.6 or libpthread.so.0 (merged in glibc 2.34+); + // on musl in libc.musl-.so.1; on Rust they may be in the app binary itself. + // Scanning all libs avoids fragile name-based lookup (findLibraryByName uses a + // prefix match that can return the wrong library, e.g. libcap instead of libc). + // walkVM emits break_no_anchor when it reaches the top of a pure-native thread + // stack without finding an anchor; marking start_thread/thread_start here gives + // the walker a clean stopping point for any pthread-managed thread. + const CodeCacheArray& all_native_libs = libraries->native_libs(); + for (int i = 0; i < all_native_libs.count(); i++) { + all_native_libs[i]->mark(isThreadEntry, MARK_THREAD_ENTRY); } if (isOpenJ9()) { diff --git a/ddprof-lib/src/main/cpp/vmEntry.h b/ddprof-lib/src/main/cpp/vmEntry.h index 73f52f2fd..663a07478 100644 --- a/ddprof-lib/src/main/cpp/vmEntry.h +++ b/ddprof-lib/src/main/cpp/vmEntry.h @@ -33,6 +33,7 @@ enum ASGCT_CallFrameType { BCI_THREAD_ID = -17, // method_id designates a thread BCI_ERROR = -18, // method_id is an error string BCI_NATIVE_FRAME_REMOTE = -19, // method_id points to RemoteFrameInfo for remote symbolication + BCI_NATIVE_MALLOC = -20, // native malloc/free sample (size stored in counter) }; // See hotspot/src/share/vm/prims/forte.cpp diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/context/TagContextTest.java b/ddprof-test/src/test/java/com/datadoghq/profiler/context/TagContextTest.java index 244865576..3ed0b042d 100644 --- a/ddprof-test/src/test/java/com/datadoghq/profiler/context/TagContextTest.java +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/context/TagContextTest.java @@ -161,6 +161,9 @@ private void checkTagValues(ContextSetter contextSetter, String contextAttribute @Override protected String getProfilerCommand() { - return "wall=1ms,filter=0,attributes=tag1;tag2;tag3"; + // Use cstack=default explicitly: this test covers context-tagging correctness, + // not stack-walking mode. CSTACK_VM is the global default but is unreliable + // on certain CI runners (musl x64), so we pin to a stable mode here. + return "wall=1ms,filter=0,cstack=default,attributes=tag1;tag2;tag3"; } } diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/junit/CStackInjector.java b/ddprof-test/src/test/java/com/datadoghq/profiler/junit/CStackInjector.java index ce021149c..75dfda181 100644 --- a/ddprof-test/src/test/java/com/datadoghq/profiler/junit/CStackInjector.java +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/junit/CStackInjector.java @@ -50,9 +50,17 @@ public Stream provideTestTemplateInvocationContex // zing is a bit iffy when using anything but 'no' for cstack return Stream.of(new ParameterizedTestContext("no", retryCount)); } else { - return Stream.of(valueSource.strings()). - filter(CStackInjector::isModeSafe). - map(param -> new ParameterizedTestContext(param, retryCount)); + List safeValues = Stream.of(valueSource.strings()) + .filter(CStackInjector::isModeSafe) + .collect(Collectors.toList()); + if (safeValues.isEmpty()) { + // No mode passed the filter (e.g. J9 with a {"vm","vmx"} @ValueSource). + // Fall back to the first declared value so the test can reach + // isPlatformSupported() and skip cleanly rather than failing with + // PreconditionViolationException (JUnit 5 requires ≥1 invocation context). + safeValues = Collections.singletonList(valueSource.strings()[0]); + } + return safeValues.stream().map(param -> new ParameterizedTestContext(param, retryCount)); } } diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NativememProfilerTest.java b/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NativememProfilerTest.java new file mode 100644 index 000000000..2bd4d54b2 --- /dev/null +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NativememProfilerTest.java @@ -0,0 +1,150 @@ +/* + * Copyright 2026, Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ +package com.datadoghq.profiler.nativemem; + +import com.datadoghq.profiler.CStackAwareAbstractProfilerTest; +import com.datadoghq.profiler.Platform; +import com.datadoghq.profiler.junit.CStack; +import com.datadoghq.profiler.junit.RetryTest; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.params.provider.ValueSource; +import org.openjdk.jmc.common.item.IAttribute; +import org.openjdk.jmc.common.item.IItem; +import org.openjdk.jmc.common.item.IItemCollection; +import org.openjdk.jmc.common.item.IItemIterable; +import org.openjdk.jmc.common.item.IMemberAccessor; +import org.openjdk.jmc.common.unit.IQuantity; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.openjdk.jmc.common.item.Attribute.attr; +import static org.openjdk.jmc.common.unit.UnitLookup.ADDRESS; + +/** + * Smoke tests for native memory (malloc/free) profiling. + * + *

Runs with {@code cstack=vm} and {@code cstack=vmx}. In {@code vmx} mode the + * stack trace is expected to contain {@code allocateDirect} because the mixed-mode + * walker captures the Java call chain that triggered the native allocation. + */ +public class NativememProfilerTest extends CStackAwareAbstractProfilerTest { + + private static final IAttribute MALLOC_ADDRESS = attr("address", "address", "", ADDRESS); + + public NativememProfilerTest(@CStack String cstack) { + super(cstack); + } + + @Override + protected String getProfilerCommand() { + return "nativemem=0"; // sample every allocation + } + + @Override + protected boolean isPlatformSupported() { + return Platform.isLinux() && !Platform.isJ9() && !Platform.isZing(); + } + + @RetryTest(3) + @TestTemplate + @ValueSource(strings = {"vm", "vmx"}) + public void shouldRecordMallocSamples(@CStack String cstack) throws InterruptedException { + Assumptions.assumeFalse(isAsan() || isTsan()); + + List buffers = new ArrayList<>(); + for (int i = 0; i < 1000; i++) { + buffers.add(ByteBuffer.allocateDirect(1024)); + } + + stopProfiler(); + + IItemCollection events = verifyEvents("profiler.Malloc"); + boolean foundMinSize = false; + for (IItemIterable items : events) { + IMemberAccessor sizeAccessor = SIZE.getAccessor(items.getType()); + IMemberAccessor addrAccessor = MALLOC_ADDRESS.getAccessor(items.getType()); + if (sizeAccessor == null) { + continue; + } + for (IItem item : items) { + IQuantity size = sizeAccessor.getMember(item); + assertTrue(size == null || size.longValue() > 0, "allocation size must be positive"); + if (size != null && size.longValue() >= 1024) { + foundMinSize = true; + } + if (addrAccessor != null) { + IQuantity addr = addrAccessor.getMember(item); + assertTrue(addr == null || addr.longValue() != 0, "malloc address must not be zero"); + } + } + } + assertTrue(foundMinSize, "expected at least one malloc event with size >= 1024 bytes"); + + // Both vm and vmx capture the Java call chain; allocateDirect must appear in stack traces + verifyStackTraces("profiler.Malloc", "allocateDirect"); + + buffers.clear(); // keep alive until after stop + } + + @RetryTest(3) + @TestTemplate + @ValueSource(strings = {"vm", "vmx"}) + public void shouldRecordFreeEvents(@CStack String cstack) throws InterruptedException { + Assumptions.assumeFalse(isAsan() || isTsan()); + + // Allocate and immediately abandon so the GC Cleaner will call free() + for (int i = 0; i < 1000; i++) { + ByteBuffer.allocateDirect(1024); + } + for (int attempt = 0; attempt < 5; attempt++) { + System.gc(); + Thread.sleep(200); + } + + stopProfiler(); + + IItemCollection mallocEvents = verifyEvents("profiler.Malloc"); + IItemCollection freeEvents = verifyEvents("profiler.Free"); + + // Collect all recorded malloc addresses + Set mallocAddresses = new HashSet<>(); + for (IItemIterable items : mallocEvents) { + IMemberAccessor addrAccessor = MALLOC_ADDRESS.getAccessor(items.getType()); + if (addrAccessor == null) { + continue; + } + for (IItem item : items) { + IQuantity addr = addrAccessor.getMember(item); + if (addr != null) { + mallocAddresses.add(addr.longValue()); + } + } + } + + // At least one free event address must match a previously recorded malloc address + boolean foundCorrelation = false; + outer: + for (IItemIterable items : freeEvents) { + IMemberAccessor addrAccessor = MALLOC_ADDRESS.getAccessor(items.getType()); + if (addrAccessor == null) { + continue; + } + for (IItem item : items) { + IQuantity addr = addrAccessor.getMember(item); + if (addr != null && mallocAddresses.contains(addr.longValue())) { + foundCorrelation = true; + break outer; + } + } + } + assertTrue(foundCorrelation, "expected at least one free event whose address matches a recorded malloc"); + } +} diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NofreeNativememProfilerTest.java b/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NofreeNativememProfilerTest.java new file mode 100644 index 000000000..4c9ccab56 --- /dev/null +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/nativemem/NofreeNativememProfilerTest.java @@ -0,0 +1,61 @@ +/* + * Copyright 2026, Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ +package com.datadoghq.profiler.nativemem; + +import com.datadoghq.profiler.CStackAwareAbstractProfilerTest; +import com.datadoghq.profiler.Platform; +import com.datadoghq.profiler.junit.CStack; +import com.datadoghq.profiler.junit.RetryTest; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.params.provider.ValueSource; +import org.openjdk.jmc.common.item.IItemCollection; + +import java.nio.ByteBuffer; + +import static org.junit.jupiter.api.Assertions.assertFalse; + +/** + * Verifies that the {@code nofree} option suppresses {@code profiler.Free} events. + */ +public class NofreeNativememProfilerTest extends CStackAwareAbstractProfilerTest { + + public NofreeNativememProfilerTest(@CStack String cstack) { + super(cstack); + } + + @Override + protected String getProfilerCommand() { + return "nativemem=0,nofree"; + } + + @Override + protected boolean isPlatformSupported() { + return Platform.isLinux() && !Platform.isJ9() && !Platform.isZing(); + } + + @RetryTest(3) + @TestTemplate + @ValueSource(strings = {"vm", "vmx"}) + public void shouldNotRecordFreeEventsWithNofreeOption(@CStack String cstack) throws InterruptedException { + Assumptions.assumeFalse(isAsan() || isTsan()); + + // Trigger allocations and then release them via GC + for (int i = 0; i < 1000; i++) { + ByteBuffer.allocateDirect(1024); + } + for (int attempt = 0; attempt < 5; attempt++) { + System.gc(); + Thread.sleep(200); + } + + stopProfiler(); + + verifyEvents("profiler.Malloc"); + + IItemCollection freeEvents = verifyEvents("profiler.Free", false); + assertFalse(freeEvents.hasItems(), "profiler.Free events must not be recorded when nofree is set"); + } +} diff --git a/doc/architecture/NativeMemoryProfiling.md b/doc/architecture/NativeMemoryProfiling.md new file mode 100644 index 000000000..0aaa5bbaf --- /dev/null +++ b/doc/architecture/NativeMemoryProfiling.md @@ -0,0 +1,308 @@ +# Native Memory Allocation Profiling + +## Overview + +The native memory profiler tracks heap allocations and frees made through the C +standard library (`malloc`, `calloc`, `realloc`, `free`, `posix_memalign`, +`aligned_alloc`). It instruments these functions at the GOT (Global Offset Table) +level so that every intercepted call is accounted for without modifying application +source code or requiring a custom allocator. + +Sampled allocation events carry a full Java + native stack trace and are emitted as +`profiler.Malloc` JFR events. Free events are emitted as `profiler.Free` JFR events +without a stack trace (capturing stack traces at every free would be prohibitively +expensive). + +The feature is activated by passing `nativemem=` to the profiler, where +`` is the byte-sampling interval (e.g. `nativemem=524288` samples roughly +one event per 512 KiB allocated). Passing `nativemem=0` records every allocation. + +--- + +## Component Map + +``` + Application code + │ malloc() / calloc() / realloc() / free() / … + ▼ + ┌─────────────┐ GOT patch ┌──────────────────────────┐ + │ libc / musl│ ◄────────── │ malloc_hook / free_hook │ mallocTracer.cpp + └─────────────┘ │ calloc_hook / … │ + └────────────┬─────────────┘ + │ recordMalloc / recordFree + ▼ + ┌──────────────────────────┐ + │ MallocTracer:: │ mallocTracer.cpp/h + │ updateCounter() │ + │ recordSample() ──────► │ profiler.cpp + │ recordEventOnly() ────► │ + └────────────┬─────────────┘ + │ walkVM (CSTACK_VM) + ▼ + ┌──────────────────────────┐ + │ JFR buffer │ flightRecorder.cpp + │ profiler.Malloc │ + │ profiler.Free │ + └──────────────────────────┘ +``` + +--- + +## GOT Patching + +The profiler redirects allocator calls by writing hook function addresses directly +into the importing library's GOT. This is cheaper than `LD_PRELOAD` (no process +restart) and works for libraries loaded at any time. + +### Import IDs + +`codeCache.h` defines an `ImportId` enum with one entry per hooked symbol: + +``` +im_malloc, im_calloc, im_realloc, im_free, im_posix_memalign, im_aligned_alloc +``` + +`CodeCache::patchImport(ImportId, void*)` walks the library's PLT/GOT and overwrites +the matching entry. + +### Hook signatures + +Each hook calls the saved original function first, then records the event: + +| Hook | Calls | Records | +|------|-------|---------| +| `malloc_hook(size)` | `_orig_malloc(size)` | `recordMalloc(ret, size)` if `ret != NULL && size != 0` | +| `calloc_hook(num, size)` | `_orig_calloc(num, size)` | `recordMalloc(ret, num*size)` if `ret != NULL` | +| `realloc_hook(addr, size)` | `_orig_realloc(addr, size)` | `recordFree(addr)` + `recordMalloc(ret, size)` | +| `free_hook(addr)` | `_orig_free(addr)` | `recordFree(addr)` if `addr != NULL` | +| `posix_memalign_hook(…)` | `_orig_posix_memalign(…)` | `recordMalloc(*memptr, size)` if `ret == 0` | +| `aligned_alloc_hook(align, size)` | `_orig_aligned_alloc(align, size)` | `recordMalloc(ret, size)` if `ret != NULL` | + +Free recording is suppressed for all hooks when `nofree` is set +(`MallocTracer::nofree()` returns `true`). + +--- + +## Initialization Sequence + +`MallocTracer::start()` (called once per profiler session) runs: + +1. **`resolveMallocSymbols()`** — calls each intercepted function at least once so + the profiler library's own PLT stubs are resolved by the dynamic linker. This + ensures that subsequent `SAVE_IMPORT` reads get the real libc function pointers + rather than the PLT resolver. + +2. **`SAVE_IMPORT(func)`** — reads the resolved GOT entry for each symbol from the + profiler library's own import table and stores it in the corresponding + `_orig_` static pointer. + +3. **`detectNestedMalloc()`** — probes whether the platform's `calloc` implementation + calls `malloc` internally (as musl does). If so, `calloc_hook` and + `posix_memalign_hook` are replaced with dummy variants (`calloc_hook_dummy`, + `posix_memalign_hook_dummy`) that forward to the originals without recording, to + prevent double-accounting. The dummy hooks preserve the caller frame pointer so + that the actual call site is not obscured. + +4. **`patchLibraries()`** — iterates over all currently loaded native libraries and + writes the hook addresses into each library's GOT, under `_patch_lock`. + `_patched_libs` is a monotonic counter so that already-patched libraries are + skipped on subsequent calls. + +`_initialized` is set to `true` after the first successful `initialize()` call. +`patchLibraries()` is called again on every subsequent `start()` to pick up any +libraries loaded between profiler sessions. + +--- + +## Dynamic Library Handling + +When the application calls `dlopen`, the profiler's `dlopen_hook` (installed as a +GOT hook for `dlopen`) calls `MallocTracer::installHooks()` after the library is +loaded: + +```cpp +// profiler.cpp +void* Profiler::dlopen_hook(const char* filename, int flags) { + void* result = dlopen(filename, flags); + if (result != NULL) { + Libraries::instance()->updateSymbols(false); + MallocTracer::installHooks(); + } + return result; +} +``` + +`installHooks()` calls `patchLibraries()` only if `_running` is `true`, so newly +loaded libraries are automatically hooked without requiring a profiler restart. + +--- + +## Sampling + +Allocation recording is gated by a byte-level counter using +`Engine::updateCounter()`: + +```cpp +// engine.h — lock-free CAS loop +static bool updateCounter(volatile u64& counter, u64 value, u64 interval) { + if (interval <= 1) return true; // nativemem=0: record every allocation + while (true) { + u64 prev = counter, next = prev + value; + if (next < interval) { + if (__sync_bool_compare_and_swap(&counter, prev, next)) return false; + } else { + if (__sync_bool_compare_and_swap(&counter, prev, next % interval)) return true; + } + } +} +``` + +`_allocated_bytes` is the shared volatile counter; `_interval` is set from +`args._nativemem`. A sample is recorded only when `updateCounter` returns `true` +(i.e. the counter crosses an `_interval` boundary). Multiple threads compete via CAS +so no mutex is needed for the counter itself. + +Free events bypass the counter — every free is always recorded (unless `nofree` is +set), because omitting frees would make leak detection impossible. + +--- + +## Stack Trace Capture + +### Why `CSTACK_VM` is needed + +The malloc hooks execute on the calling thread with no signal context (`ucontext == +NULL`). Native stack unwinding via frame pointers or DWARF requires a signal context +as the starting point, so neither `CSTACK_DEFAULT` nor `CSTACK_FP` can produce +useful traces for malloc events. + +`CSTACK_VM` uses HotSpot's `JavaFrameAnchor` (lastJavaPC / lastJavaSP / lastJavaFP) +to walk Java frames, and falls back to `__builtin_return_address` for the native +portion. This works correctly from inside a malloc hook because the anchor is set +whenever the JVM has transitioned from Java to native. + +### Default stack mode + +`CSTACK_VM` is the global default (`arguments.h`). On a HotSpot JVM with VMStructs +available this gives the best stack traces for all profiling modes. If VMStructs are +not available, `profiler.cpp` downgrades to `CSTACK_DEFAULT` at startup: + +```cpp +} else if (_cstack == CSTACK_VM) { + if (!VMStructs::hasStackStructs()) { + _cstack = CSTACK_DEFAULT; + Log::warn("VMStructs stack walking is not supported …"); + } +} +``` + +### Code path in `recordSample` + +```cpp +// profiler.cpp line 839 +if (event_type == BCI_NATIVE_MALLOC && _cstack >= CSTACK_VM) { + // walkVM starts from callerPC()/callerFP() and walks native frames + // until it reaches the thread's JavaFrameAnchor for Java frames. + int vm_frames = StackWalker::walkVM(ucontext /*NULL*/, frames + num_frames, + max_remaining, _features, + eventTypeFromBCI(event_type), + lock_index, &truncated); + num_frames += vm_frames; +} +``` + +`getNativeTrace` returns 0 immediately for `_cstack >= CSTACK_VM` (line 316), so +native frames from `walkFP`/`walkDwarf` are not collected — `walkVM` is the sole +source of both native and Java frames for malloc events. + +--- + +## JFR Event Format + +Two new event types are defined in `jfrMetadata.cpp` under the +`Java Virtual Machine / Native Memory` category: + +### `profiler.Malloc` (`T_MALLOC`) + +| Field | Type | Description | +|-------|------|-------------| +| `startTime` | `long` (ticks) | TSC timestamp of the allocation | +| `eventThread` | thread ref | Thread that performed the allocation | +| `stackTrace` | stack trace ref | Call stack at the allocation site | +| `address` | `long` (address) | Returned pointer value | +| `size` | `long` (bytes) | Requested allocation size | + +### `profiler.Free` (`T_FREE`) + +| Field | Type | Description | +|-------|------|-------------| +| `startTime` | `long` (ticks) | TSC timestamp of the free | +| `eventThread` | thread ref | Thread that performed the free | +| `stackTrace` | stack trace ref | Always null (0) — see Limitations | +| `address` | `long` (address) | Pointer being freed | + +Both event types are written by `Recording::recordMallocSample()` in +`flightRecorder.cpp`, which selects the type by inspecting `event->_size`: + +```cpp +buf->putVar64(event->_size != 0 ? T_MALLOC : T_FREE); +buf->putVar64(event->_start_time); +buf->putVar32(tid); +buf->putVar64(call_trace_id); // 0 for free events +buf->putVar64(event->_address); +if (event->_size != 0) { + buf->putVar64(event->_size); // omitted for T_FREE +} +``` + +Malloc events flow through `Profiler::recordSample()`, which fills `call_trace_id` +from the call trace storage. Free events flow through `Profiler::recordEventOnly()`, +which passes `call_trace_id = 0` (JFR null-reference convention). + +--- + +## Concurrency and Thread Safety + +| Concern | Mechanism | +|---------|-----------| +| GOT patching across threads | `_patch_lock` (Mutex) in `patchLibraries()` | +| Library unload during patching | `UnloadProtection` handle per library | +| Allocation byte counter | Lock-free CAS loop in `updateCounter` | +| JFR buffer writes | Per-lock-index try-lock with 3 attempts; events dropped on contention | +| Hook enable / disable | `volatile bool _running` — checked before every recording call | +| `_initialized` write ordering | Serialized by the profiler's outer state lock (caller responsibility) | + +--- + +## Known Limitations and Design Trade-offs + +**No reentrancy guard.** As documented in `mallocTracer.cpp`: + +> To avoid complexity in hooking and tracking reentrancy, a TLS-based approach is +> not used. Reentrant allocation calls would result in double-accounting. + +When `recordMalloc` calls into the profiler (stack walking, JFR buffer writes), any +allocations made by the profiler itself will re-enter the hooks. Because those +internal allocations call `_orig_malloc` directly (not the hook), there is no +infinite recursion, but they may be double-counted as application allocations. +Leak detection is unaffected: the same address being recorded multiple times is +handled correctly by the tracking logic. + +**Hooks are never uninstalled.** `stop()` only sets `_running = false`. The GOT +entries remain patched for the lifetime of the process. After stopping, every +malloc/free incurs the overhead of one function-pointer indirection plus a volatile +bool read, which is negligible in practice. Uninstalling hooks safely would require +iterating all libraries again under `_patch_lock`, which is deferred. + +**`nativemem=0` records every allocation.** When `_interval == 0`, +`updateCounter` returns `true` on every call (the `interval <= 1` fast path). This +is intentional for 100% sampling but can produce very high event volumes. + +**No stack traces on free events.** `call_trace_id` is always 0 for `profiler.Free` +events. The `stackTrace` field is present in the JFR metadata but will always resolve +to a null reference. + +**HotSpot / Linux only for full stack traces.** `CSTACK_VM` requires +`VMStructs::hasStackStructs()`, which is only true on HotSpot JVMs on Linux. On other +platforms the profiler falls back to `CSTACK_DEFAULT` and malloc events will have +empty stack traces.