From d1e320d9dd5c4186c1ce0acec5bc293c37a09b5e Mon Sep 17 00:00:00 2001 From: Oliver Rietmann Date: Sat, 14 Mar 2026 21:43:47 +0100 Subject: [PATCH 1/5] Write benchmark to CSV file --- GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 81 +++++++ GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 + .../Definitions/Parameters/GPUParameters.csv | 226 +++++++++--------- GPU/GPUTracking/Standalone/cmake/config.cmake | 6 +- dependencies/FindO2GPU.cmake | 10 +- 5 files changed, 205 insertions(+), 119 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 409c28b8bf328..8491974bda331 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -35,6 +35,8 @@ #include #include +#include +#include #ifndef _WIN32 #include @@ -212,6 +214,38 @@ int32_t GPUReconstructionCPU::ExitDevice() return 0; } +namespace { + void write_header(std::ostream& stream) { + stream << "type,count,name,kernel (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; + } + + struct Row { + std::string type = ""; + std::string name = ""; + double kernel_time = -1.0; + double cpu_time = -1.0; + double total_time = -1.0; + size_t memSize = 0; + uint32_t count = 0; + + void write(std::ostream& stream, uint32_t statNEvents) { + double scale = 1000000.0 / statNEvents; + stream << type << ","; + if (count != 0) stream << count; + stream << "," << name << "," << uint32_t(kernel_time * scale) << ","; + if (cpu_time != -1.0) stream << uint32_t(cpu_time * scale); + stream << ","; + if (cpu_time != -1.0 && total_time != -1.0) stream << uint32_t(cpu_time / total_time *100) / 100.0; + stream << ","; + if (total_time != -1.0) stream << uint32_t(total_time * scale); + stream << ","; + if (memSize != 0 && count != 0) stream << uint32_t(memSize / kernel_time * 1e-6) * 1e-3 << "," << memSize / statNEvents << "," << memSize / statNEvents / count; + else stream << ",,"; + stream << std::endl; + } + }; +} + int32_t GPUReconstructionCPU::RunChains() { mMemoryScalers->temporaryFactor = 1.; @@ -264,6 +298,16 @@ int32_t GPUReconstructionCPU::RunChains() double kernelTotal = 0; std::vector kernelStepTimes(gpudatatypes::N_RECO_STEPS, 0.); + std::ofstream benchmarkCSV; + if (!GetProcessingSettings().timingCSV.empty()) { + benchmarkCSV.open(GetProcessingSettings().timingCSV, std::ios::out | std::ios::app); + if (!benchmarkCSV.is_open()) { + GPUError("Could not open timing CSV file '%s' for writing", GetProcessingSettings().timingCSV.c_str()); + } else if (mNEventsProcessed == 1) { + write_header(benchmarkCSV); + } + } + if (GetProcessingSettings().debugLevel >= 1) { for (uint32_t i = 0; i < mTimers.size(); i++) { double time = 0; @@ -285,9 +329,16 @@ int32_t GPUReconstructionCPU::RunChains() kernelStepTimes[stepNum] += time; } char bandwidth[256] = ""; + Row task_row; + task_row.type = 'K'; + task_row.name = mTimers[i]->name.c_str(); + task_row.kernel_time = time; + task_row.count = mTimers[i]->count; if (mTimers[i]->memSize && mStatNEvents && time != 0.) { + task_row.memSize = mTimers[i]->memSize; snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count); } + if (benchmarkCSV.is_open()) task_row.write(benchmarkCSV, mStatNEvents); printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth); if (GetProcessingSettings().resetTimers) { mTimers[i]->count = 0; @@ -298,14 +349,34 @@ int32_t GPUReconstructionCPU::RunChains() if (GetProcessingSettings().recoTaskTiming) { for (int32_t i = 0; i < gpudatatypes::N_RECO_STEPS; i++) { if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) { + Row reco_step_row; + reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (Tasks)"; + reco_step_row.kernel_time = kernelStepTimes[i]; + reco_step_row.cpu_time = mTimersRecoSteps[i].timerCPU; + reco_step_row.total_time = mTimersRecoSteps[i].timerTotal.GetElapsedTime(); + if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents); printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks", gpudatatypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime()); } if (mTimersRecoSteps[i].bytesToGPU) { + Row reco_step_row; + reco_step_row.type = 'D'; + reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to GPU)"; + reco_step_row.kernel_time = mTimersRecoSteps[i].timerToGPU.GetElapsedTime(); + reco_step_row.memSize = mTimersRecoSteps[i].bytesToGPU; + reco_step_row.count = mTimersRecoSteps[i].countToGPU; + if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents); printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToGPU / mStatNEvents, mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].countToGPU); } if (mTimersRecoSteps[i].bytesToHost) { + Row reco_step_row; + reco_step_row.type = 'D'; + reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to Host)"; + reco_step_row.kernel_time = mTimersRecoSteps[i].timerToHost.GetElapsedTime(); + reco_step_row.memSize = mTimersRecoSteps[i].bytesToHost; + reco_step_row.count = mTimersRecoSteps[i].countToHost; + if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents); printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost); } @@ -321,13 +392,23 @@ int32_t GPUReconstructionCPU::RunChains() } for (int32_t i = 0; i < gpudatatypes::N_GENERAL_STEPS; i++) { if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) { + Row general_step_row; + general_step_row.name = gpudatatypes::GENERAL_STEP_NAMES[i]; + general_step_row.kernel_time = mTimersGeneralSteps[i].GetElapsedTime(); + if (benchmarkCSV.is_open()) general_step_row.write(benchmarkCSV, mStatNEvents); printf("Execution Time: General Step : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents); } } + Row wall_row; + wall_row.name = "Wall"; if (GetProcessingSettings().debugLevel >= 1) { + wall_row.kernel_time = kernelTotal; mStatKernelTime = kernelTotal * 1000000 / mStatNEvents; printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str()); } + wall_row.cpu_time = mStatCPUTime; + wall_row.total_time = mStatWallTime * mStatNEvents / 1000000; + if (benchmarkCSV.is_open()) wall_row.write(benchmarkCSV, mStatNEvents); printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str()); } else if (GetProcessingSettings().debugLevel >= 0) { GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str()); diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 57cb1371a4aa0..06c0d8f344af1 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -329,6 +329,7 @@ AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)") AddOption(debugMask, uint32_t, (1 << 18) - 1, "debugMask", 0, "Mask for debug output dumps to file") AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6") +AddOption(timingCSV, std::string, "", "", 0, "CSV filename to append the benchmark results. Verbosity determined by parameter --debug.") AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures") AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks") AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6 or deterministic compile flag set", def(1)) diff --git a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv index fc27de72ea2f1..f240402acc19c 100644 --- a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv +++ b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv @@ -1,113 +1,113 @@ -Architecture,default,default_cpu,MI100,VEGA,TAHITI,TESLA,FERMI,PASCAL,KEPLER,AMPERE,TURING -,,,,,,,,,,, -CORE:,,,,,,,,,,, -WARP_SIZE,32,,64,64,32,32,32,32,32,32,32 -THREAD_COUNT_DEFAULT,256,,256,256,,,,,,512,512 -,,,,,,,,,,, -LB:,,,,,,,,,,, -GPUTPCCreateTrackingData,256,,"[256, 7]","[192, 2]",,,,,,384,256 -GPUTPCTrackletConstructor,256,,"[768, 8]","[512, 10]","[256, 2]","[256, 1]","[256, 2]","[1024, 2]","[512, 4]","[256, 2]","[256, 2]" -GPUTPCTrackletSelector,256,,"[384, 5]","[192, 10]","[256, 3]","[256, 1]","[256, 3]","[512, 4]","[256, 3]","[192, 3]","[192, 3]" -GPUTPCNeighboursFinder,256,,"[192, 8]","[960, 8]",256,256,256,512,256,"[640, 1]","[640, 1]" -GPUTPCNeighboursCleaner,256,,"[128, 5]","[384, 9]",256,256,256,256,256,512,512 -GPUTPCExtrapolationTracking,256,,"[256, 7]","[256, 2]",,,,,,"[128, 4]","[192, 2]" -GPUTRDTrackerKernels_gpuVersion,512,,,,,,,,,, -GPUTPCCreateOccupancyMap_fill,256,,,,,,,,,, -GPUTPCCreateOccupancyMap_fold,256,,,,,,,,,, -GPUTRDTrackerKernels_o2Version,512,,,,,,,,,, -GPUTPCCompressionKernels_step0attached,256,,"[128, 1]","[64, 2]",,,,,,"[64, 2]",128 -GPUTPCCompressionKernels_step1unattached,256,,"[512, 2]","[512, 2]",,,,,,"[512, 3]","[512, 2]" -GPUTPCDecompressionKernels_step0attached,256,,"[128, 2]","[128, 2]",,,,,,"[32, 1]","[32, 1]" -GPUTPCDecompressionKernels_step1unattached,256,,"[64, 2]","[64, 2]",,,,,,"[32, 1]","[32, 1]" -GPUTPCDecompressionUtilKernels_sortPerSectorRow,256,,,,,,,,,, -GPUTPCDecompressionUtilKernels_countFilteredClusters,256,,,,,,,,,, -GPUTPCDecompressionUtilKernels_storeFilteredClusters,256,,,,,,,,,, -GPUTPCCFDecodeZS,"[128, 4]",,"[64, 4]","[64, 1]",,,,,,"[64, 10]","[64, 8]" -GPUTPCCFDecodeZSLink,"""GPUCA_WARP_SIZE""",,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" -GPUTPCCFDecodeZSDenseLink,"""GPUCA_WARP_SIZE""",,"[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 14]",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" -GPUTPCCFGather,"[1024, 1]",,"[1024, 5]","[1024, 1]",,,,,,"[1024, 1]","[1024, 1]" -COMPRESSION_GATHER,1024,,1024,1024,,,,,,1024,1024 -GPUTPCGMMergerTrackFit,256,,"[192, 2]","[64, 7]",,,,,,"[64, 4]","[32, 8]" -GPUTPCGMMergerFollowLoopers,256,,"[256, 5]","[256, 4]",,,,,,"[64, 12]","[128, 4]" -GPUTPCGMMergerSectorRefit,256,,"[64, 4]","[256, 2]",,,,,,"[32, 6]","[64, 5]" -GPUTPCGMMergerUnpackResetIds,256,,256,256,,,,,,256,256 -GPUTPCGMMergerUnpackGlobal,256,,256,256,,,,,,256,256 -GPUTPCGMMergerResolve_step0,256,,512,256,,,,,,256,256 -GPUTPCGMMergerResolve_step1,256,,512,256,,,,,,256,256 -GPUTPCGMMergerResolve_step2,256,,512,256,,,,,,256,256 -GPUTPCGMMergerResolve_step3,256,,512,256,,,,,,256,256 -GPUTPCGMMergerResolve_step4,256,,512,256,,,,,,"[256, 4]","[256, 4]" -GPUTPCGMMergerClearLinks,256,,256,256,,,,,,256,256 -GPUTPCGMMergerMergeWithinPrepare,256,,256,256,,,,,,256,256 -GPUTPCGMMergerMergeSectorsPrepare,256,,256,256,,,,,,"[256, 2]","[256, 2]" -GPUTPCGMMergerMergeBorders_step0,256,,512,256,,,,,,192,192 -GPUTPCGMMergerMergeBorders_step2,256,,512,256,,,,,,"[64, 2]",256 -GPUTPCGMMergerMergeCE,256,,512,256,,,,,,256,256 -GPUTPCGMMergerLinkExtrapolatedTracks,256,,256,256,,,,,,256,256 -GPUTPCGMMergerCollect,256,,"[768, 1]","[1024, 1]",,,,,,"[256, 2]","[128, 2]" -GPUTPCGMMergerSortTracksPrepare,256,,256,256,,,,,,256,256 -GPUTPCGMMergerPrepareForFit_step0,256,,256,256,,,,,,256,256 -GPUTPCGMMergerPrepareForFit_step1,256,,256,256,,,,,,256,256 -GPUTPCGMMergerPrepareForFit_step2,256,,256,256,,,,,,256,256 -GPUTPCGMMergerFinalize_step0,256,,,256,,,,,,, -GPUTPCGMMergerFinalize_step1,256,,,256,,,,,,, -GPUTPCGMMergerFinalize_step2,256,,,256,,,,,,, -GPUTPCGMMergerMergeLoopers_step0,256,,,,,,,,,, -GPUTPCGMMergerMergeLoopers_step1,256,,,,,,,,,, -GPUTPCGMMergerMergeLoopers_step2,256,,,,,,,,,, -GPUTPCGMO2Output_prepare,256,,,,,,,,,, -GPUTPCGMO2Output_output,256,,,,,,,,,, -GPUTPCStartHitsFinder,256,,"[1024, 2]","[1024, 7]",256,256,256,256,256,512,512 -GPUTPCStartHitsSorter,256,,"[1024, 5]","[512, 7]",256,256,256,256,256,"[512, 1]","[512, 1]" -GPUTPCCFCheckPadBaseline,576,,"[576, 2]","[576, 2]",,,,,,"[576, 2]", -GPUTPCCFChargeMapFiller_fillIndexMap,512,,512,512,,,,,,448, -GPUTPCCFChargeMapFiller_fillFromDigits,512,,512,512,,,,,,448, -GPUTPCCFChargeMapFiller_findFragmentStart,512,,512,512,,,,,,448, -GPUTPCCFPeakFinder,512,,"[512, 9]","[512, 4]",,,,,,128, -GPUTPCCFNoiseSuppression,512,,512,512,,,,,,448, -GPUTPCCFDeconvolution,512,,"[512, 5]","[512, 5]",,,,,,384, -GPUTPCCFClusterizer,512,,"[448, 3]","[512, 2]",,,,,,448, -GPUTPCNNClusterizerKernels,512,,,,,,,,,, -GPUTrackingRefitKernel_mode0asGPU,256,,,,,,,,,, -GPUTrackingRefitKernel_mode1asTrackParCov,256,,,,,,,,,, -GPUMemClean16,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,, -GPUitoa,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,, -GPUTPCCFNoiseSuppression_noiseSuppression,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,, -GPUTPCCFNoiseSuppression_updatePeaks,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,, -GPUTPCNNClusterizerKernels_runCfClusterizer,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, -GPUTPCNNClusterizerKernels_fillInputNNCPU,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, -GPUTPCNNClusterizerKernels_fillInputNNGPU,1024,,,,,,,,,, -GPUTPCNNClusterizerKernels_determineClass1Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, -GPUTPCNNClusterizerKernels_determineClass2Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, -GPUTPCNNClusterizerKernels_publishClass1Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, -GPUTPCNNClusterizerKernels_publishClass2Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, -GPUTPCNNClusterizerKernels_publishDeconvolutionFlags,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, -GPUTPCCFStreamCompaction_scanStart,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, -GPUTPCCFStreamCompaction_scanUp,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, -GPUTPCCFStreamCompaction_scanTop,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, -GPUTPCCFStreamCompaction_scanDown,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, -GPUTPCCFStreamCompaction_compactDigits,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, -GPUTPCCompressionGatherKernels_unbuffered,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, -GPUTPCCompressionGatherKernels_buffered32,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, -GPUTPCCompressionGatherKernels_buffered64,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, -GPUTPCCompressionGatherKernels_buffered128,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, -GPUTPCCompressionGatherKernels_multiBlock,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, -GPUTPCGMMergerFinalize_0,256,,256,,,,,,,256,256 -GPUTPCGMMergerFinalize_1,256,,256,,,,,,,256,256 -GPUTPCGMMergerFinalize_2,256,,256,,,,,,,256,256 -,,,,,,,,,,, -PAR:,,,,,,,,,,, -AMD_EUS_PER_CU,0,0,4,4,,,,,,, -SORT_STARTHITS,1,0,,,,,,,,, -NEIGHBOURS_FINDER_MAX_NNEIGHUP,6,0,10,4,,,,,,4,4 -NEIGHBOURS_FINDER_UNROLL_GLOBAL,4,0,4,2,,,,,,, -NEIGHBOURS_FINDER_UNROLL_SHARED,1,0,0,0,,,,,,, -TRACKLET_SELECTOR_HITS_REG_SIZE,12,0,9,27,,,,,,20,20 -ALTERNATE_BORDER_SORT,0,0,1,1,,,,,,1,1 -SORT_BEFORE_FIT,0,0,1,1,,,,,,1,1 -NO_ATOMIC_PRECHECK,0,0,1,1,,,,,,1,1 -DEDX_STORAGE_TYPE,"""float""","""float""","""uint16_t""","""uint16_t""",,,,,,"""uint16_t""","""uint16_t""" -MERGER_INTERPOLATION_ERROR_TYPE,"""float""","""float""","""half""","""half""",,,,,,"""half""","""half""" -COMP_GATHER_KERNEL,0,0,4,4,,,,,,4,4 -COMP_GATHER_MODE,2,0,3,3,,,,,,3,3 -CF_SCAN_WORKGROUP_SIZE,512,0,,,,,,,,, +Architecture,default,default_cpu,MI100,MI210,RDNA3,VEGA,TAHITI,TESLA,FERMI,PASCAL,KEPLER,AMPERE,TURING,HOPPER +,,,,,,,,,,,,,, +CORE:,,,,,,,,,,,,,, +WARP_SIZE,32,,64,64,32,64,32,32,32,32,32,32,32,32 +THREAD_COUNT_DEFAULT,256,,256,256,256,256,,,,,,512,512,512 +,,,,,,,,,,,,,, +LB:,,,,,,,,,,,,,, +GPUTPCCreateTrackingData,256,,"[256, 7]","[256, 7]","[256, 7]","[192, 2]",,,,,,384,256,256 +GPUTPCTrackletConstructor,256,,"[768, 8]","[768, 8]","[768, 8]","[512, 10]","[256, 2]","[256, 1]","[256, 2]","[1024, 2]","[512, 4]","[256, 2]","[256, 2]","[256, 2]" +GPUTPCTrackletSelector,256,,"[384, 5]","[384, 5]","[384, 5]","[192, 10]","[256, 3]","[256, 1]","[256, 3]","[512, 4]","[256, 3]","[192, 3]","[192, 3]","[192, 3]" +GPUTPCNeighboursFinder,256,,"[192, 8]","[192, 8]","[192, 8]","[960, 8]",256,256,256,512,256,"[640, 1]","[640, 1]","[640, 1]" +GPUTPCNeighboursCleaner,256,,"[128, 5]","[128, 5]","[128, 5]","[384, 9]",256,256,256,256,256,512,512,512 +GPUTPCExtrapolationTracking,256,,"[256, 7]","[256, 7]","[256, 7]","[256, 2]",,,,,,"[128, 4]","[192, 2]","[192, 2]" +GPUTRDTrackerKernels_gpuVersion,512,,,,,,,,,,,,, +GPUTPCCreateOccupancyMap_fill,256,,,,,,,,,,,,, +GPUTPCCreateOccupancyMap_fold,256,,,,,,,,,,,,, +GPUTRDTrackerKernels_o2Version,512,,,,,,,,,,,,, +GPUTPCCompressionKernels_step0attached,256,,"[128, 1]","[128, 1]","[128, 1]","[64, 2]",,,,,,"[64, 2]",128,128 +GPUTPCCompressionKernels_step1unattached,256,,"[512, 2]","[512, 2]","[512, 2]","[512, 2]",,,,,,"[512, 3]","[512, 2]","[512, 2]" +GPUTPCDecompressionKernels_step0attached,256,,"[128, 2]","[128, 2]","[128, 2]","[128, 2]",,,,,,"[32, 1]","[32, 1]","[32, 1]" +GPUTPCDecompressionKernels_step1unattached,256,,"[64, 2]","[64, 2]","[64, 2]","[64, 2]",,,,,,"[32, 1]","[32, 1]","[32, 1]" +GPUTPCDecompressionUtilKernels_sortPerSectorRow,256,,,,,,,,,,,,, +GPUTPCDecompressionUtilKernels_countFilteredClusters,256,,,,,,,,,,,,, +GPUTPCDecompressionUtilKernels_storeFilteredClusters,256,,,,,,,,,,,,, +GPUTPCCFDecodeZS,"[128, 4]",,"[64, 4]","[64, 4]","[64, 4]","[64, 1]",,,,,,"[64, 10]","[64, 8]","[64, 8]" +GPUTPCCFDecodeZSLink,"""GPUCA_WARP_SIZE""",,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" +GPUTPCCFDecodeZSDenseLink,"""GPUCA_WARP_SIZE""",,"[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 14]",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" +GPUTPCCFGather,"[1024, 1]",,"[1024, 5]","[1024, 5]","[1024, 5]","[1024, 1]",,,,,,"[1024, 1]","[1024, 1]","[1024, 1]" +COMPRESSION_GATHER,1024,,1024,1024,1024,1024,,,,,,1024,1024,1024 +GPUTPCGMMergerTrackFit,256,,"[192, 2]","[192, 2]","[192, 2]","[64, 7]",,,,,,"[64, 4]","[32, 8]","[32, 8]" +GPUTPCGMMergerFollowLoopers,256,,"[256, 5]","[256, 5]","[256, 5]","[256, 4]",,,,,,"[64, 12]","[128, 4]","[128, 4]" +GPUTPCGMMergerSectorRefit,256,,"[64, 4]","[64, 4]","[64, 4]","[256, 2]",,,,,,"[32, 6]","[64, 5]","[64, 5]" +GPUTPCGMMergerUnpackResetIds,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerUnpackGlobal,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerResolve_step0,256,,512,512,512,256,,,,,,256,256,256 +GPUTPCGMMergerResolve_step1,256,,512,512,512,256,,,,,,256,256,256 +GPUTPCGMMergerResolve_step2,256,,512,512,512,256,,,,,,256,256,256 +GPUTPCGMMergerResolve_step3,256,,512,512,512,256,,,,,,256,256,256 +GPUTPCGMMergerResolve_step4,256,,512,512,512,256,,,,,,"[256, 4]","[256, 4]","[256, 4]" +GPUTPCGMMergerClearLinks,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerMergeWithinPrepare,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerMergeSectorsPrepare,256,,256,256,256,256,,,,,,"[256, 2]","[256, 2]","[256, 2]" +GPUTPCGMMergerMergeBorders_step0,256,,512,512,512,256,,,,,,192,192,192 +GPUTPCGMMergerMergeBorders_step2,256,,512,512,512,256,,,,,,"[64, 2]",256,256 +GPUTPCGMMergerMergeCE,256,,512,512,512,256,,,,,,256,256,256 +GPUTPCGMMergerLinkExtrapolatedTracks,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerCollect,256,,"[768, 1]","[768, 1]","[768, 1]","[1024, 1]",,,,,,"[256, 2]","[128, 2]","[128, 2]" +GPUTPCGMMergerSortTracksPrepare,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerPrepareForFit_step0,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerPrepareForFit_step1,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerPrepareForFit_step2,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerFinalize_step0,256,,,,,256,,,,,,,, +GPUTPCGMMergerFinalize_step1,256,,,,,256,,,,,,,, +GPUTPCGMMergerFinalize_step2,256,,,,,256,,,,,,,, +GPUTPCGMMergerMergeLoopers_step0,256,,,,,,,,,,,,, +GPUTPCGMMergerMergeLoopers_step1,256,,,,,,,,,,,,, +GPUTPCGMMergerMergeLoopers_step2,256,,,,,,,,,,,,, +GPUTPCGMO2Output_prepare,256,,,,,,,,,,,,, +GPUTPCGMO2Output_output,256,,,,,,,,,,,,, +GPUTPCStartHitsFinder,256,,"[1024, 2]","[1024, 2]","[1024, 2]","[1024, 7]",256,256,256,256,256,512,512,512 +GPUTPCStartHitsSorter,256,,"[1024, 5]","[1024, 5]","[1024, 5]","[512, 7]",256,256,256,256,256,"[512, 1]","[512, 1]","[512, 1]" +GPUTPCCFCheckPadBaseline,576,,"[576, 2]","[576, 2]","[576, 2]","[576, 2]",,,,,,"[576, 2]",, +GPUTPCCFChargeMapFiller_fillIndexMap,512,,512,512,512,512,,,,,,448,, +GPUTPCCFChargeMapFiller_fillFromDigits,512,,512,512,512,512,,,,,,448,, +GPUTPCCFChargeMapFiller_findFragmentStart,512,,512,512,512,512,,,,,,448,, +GPUTPCCFPeakFinder,512,,"[512, 9]","[512, 9]","[512, 9]","[512, 4]",,,,,,128,, +GPUTPCCFNoiseSuppression,512,,512,512,512,512,,,,,,448,, +GPUTPCCFDeconvolution,512,,"[512, 5]","[512, 5]","[512, 5]","[512, 5]",,,,,,384,, +GPUTPCCFClusterizer,512,,"[448, 3]","[448, 3]","[448, 3]","[512, 2]",,,,,,448,, +GPUTPCNNClusterizerKernels,512,,,,,,,,,,,,, +GPUTrackingRefitKernel_mode0asGPU,256,,,,,,,,,,,,, +GPUTrackingRefitKernel_mode1asTrackParCov,256,,,,,,,,,,,,, +GPUMemClean16,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,,,,, +GPUitoa,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,,,,, +GPUTPCCFNoiseSuppression_noiseSuppression,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,,,,, +GPUTPCCFNoiseSuppression_updatePeaks,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_runCfClusterizer,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_fillInputNNCPU,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_fillInputNNGPU,1024,,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_determineClass1Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_determineClass2Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_publishClass1Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_publishClass2Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_publishDeconvolutionFlags,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, +GPUTPCCFStreamCompaction_scanStart,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, +GPUTPCCFStreamCompaction_scanUp,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, +GPUTPCCFStreamCompaction_scanTop,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, +GPUTPCCFStreamCompaction_scanDown,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, +GPUTPCCFStreamCompaction_compactDigits,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, +GPUTPCCompressionGatherKernels_unbuffered,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, +GPUTPCCompressionGatherKernels_buffered32,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, +GPUTPCCompressionGatherKernels_buffered64,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, +GPUTPCCompressionGatherKernels_buffered128,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, +GPUTPCCompressionGatherKernels_multiBlock,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, +GPUTPCGMMergerFinalize_0,256,,256,256,256,,,,,,,256,256,256 +GPUTPCGMMergerFinalize_1,256,,256,256,256,,,,,,,256,256,256 +GPUTPCGMMergerFinalize_2,256,,256,256,256,,,,,,,256,256,256 +,,,,,,,,,,,,,, +PAR:,,,,,,,,,,,,,, +AMD_EUS_PER_CU,0,0,4,4,4,4,,,,,,,, +SORT_STARTHITS,1,0,,,,,,,,,,,, +NEIGHBOURS_FINDER_MAX_NNEIGHUP,6,0,10,10,10,4,,,,,,4,4,4 +NEIGHBOURS_FINDER_UNROLL_GLOBAL,4,0,4,4,4,2,,,,,,,, +NEIGHBOURS_FINDER_UNROLL_SHARED,1,0,0,0,0,0,,,,,,,, +TRACKLET_SELECTOR_HITS_REG_SIZE,12,0,9,9,9,27,,,,,,20,20,20 +ALTERNATE_BORDER_SORT,0,0,1,1,1,1,,,,,,1,1,1 +SORT_BEFORE_FIT,0,0,1,1,1,1,,,,,,1,1,1 +NO_ATOMIC_PRECHECK,0,0,1,1,1,1,,,,,,1,1,1 +DEDX_STORAGE_TYPE,"""float""","""float""","""uint16_t""","""uint16_t""","""uint16_t""","""uint16_t""",,,,,,"""uint16_t""","""uint16_t""","""uint16_t""" +MERGER_INTERPOLATION_ERROR_TYPE,"""float""","""float""","""half""","""half""","""half""","""half""",,,,,,"""half""","""half""","""half""" +COMP_GATHER_KERNEL,0,0,4,4,4,4,,,,,,4,4,4 +COMP_GATHER_MODE,2,0,3,3,3,3,,,,,,3,3,3 +CF_SCAN_WORKGROUP_SIZE,512,0,,,,,,,,,,,, diff --git a/GPU/GPUTracking/Standalone/cmake/config.cmake b/GPU/GPUTracking/Standalone/cmake/config.cmake index 9355311db617c..abdfc52c460e4 100644 --- a/GPU/GPUTracking/Standalone/cmake/config.cmake +++ b/GPU/GPUTracking/Standalone/cmake/config.cmake @@ -19,7 +19,7 @@ set(GPUCA_CONFIG_VC 1) set(GPUCA_CONFIG_FMT 1) set(GPUCA_CONFIG_ROOT 1) set(GPUCA_CONFIG_ONNX 0) -set(GPUCA_BUILD_EVENT_DISPLAY 1) +set(GPUCA_BUILD_EVENT_DISPLAY 0) set(GPUCA_BUILD_EVENT_DISPLAY_FREETYPE 1) set(GPUCA_BUILD_EVENT_DISPLAY_VULKAN 1) set(GPUCA_BUILD_EVENT_DISPLAY_WAYLAND 1) @@ -32,8 +32,8 @@ set(GPUCA_BUILD_DEBUG_HOSTONLY 0) set(GPUCA_DETERMINISTIC_MODE 0) # OFF / NO_FAST_MATH / OPTO2 / GPU / WHOLEO2 #set(GPUCA_CUDA_GCCBIN c++-14) #set(GPUCA_OPENCL_CLANGBIN clang-20) -set(HIP_AMDGPUTARGET "default") # "gfx906;gfx908;gfx90a" -set(CUDA_COMPUTETARGET "default") # 86 89 +set(HIP_AMDGPUTARGET "gfx1100") # "gfx906;gfx908;gfx90a" +#set(CUDA_COMPUTETARGET "default") # 86 89 #set(GPUCA_CUDA_COMPILE_MODE perkernel) # onefile / perkernel / rtc #set(GPUCA_HIP_COMPILE_MODE perkernel) #set(GPUCA_RTC_NO_COMPILED_KERNELS 1) diff --git a/dependencies/FindO2GPU.cmake b/dependencies/FindO2GPU.cmake index 3e8f012fea4b5..0aeae438b7187 100644 --- a/dependencies/FindO2GPU.cmake +++ b/dependencies/FindO2GPU.cmake @@ -52,7 +52,9 @@ function(detect_gpu_arch backend) # Detect GPU architecture, optionally filterri set(CUDA_FIRST_TARGET 86) message(STATUS "CUDA_COMPUTETARGET not set, defaulting CUDA optimization for architecture ${CUDA_FIRST_TARGET}") endif() - if(CUDA_FIRST_TARGET GREATER_EQUAL 86) + if(CUDA_FIRST_TARGET GREATER_EQUAL 89) + set(CUDA_TARGET HOPPER) + elseif(CUDA_FIRST_TARGET GREATER_EQUAL 86) set(CUDA_TARGET AMPERE) elseif(CUDA_FIRST_TARGET GREATER_EQUAL 75) set(CUDA_TARGET TURING) @@ -75,7 +77,9 @@ function(detect_gpu_arch backend) # Detect GPU architecture, optionally filterri endif() string(TOLOWER "${HIP_FIRST_TARGET}" HIP_FIRST_TARGET) string(REGEX MATCH "....$" HIP_FIRST_TARGET_PADDED "0000${HIP_FIRST_TARGET}") - if(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "1000") + if(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "1100") + set(HIP_TARGET RDNA3) + elseif(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "1000") set(HIP_TARGET RDNA) elseif(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "090a") set(HIP_TARGET MI210) @@ -400,4 +404,4 @@ endif() set(O2GPU_FOUND TRUE) if (NOT GPUCA_FINDO2GPU_CHECK_ONLY) include("${CMAKE_CURRENT_LIST_DIR}/../GPU/GPUTracking/cmake/kernel_helpers.cmake") -endif() +endif() \ No newline at end of file From 61a9e88d4a0269c0b1d78dddbdf2e714257bd069 Mon Sep 17 00:00:00 2001 From: Oliver Rietmann Date: Tue, 17 Mar 2026 15:20:55 +0100 Subject: [PATCH 2/5] Write markdown style to terminal --- GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 155 +++++++++++------- 1 file changed, 98 insertions(+), 57 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 8491974bda331..6db667ea0886f 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -35,8 +35,10 @@ #include #include -#include #include +#include +#include + #ifndef _WIN32 #include @@ -215,35 +217,62 @@ int32_t GPUReconstructionCPU::ExitDevice() } namespace { - void write_header(std::ostream& stream) { - stream << "type,count,name,kernel (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; +void writeHeaderMarkdown(std::ostream& stream) { + stream << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n"; + stream << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n"; +} + +void writeHeaderCSV(std::ostream& stream) { + stream << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; +} + +struct Row { + char type = ' '; + std::string name; + uint32_t count = 0; + double gpu_time = -1.0; + double cpu_time = -1.0; + double total_time = -1.0; + uint32_t memSize = 0; + uint32_t statNEvents; + + void writeMarkdown(std::ostream& stream) { + double scale = 1000000.0 / statNEvents; + stream << "| " << type << " | "; + if (count != 0) stream << std::format("{:6} |", count); + else stream << " |"; + stream << std::format(" {:42}|", name); + if (gpu_time != -1.0) stream << std::format("{:10.0f} |", gpu_time * scale); + else stream << " |"; + if (cpu_time != -1.0) stream << std::format("{:10.0f} |", cpu_time * scale); + else stream << " |"; + if (cpu_time != -1.0 && total_time != -1.0) stream << std::format("{:8.2f} |", cpu_time / total_time); + else stream << " |"; + if (total_time != -1.0) stream << std::format("{:10.0f} |", total_time * scale); + else stream << " |"; + if (memSize != 0 && count != 0) stream << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); + else stream << " | | |"; + stream << std::endl; } - struct Row { - std::string type = ""; - std::string name = ""; - double kernel_time = -1.0; - double cpu_time = -1.0; - double total_time = -1.0; - size_t memSize = 0; - uint32_t count = 0; - - void write(std::ostream& stream, uint32_t statNEvents) { - double scale = 1000000.0 / statNEvents; - stream << type << ","; - if (count != 0) stream << count; - stream << "," << name << "," << uint32_t(kernel_time * scale) << ","; - if (cpu_time != -1.0) stream << uint32_t(cpu_time * scale); - stream << ","; - if (cpu_time != -1.0 && total_time != -1.0) stream << uint32_t(cpu_time / total_time *100) / 100.0; - stream << ","; - if (total_time != -1.0) stream << uint32_t(total_time * scale); - stream << ","; - if (memSize != 0 && count != 0) stream << uint32_t(memSize / kernel_time * 1e-6) * 1e-3 << "," << memSize / statNEvents << "," << memSize / statNEvents / count; - else stream << ",,"; - stream << std::endl; - } - }; + void writeCSV(std::ostream& stream) { + double scale = 1000000.0 / statNEvents; + stream << type << ","; + if (count != 0) stream << count; + stream << "," << name << ","; + if (gpu_time != -1.0) stream << std::format("{:.0f}", gpu_time * scale); + stream << ","; + if (cpu_time != -1.0) stream << std::format("{:.0f}", cpu_time * scale); + stream << ","; + if (cpu_time != -1.0 && total_time != -1.0) stream << std::format("{:.2f}", cpu_time / total_time); + stream << ","; + if (total_time != -1.0) stream << std::format("{:.0f}", total_time * scale); + stream << ","; + if (memSize != 0 && count != 0) stream << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); + else stream << ",,"; + stream << std::endl; + } +}; } int32_t GPUReconstructionCPU::RunChains() @@ -290,7 +319,7 @@ int32_t GPUReconstructionCPU::RunChains() PrintMemoryOverview(); } - mStatWallTime = (mTimerTotal.GetElapsedTime() * 1000000. / mStatNEvents); + mStatWallTime = mTimerTotal.GetElapsedTime(); std::string nEventReport; if (GetProcessingSettings().debugLevel >= 0 && mStatNEvents > 1) { nEventReport += " (avergage of " + std::to_string(mStatNEvents) + " runs)"; @@ -304,11 +333,12 @@ int32_t GPUReconstructionCPU::RunChains() if (!benchmarkCSV.is_open()) { GPUError("Could not open timing CSV file '%s' for writing", GetProcessingSettings().timingCSV.c_str()); } else if (mNEventsProcessed == 1) { - write_header(benchmarkCSV); + writeHeaderCSV(benchmarkCSV); } } if (GetProcessingSettings().debugLevel >= 1) { + writeHeaderMarkdown(std::cout); for (uint32_t i = 0; i < mTimers.size(); i++) { double time = 0; if (mTimers[i] == nullptr) { @@ -328,18 +358,19 @@ int32_t GPUReconstructionCPU::RunChains() int32_t stepNum = getRecoStepNum(mTimers[i]->step); kernelStepTimes[stepNum] += time; } - char bandwidth[256] = ""; Row task_row; task_row.type = 'K'; task_row.name = mTimers[i]->name.c_str(); - task_row.kernel_time = time; + task_row.gpu_time = time; task_row.count = mTimers[i]->count; + task_row.statNEvents = mStatNEvents; if (mTimers[i]->memSize && mStatNEvents && time != 0.) { task_row.memSize = mTimers[i]->memSize; - snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count); } - if (benchmarkCSV.is_open()) task_row.write(benchmarkCSV, mStatNEvents); - printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth); + if (benchmarkCSV.is_open()) { + task_row.writeCSV(benchmarkCSV); + } + task_row.writeMarkdown(std::cout); if (GetProcessingSettings().resetTimers) { mTimers[i]->count = 0; mTimers[i]->memSize = 0; @@ -351,34 +382,40 @@ int32_t GPUReconstructionCPU::RunChains() if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) { Row reco_step_row; reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (Tasks)"; - reco_step_row.kernel_time = kernelStepTimes[i]; + reco_step_row.gpu_time = kernelStepTimes[i]; reco_step_row.cpu_time = mTimersRecoSteps[i].timerCPU; reco_step_row.total_time = mTimersRecoSteps[i].timerTotal.GetElapsedTime(); - if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents); - printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks", - gpudatatypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime()); + reco_step_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + reco_step_row.writeCSV(benchmarkCSV); + } + reco_step_row.writeMarkdown(std::cout); } if (mTimersRecoSteps[i].bytesToGPU) { Row reco_step_row; reco_step_row.type = 'D'; reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to GPU)"; - reco_step_row.kernel_time = mTimersRecoSteps[i].timerToGPU.GetElapsedTime(); + reco_step_row.gpu_time = mTimersRecoSteps[i].timerToGPU.GetElapsedTime(); reco_step_row.memSize = mTimersRecoSteps[i].bytesToGPU; reco_step_row.count = mTimersRecoSteps[i].countToGPU; - if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents); - printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents, - mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToGPU / mStatNEvents, mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].countToGPU); + reco_step_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + reco_step_row.writeCSV(benchmarkCSV); + } + reco_step_row.writeMarkdown(std::cout); } if (mTimersRecoSteps[i].bytesToHost) { Row reco_step_row; reco_step_row.type = 'D'; reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to Host)"; - reco_step_row.kernel_time = mTimersRecoSteps[i].timerToHost.GetElapsedTime(); + reco_step_row.gpu_time = mTimersRecoSteps[i].timerToHost.GetElapsedTime(); reco_step_row.memSize = mTimersRecoSteps[i].bytesToHost; reco_step_row.count = mTimersRecoSteps[i].countToHost; - if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents); - printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents, - mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost); + reco_step_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + reco_step_row.writeCSV(benchmarkCSV); + } + reco_step_row.writeMarkdown(std::cout); } if (GetProcessingSettings().resetTimers) { mTimersRecoSteps[i].bytesToGPU = mTimersRecoSteps[i].bytesToHost = 0; @@ -394,24 +431,28 @@ int32_t GPUReconstructionCPU::RunChains() if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) { Row general_step_row; general_step_row.name = gpudatatypes::GENERAL_STEP_NAMES[i]; - general_step_row.kernel_time = mTimersGeneralSteps[i].GetElapsedTime(); - if (benchmarkCSV.is_open()) general_step_row.write(benchmarkCSV, mStatNEvents); - printf("Execution Time: General Step : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents); + general_step_row.gpu_time = mTimersGeneralSteps[i].GetElapsedTime(); + general_step_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + general_step_row.writeCSV(benchmarkCSV); + } + general_step_row.writeMarkdown(std::cout); } } Row wall_row; wall_row.name = "Wall"; if (GetProcessingSettings().debugLevel >= 1) { - wall_row.kernel_time = kernelTotal; - mStatKernelTime = kernelTotal * 1000000 / mStatNEvents; - printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str()); + wall_row.gpu_time = kernelTotal; } wall_row.cpu_time = mStatCPUTime; - wall_row.total_time = mStatWallTime * mStatNEvents / 1000000; - if (benchmarkCSV.is_open()) wall_row.write(benchmarkCSV, mStatNEvents); - printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str()); + wall_row.total_time = mStatWallTime; + wall_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + wall_row.writeCSV(benchmarkCSV); + } + wall_row.writeMarkdown(std::cout); } else if (GetProcessingSettings().debugLevel >= 0) { - GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str()); + GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime * 1000000 / mStatNEvents, nEventReport.c_str()); } if (GetProcessingSettings().resetTimers) { mStatNEvents = 0; From 0a93b3af81d74b66263a8c6080816820aef6593a Mon Sep 17 00:00:00 2001 From: Oliver Rietmann Date: Tue, 17 Mar 2026 16:01:29 +0100 Subject: [PATCH 3/5] Fix overwriting of --PROCresetTimers --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 2 +- GPU/GPUTracking/Standalone/Benchmark/standalone.cxx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 06c0d8f344af1..3209b98547d75 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -335,7 +335,7 @@ AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole re AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6 or deterministic compile flag set", def(1)) AddOption(showOutputStat, bool, false, "", 0, "Print some track output statistics") AddOption(runCompressionStatistics, bool, false, "compressionStat", 0, "Run statistics and verification for cluster compression") -AddOption(resetTimers, int8_t, 1, "", 0, "Reset timers every event") +AddOption(resetTimers, int8_t, 0, "", 0, "Reset timers every event") AddOption(deviceTimers, bool, true, "", 0, "Use device timers instead of host-based time measurement") AddOption(keepAllMemory, bool, false, "", 0, "Allocate all memory on both device and host, and do not reuse") AddOption(keepDisplayMemory, bool, false, "", 0, "Like keepAllMemory, but only for memory required for event display") diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index a2e74c45fcb86..ed35bf0b281bc 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -627,7 +627,7 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU if (configStandalone.runs > 1) { printf("Run %d (thread %d)\n", iteration + 1, threadId); } - recUse->SetResetTimers(iRun < configStandalone.runsInit); + recUse->SetResetTimers(iRun < configStandalone.runsInit || configStandalone.proc.resetTimers); if (configStandalone.outputcontrolmem) { recUse->SetOutputControl(threadId ? outputmemoryPipeline.get() : outputmemory.get(), configStandalone.outputcontrolmem); } @@ -685,7 +685,7 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0; } chainTrackingAsync->mIOPtrs.clustersNative = nullptr; - recAsync->SetResetTimers(iRun < configStandalone.runsInit); + recAsync->SetResetTimers(iRun < configStandalone.runsInit || configStandalone.proc.resetTimers); tmpRetVal = recAsync->RunChains(); if (tmpRetVal == 0 || tmpRetVal == 2) { OutputStat(chainTrackingAsync, nullptr, nullptr); From d1456c5a673858c7801be39f34815ade008f5c8d Mon Sep 17 00:00:00 2001 From: Oliver Rietmann Date: Tue, 17 Mar 2026 16:21:06 +0100 Subject: [PATCH 4/5] Revert GPU Paramter and CMake Files --- .../Definitions/Parameters/GPUParameters.csv | 226 +++++++++--------- GPU/GPUTracking/Standalone/cmake/config.cmake | 6 +- dependencies/FindO2GPU.cmake | 10 +- 3 files changed, 119 insertions(+), 123 deletions(-) diff --git a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv index f240402acc19c..fc27de72ea2f1 100644 --- a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv +++ b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv @@ -1,113 +1,113 @@ -Architecture,default,default_cpu,MI100,MI210,RDNA3,VEGA,TAHITI,TESLA,FERMI,PASCAL,KEPLER,AMPERE,TURING,HOPPER -,,,,,,,,,,,,,, -CORE:,,,,,,,,,,,,,, -WARP_SIZE,32,,64,64,32,64,32,32,32,32,32,32,32,32 -THREAD_COUNT_DEFAULT,256,,256,256,256,256,,,,,,512,512,512 -,,,,,,,,,,,,,, -LB:,,,,,,,,,,,,,, -GPUTPCCreateTrackingData,256,,"[256, 7]","[256, 7]","[256, 7]","[192, 2]",,,,,,384,256,256 -GPUTPCTrackletConstructor,256,,"[768, 8]","[768, 8]","[768, 8]","[512, 10]","[256, 2]","[256, 1]","[256, 2]","[1024, 2]","[512, 4]","[256, 2]","[256, 2]","[256, 2]" -GPUTPCTrackletSelector,256,,"[384, 5]","[384, 5]","[384, 5]","[192, 10]","[256, 3]","[256, 1]","[256, 3]","[512, 4]","[256, 3]","[192, 3]","[192, 3]","[192, 3]" -GPUTPCNeighboursFinder,256,,"[192, 8]","[192, 8]","[192, 8]","[960, 8]",256,256,256,512,256,"[640, 1]","[640, 1]","[640, 1]" -GPUTPCNeighboursCleaner,256,,"[128, 5]","[128, 5]","[128, 5]","[384, 9]",256,256,256,256,256,512,512,512 -GPUTPCExtrapolationTracking,256,,"[256, 7]","[256, 7]","[256, 7]","[256, 2]",,,,,,"[128, 4]","[192, 2]","[192, 2]" -GPUTRDTrackerKernels_gpuVersion,512,,,,,,,,,,,,, -GPUTPCCreateOccupancyMap_fill,256,,,,,,,,,,,,, -GPUTPCCreateOccupancyMap_fold,256,,,,,,,,,,,,, -GPUTRDTrackerKernels_o2Version,512,,,,,,,,,,,,, -GPUTPCCompressionKernels_step0attached,256,,"[128, 1]","[128, 1]","[128, 1]","[64, 2]",,,,,,"[64, 2]",128,128 -GPUTPCCompressionKernels_step1unattached,256,,"[512, 2]","[512, 2]","[512, 2]","[512, 2]",,,,,,"[512, 3]","[512, 2]","[512, 2]" -GPUTPCDecompressionKernels_step0attached,256,,"[128, 2]","[128, 2]","[128, 2]","[128, 2]",,,,,,"[32, 1]","[32, 1]","[32, 1]" -GPUTPCDecompressionKernels_step1unattached,256,,"[64, 2]","[64, 2]","[64, 2]","[64, 2]",,,,,,"[32, 1]","[32, 1]","[32, 1]" -GPUTPCDecompressionUtilKernels_sortPerSectorRow,256,,,,,,,,,,,,, -GPUTPCDecompressionUtilKernels_countFilteredClusters,256,,,,,,,,,,,,, -GPUTPCDecompressionUtilKernels_storeFilteredClusters,256,,,,,,,,,,,,, -GPUTPCCFDecodeZS,"[128, 4]",,"[64, 4]","[64, 4]","[64, 4]","[64, 1]",,,,,,"[64, 10]","[64, 8]","[64, 8]" -GPUTPCCFDecodeZSLink,"""GPUCA_WARP_SIZE""",,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" -GPUTPCCFDecodeZSDenseLink,"""GPUCA_WARP_SIZE""",,"[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 14]",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" -GPUTPCCFGather,"[1024, 1]",,"[1024, 5]","[1024, 5]","[1024, 5]","[1024, 1]",,,,,,"[1024, 1]","[1024, 1]","[1024, 1]" -COMPRESSION_GATHER,1024,,1024,1024,1024,1024,,,,,,1024,1024,1024 -GPUTPCGMMergerTrackFit,256,,"[192, 2]","[192, 2]","[192, 2]","[64, 7]",,,,,,"[64, 4]","[32, 8]","[32, 8]" -GPUTPCGMMergerFollowLoopers,256,,"[256, 5]","[256, 5]","[256, 5]","[256, 4]",,,,,,"[64, 12]","[128, 4]","[128, 4]" -GPUTPCGMMergerSectorRefit,256,,"[64, 4]","[64, 4]","[64, 4]","[256, 2]",,,,,,"[32, 6]","[64, 5]","[64, 5]" -GPUTPCGMMergerUnpackResetIds,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerUnpackGlobal,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerResolve_step0,256,,512,512,512,256,,,,,,256,256,256 -GPUTPCGMMergerResolve_step1,256,,512,512,512,256,,,,,,256,256,256 -GPUTPCGMMergerResolve_step2,256,,512,512,512,256,,,,,,256,256,256 -GPUTPCGMMergerResolve_step3,256,,512,512,512,256,,,,,,256,256,256 -GPUTPCGMMergerResolve_step4,256,,512,512,512,256,,,,,,"[256, 4]","[256, 4]","[256, 4]" -GPUTPCGMMergerClearLinks,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerMergeWithinPrepare,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerMergeSectorsPrepare,256,,256,256,256,256,,,,,,"[256, 2]","[256, 2]","[256, 2]" -GPUTPCGMMergerMergeBorders_step0,256,,512,512,512,256,,,,,,192,192,192 -GPUTPCGMMergerMergeBorders_step2,256,,512,512,512,256,,,,,,"[64, 2]",256,256 -GPUTPCGMMergerMergeCE,256,,512,512,512,256,,,,,,256,256,256 -GPUTPCGMMergerLinkExtrapolatedTracks,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerCollect,256,,"[768, 1]","[768, 1]","[768, 1]","[1024, 1]",,,,,,"[256, 2]","[128, 2]","[128, 2]" -GPUTPCGMMergerSortTracksPrepare,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerPrepareForFit_step0,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerPrepareForFit_step1,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerPrepareForFit_step2,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerFinalize_step0,256,,,,,256,,,,,,,, -GPUTPCGMMergerFinalize_step1,256,,,,,256,,,,,,,, -GPUTPCGMMergerFinalize_step2,256,,,,,256,,,,,,,, -GPUTPCGMMergerMergeLoopers_step0,256,,,,,,,,,,,,, -GPUTPCGMMergerMergeLoopers_step1,256,,,,,,,,,,,,, -GPUTPCGMMergerMergeLoopers_step2,256,,,,,,,,,,,,, -GPUTPCGMO2Output_prepare,256,,,,,,,,,,,,, -GPUTPCGMO2Output_output,256,,,,,,,,,,,,, -GPUTPCStartHitsFinder,256,,"[1024, 2]","[1024, 2]","[1024, 2]","[1024, 7]",256,256,256,256,256,512,512,512 -GPUTPCStartHitsSorter,256,,"[1024, 5]","[1024, 5]","[1024, 5]","[512, 7]",256,256,256,256,256,"[512, 1]","[512, 1]","[512, 1]" -GPUTPCCFCheckPadBaseline,576,,"[576, 2]","[576, 2]","[576, 2]","[576, 2]",,,,,,"[576, 2]",, -GPUTPCCFChargeMapFiller_fillIndexMap,512,,512,512,512,512,,,,,,448,, -GPUTPCCFChargeMapFiller_fillFromDigits,512,,512,512,512,512,,,,,,448,, -GPUTPCCFChargeMapFiller_findFragmentStart,512,,512,512,512,512,,,,,,448,, -GPUTPCCFPeakFinder,512,,"[512, 9]","[512, 9]","[512, 9]","[512, 4]",,,,,,128,, -GPUTPCCFNoiseSuppression,512,,512,512,512,512,,,,,,448,, -GPUTPCCFDeconvolution,512,,"[512, 5]","[512, 5]","[512, 5]","[512, 5]",,,,,,384,, -GPUTPCCFClusterizer,512,,"[448, 3]","[448, 3]","[448, 3]","[512, 2]",,,,,,448,, -GPUTPCNNClusterizerKernels,512,,,,,,,,,,,,, -GPUTrackingRefitKernel_mode0asGPU,256,,,,,,,,,,,,, -GPUTrackingRefitKernel_mode1asTrackParCov,256,,,,,,,,,,,,, -GPUMemClean16,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,,,,, -GPUitoa,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,,,,, -GPUTPCCFNoiseSuppression_noiseSuppression,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,,,,, -GPUTPCCFNoiseSuppression_updatePeaks,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_runCfClusterizer,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_fillInputNNCPU,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_fillInputNNGPU,1024,,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_determineClass1Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_determineClass2Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_publishClass1Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_publishClass2Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_publishDeconvolutionFlags,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, -GPUTPCCFStreamCompaction_scanStart,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, -GPUTPCCFStreamCompaction_scanUp,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, -GPUTPCCFStreamCompaction_scanTop,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, -GPUTPCCFStreamCompaction_scanDown,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, -GPUTPCCFStreamCompaction_compactDigits,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, -GPUTPCCompressionGatherKernels_unbuffered,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, -GPUTPCCompressionGatherKernels_buffered32,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, -GPUTPCCompressionGatherKernels_buffered64,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, -GPUTPCCompressionGatherKernels_buffered128,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, -GPUTPCCompressionGatherKernels_multiBlock,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, -GPUTPCGMMergerFinalize_0,256,,256,256,256,,,,,,,256,256,256 -GPUTPCGMMergerFinalize_1,256,,256,256,256,,,,,,,256,256,256 -GPUTPCGMMergerFinalize_2,256,,256,256,256,,,,,,,256,256,256 -,,,,,,,,,,,,,, -PAR:,,,,,,,,,,,,,, -AMD_EUS_PER_CU,0,0,4,4,4,4,,,,,,,, -SORT_STARTHITS,1,0,,,,,,,,,,,, -NEIGHBOURS_FINDER_MAX_NNEIGHUP,6,0,10,10,10,4,,,,,,4,4,4 -NEIGHBOURS_FINDER_UNROLL_GLOBAL,4,0,4,4,4,2,,,,,,,, -NEIGHBOURS_FINDER_UNROLL_SHARED,1,0,0,0,0,0,,,,,,,, -TRACKLET_SELECTOR_HITS_REG_SIZE,12,0,9,9,9,27,,,,,,20,20,20 -ALTERNATE_BORDER_SORT,0,0,1,1,1,1,,,,,,1,1,1 -SORT_BEFORE_FIT,0,0,1,1,1,1,,,,,,1,1,1 -NO_ATOMIC_PRECHECK,0,0,1,1,1,1,,,,,,1,1,1 -DEDX_STORAGE_TYPE,"""float""","""float""","""uint16_t""","""uint16_t""","""uint16_t""","""uint16_t""",,,,,,"""uint16_t""","""uint16_t""","""uint16_t""" -MERGER_INTERPOLATION_ERROR_TYPE,"""float""","""float""","""half""","""half""","""half""","""half""",,,,,,"""half""","""half""","""half""" -COMP_GATHER_KERNEL,0,0,4,4,4,4,,,,,,4,4,4 -COMP_GATHER_MODE,2,0,3,3,3,3,,,,,,3,3,3 -CF_SCAN_WORKGROUP_SIZE,512,0,,,,,,,,,,,, +Architecture,default,default_cpu,MI100,VEGA,TAHITI,TESLA,FERMI,PASCAL,KEPLER,AMPERE,TURING +,,,,,,,,,,, +CORE:,,,,,,,,,,, +WARP_SIZE,32,,64,64,32,32,32,32,32,32,32 +THREAD_COUNT_DEFAULT,256,,256,256,,,,,,512,512 +,,,,,,,,,,, +LB:,,,,,,,,,,, +GPUTPCCreateTrackingData,256,,"[256, 7]","[192, 2]",,,,,,384,256 +GPUTPCTrackletConstructor,256,,"[768, 8]","[512, 10]","[256, 2]","[256, 1]","[256, 2]","[1024, 2]","[512, 4]","[256, 2]","[256, 2]" +GPUTPCTrackletSelector,256,,"[384, 5]","[192, 10]","[256, 3]","[256, 1]","[256, 3]","[512, 4]","[256, 3]","[192, 3]","[192, 3]" +GPUTPCNeighboursFinder,256,,"[192, 8]","[960, 8]",256,256,256,512,256,"[640, 1]","[640, 1]" +GPUTPCNeighboursCleaner,256,,"[128, 5]","[384, 9]",256,256,256,256,256,512,512 +GPUTPCExtrapolationTracking,256,,"[256, 7]","[256, 2]",,,,,,"[128, 4]","[192, 2]" +GPUTRDTrackerKernels_gpuVersion,512,,,,,,,,,, +GPUTPCCreateOccupancyMap_fill,256,,,,,,,,,, +GPUTPCCreateOccupancyMap_fold,256,,,,,,,,,, +GPUTRDTrackerKernels_o2Version,512,,,,,,,,,, +GPUTPCCompressionKernels_step0attached,256,,"[128, 1]","[64, 2]",,,,,,"[64, 2]",128 +GPUTPCCompressionKernels_step1unattached,256,,"[512, 2]","[512, 2]",,,,,,"[512, 3]","[512, 2]" +GPUTPCDecompressionKernels_step0attached,256,,"[128, 2]","[128, 2]",,,,,,"[32, 1]","[32, 1]" +GPUTPCDecompressionKernels_step1unattached,256,,"[64, 2]","[64, 2]",,,,,,"[32, 1]","[32, 1]" +GPUTPCDecompressionUtilKernels_sortPerSectorRow,256,,,,,,,,,, +GPUTPCDecompressionUtilKernels_countFilteredClusters,256,,,,,,,,,, +GPUTPCDecompressionUtilKernels_storeFilteredClusters,256,,,,,,,,,, +GPUTPCCFDecodeZS,"[128, 4]",,"[64, 4]","[64, 1]",,,,,,"[64, 10]","[64, 8]" +GPUTPCCFDecodeZSLink,"""GPUCA_WARP_SIZE""",,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" +GPUTPCCFDecodeZSDenseLink,"""GPUCA_WARP_SIZE""",,"[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 14]",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" +GPUTPCCFGather,"[1024, 1]",,"[1024, 5]","[1024, 1]",,,,,,"[1024, 1]","[1024, 1]" +COMPRESSION_GATHER,1024,,1024,1024,,,,,,1024,1024 +GPUTPCGMMergerTrackFit,256,,"[192, 2]","[64, 7]",,,,,,"[64, 4]","[32, 8]" +GPUTPCGMMergerFollowLoopers,256,,"[256, 5]","[256, 4]",,,,,,"[64, 12]","[128, 4]" +GPUTPCGMMergerSectorRefit,256,,"[64, 4]","[256, 2]",,,,,,"[32, 6]","[64, 5]" +GPUTPCGMMergerUnpackResetIds,256,,256,256,,,,,,256,256 +GPUTPCGMMergerUnpackGlobal,256,,256,256,,,,,,256,256 +GPUTPCGMMergerResolve_step0,256,,512,256,,,,,,256,256 +GPUTPCGMMergerResolve_step1,256,,512,256,,,,,,256,256 +GPUTPCGMMergerResolve_step2,256,,512,256,,,,,,256,256 +GPUTPCGMMergerResolve_step3,256,,512,256,,,,,,256,256 +GPUTPCGMMergerResolve_step4,256,,512,256,,,,,,"[256, 4]","[256, 4]" +GPUTPCGMMergerClearLinks,256,,256,256,,,,,,256,256 +GPUTPCGMMergerMergeWithinPrepare,256,,256,256,,,,,,256,256 +GPUTPCGMMergerMergeSectorsPrepare,256,,256,256,,,,,,"[256, 2]","[256, 2]" +GPUTPCGMMergerMergeBorders_step0,256,,512,256,,,,,,192,192 +GPUTPCGMMergerMergeBorders_step2,256,,512,256,,,,,,"[64, 2]",256 +GPUTPCGMMergerMergeCE,256,,512,256,,,,,,256,256 +GPUTPCGMMergerLinkExtrapolatedTracks,256,,256,256,,,,,,256,256 +GPUTPCGMMergerCollect,256,,"[768, 1]","[1024, 1]",,,,,,"[256, 2]","[128, 2]" +GPUTPCGMMergerSortTracksPrepare,256,,256,256,,,,,,256,256 +GPUTPCGMMergerPrepareForFit_step0,256,,256,256,,,,,,256,256 +GPUTPCGMMergerPrepareForFit_step1,256,,256,256,,,,,,256,256 +GPUTPCGMMergerPrepareForFit_step2,256,,256,256,,,,,,256,256 +GPUTPCGMMergerFinalize_step0,256,,,256,,,,,,, +GPUTPCGMMergerFinalize_step1,256,,,256,,,,,,, +GPUTPCGMMergerFinalize_step2,256,,,256,,,,,,, +GPUTPCGMMergerMergeLoopers_step0,256,,,,,,,,,, +GPUTPCGMMergerMergeLoopers_step1,256,,,,,,,,,, +GPUTPCGMMergerMergeLoopers_step2,256,,,,,,,,,, +GPUTPCGMO2Output_prepare,256,,,,,,,,,, +GPUTPCGMO2Output_output,256,,,,,,,,,, +GPUTPCStartHitsFinder,256,,"[1024, 2]","[1024, 7]",256,256,256,256,256,512,512 +GPUTPCStartHitsSorter,256,,"[1024, 5]","[512, 7]",256,256,256,256,256,"[512, 1]","[512, 1]" +GPUTPCCFCheckPadBaseline,576,,"[576, 2]","[576, 2]",,,,,,"[576, 2]", +GPUTPCCFChargeMapFiller_fillIndexMap,512,,512,512,,,,,,448, +GPUTPCCFChargeMapFiller_fillFromDigits,512,,512,512,,,,,,448, +GPUTPCCFChargeMapFiller_findFragmentStart,512,,512,512,,,,,,448, +GPUTPCCFPeakFinder,512,,"[512, 9]","[512, 4]",,,,,,128, +GPUTPCCFNoiseSuppression,512,,512,512,,,,,,448, +GPUTPCCFDeconvolution,512,,"[512, 5]","[512, 5]",,,,,,384, +GPUTPCCFClusterizer,512,,"[448, 3]","[512, 2]",,,,,,448, +GPUTPCNNClusterizerKernels,512,,,,,,,,,, +GPUTrackingRefitKernel_mode0asGPU,256,,,,,,,,,, +GPUTrackingRefitKernel_mode1asTrackParCov,256,,,,,,,,,, +GPUMemClean16,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,, +GPUitoa,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,, +GPUTPCCFNoiseSuppression_noiseSuppression,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,, +GPUTPCCFNoiseSuppression_updatePeaks,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,, +GPUTPCNNClusterizerKernels_runCfClusterizer,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, +GPUTPCNNClusterizerKernels_fillInputNNCPU,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, +GPUTPCNNClusterizerKernels_fillInputNNGPU,1024,,,,,,,,,, +GPUTPCNNClusterizerKernels_determineClass1Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, +GPUTPCNNClusterizerKernels_determineClass2Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, +GPUTPCNNClusterizerKernels_publishClass1Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, +GPUTPCNNClusterizerKernels_publishClass2Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, +GPUTPCNNClusterizerKernels_publishDeconvolutionFlags,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, +GPUTPCCFStreamCompaction_scanStart,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, +GPUTPCCFStreamCompaction_scanUp,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, +GPUTPCCFStreamCompaction_scanTop,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, +GPUTPCCFStreamCompaction_scanDown,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, +GPUTPCCFStreamCompaction_compactDigits,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, +GPUTPCCompressionGatherKernels_unbuffered,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, +GPUTPCCompressionGatherKernels_buffered32,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, +GPUTPCCompressionGatherKernels_buffered64,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, +GPUTPCCompressionGatherKernels_buffered128,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, +GPUTPCCompressionGatherKernels_multiBlock,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, +GPUTPCGMMergerFinalize_0,256,,256,,,,,,,256,256 +GPUTPCGMMergerFinalize_1,256,,256,,,,,,,256,256 +GPUTPCGMMergerFinalize_2,256,,256,,,,,,,256,256 +,,,,,,,,,,, +PAR:,,,,,,,,,,, +AMD_EUS_PER_CU,0,0,4,4,,,,,,, +SORT_STARTHITS,1,0,,,,,,,,, +NEIGHBOURS_FINDER_MAX_NNEIGHUP,6,0,10,4,,,,,,4,4 +NEIGHBOURS_FINDER_UNROLL_GLOBAL,4,0,4,2,,,,,,, +NEIGHBOURS_FINDER_UNROLL_SHARED,1,0,0,0,,,,,,, +TRACKLET_SELECTOR_HITS_REG_SIZE,12,0,9,27,,,,,,20,20 +ALTERNATE_BORDER_SORT,0,0,1,1,,,,,,1,1 +SORT_BEFORE_FIT,0,0,1,1,,,,,,1,1 +NO_ATOMIC_PRECHECK,0,0,1,1,,,,,,1,1 +DEDX_STORAGE_TYPE,"""float""","""float""","""uint16_t""","""uint16_t""",,,,,,"""uint16_t""","""uint16_t""" +MERGER_INTERPOLATION_ERROR_TYPE,"""float""","""float""","""half""","""half""",,,,,,"""half""","""half""" +COMP_GATHER_KERNEL,0,0,4,4,,,,,,4,4 +COMP_GATHER_MODE,2,0,3,3,,,,,,3,3 +CF_SCAN_WORKGROUP_SIZE,512,0,,,,,,,,, diff --git a/GPU/GPUTracking/Standalone/cmake/config.cmake b/GPU/GPUTracking/Standalone/cmake/config.cmake index abdfc52c460e4..9355311db617c 100644 --- a/GPU/GPUTracking/Standalone/cmake/config.cmake +++ b/GPU/GPUTracking/Standalone/cmake/config.cmake @@ -19,7 +19,7 @@ set(GPUCA_CONFIG_VC 1) set(GPUCA_CONFIG_FMT 1) set(GPUCA_CONFIG_ROOT 1) set(GPUCA_CONFIG_ONNX 0) -set(GPUCA_BUILD_EVENT_DISPLAY 0) +set(GPUCA_BUILD_EVENT_DISPLAY 1) set(GPUCA_BUILD_EVENT_DISPLAY_FREETYPE 1) set(GPUCA_BUILD_EVENT_DISPLAY_VULKAN 1) set(GPUCA_BUILD_EVENT_DISPLAY_WAYLAND 1) @@ -32,8 +32,8 @@ set(GPUCA_BUILD_DEBUG_HOSTONLY 0) set(GPUCA_DETERMINISTIC_MODE 0) # OFF / NO_FAST_MATH / OPTO2 / GPU / WHOLEO2 #set(GPUCA_CUDA_GCCBIN c++-14) #set(GPUCA_OPENCL_CLANGBIN clang-20) -set(HIP_AMDGPUTARGET "gfx1100") # "gfx906;gfx908;gfx90a" -#set(CUDA_COMPUTETARGET "default") # 86 89 +set(HIP_AMDGPUTARGET "default") # "gfx906;gfx908;gfx90a" +set(CUDA_COMPUTETARGET "default") # 86 89 #set(GPUCA_CUDA_COMPILE_MODE perkernel) # onefile / perkernel / rtc #set(GPUCA_HIP_COMPILE_MODE perkernel) #set(GPUCA_RTC_NO_COMPILED_KERNELS 1) diff --git a/dependencies/FindO2GPU.cmake b/dependencies/FindO2GPU.cmake index 0aeae438b7187..3e8f012fea4b5 100644 --- a/dependencies/FindO2GPU.cmake +++ b/dependencies/FindO2GPU.cmake @@ -52,9 +52,7 @@ function(detect_gpu_arch backend) # Detect GPU architecture, optionally filterri set(CUDA_FIRST_TARGET 86) message(STATUS "CUDA_COMPUTETARGET not set, defaulting CUDA optimization for architecture ${CUDA_FIRST_TARGET}") endif() - if(CUDA_FIRST_TARGET GREATER_EQUAL 89) - set(CUDA_TARGET HOPPER) - elseif(CUDA_FIRST_TARGET GREATER_EQUAL 86) + if(CUDA_FIRST_TARGET GREATER_EQUAL 86) set(CUDA_TARGET AMPERE) elseif(CUDA_FIRST_TARGET GREATER_EQUAL 75) set(CUDA_TARGET TURING) @@ -77,9 +75,7 @@ function(detect_gpu_arch backend) # Detect GPU architecture, optionally filterri endif() string(TOLOWER "${HIP_FIRST_TARGET}" HIP_FIRST_TARGET) string(REGEX MATCH "....$" HIP_FIRST_TARGET_PADDED "0000${HIP_FIRST_TARGET}") - if(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "1100") - set(HIP_TARGET RDNA3) - elseif(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "1000") + if(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "1000") set(HIP_TARGET RDNA) elseif(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "090a") set(HIP_TARGET MI210) @@ -404,4 +400,4 @@ endif() set(O2GPU_FOUND TRUE) if (NOT GPUCA_FINDO2GPU_CHECK_ONLY) include("${CMAKE_CURRENT_LIST_DIR}/../GPU/GPUTracking/cmake/kernel_helpers.cmake") -endif() \ No newline at end of file +endif() From 408e62473290956202f2f6d350015eb337c98cbd Mon Sep 17 00:00:00 2001 From: Oliver Rietmann Date: Tue, 17 Mar 2026 16:58:35 +0100 Subject: [PATCH 5/5] clang-format --- GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 81 ++++++++++++------- 1 file changed, 52 insertions(+), 29 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 6db667ea0886f..dc96c2a238d1a 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -39,7 +39,6 @@ #include #include - #ifndef _WIN32 #include #endif @@ -216,14 +215,17 @@ int32_t GPUReconstructionCPU::ExitDevice() return 0; } -namespace { -void writeHeaderMarkdown(std::ostream& stream) { - stream << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n"; - stream << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n"; +namespace +{ +void writeHeaderMarkdown(std::ostream& stream) +{ + stream << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n"; + stream << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n"; } -void writeHeaderCSV(std::ostream& stream) { - stream << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; +void writeHeaderCSV(std::ostream& stream) +{ + stream << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; } struct Row { @@ -236,44 +238,65 @@ struct Row { uint32_t memSize = 0; uint32_t statNEvents; - void writeMarkdown(std::ostream& stream) { + void writeMarkdown(std::ostream& stream) + { double scale = 1000000.0 / statNEvents; stream << "| " << type << " | "; - if (count != 0) stream << std::format("{:6} |", count); - else stream << " |"; + if (count != 0) + stream << std::format("{:6} |", count); + else + stream << " |"; stream << std::format(" {:42}|", name); - if (gpu_time != -1.0) stream << std::format("{:10.0f} |", gpu_time * scale); - else stream << " |"; - if (cpu_time != -1.0) stream << std::format("{:10.0f} |", cpu_time * scale); - else stream << " |"; - if (cpu_time != -1.0 && total_time != -1.0) stream << std::format("{:8.2f} |", cpu_time / total_time); - else stream << " |"; - if (total_time != -1.0) stream << std::format("{:10.0f} |", total_time * scale); - else stream << " |"; - if (memSize != 0 && count != 0) stream << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); - else stream << " | | |"; + if (gpu_time != -1.0) + stream << std::format("{:10.0f} |", gpu_time * scale); + else + stream << " |"; + if (cpu_time != -1.0) + stream << std::format("{:10.0f} |", cpu_time * scale); + else + stream << " |"; + if (cpu_time != -1.0 && total_time != -1.0) + stream << std::format("{:8.2f} |", cpu_time / total_time); + else + stream << " |"; + if (total_time != -1.0) + stream << std::format("{:10.0f} |", total_time * scale); + else + stream << " |"; + if (memSize != 0 && count != 0) + stream << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); + else + stream << " | | |"; stream << std::endl; } - void writeCSV(std::ostream& stream) { + void writeCSV(std::ostream& stream) + { double scale = 1000000.0 / statNEvents; stream << type << ","; - if (count != 0) stream << count; + if (count != 0) + stream << count; stream << "," << name << ","; - if (gpu_time != -1.0) stream << std::format("{:.0f}", gpu_time * scale); + if (gpu_time != -1.0) + stream << std::format("{:.0f}", gpu_time * scale); stream << ","; - if (cpu_time != -1.0) stream << std::format("{:.0f}", cpu_time * scale); + if (cpu_time != -1.0) + stream << std::format("{:.0f}", cpu_time * scale); stream << ","; - if (cpu_time != -1.0 && total_time != -1.0) stream << std::format("{:.2f}", cpu_time / total_time); + if (cpu_time != -1.0 && total_time != -1.0) + stream << std::format("{:.2f}", cpu_time / total_time); stream << ","; - if (total_time != -1.0) stream << std::format("{:.0f}", total_time * scale); + if (total_time != -1.0) + stream << std::format("{:.0f}", total_time * scale); stream << ","; - if (memSize != 0 && count != 0) stream << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); - else stream << ",,"; + if (memSize != 0 && count != 0) + stream << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); + else + stream << ",,"; stream << std::endl; } }; -} +} // namespace int32_t GPUReconstructionCPU::RunChains() {