diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 409c28b8bf328..dc96c2a238d1a 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -35,6 +35,9 @@ #include #include +#include +#include +#include #ifndef _WIN32 #include @@ -212,6 +215,89 @@ int32_t GPUReconstructionCPU::ExitDevice() return 0; } +namespace +{ +void writeHeaderMarkdown(std::ostream& stream) +{ + stream << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n"; + stream << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n"; +} + +void writeHeaderCSV(std::ostream& stream) +{ + stream << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; +} + +struct Row { + char type = ' '; + std::string name; + uint32_t count = 0; + double gpu_time = -1.0; + double cpu_time = -1.0; + double total_time = -1.0; + uint32_t memSize = 0; + uint32_t statNEvents; + + void writeMarkdown(std::ostream& stream) + { + double scale = 1000000.0 / statNEvents; + stream << "| " << type << " | "; + if (count != 0) + stream << std::format("{:6} |", count); + else + stream << " |"; + stream << std::format(" {:42}|", name); + if (gpu_time != -1.0) + stream << std::format("{:10.0f} |", gpu_time * scale); + else + stream << " |"; + if (cpu_time != -1.0) + stream << std::format("{:10.0f} |", cpu_time * scale); + else + stream << " |"; + if (cpu_time != -1.0 && total_time != -1.0) + stream << std::format("{:8.2f} |", cpu_time / total_time); + else + stream << " |"; + if (total_time != -1.0) + stream << std::format("{:10.0f} |", total_time * scale); + else + stream << " |"; + if (memSize != 0 && count != 0) + stream << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); + else + stream << " | | |"; + stream << std::endl; + } + + void writeCSV(std::ostream& stream) + { + double scale = 1000000.0 / statNEvents; + stream << type << ","; + if (count != 0) + stream << count; + stream << "," << name << ","; + if (gpu_time != -1.0) + stream << std::format("{:.0f}", gpu_time * scale); + stream << ","; + if (cpu_time != -1.0) + stream << std::format("{:.0f}", cpu_time * scale); + stream << ","; + if (cpu_time != -1.0 && total_time != -1.0) + stream << std::format("{:.2f}", cpu_time / total_time); + stream << ","; + if (total_time != -1.0) + stream << std::format("{:.0f}", total_time * scale); + stream << ","; + if (memSize != 0 && count != 0) + stream << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); + else + stream << ",,"; + stream << std::endl; + } +}; +} // namespace + int32_t GPUReconstructionCPU::RunChains() { mMemoryScalers->temporaryFactor = 1.; @@ -256,7 +342,7 @@ int32_t GPUReconstructionCPU::RunChains() PrintMemoryOverview(); } - mStatWallTime = (mTimerTotal.GetElapsedTime() * 1000000. / mStatNEvents); + mStatWallTime = mTimerTotal.GetElapsedTime(); std::string nEventReport; if (GetProcessingSettings().debugLevel >= 0 && mStatNEvents > 1) { nEventReport += " (avergage of " + std::to_string(mStatNEvents) + " runs)"; @@ -264,7 +350,18 @@ int32_t GPUReconstructionCPU::RunChains() double kernelTotal = 0; std::vector kernelStepTimes(gpudatatypes::N_RECO_STEPS, 0.); + std::ofstream benchmarkCSV; + if (!GetProcessingSettings().timingCSV.empty()) { + benchmarkCSV.open(GetProcessingSettings().timingCSV, std::ios::out | std::ios::app); + if (!benchmarkCSV.is_open()) { + GPUError("Could not open timing CSV file '%s' for writing", GetProcessingSettings().timingCSV.c_str()); + } else if (mNEventsProcessed == 1) { + writeHeaderCSV(benchmarkCSV); + } + } + if (GetProcessingSettings().debugLevel >= 1) { + writeHeaderMarkdown(std::cout); for (uint32_t i = 0; i < mTimers.size(); i++) { double time = 0; if (mTimers[i] == nullptr) { @@ -284,11 +381,19 @@ int32_t GPUReconstructionCPU::RunChains() int32_t stepNum = getRecoStepNum(mTimers[i]->step); kernelStepTimes[stepNum] += time; } - char bandwidth[256] = ""; + Row task_row; + task_row.type = 'K'; + task_row.name = mTimers[i]->name.c_str(); + task_row.gpu_time = time; + task_row.count = mTimers[i]->count; + task_row.statNEvents = mStatNEvents; if (mTimers[i]->memSize && mStatNEvents && time != 0.) { - snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count); + task_row.memSize = mTimers[i]->memSize; + } + if (benchmarkCSV.is_open()) { + task_row.writeCSV(benchmarkCSV); } - printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth); + task_row.writeMarkdown(std::cout); if (GetProcessingSettings().resetTimers) { mTimers[i]->count = 0; mTimers[i]->memSize = 0; @@ -298,16 +403,42 @@ int32_t GPUReconstructionCPU::RunChains() if (GetProcessingSettings().recoTaskTiming) { for (int32_t i = 0; i < gpudatatypes::N_RECO_STEPS; i++) { if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) { - printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks", - gpudatatypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime()); + Row reco_step_row; + reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (Tasks)"; + reco_step_row.gpu_time = kernelStepTimes[i]; + reco_step_row.cpu_time = mTimersRecoSteps[i].timerCPU; + reco_step_row.total_time = mTimersRecoSteps[i].timerTotal.GetElapsedTime(); + reco_step_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + reco_step_row.writeCSV(benchmarkCSV); + } + reco_step_row.writeMarkdown(std::cout); } if (mTimersRecoSteps[i].bytesToGPU) { - printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents, - mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToGPU / mStatNEvents, mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].countToGPU); + Row reco_step_row; + reco_step_row.type = 'D'; + reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to GPU)"; + reco_step_row.gpu_time = mTimersRecoSteps[i].timerToGPU.GetElapsedTime(); + reco_step_row.memSize = mTimersRecoSteps[i].bytesToGPU; + reco_step_row.count = mTimersRecoSteps[i].countToGPU; + reco_step_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + reco_step_row.writeCSV(benchmarkCSV); + } + reco_step_row.writeMarkdown(std::cout); } if (mTimersRecoSteps[i].bytesToHost) { - printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents, - mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost); + Row reco_step_row; + reco_step_row.type = 'D'; + reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to Host)"; + reco_step_row.gpu_time = mTimersRecoSteps[i].timerToHost.GetElapsedTime(); + reco_step_row.memSize = mTimersRecoSteps[i].bytesToHost; + reco_step_row.count = mTimersRecoSteps[i].countToHost; + reco_step_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + reco_step_row.writeCSV(benchmarkCSV); + } + reco_step_row.writeMarkdown(std::cout); } if (GetProcessingSettings().resetTimers) { mTimersRecoSteps[i].bytesToGPU = mTimersRecoSteps[i].bytesToHost = 0; @@ -321,16 +452,30 @@ int32_t GPUReconstructionCPU::RunChains() } for (int32_t i = 0; i < gpudatatypes::N_GENERAL_STEPS; i++) { if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) { - printf("Execution Time: General Step : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents); + Row general_step_row; + general_step_row.name = gpudatatypes::GENERAL_STEP_NAMES[i]; + general_step_row.gpu_time = mTimersGeneralSteps[i].GetElapsedTime(); + general_step_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + general_step_row.writeCSV(benchmarkCSV); + } + general_step_row.writeMarkdown(std::cout); } } + Row wall_row; + wall_row.name = "Wall"; if (GetProcessingSettings().debugLevel >= 1) { - mStatKernelTime = kernelTotal * 1000000 / mStatNEvents; - printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str()); + wall_row.gpu_time = kernelTotal; + } + wall_row.cpu_time = mStatCPUTime; + wall_row.total_time = mStatWallTime; + wall_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + wall_row.writeCSV(benchmarkCSV); } - printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str()); + wall_row.writeMarkdown(std::cout); } else if (GetProcessingSettings().debugLevel >= 0) { - GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str()); + GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime * 1000000 / mStatNEvents, nEventReport.c_str()); } if (GetProcessingSettings().resetTimers) { mStatNEvents = 0; diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 57cb1371a4aa0..3209b98547d75 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -329,12 +329,13 @@ AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)") AddOption(debugMask, uint32_t, (1 << 18) - 1, "debugMask", 0, "Mask for debug output dumps to file") AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6") +AddOption(timingCSV, std::string, "", "", 0, "CSV filename to append the benchmark results. Verbosity determined by parameter --debug.") AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures") AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks") AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6 or deterministic compile flag set", def(1)) AddOption(showOutputStat, bool, false, "", 0, "Print some track output statistics") AddOption(runCompressionStatistics, bool, false, "compressionStat", 0, "Run statistics and verification for cluster compression") -AddOption(resetTimers, int8_t, 1, "", 0, "Reset timers every event") +AddOption(resetTimers, int8_t, 0, "", 0, "Reset timers every event") AddOption(deviceTimers, bool, true, "", 0, "Use device timers instead of host-based time measurement") AddOption(keepAllMemory, bool, false, "", 0, "Allocate all memory on both device and host, and do not reuse") AddOption(keepDisplayMemory, bool, false, "", 0, "Like keepAllMemory, but only for memory required for event display") diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index a2e74c45fcb86..ed35bf0b281bc 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -627,7 +627,7 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU if (configStandalone.runs > 1) { printf("Run %d (thread %d)\n", iteration + 1, threadId); } - recUse->SetResetTimers(iRun < configStandalone.runsInit); + recUse->SetResetTimers(iRun < configStandalone.runsInit || configStandalone.proc.resetTimers); if (configStandalone.outputcontrolmem) { recUse->SetOutputControl(threadId ? outputmemoryPipeline.get() : outputmemory.get(), configStandalone.outputcontrolmem); } @@ -685,7 +685,7 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0; } chainTrackingAsync->mIOPtrs.clustersNative = nullptr; - recAsync->SetResetTimers(iRun < configStandalone.runsInit); + recAsync->SetResetTimers(iRun < configStandalone.runsInit || configStandalone.proc.resetTimers); tmpRetVal = recAsync->RunChains(); if (tmpRetVal == 0 || tmpRetVal == 2) { OutputStat(chainTrackingAsync, nullptr, nullptr);