diff --git a/GPU/Common/GPUCommonAlgorithm.h b/GPU/Common/GPUCommonAlgorithm.h index db57e7ec06d4b..be88973561e0a 100644 --- a/GPU/Common/GPUCommonAlgorithm.h +++ b/GPU/Common/GPUCommonAlgorithm.h @@ -354,6 +354,11 @@ GPUdi() uint8_t warp_broadcast_FUNC(uint8_t v, int32_t i) #define warp_scan_inclusive_add(v) warp_scan_inclusive_add_FUNC(v) #define warp_broadcast(v, i) warp_broadcast_FUNC(v, i) +[[nodiscard]] GPUdi() int32_t work_group_count(bool pred) +{ + return work_group_reduce_add((int32_t)pred); +} + #elif (defined(__CUDACC__) || defined(__HIPCC__)) // CUDA and HIP work the same way using cub, need just different header @@ -416,6 +421,16 @@ GPUdi() T warp_broadcast_FUNC(T v, int32_t i) #endif } +[[nodiscard]] GPUdi() bool work_group_any(bool pred) +{ + return __syncthreads_or(pred); +} + +[[nodiscard]] GPUdi() uint32_t work_group_count(bool pred) +{ + return __syncthreads_count(pred); +} + #else // Trivial implementation for the CPU @@ -449,6 +464,16 @@ GPUdi() T warp_broadcast(T v, int32_t i) return v; } +[[nodiscard]] GPUdi() bool work_group_any(bool pred) +{ + return pred; +} + +[[nodiscard]] GPUdi() uint32_t work_group_count(bool pred) +{ + return pred; +} + #endif #ifdef GPUCA_ALGORITHM_STD diff --git a/GPU/GPUTracking/DataTypes/GPUTPCExtraADC.h b/GPU/GPUTracking/DataTypes/GPUTPCExtraADC.h new file mode 100644 index 0000000000000..28fa072b9401c --- /dev/null +++ b/GPU/GPUTracking/DataTypes/GPUTPCExtraADC.h @@ -0,0 +1,25 @@ +// Copyright 2019-2026 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCExtraADC.h +/// \author Felix Weiglhofer + +#include "GPUDefConstantsAndSettings.h" +#include "DataFormatsTPC/Digit.h" +#include +#include + +namespace o2::gpu +{ +struct GPUTPCExtraADC { + std::array, GPUCA_NSECTORS> digitsBySector; +}; +} // namespace o2::gpu diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index e34af48d7a85e..51fc8cf46bbbe 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -112,6 +112,8 @@ AddOptionRTC(trackletMinSharedNormFactor, float, 0.f, "", 0, "Max shared defined AddOptionRTC(maxTimeBinAboveThresholdIn1000Bin, uint16_t, 500, "", 0, "Except pad from cluster finding if total number of charges in a fragment is above this baseline (disable = 0)") AddOptionRTC(maxConsecTimeBinAboveThreshold, uint16_t, 200, "", 0, "Except pad from cluster finding if number of consecutive charges in a fragment is above this baseline (disable = 0)") AddOptionRTC(noisyPadSaturationThreshold, uint16_t, 700, "", 0, "Threshold where a timebin is considered saturated, disabling the noisy pad check for that pad") +AddOptionRTC(hipTailFilter, uint8_t, 0, "", 0, "Enable Highly Ionising Particle tail filter in CheckPadBaseline (0 = disable, 1 = filter tails)") +AddOptionRTC(hipTailFilterThreshold, uint16_t, 150, "", 0, "Threshold that must be exceeded for a timebin to be counted towards Highly Ionising Particle tail") AddOptionRTC(occupancyMapTimeBins, uint16_t, 16, "", 0, "Number of timebins per histogram bin of occupancy map (0 = disable occupancy map)") AddOptionRTC(occupancyMapTimeBinsAverage, uint16_t, 0, "", 0, "Number of timebins +/- to use for the averaging") AddOptionRTC(trackFitCovLimit, uint16_t, 1000, "", 0, "Abort fit when y/z cov exceed the limit") diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h index fd75136f51d76..5bcdfee2607c7 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.h +++ b/GPU/GPUTracking/Global/GPUChainTracking.h @@ -18,7 +18,6 @@ #include "GPUChain.h" #include "GPUDataTypesIO.h" #include "GPUDataTypesConfig.h" -#include #include #include #include @@ -70,6 +69,7 @@ struct CfFragment; class GPUTPCClusterFinder; struct GPUSettingsProcessing; struct GPUSettingsRec; +struct GPUTPCExtraADC; class GPUChainTracking : public GPUChain { @@ -302,13 +302,15 @@ class GPUChainTracking : public GPUChain int32_t RunChainFinalize(); void OutputSanityCheck(); int32_t RunTPCTrackingSectors_internal(); - int32_t RunTPCClusterizer_prepare(bool restorePointers); + int32_t RunTPCClusterizer_prepare(bool restorePointers, const GPUTPCExtraADC& extraADCs); #ifdef GPUCA_TPC_GEOMETRY_O2 - std::pair RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane); + std::pair RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane, const GPUTPCExtraADC& extraADCs); void RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int32_t stage, bool doGPU, int32_t lane); std::pair TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment); std::pair TPCClusterizerDecodeZSCountUpdate(uint32_t iSector, const CfFragment& fragment); void TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfFragment& fragment); + void TPCClusterizerTransferExtraADC(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs); + void TPCClusterizerCheckExtraADCZeros(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs); #endif void RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSector, int8_t mergeMode, GPUReconstruction::krnlDeviceType deviceType); void RunTPCTrackingMerger_Resolve(int8_t useOrigTrackParam, int8_t mergeAll, GPUReconstruction::krnlDeviceType deviceType); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index bf6577cfd929e..d3479cb6f0637 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -17,6 +17,7 @@ #include "GPUChainTrackingDebug.h" #include "GPULogging.h" #include "GPUO2DataTypes.h" +#include "GPUTPCExtraADC.h" #include "GPUMemorySizeScalers.h" #include "GPUTrackingInputProvider.h" #include "GPUNewCalibValues.h" @@ -56,10 +57,13 @@ #include "utils/VcShim.h" #include "utils/strtag.h" -#include +#include "utils/vecpod.h" #include +#include #include +// #define INSERT_SATURATED_SIGNALS + using namespace o2::gpu; using namespace o2::tpc; using namespace o2::tpc::constants; @@ -155,11 +159,171 @@ void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfF } } +void GPUChainTracking::TPCClusterizerTransferExtraADC(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs) +{ + const int32_t iSector = clusterer.mISector; + const auto& fragment = clusterer.mPmemory->fragment; + const auto& digits = extraADCs.digitsBySector[iSector]; + + if (fragment.index != 0) { + return; + } + + if (digits.empty()) { + return; + } + + const size_t chargeMapSize = TPCMapMemoryLayout::items(GetProcessingSettings().overrideClusterizerFragmentLen); + const size_t chargeMapSizeBytes = chargeMapSize * sizeof(PackedCharge); + + vecpod chargeMapHostData; + chargeMapHostData.resize(chargeMapSize); + + CfArray2D chargeMapHost(reinterpret_cast(chargeMapHostData.data())); + + vecpod extraPositions; + extraPositions.reserve(digits.size()); + + GPUMemCpy(RecoStep::TPCClusterFinding, chargeMapHostData.data(), clustererShadow.mPchargeMap, chargeMapSizeBytes, lane, false); + SynchronizeStream(lane); + + for (const auto& d : digits) { + if (!fragment.contains(d.getTimeStamp())) { + continue; + } + + CfChargePos pos{(tpccf::Row)d.getRow(), (tpccf::Pad)d.getPad(), (tpccf::TPCFragmentTime)(d.getTimeStamp() - fragment.start)}; + chargeMapHost[pos] = PackedCharge(d.getChargeFloat()); + + extraPositions.push_back(pos); + } + + GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPchargeMap, chargeMapHostData.data(), chargeMapSizeBytes, lane, true); + + const size_t nPositions = clusterer.mPmemory->counters.nPositions; + const size_t extraPositionsOffset = nPositions - extraPositions.size(); + GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPpositions + extraPositionsOffset, extraPositions.data(), extraPositions.size() * sizeof(CfChargePos), lane, true); +} + +void GPUChainTracking::TPCClusterizerCheckExtraADCZeros(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs) +{ + const int32_t iSector = clusterer.mISector; + const auto& fragment = clusterer.mPmemory->fragment; + const auto& digits = extraADCs.digitsBySector[iSector]; + + if (fragment.index != 0) { + return; + } + + if (digits.empty()) { + return; + } + + const size_t chargeMapSize = TPCMapMemoryLayout::items(GetProcessingSettings().overrideClusterizerFragmentLen); + const size_t chargeMapSizeBytes = chargeMapSize * sizeof(PackedCharge); + + vecpod chargeMapHostData; + chargeMapHostData.resize(chargeMapSize); + + CfArray2D chargeMapHost(reinterpret_cast(chargeMapHostData.data())); + + GPUMemCpy(RecoStep::TPCClusterFinding, chargeMapHostData.data(), clustererShadow.mPchargeMap, chargeMapSizeBytes, lane, false); + SynchronizeStream(lane); + + size_t nNonZeroADCs = 0; + + for (const auto& d : digits) { + if (!fragment.contains(d.getTimeStamp())) { + continue; + } + + CfChargePos pos{(tpccf::Row)d.getRow(), (tpccf::Pad)d.getPad(), (tpccf::TPCFragmentTime)(d.getTimeStamp() - fragment.start)}; + + auto adc = chargeMapHost[pos].unpack(); + + if (adc != 0) { + nNonZeroADCs++; + } + } + + if (nNonZeroADCs > 0) { + GPUInfo("Non Zero ADCs: %zu", nNonZeroADCs); + } else { + GPUInfo("Cleared all extra ADC values!", nNonZeroADCs); + } +} + namespace { struct TPCCFDecodeScanTmp { int32_t zsPtrFirst, zsPageFirst, zsPtrLast, zsPageLast, hasData, pageCounter; }; + +// Additional ADC values must be generated at start of clusterizer +// This is required, so enough memory is allocated for the charge points +// And ADCs can be injected by "simply" +// -> copying chargeMap + chargePositions to host +// -> writing additional adcs to chargeMap + positions +// -> copying values to device +GPUTPCExtraADC GenerateSaturatedSignals(size_t seed = 42) +{ + constexpr int32_t MinTailLength = 50; + constexpr int32_t MaxTailLength = 200; + constexpr int32_t TailWidth = 3; // Assume tails are 3 pads wide at the moment + + constexpr GPUTPCGeometry geo; + + GPUTPCExtraADC adcs; + + const int32_t nHIPs = 50; + const int32_t firstTB = 0; // Place all HIPs in first fragment for now + const int32_t lastTB = 4000 - MaxTailLength; // Don't allow cut off tails at fragment borders + const int32_t tailADC = 250; // charge should decrease over time, but for now just hardcode ADC above the threshold + + std::mt19937 gen{seed}; + std::uniform_int_distribution<> randomRow(0, GPUCA_ROW_COUNT - 1); + std::uniform_int_distribution<> randomTB(firstTB, lastTB); + std::uniform_int_distribution<> randomTailLength(MinTailLength, MaxTailLength); + // std::normal_distribution<> tailLengthNoise(8, 2.0); + + for (int32_t iHIP = 0; iHIP < nHIPs; iHIP++) { + + const int32_t row = randomRow(gen); + const int32_t nPads = geo.NPads(row); + std::uniform_int_distribution<> randomPad(0, nPads - 1); + + const int32_t basePad = randomPad(gen); + const int32_t baseTb = randomTB(gen); + + auto& digits = adcs.digitsBySector[0]; + + const int32_t tailLength = randomTailLength(gen); + + for (int32_t dPad = -TailWidth; dPad <= TailWidth; dPad++) { + const int32_t iPad = basePad + dPad; + if (iPad < 0 || iPad >= nPads) { + continue; + } + + for (int32_t dTime = 0; dTime < tailLength; dTime++) { + const int32_t iTime = baseTb + dTime; + + if (iTime >= 4000) { + break; + } + + const auto adc = dTime == 0 && dPad == 0 ? 1023 : tailADC; + + digits.emplace_back(0, adc, row, iPad, iTime); + } + } + } + + GPUInfo("Generated %zu ADCs!", adcs.digitsBySector[0].size()); + + return adcs; +} + } // namespace std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment) @@ -437,13 +601,16 @@ void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clust } } -std::pair GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane) +std::pair GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane, const GPUTPCExtraADC& extraADCs) { bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding; if (mCFContext->abandonTimeframe) { return {0, 0}; } - const auto& retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment); + auto retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment); + if (fragment.index == 0) { + retVal.first += extraADCs.digitsBySector[iSector].size(); + } if (doGPU) { GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector]; GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer; @@ -473,7 +640,7 @@ std::pair GPUChainTracking::RunTPCClusterizer_transferZS(int return retVal; } -int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers) +int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers, const GPUTPCExtraADC& extraADCs) { bool doGPU = mRec->GetRecoStepsGPU() & gpudatatypes::RecoStep::TPCClusterFinding; if (restorePointers) { @@ -569,7 +736,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers) mCFContext->fragmentFirst = CfFragment{std::max(mCFContext->tpcMaxTimeBin + 1, maxFragmentLen), maxFragmentLen}; for (int32_t iSector = 0; iSector < GetProcessingSettings().nTPCClustererLanes && iSector < NSECTORS; iSector++) { if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) { - mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector); + mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector, extraADCs); } } @@ -595,7 +762,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) mRec->PushNonPersistentMemory(qStr2Tag("TPCCLUST")); const auto& threadContext = GetThreadContext(); const bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding; - if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) { + + GPUTPCExtraADC extraADCs; +#ifdef INSERT_SATURATED_SIGNALS + extraADCs = GenerateSaturatedSignals(); +#endif + + if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer, extraADCs)) { return 1; } if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) { @@ -625,7 +798,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) SetupGPUProcessor(&processors()->tpcClusterer[iSector], true); // Now we allocate } if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) { - RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline) + RunTPCClusterizer_prepare(true, extraADCs); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline) } if (doGPU && mIOPtrs.tpcZS) { @@ -769,6 +942,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) const bool propagateMCLabels = buildNativeHost && GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC; const bool sortClusters = buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4); + if (GetProcessingSettings().runMC && (!processors()->ioPtrs.tpcPackedDigits || !processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC)) { + GPUWarning("Requested to process MC labels, but no labels present"); + } + auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr; mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor; @@ -938,7 +1115,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) f = mCFContext->fragmentFirst; } if (nextSector < NSECTORS && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSector] && mCFContext->zsVersion != -1 && !mCFContext->abandonTimeframe) { - mCFContext->nextPos[nextSector] = RunTPCClusterizer_transferZS(nextSector, f, GetProcessingSettings().nTPCClustererLanes + lane); + mCFContext->nextPos[nextSector] = RunTPCClusterizer_transferZS(nextSector, f, GetProcessingSettings().nTPCClustererLanes + lane, extraADCs); } } GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector]; @@ -949,9 +1126,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (!mIOPtrs.tpcZS) { runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); } - if (DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererDigits, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile)) { - clusterer.DumpChargeMap(*mDebugFile, "Charges"); - } + + TPCClusterizerTransferExtraADC(clusterer, clustererShadow, lane, extraADCs); if (propagateMCLabels) { runKernel({GetGrid(clusterer.mPmemory->counters.nDigitsInFragment, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}); @@ -960,6 +1136,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) bool checkForNoisyPads = (rec()->GetParam().rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (rec()->GetParam().rec.tpc.maxConsecTimeBinAboveThreshold > 0); checkForNoisyPads &= (rec()->GetParam().rec.tpc.noisyPadsQuickCheck ? fragment.index == 0 : true); checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter; + // TODO Move hipTailFilter flag to ProcessingSettings? + // TODO Add some warning when re enabling pad filter with this flag, so it's not just silently enabled when disabling was requested + checkForNoisyPads |= rec()->GetParam().rec.tpc.hipTailFilter; + + if (rec()->GetParam().rec.tpc.hipTailFilter && !doGPU) { + GPUError("HIP tail filter enabled, but this is currently not supported on CPU"); + } if (checkForNoisyPads) { const int32_t nBlocks = GPUTPCCFCheckPadBaseline::GetNBlocks(doGPU); @@ -968,6 +1151,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) getKernelTimer(RecoStep::TPCClusterFinding, iSector, TPC_PADS_IN_SECTOR * fragment.lengthWithoutOverlap() * sizeof(PackedCharge), false); } + DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererDigits, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile); + // Avoid additional sync when also dumping digits + const bool debugSyncChargeMap = !(GetProcessingSettings().debugMask & GPUChainTrackingDebugFlags::TPCClustererDigits); + // DumpChargeMap should run after noisy pad filter to avoid yet another dump of intermediate data. When chargemap without pad filter is required, disable pad filter instead. + DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, debugSyncChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Charges"); + + TPCClusterizerCheckExtraADCZeros(clusterer, clustererShadow, lane, extraADCs); + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); if (DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererPeaks, clusterer, &GPUTPCClusterFinder::DumpPeaks, *mDebugFile)) { clusterer.DumpPeakMap(*mDebugFile, "Peaks"); @@ -1144,7 +1335,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if(!nn_settings.nnClusterizerApplyCfDeconvolution) { // If it is already applied don't do it twice, otherwise apply now runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true); } - DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); + DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMapSplit, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 if (nn_settings.nnClusterizerVerbosity > 3) { LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")"; @@ -1155,7 +1346,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) #endif } else { runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true); - DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); + DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMapSplit, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0); } diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 89d47d0e1b86c..8d8d9f722c3cf 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -16,6 +16,7 @@ #include "GPUChainTrackingDebug.h" #include "GPULogging.h" #include "GPUO2DataTypes.h" +#include "GPUTPCExtraADC.h" #include "GPUTrackingInputProvider.h" #include "GPUTPCCFChainContext.h" #include "TPCClusterDecompressor.h" @@ -63,7 +64,7 @@ int32_t GPUChainTracking::RunTPCCompression() if (mPipelineFinalizationCtx && GetProcessingSettings().doublePipelineClusterizer) { SynchronizeEventAndRelease(mEvents->single); auto* foreignChain = (GPUChainTracking*)GetNextChainInQueue(); - foreignChain->RunTPCClusterizer_prepare(false); + foreignChain->RunTPCClusterizer_prepare(false, {}); foreignChain->mCFContext->ptrClusterNativeSave = processorsShadow()->ioPtrs.clustersNative; } #endif diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebug.h b/GPU/GPUTracking/Global/GPUChainTrackingDebug.h index a0be9d833d5a9..39e48f9f14d9a 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingDebug.h +++ b/GPU/GPUTracking/Global/GPUChainTrackingDebug.h @@ -45,7 +45,8 @@ enum GPUChainTrackingDebugFlags : uint32_t { TPCClustererPeaks = 1 << 19, TPCClustererSuppressedPeaks = 1 << 20, TPCClustererChargeMap = 1 << 21, - TPCClustererZeroedCharges = 1 << 22 + TPCClustererChargeMapSplit = 1 << 22, + TPCClustererZeroedCharges = 1 << 23 }; template diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx index 33ed089890bc4..b4d012bbb82fa 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx @@ -21,9 +21,88 @@ #include "utils/VcShim.h" #endif +#if 0 +#define DPRINT(...) printf(__VA_ARGS__) +#define DPRINTB(...) \ + if (iThread == 0) \ + printf(__VA_ARGS__) +#define DPRINTB_IF(test, ...) \ + if (iThread == 0 && (test)) \ + printf(__VA_ARGS__) +#else +#define DPRINT(...) ((void)0) +#define DPRINTB(...) ((void)0) +#define DPRINTB_IF(test, ...) ((void)0) +#endif + using namespace o2::gpu; using namespace o2::gpu::tpccf; +using Kernel = GPUTPCCFCheckPadBaseline; + +// Collect tails marked for closing across the workgroup using a prefix scan, +// then cooperatively zero the charge map entries for each closed tail. +// Caller must set acc.activeHIPTail.end before calling if the tail is open. +static GPUdi() uint16_t CloseHIPTails( + Kernel::GPUSharedMemory& smem, + int32_t iThread, int32_t nThreads, + int16_t iPadHandle, + CfChargePos basePos, + CfArray2D& chargeMap, + Kernel::PadChargeAccu& acc, + bool shouldCloseTail) +{ + uint16_t nClosedTails = work_group_count(shouldCloseTail); + + if (nClosedTails > 0) { + int16_t iClosedTail = work_group_scan_inclusive_add((int16_t)shouldCloseTail) - 1; + if (shouldCloseTail) { + smem.tailsClosedPad[iClosedTail] = iPadHandle; + smem.tailsClosed[iClosedTail] = acc.activeHIPTail; + acc.activeHIPTail.Reset(); + } + + GPUbarrier(); + } + + // TODO: performance improvement -> parallelize this loop across tails + for (uint16_t iTail = 0; iTail < nClosedTails; iTail++) { + const auto tailPad = smem.tailsClosedPad[iTail]; + const auto tail = smem.tailsClosed[iTail]; + + for (uint16_t iTime = iThread; iTime < tail.Length(); iTime += nThreads) { + chargeMap[basePos.delta({tailPad, int16_t(tail.start + iTime)})] = PackedCharge{0}; + } + } + + return nClosedTails; +} + +template +static GPUdi() void ScanCachedCharges(Kernel::GPUSharedMemory& smem, uint16_t timeOffset, uint16_t pad, Charge hipTailThreshold, Kernel::PadChargeAccu& acc) +{ + for (int32_t i = 0; i < Kernel::NumOfCachedTBs; i++) { + const Charge qs = smem.charges[i][pad]; + acc.totalCharges += qs > 0; + acc.consecCharges = qs > 0 ? acc.consecCharges + 1 : 0; + acc.maxConsecCharges = CAMath::Max(acc.consecCharges, acc.maxConsecCharges); + acc.maxCharge = CAMath::Max(qs, acc.maxCharge); + + if constexpr (CheckHIPTrigger) { + if (qs >= Charge(Kernel::MaxADC)) { + acc.HIPtb = timeOffset + i; + smem.tails[pad] = {acc.HIPtb, 0}; // Broadcast HIP start TB to neighboring pads / threads + } + } + + if constexpr (CheckHIPTailEnd) { + if (qs < hipTailThreshold) { + acc.activeHIPTail.end = timeOffset + i; + } + } + } +} + template <> GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer) { @@ -48,51 +127,131 @@ GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t } const CfFragment& fragment = clusterer.mPmemory->fragment; + const bool hipFilterOn = clusterer.Param().rec.tpc.hipTailFilter; + const Charge hipTailThreshold = clusterer.Param().rec.tpc.hipTailFilterThreshold; CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); const auto iRow = iBlock; const auto rowinfo = GetRowInfo(iRow); const CfChargePos basePos{(Row)iRow, 0, 0}; - int32_t totalCharges = 0; - int32_t consecCharges = 0; - int32_t maxConsecCharges = 0; - Charge maxCharge = 0; + PadChargeAccu acc; const int16_t iPadOffset = iThread % MaxNPadsPerRow; const int16_t iTimeOffset = iThread / MaxNPadsPerRow; const int16_t iPadHandle = iThread; const bool handlePad = iPadHandle < rowinfo.nPads; + if (iPadHandle < MaxNPadsPerRow) { + smem.tails[iPadHandle] = {-1, -1}; + } + GPUbarrier(); + const auto firstTB = fragment.firstNonOverlapTimeBin(); const auto lastTB = fragment.lastNonOverlapTimeBin(); - for (auto t = firstTB; t < lastTB; t += NumOfCachedTBs) { + for (uint16_t t = firstTB; t < lastTB; t += NumOfCachedTBs) { - const TPCFragmentTime iTime = t + iTimeOffset; + const TPCFragmentTime iTimeLoad = t + iTimeOffset; - const CfChargePos pos = basePos.delta({iPadOffset, iTime}); + const CfChargePos pos = basePos.delta({iPadOffset, iTimeLoad}); - smem.charges[iTimeOffset][iPadOffset] = iTime < lastTB && iPadOffset < rowinfo.nPads ? chargeMap[pos].unpack() : 0; + const Charge ql = iTimeLoad < lastTB && iPadOffset < rowinfo.nPads ? chargeMap[pos].unpack() : 0; + smem.charges[iTimeOffset][iPadOffset] = ql; - GPUbarrier(); + const bool hasHIPTrigger = hipFilterOn && work_group_any(ql >= MaxADC); + + acc.HIPtb = -1; if (handlePad) { - for (int32_t i = 0; i < NumOfCachedTBs; i++) { - const Charge q = smem.charges[i][iPadHandle]; - totalCharges += (q > 0); - consecCharges = (q > 0) ? consecCharges + 1 : 0; - maxConsecCharges = CAMath::Max(consecCharges, maxConsecCharges); - maxCharge = CAMath::Max(q, maxCharge); + + // TODO: is this really necessary? + // Why is the old version so much slower, when we just add short branches to the loop??? + if (!hasHIPTrigger) [[likely]] { + if (!acc.activeHIPTail.IsOpen()) { + ScanCachedCharges(smem, t, iPadHandle, hipTailThreshold, acc); + } else { + ScanCachedCharges(smem, t, iPadHandle, hipTailThreshold, acc); + } + } else { + if (!acc.activeHIPTail.IsOpen()) { + ScanCachedCharges(smem, t, iPadHandle, hipTailThreshold, acc); + } else { + ScanCachedCharges(smem, t, iPadHandle, hipTailThreshold, acc); + } } } GPUbarrier(); - } + + if (hasHIPTrigger) [[unlikely]] { + + DPRINTB("%d: Trigger!\n", iBlock); + + if (handlePad && acc.HIPtb < 0) { + + // Search neighboring pads for trigger + for (int16_t i = -3; i < 0; i++) { + const auto p = iPadHandle + i; + if (p > -1) { + acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb); + } + } + + for (int16_t i = 1; i < 4; i++) { + const auto p = iPadHandle + i; + if (p < MaxNPadsPerRow) { + acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb); + } + } + } + + bool shouldCloseTail = acc.HIPtb > -1 && acc.activeHIPTail.HasValue(); + if (shouldCloseTail && acc.activeHIPTail.IsOpen()) { + DPRINT("%d: end = %d\n", iThread, acc.HIPtb); + acc.activeHIPTail.end = acc.HIPtb; + } + + CloseHIPTails(smem, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail); + + GPUbarrier(); // TODO: not needed? Debug only + + if (acc.HIPtb > -1) { + DPRINT("%d: start = %d\n", iThread, acc.HIPtb); + acc.activeHIPTail.SetOpen(acc.HIPtb); + } + + // Clear smem between iterations to prevent stale entries + if (handlePad) { + smem.tails[iPadHandle].Reset(); + } + + GPUbarrier(); // TODO: not needed? Debug only + + } // if (hipTriggerFound) + + } // for (uint16_t t = firstTB; t < lastTB; t += NumOfCachedTBs) if (handlePad) { - updatePadBaseline(rowinfo.globalPadOffset + iPadOffset, clusterer, totalCharges, maxConsecCharges, maxCharge); + updatePadBaseline(rowinfo.globalPadOffset + iPadOffset, clusterer, acc.totalCharges, acc.maxConsecCharges, acc.maxCharge); } + + // --- Close remaining tails + const bool shouldCloseTail = acc.activeHIPTail.HasValue(); + + // Call `work_group_any` here, instead of always counting. + // This is important as `work_group_count` is a lot slower + // and has a lot of overhead if no HIPs were found. + if (work_group_any(shouldCloseTail)) { + if (shouldCloseTail && acc.activeHIPTail.IsOpen()) { + acc.activeHIPTail.end = lastTB; + } + + [[maybe_unused]] const uint16_t nClosedTails = CloseHIPTails(smem, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail); + + DPRINTB_IF(nClosedTails > 0, "%d: Close remaining tails (%d)\n", iBlock, nClosedTails); + } + #endif } @@ -106,7 +265,7 @@ GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineCPU(int32_t nBlocks, int32_t int32_t padsPerRow; CfChargePos basePos = padToCfChargePos(basePad, clusterer, padsPerRow); - if (not basePos.valid()) { + if (!basePos.valid()) { return; } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h index a71f1358a73a6..1593ea5dd5df3 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h @@ -15,6 +15,11 @@ /// Kernel identifies noisy TPC pads by analyzing charge patterns over time. /// A pad is marked noisy if it exceeds thresholds for total or consecutive /// time bins with charge, unless the charge exceeds a saturation threshold. +/// +/// Optionally detects Highly Ionising Particle (HIP) tails: when a saturated +/// ADC value (1023) is found, the tail region on the triggering pad and its +/// neighbors is zeroed in the charge map until charges drop below a +/// configurable threshold. #ifndef O2_GPU_GPU_TPC_CF_CHECK_PAD_BASELINE_H #define O2_GPU_GPU_TPC_CF_CHECK_PAD_BASELINE_H @@ -44,10 +49,54 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate // Rounding up to a multiple of PadsPerCacheline ensures iThread / MaxNPadsPerRow < NumOfCachedTBs // for all threads, avoiding out-of-bounds access. MaxNPadsPerRow = CAMath::nextMultipleOf(GPUTPCGeometry::MaxNPadsPerRow()), + + MaxADC = 1023, + + NThreads = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline), + }; + + union HipTailRange { + struct { + int16_t start; + int16_t end; + }; + // uint32_t asU32; + + // Be careful with using default initialized values. + // Need default constructor, so can be placed in shared memory. + // Might be zero initialized, but invalid tail needs start = end = -1 instead. + GPUdDefault() HipTailRange() = default; + GPUdi() HipTailRange(int16_t st, int16_t e) : start(st), end(e) {} + + GPUdi() bool HasValue() const { return start > -1; } + GPUdi() bool IsOpen() const { return start > -1 && end < 0; } + + GPUdi() void SetOpen(int16_t st) + { + start = st; + end = -1; + } + + GPUdi() int16_t Length() const { return end - start; } + + GPUdi() void Reset() { start = end = -1; } }; - struct GPUSharedMemory { + struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64 { tpccf::Charge charges[NumOfCachedTBs][MaxNPadsPerRow]; + HipTailRange tails[MaxNPadsPerRow]; + uint8_t tailsClosedPad[MaxNPadsPerRow]; + HipTailRange tailsClosed[MaxNPadsPerRow]; + }; + + // Accumulated values from scanning cached charges in a pad + struct PadChargeAccu { + int32_t totalCharges = 0; + int32_t consecCharges = 0; + int32_t maxConsecCharges = 0; + tpccf::Charge maxCharge = 0; + int16_t HIPtb = -1; + HipTailRange activeHIPTail{-1, -1}; }; typedef GPUTPCClusterFinder processorType; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx index e20f5d8b0f074..c266e9883d227 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx @@ -477,7 +477,10 @@ GPUd() void GPUTPCCFDecodeZSLinkBase::WriteCharge(processorType& clusterer, floa CfChargePos pos(padAndRow.getRow(), padAndRow.getPad(), localTime); positions[positionOffset] = pos; - charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad()); + // Only apply gain correction if ADC not fully saturated + if (charge < 1023.f) { + charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad()); + } chargeMap[pos] = PackedCharge(charge); } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h index b8ff90f511057..6246a0366ddf7 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h @@ -132,7 +132,7 @@ class GPUTPCCFDecodeZSLink : public GPUTPCCFDecodeZSLinkBase { public: // constants for decoding - static inline constexpr int32_t DECODE_BITS = o2::tpc::TPCZSHDRV2::TPC_ZS_NBITS_V34; + static inline constexpr int32_t DECODE_BITS = tpc::TPCZSHDRV2::TPC_ZS_NBITS_V34; static inline constexpr float DECODE_BITS_FACTOR = 1.f / (1 << (DECODE_BITS - 10)); static inline constexpr uint32_t DECODE_MASK = (1 << DECODE_BITS) - 1; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx index d676cf9cd3887..b2a6cde9ff763 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx @@ -15,7 +15,6 @@ #include "GPUTPCClusterFinder.h" #include "GPUReconstruction.h" #include "CfArray2D.h" -#include "DataFormatsTPC/Digit.h" #include "DataFormatsTPC/ClusterNative.h" #include "GPUSettings.h" diff --git a/GPU/GPUTracking/utils/VcShim.h b/GPU/GPUTracking/utils/VcShim.h index 21a9a6a5c95c2..2bbc1d471bbbb 100644 --- a/GPU/GPUTracking/utils/VcShim.h +++ b/GPU/GPUTracking/utils/VcShim.h @@ -19,7 +19,7 @@ #ifndef GPUCA_NO_VC -#include +#include // IWYU pragma: export #else