diff --git a/GPU/Common/GPUCommonAlgorithm.h b/GPU/Common/GPUCommonAlgorithm.h
index db57e7ec06d4b..be88973561e0a 100644
--- a/GPU/Common/GPUCommonAlgorithm.h
+++ b/GPU/Common/GPUCommonAlgorithm.h
@@ -354,6 +354,11 @@ GPUdi() uint8_t warp_broadcast_FUNC<uint8_t>(uint8_t v, int32_t i)
 #define warp_scan_inclusive_add(v) warp_scan_inclusive_add_FUNC(v)
 #define warp_broadcast(v, i) warp_broadcast_FUNC(v, i)
 
+[[nodiscard]] GPUdi() int32_t work_group_count(bool pred)
+{
+  return work_group_reduce_add((int32_t)pred);
+}
+
 #elif (defined(__CUDACC__) || defined(__HIPCC__))
 // CUDA and HIP work the same way using cub, need just different header
 
@@ -416,6 +421,16 @@ GPUdi() T warp_broadcast_FUNC(T v, int32_t i)
 #endif
 }
 
+[[nodiscard]] GPUdi() bool work_group_any(bool pred)
+{
+  return __syncthreads_or(pred);
+}
+
+[[nodiscard]] GPUdi() uint32_t work_group_count(bool pred)
+{
+  return __syncthreads_count(pred);
+}
+
 #else
 // Trivial implementation for the CPU
 
@@ -449,6 +464,16 @@ GPUdi() T warp_broadcast(T v, int32_t i)
   return v;
 }
 
+[[nodiscard]] GPUdi() bool work_group_any(bool pred)
+{
+  return pred;
+}
+
+[[nodiscard]] GPUdi() uint32_t work_group_count(bool pred)
+{
+  return pred;
+}
+
 #endif
 
 #ifdef GPUCA_ALGORITHM_STD
diff --git a/GPU/GPUTracking/DataTypes/GPUTPCExtraADC.h b/GPU/GPUTracking/DataTypes/GPUTPCExtraADC.h
new file mode 100644
index 0000000000000..28fa072b9401c
--- /dev/null
+++ b/GPU/GPUTracking/DataTypes/GPUTPCExtraADC.h
@@ -0,0 +1,25 @@
+// Copyright 2019-2026 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCExtraADC.h
+/// \author Felix Weiglhofer
+
+#include "GPUDefConstantsAndSettings.h"
+#include "DataFormatsTPC/Digit.h"
+#include <array>
+#include <vector>
+
+namespace o2::gpu
+{
+struct GPUTPCExtraADC {
+  std::array<std::vector<tpc::Digit>, GPUCA_NSECTORS> digitsBySector;
+};
+} // namespace o2::gpu
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index e34af48d7a85e..51fc8cf46bbbe 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -112,6 +112,8 @@ AddOptionRTC(trackletMinSharedNormFactor, float, 0.f, "", 0, "Max shared defined
 AddOptionRTC(maxTimeBinAboveThresholdIn1000Bin, uint16_t, 500, "", 0, "Except pad from cluster finding if total number of charges in a fragment is above this baseline (disable = 0)")
 AddOptionRTC(maxConsecTimeBinAboveThreshold, uint16_t, 200, "", 0, "Except pad from cluster finding if number of consecutive charges in a fragment is above this baseline (disable = 0)")
 AddOptionRTC(noisyPadSaturationThreshold, uint16_t, 700, "", 0, "Threshold where a timebin is considered saturated, disabling the noisy pad check for that pad")
+AddOptionRTC(hipTailFilter, uint8_t, 0, "", 0, "Enable Highly Ionising Particle tail filter in CheckPadBaseline (0 = disable, 1 = filter tails)")
+AddOptionRTC(hipTailFilterThreshold, uint16_t, 150, "", 0, "Threshold that must be exceeded for a timebin to be counted towards Highly Ionising Particle tail")
 AddOptionRTC(occupancyMapTimeBins, uint16_t, 16, "", 0, "Number of timebins per histogram bin of occupancy map (0 = disable occupancy map)")
 AddOptionRTC(occupancyMapTimeBinsAverage, uint16_t, 0, "", 0, "Number of timebins +/- to use for the averaging")
 AddOptionRTC(trackFitCovLimit, uint16_t, 1000, "", 0, "Abort fit when y/z cov exceed the limit")
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h
index fd75136f51d76..5bcdfee2607c7 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.h
+++ b/GPU/GPUTracking/Global/GPUChainTracking.h
@@ -18,7 +18,6 @@
 #include "GPUChain.h"
 #include "GPUDataTypesIO.h"
 #include "GPUDataTypesConfig.h"
-#include <atomic>
 #include <mutex>
 #include <functional>
 #include <array>
@@ -70,6 +69,7 @@ struct CfFragment;
 class GPUTPCClusterFinder;
 struct GPUSettingsProcessing;
 struct GPUSettingsRec;
+struct GPUTPCExtraADC;
 
 class GPUChainTracking : public GPUChain
 {
@@ -302,13 +302,15 @@ class GPUChainTracking : public GPUChain
   int32_t RunChainFinalize();
   void OutputSanityCheck();
   int32_t RunTPCTrackingSectors_internal();
-  int32_t RunTPCClusterizer_prepare(bool restorePointers);
+  int32_t RunTPCClusterizer_prepare(bool restorePointers, const GPUTPCExtraADC& extraADCs);
 #ifdef GPUCA_TPC_GEOMETRY_O2
-  std::pair<uint32_t, uint32_t> RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane);
+  std::pair<uint32_t, uint32_t> RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane, const GPUTPCExtraADC& extraADCs);
   void RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int32_t stage, bool doGPU, int32_t lane);
   std::pair<uint32_t, uint32_t> TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment);
   std::pair<uint32_t, uint32_t> TPCClusterizerDecodeZSCountUpdate(uint32_t iSector, const CfFragment& fragment);
   void TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfFragment& fragment);
+  void TPCClusterizerTransferExtraADC(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs);
+  void TPCClusterizerCheckExtraADCZeros(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs);
 #endif
   void RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSector, int8_t mergeMode, GPUReconstruction::krnlDeviceType deviceType);
   void RunTPCTrackingMerger_Resolve(int8_t useOrigTrackParam, int8_t mergeAll, GPUReconstruction::krnlDeviceType deviceType);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index bf6577cfd929e..d3479cb6f0637 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -17,6 +17,7 @@
 #include "GPUChainTrackingDebug.h"
 #include "GPULogging.h"
 #include "GPUO2DataTypes.h"
+#include "GPUTPCExtraADC.h"
 #include "GPUMemorySizeScalers.h"
 #include "GPUTrackingInputProvider.h"
 #include "GPUNewCalibValues.h"
@@ -56,10 +57,13 @@
 
 #include "utils/VcShim.h"
 #include "utils/strtag.h"
-#include <fstream>
+#include "utils/vecpod.h"
 #include <numeric>
+#include <random>
 #include <vector>
 
+// #define INSERT_SATURATED_SIGNALS
+
 using namespace o2::gpu;
 using namespace o2::tpc;
 using namespace o2::tpc::constants;
@@ -155,11 +159,171 @@ void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfF
   }
 }
 
+void GPUChainTracking::TPCClusterizerTransferExtraADC(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs)
+{
+  const int32_t iSector = clusterer.mISector;
+  const auto& fragment = clusterer.mPmemory->fragment;
+  const auto& digits = extraADCs.digitsBySector[iSector];
+
+  if (fragment.index != 0) {
+    return;
+  }
+
+  if (digits.empty()) {
+    return;
+  }
+
+  const size_t chargeMapSize = TPCMapMemoryLayout<PackedCharge>::items(GetProcessingSettings().overrideClusterizerFragmentLen);
+  const size_t chargeMapSizeBytes = chargeMapSize * sizeof(PackedCharge);
+
+  vecpod<uint16_t> chargeMapHostData;
+  chargeMapHostData.resize(chargeMapSize);
+
+  CfArray2D<PackedCharge> chargeMapHost(reinterpret_cast<PackedCharge*>(chargeMapHostData.data()));
+
+  vecpod<CfChargePos> extraPositions;
+  extraPositions.reserve(digits.size());
+
+  GPUMemCpy(RecoStep::TPCClusterFinding, chargeMapHostData.data(), clustererShadow.mPchargeMap, chargeMapSizeBytes, lane, false);
+  SynchronizeStream(lane);
+
+  for (const auto& d : digits) {
+    if (!fragment.contains(d.getTimeStamp())) {
+      continue;
+    }
+
+    CfChargePos pos{(tpccf::Row)d.getRow(), (tpccf::Pad)d.getPad(), (tpccf::TPCFragmentTime)(d.getTimeStamp() - fragment.start)};
+    chargeMapHost[pos] = PackedCharge(d.getChargeFloat());
+
+    extraPositions.push_back(pos);
+  }
+
+  GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPchargeMap, chargeMapHostData.data(), chargeMapSizeBytes, lane, true);
+
+  const size_t nPositions = clusterer.mPmemory->counters.nPositions;
+  const size_t extraPositionsOffset = nPositions - extraPositions.size();
+  GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPpositions + extraPositionsOffset, extraPositions.data(), extraPositions.size() * sizeof(CfChargePos), lane, true);
+}
+
+void GPUChainTracking::TPCClusterizerCheckExtraADCZeros(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs)
+{
+  const int32_t iSector = clusterer.mISector;
+  const auto& fragment = clusterer.mPmemory->fragment;
+  const auto& digits = extraADCs.digitsBySector[iSector];
+
+  if (fragment.index != 0) {
+    return;
+  }
+
+  if (digits.empty()) {
+    return;
+  }
+
+  const size_t chargeMapSize = TPCMapMemoryLayout<PackedCharge>::items(GetProcessingSettings().overrideClusterizerFragmentLen);
+  const size_t chargeMapSizeBytes = chargeMapSize * sizeof(PackedCharge);
+
+  vecpod<uint16_t> chargeMapHostData;
+  chargeMapHostData.resize(chargeMapSize);
+
+  CfArray2D<PackedCharge> chargeMapHost(reinterpret_cast<PackedCharge*>(chargeMapHostData.data()));
+
+  GPUMemCpy(RecoStep::TPCClusterFinding, chargeMapHostData.data(), clustererShadow.mPchargeMap, chargeMapSizeBytes, lane, false);
+  SynchronizeStream(lane);
+
+  size_t nNonZeroADCs = 0;
+
+  for (const auto& d : digits) {
+    if (!fragment.contains(d.getTimeStamp())) {
+      continue;
+    }
+
+    CfChargePos pos{(tpccf::Row)d.getRow(), (tpccf::Pad)d.getPad(), (tpccf::TPCFragmentTime)(d.getTimeStamp() - fragment.start)};
+
+    auto adc = chargeMapHost[pos].unpack();
+
+    if (adc != 0) {
+      nNonZeroADCs++;
+    }
+  }
+
+  if (nNonZeroADCs > 0) {
+    GPUInfo("Non Zero ADCs: %zu", nNonZeroADCs);
+  } else {
+    GPUInfo("Cleared all extra ADC values!", nNonZeroADCs);
+  }
+}
+
 namespace
 {
 struct TPCCFDecodeScanTmp {
   int32_t zsPtrFirst, zsPageFirst, zsPtrLast, zsPageLast, hasData, pageCounter;
 };
+
+// Additional ADC values must be generated at start of clusterizer
+// This is required, so enough memory is allocated for the charge points
+// And ADCs can be injected by "simply"
+//  -> copying chargeMap + chargePositions to host
+//  -> writing additional adcs to chargeMap + positions
+//  -> copying values to device
+GPUTPCExtraADC GenerateSaturatedSignals(size_t seed = 42)
+{
+  constexpr int32_t MinTailLength = 50;
+  constexpr int32_t MaxTailLength = 200;
+  constexpr int32_t TailWidth = 3; // Assume tails are 3 pads wide at the moment
+
+  constexpr GPUTPCGeometry geo;
+
+  GPUTPCExtraADC adcs;
+
+  const int32_t nHIPs = 50;
+  const int32_t firstTB = 0;                   // Place all HIPs in first fragment for now
+  const int32_t lastTB = 4000 - MaxTailLength; // Don't allow cut off tails at fragment borders
+  const int32_t tailADC = 250;                 // charge should decrease over time, but for now just hardcode ADC above the threshold
+
+  std::mt19937 gen{seed};
+  std::uniform_int_distribution<> randomRow(0, GPUCA_ROW_COUNT - 1);
+  std::uniform_int_distribution<> randomTB(firstTB, lastTB);
+  std::uniform_int_distribution<> randomTailLength(MinTailLength, MaxTailLength);
+  // std::normal_distribution<> tailLengthNoise(8, 2.0);
+
+  for (int32_t iHIP = 0; iHIP < nHIPs; iHIP++) {
+
+    const int32_t row = randomRow(gen);
+    const int32_t nPads = geo.NPads(row);
+    std::uniform_int_distribution<> randomPad(0, nPads - 1);
+
+    const int32_t basePad = randomPad(gen);
+    const int32_t baseTb = randomTB(gen);
+
+    auto& digits = adcs.digitsBySector[0];
+
+    const int32_t tailLength = randomTailLength(gen);
+
+    for (int32_t dPad = -TailWidth; dPad <= TailWidth; dPad++) {
+      const int32_t iPad = basePad + dPad;
+      if (iPad < 0 || iPad >= nPads) {
+        continue;
+      }
+
+      for (int32_t dTime = 0; dTime < tailLength; dTime++) {
+        const int32_t iTime = baseTb + dTime;
+
+        if (iTime >= 4000) {
+          break;
+        }
+
+        const auto adc = dTime == 0 && dPad == 0 ? 1023 : tailADC;
+
+        digits.emplace_back(0, adc, row, iPad, iTime);
+      }
+    }
+  }
+
+  GPUInfo("Generated %zu ADCs!", adcs.digitsBySector[0].size());
+
+  return adcs;
+}
+
 } // namespace
 
 std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment)
@@ -437,13 +601,16 @@ void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clust
   }
 }
 
-std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane)
+std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane, const GPUTPCExtraADC& extraADCs)
 {
   bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
   if (mCFContext->abandonTimeframe) {
     return {0, 0};
   }
-  const auto& retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment);
+  auto retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment);
+  if (fragment.index == 0) {
+    retVal.first += extraADCs.digitsBySector[iSector].size();
+  }
   if (doGPU) {
     GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
     GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
@@ -473,7 +640,7 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int
   return retVal;
 }
 
-int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
+int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers, const GPUTPCExtraADC& extraADCs)
 {
   bool doGPU = mRec->GetRecoStepsGPU() & gpudatatypes::RecoStep::TPCClusterFinding;
   if (restorePointers) {
@@ -569,7 +736,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
   mCFContext->fragmentFirst = CfFragment{std::max<int32_t>(mCFContext->tpcMaxTimeBin + 1, maxFragmentLen), maxFragmentLen};
   for (int32_t iSector = 0; iSector < GetProcessingSettings().nTPCClustererLanes && iSector < NSECTORS; iSector++) {
     if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
-      mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector);
+      mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector, extraADCs);
     }
   }
 
@@ -595,7 +762,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   mRec->PushNonPersistentMemory(qStr2Tag("TPCCLUST"));
   const auto& threadContext = GetThreadContext();
   const bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
-  if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) {
+
+  GPUTPCExtraADC extraADCs;
+#ifdef INSERT_SATURATED_SIGNALS
+  extraADCs = GenerateSaturatedSignals();
+#endif
+
+  if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer, extraADCs)) {
     return 1;
   }
   if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) {
@@ -625,7 +798,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     SetupGPUProcessor(&processors()->tpcClusterer[iSector], true); // Now we allocate
   }
   if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
-    RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
+    RunTPCClusterizer_prepare(true, extraADCs); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
   }
 
   if (doGPU && mIOPtrs.tpcZS) {
@@ -769,6 +942,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   const bool propagateMCLabels = buildNativeHost && GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC;
   const bool sortClusters = buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4);
 
+  if (GetProcessingSettings().runMC && (!processors()->ioPtrs.tpcPackedDigits || !processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC)) {
+    GPUWarning("Requested to process MC labels, but no labels present");
+  }
+
   auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr;
 
   mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor;
@@ -938,7 +1115,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             f = mCFContext->fragmentFirst;
           }
           if (nextSector < NSECTORS && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSector] && mCFContext->zsVersion != -1 && !mCFContext->abandonTimeframe) {
-            mCFContext->nextPos[nextSector] = RunTPCClusterizer_transferZS(nextSector, f, GetProcessingSettings().nTPCClustererLanes + lane);
+            mCFContext->nextPos[nextSector] = RunTPCClusterizer_transferZS(nextSector, f, GetProcessingSettings().nTPCClustererLanes + lane, extraADCs);
           }
         }
         GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
@@ -949,9 +1126,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (!mIOPtrs.tpcZS) {
           runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillFromDigits>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
         }
-        if (DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererDigits, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile)) {
-          clusterer.DumpChargeMap(*mDebugFile, "Charges");
-        }
+
+        TPCClusterizerTransferExtraADC(clusterer, clustererShadow, lane, extraADCs);
 
         if (propagateMCLabels) {
           runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillIndexMap>({GetGrid(clusterer.mPmemory->counters.nDigitsInFragment, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}});
@@ -960,6 +1136,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         bool checkForNoisyPads = (rec()->GetParam().rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (rec()->GetParam().rec.tpc.maxConsecTimeBinAboveThreshold > 0);
         checkForNoisyPads &= (rec()->GetParam().rec.tpc.noisyPadsQuickCheck ? fragment.index == 0 : true);
         checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter;
+        // TODO Move hipTailFilter flag to ProcessingSettings?
+        // TODO Add some warning when re enabling pad filter with this flag, so it's not just silently enabled when disabling was requested
+        checkForNoisyPads |= rec()->GetParam().rec.tpc.hipTailFilter;
+
+        if (rec()->GetParam().rec.tpc.hipTailFilter && !doGPU) {
+          GPUError("HIP tail filter enabled, but this is currently not supported on CPU");
+        }
 
         if (checkForNoisyPads) {
           const int32_t nBlocks = GPUTPCCFCheckPadBaseline::GetNBlocks(doGPU);
@@ -968,6 +1151,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           getKernelTimer<GPUTPCCFCheckPadBaseline>(RecoStep::TPCClusterFinding, iSector, TPC_PADS_IN_SECTOR * fragment.lengthWithoutOverlap() * sizeof(PackedCharge), false);
         }
 
+        DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererDigits, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile);
+        // Avoid additional sync when also dumping digits
+        const bool debugSyncChargeMap = !(GetProcessingSettings().debugMask & GPUChainTrackingDebugFlags::TPCClustererDigits);
+        // DumpChargeMap should run after noisy pad filter to avoid yet another dump of intermediate data. When chargemap without pad filter is required, disable pad filter instead.
+        DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, debugSyncChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Charges");
+
+        TPCClusterizerCheckExtraADCZeros(clusterer, clustererShadow, lane, extraADCs);
+
         runKernel<GPUTPCCFPeakFinder>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
         if (DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererPeaks, clusterer, &GPUTPCClusterFinder::DumpPeaks, *mDebugFile)) {
           clusterer.DumpPeakMap(*mDebugFile, "Peaks");
@@ -1144,7 +1335,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             if(!nn_settings.nnClusterizerApplyCfDeconvolution) { // If it is already applied don't do it twice, otherwise apply now
               runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
             }
-            DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+            DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMapSplit, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
             if (nn_settings.nnClusterizerVerbosity > 3) {
               LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
@@ -1155,7 +1346,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 #endif
         } else {
           runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
-          DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+          DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMapSplit, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0);
         }
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 89d47d0e1b86c..8d8d9f722c3cf 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -16,6 +16,7 @@
 #include "GPUChainTrackingDebug.h"
 #include "GPULogging.h"
 #include "GPUO2DataTypes.h"
+#include "GPUTPCExtraADC.h"
 #include "GPUTrackingInputProvider.h"
 #include "GPUTPCCFChainContext.h"
 #include "TPCClusterDecompressor.h"
@@ -63,7 +64,7 @@ int32_t GPUChainTracking::RunTPCCompression()
   if (mPipelineFinalizationCtx && GetProcessingSettings().doublePipelineClusterizer) {
     SynchronizeEventAndRelease(mEvents->single);
     auto* foreignChain = (GPUChainTracking*)GetNextChainInQueue();
-    foreignChain->RunTPCClusterizer_prepare(false);
+    foreignChain->RunTPCClusterizer_prepare(false, {});
     foreignChain->mCFContext->ptrClusterNativeSave = processorsShadow()->ioPtrs.clustersNative;
   }
 #endif
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebug.h b/GPU/GPUTracking/Global/GPUChainTrackingDebug.h
index a0be9d833d5a9..39e48f9f14d9a 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingDebug.h
+++ b/GPU/GPUTracking/Global/GPUChainTrackingDebug.h
@@ -45,7 +45,8 @@ enum GPUChainTrackingDebugFlags : uint32_t {
   TPCClustererPeaks = 1 << 19,
   TPCClustererSuppressedPeaks = 1 << 20,
   TPCClustererChargeMap = 1 << 21,
-  TPCClustererZeroedCharges = 1 << 22
+  TPCClustererChargeMapSplit = 1 << 22,
+  TPCClustererZeroedCharges = 1 << 23
 };
 
 template <class T, class S, typename... Args>
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
index 33ed089890bc4..b4d012bbb82fa 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
@@ -21,9 +21,88 @@
 #include "utils/VcShim.h"
 #endif
 
+#if 0
+#define DPRINT(...) printf(__VA_ARGS__)
+#define DPRINTB(...) \
+  if (iThread == 0)  \
+  printf(__VA_ARGS__)
+#define DPRINTB_IF(test, ...) \
+  if (iThread == 0 && (test)) \
+  printf(__VA_ARGS__)
+#else
+#define DPRINT(...) ((void)0)
+#define DPRINTB(...) ((void)0)
+#define DPRINTB_IF(test, ...) ((void)0)
+#endif
+
 using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
 
+using Kernel = GPUTPCCFCheckPadBaseline;
+
+// Collect tails marked for closing across the workgroup using a prefix scan,
+// then cooperatively zero the charge map entries for each closed tail.
+// Caller must set acc.activeHIPTail.end before calling if the tail is open.
+static GPUdi() uint16_t CloseHIPTails(
+  Kernel::GPUSharedMemory& smem,
+  int32_t iThread, int32_t nThreads,
+  int16_t iPadHandle,
+  CfChargePos basePos,
+  CfArray2D<PackedCharge>& chargeMap,
+  Kernel::PadChargeAccu& acc,
+  bool shouldCloseTail)
+{
+  uint16_t nClosedTails = work_group_count(shouldCloseTail);
+
+  if (nClosedTails > 0) {
+    int16_t iClosedTail = work_group_scan_inclusive_add((int16_t)shouldCloseTail) - 1;
+    if (shouldCloseTail) {
+      smem.tailsClosedPad[iClosedTail] = iPadHandle;
+      smem.tailsClosed[iClosedTail] = acc.activeHIPTail;
+      acc.activeHIPTail.Reset();
+    }
+
+    GPUbarrier();
+  }
+
+  // TODO: performance improvement -> parallelize this loop across tails
+  for (uint16_t iTail = 0; iTail < nClosedTails; iTail++) {
+    const auto tailPad = smem.tailsClosedPad[iTail];
+    const auto tail = smem.tailsClosed[iTail];
+
+    for (uint16_t iTime = iThread; iTime < tail.Length(); iTime += nThreads) {
+      chargeMap[basePos.delta({tailPad, int16_t(tail.start + iTime)})] = PackedCharge{0};
+    }
+  }
+
+  return nClosedTails;
+}
+
+template <bool CheckHIPTrigger, bool CheckHIPTailEnd>
+static GPUdi() void ScanCachedCharges(Kernel::GPUSharedMemory& smem, uint16_t timeOffset, uint16_t pad, Charge hipTailThreshold, Kernel::PadChargeAccu& acc)
+{
+  for (int32_t i = 0; i < Kernel::NumOfCachedTBs; i++) {
+    const Charge qs = smem.charges[i][pad];
+    acc.totalCharges += qs > 0;
+    acc.consecCharges = qs > 0 ? acc.consecCharges + 1 : 0;
+    acc.maxConsecCharges = CAMath::Max(acc.consecCharges, acc.maxConsecCharges);
+    acc.maxCharge = CAMath::Max<Charge>(qs, acc.maxCharge);
+
+    if constexpr (CheckHIPTrigger) {
+      if (qs >= Charge(Kernel::MaxADC)) {
+        acc.HIPtb = timeOffset + i;
+        smem.tails[pad] = {acc.HIPtb, 0}; // Broadcast HIP start TB to neighboring pads / threads
+      }
+    }
+
+    if constexpr (CheckHIPTailEnd) {
+      if (qs < hipTailThreshold) {
+        acc.activeHIPTail.end = timeOffset + i;
+      }
+    }
+  }
+}
+
 template <>
 GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
 {
@@ -48,51 +127,131 @@ GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t
   }
 
   const CfFragment& fragment = clusterer.mPmemory->fragment;
+  const bool hipFilterOn = clusterer.Param().rec.tpc.hipTailFilter;
+  const Charge hipTailThreshold = clusterer.Param().rec.tpc.hipTailFilterThreshold;
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
 
   const auto iRow = iBlock;
   const auto rowinfo = GetRowInfo(iRow);
   const CfChargePos basePos{(Row)iRow, 0, 0};
 
-  int32_t totalCharges = 0;
-  int32_t consecCharges = 0;
-  int32_t maxConsecCharges = 0;
-  Charge maxCharge = 0;
+  PadChargeAccu acc;
 
   const int16_t iPadOffset = iThread % MaxNPadsPerRow;
   const int16_t iTimeOffset = iThread / MaxNPadsPerRow;
   const int16_t iPadHandle = iThread;
   const bool handlePad = iPadHandle < rowinfo.nPads;
 
+  if (iPadHandle < MaxNPadsPerRow) {
+    smem.tails[iPadHandle] = {-1, -1};
+  }
+  GPUbarrier();
+
   const auto firstTB = fragment.firstNonOverlapTimeBin();
   const auto lastTB = fragment.lastNonOverlapTimeBin();
 
-  for (auto t = firstTB; t < lastTB; t += NumOfCachedTBs) {
+  for (uint16_t t = firstTB; t < lastTB; t += NumOfCachedTBs) {
 
-    const TPCFragmentTime iTime = t + iTimeOffset;
+    const TPCFragmentTime iTimeLoad = t + iTimeOffset;
 
-    const CfChargePos pos = basePos.delta({iPadOffset, iTime});
+    const CfChargePos pos = basePos.delta({iPadOffset, iTimeLoad});
 
-    smem.charges[iTimeOffset][iPadOffset] = iTime < lastTB && iPadOffset < rowinfo.nPads ? chargeMap[pos].unpack() : 0;
+    const Charge ql = iTimeLoad < lastTB && iPadOffset < rowinfo.nPads ? chargeMap[pos].unpack() : 0;
+    smem.charges[iTimeOffset][iPadOffset] = ql;
 
-    GPUbarrier();
+    const bool hasHIPTrigger = hipFilterOn && work_group_any(ql >= MaxADC);
+
+    acc.HIPtb = -1;
 
     if (handlePad) {
-      for (int32_t i = 0; i < NumOfCachedTBs; i++) {
-        const Charge q = smem.charges[i][iPadHandle];
-        totalCharges += (q > 0);
-        consecCharges = (q > 0) ? consecCharges + 1 : 0;
-        maxConsecCharges = CAMath::Max(consecCharges, maxConsecCharges);
-        maxCharge = CAMath::Max<Charge>(q, maxCharge);
+
+      // TODO: is this really necessary?
+      // Why is the old version so much slower, when we just add short branches to the loop???
+      if (!hasHIPTrigger) [[likely]] {
+        if (!acc.activeHIPTail.IsOpen()) {
+          ScanCachedCharges<false, false>(smem, t, iPadHandle, hipTailThreshold, acc);
+        } else {
+          ScanCachedCharges<false, true>(smem, t, iPadHandle, hipTailThreshold, acc);
+        }
+      } else {
+        if (!acc.activeHIPTail.IsOpen()) {
+          ScanCachedCharges<true, false>(smem, t, iPadHandle, hipTailThreshold, acc);
+        } else {
+          ScanCachedCharges<true, true>(smem, t, iPadHandle, hipTailThreshold, acc);
+        }
       }
     }
 
     GPUbarrier();
-  }
+
+    if (hasHIPTrigger) [[unlikely]] {
+
+      DPRINTB("%d: Trigger!\n", iBlock);
+
+      if (handlePad && acc.HIPtb < 0) {
+
+        // Search neighboring pads for trigger
+        for (int16_t i = -3; i < 0; i++) {
+          const auto p = iPadHandle + i;
+          if (p > -1) {
+            acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb);
+          }
+        }
+
+        for (int16_t i = 1; i < 4; i++) {
+          const auto p = iPadHandle + i;
+          if (p < MaxNPadsPerRow) {
+            acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb);
+          }
+        }
+      }
+
+      bool shouldCloseTail = acc.HIPtb > -1 && acc.activeHIPTail.HasValue();
+      if (shouldCloseTail && acc.activeHIPTail.IsOpen()) {
+        DPRINT("%d: end = %d\n", iThread, acc.HIPtb);
+        acc.activeHIPTail.end = acc.HIPtb;
+      }
+
+      CloseHIPTails(smem, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail);
+
+      GPUbarrier(); // TODO: not needed? Debug only
+
+      if (acc.HIPtb > -1) {
+        DPRINT("%d: start = %d\n", iThread, acc.HIPtb);
+        acc.activeHIPTail.SetOpen(acc.HIPtb);
+      }
+
+      // Clear smem between iterations to prevent stale entries
+      if (handlePad) {
+        smem.tails[iPadHandle].Reset();
+      }
+
+      GPUbarrier(); // TODO: not needed? Debug only
+
+    } // if (hipTriggerFound)
+
+  } // for (uint16_t t = firstTB; t < lastTB; t += NumOfCachedTBs)
 
   if (handlePad) {
-    updatePadBaseline(rowinfo.globalPadOffset + iPadOffset, clusterer, totalCharges, maxConsecCharges, maxCharge);
+    updatePadBaseline(rowinfo.globalPadOffset + iPadOffset, clusterer, acc.totalCharges, acc.maxConsecCharges, acc.maxCharge);
   }
+
+  // --- Close remaining tails
+  const bool shouldCloseTail = acc.activeHIPTail.HasValue();
+
+  // Call `work_group_any` here, instead of always counting.
+  // This is important as `work_group_count` is a lot slower
+  // and has a lot of overhead if no HIPs were found.
+  if (work_group_any(shouldCloseTail)) {
+    if (shouldCloseTail && acc.activeHIPTail.IsOpen()) {
+      acc.activeHIPTail.end = lastTB;
+    }
+
+    [[maybe_unused]] const uint16_t nClosedTails = CloseHIPTails(smem, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail);
+
+    DPRINTB_IF(nClosedTails > 0, "%d: Close remaining tails (%d)\n", iBlock, nClosedTails);
+  }
+
 #endif
 }
 
@@ -106,7 +265,7 @@ GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineCPU(int32_t nBlocks, int32_t
   int32_t padsPerRow;
   CfChargePos basePos = padToCfChargePos<PadsPerCacheline>(basePad, clusterer, padsPerRow);
 
-  if (not basePos.valid()) {
+  if (!basePos.valid()) {
     return;
   }
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h
index a71f1358a73a6..1593ea5dd5df3 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h
@@ -15,6 +15,11 @@
 /// Kernel identifies noisy TPC pads by analyzing charge patterns over time.
 /// A pad is marked noisy if it exceeds thresholds for total or consecutive
 /// time bins with charge, unless the charge exceeds a saturation threshold.
+///
+/// Optionally detects Highly Ionising Particle (HIP) tails: when a saturated
+/// ADC value (1023) is found, the tail region on the triggering pad and its
+/// neighbors is zeroed in the charge map until charges drop below a
+/// configurable threshold.
 
 #ifndef O2_GPU_GPU_TPC_CF_CHECK_PAD_BASELINE_H
 #define O2_GPU_GPU_TPC_CF_CHECK_PAD_BASELINE_H
@@ -44,10 +49,54 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
     // Rounding up to a multiple of PadsPerCacheline ensures iThread / MaxNPadsPerRow < NumOfCachedTBs
     // for all threads, avoiding out-of-bounds access.
     MaxNPadsPerRow = CAMath::nextMultipleOf<PadsPerCacheline>(GPUTPCGeometry::MaxNPadsPerRow()),
+
+    MaxADC = 1023,
+
+    NThreads = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline),
+  };
+
+  union HipTailRange {
+    struct {
+      int16_t start;
+      int16_t end;
+    };
+    // uint32_t asU32;
+
+    // Be careful with using default initialized values.
+    // Need default constructor, so can be placed in shared memory.
+    // Might be zero initialized, but invalid tail needs start = end = -1 instead.
+    GPUdDefault() HipTailRange() = default;
+    GPUdi() HipTailRange(int16_t st, int16_t e) : start(st), end(e) {}
+
+    GPUdi() bool HasValue() const { return start > -1; }
+    GPUdi() bool IsOpen() const { return start > -1 && end < 0; }
+
+    GPUdi() void SetOpen(int16_t st)
+    {
+      start = st;
+      end = -1;
+    }
+
+    GPUdi() int16_t Length() const { return end - start; }
+
+    GPUdi() void Reset() { start = end = -1; }
   };
 
-  struct GPUSharedMemory {
+  struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64<int16_t, NThreads> {
     tpccf::Charge charges[NumOfCachedTBs][MaxNPadsPerRow];
+    HipTailRange tails[MaxNPadsPerRow];
+    uint8_t tailsClosedPad[MaxNPadsPerRow];
+    HipTailRange tailsClosed[MaxNPadsPerRow];
+  };
+
+  // Accumulated values from scanning cached charges in a pad
+  struct PadChargeAccu {
+    int32_t totalCharges = 0;
+    int32_t consecCharges = 0;
+    int32_t maxConsecCharges = 0;
+    tpccf::Charge maxCharge = 0;
+    int16_t HIPtb = -1;
+    HipTailRange activeHIPTail{-1, -1};
   };
 
   typedef GPUTPCClusterFinder processorType;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx
index e20f5d8b0f074..c266e9883d227 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx
@@ -477,7 +477,10 @@ GPUd() void GPUTPCCFDecodeZSLinkBase::WriteCharge(processorType& clusterer, floa
   CfChargePos pos(padAndRow.getRow(), padAndRow.getPad(), localTime);
   positions[positionOffset] = pos;
 
-  charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad());
+  // Only apply gain correction if ADC not fully saturated
+  if (charge < 1023.f) {
+    charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad());
+  }
 
   chargeMap[pos] = PackedCharge(charge);
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h
index b8ff90f511057..6246a0366ddf7 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h
@@ -132,7 +132,7 @@ class GPUTPCCFDecodeZSLink : public GPUTPCCFDecodeZSLinkBase
 {
  public:
   // constants for decoding
-  static inline constexpr int32_t DECODE_BITS = o2::tpc::TPCZSHDRV2::TPC_ZS_NBITS_V34;
+  static inline constexpr int32_t DECODE_BITS = tpc::TPCZSHDRV2::TPC_ZS_NBITS_V34;
   static inline constexpr float DECODE_BITS_FACTOR = 1.f / (1 << (DECODE_BITS - 10));
   static inline constexpr uint32_t DECODE_MASK = (1 << DECODE_BITS) - 1;
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx
index d676cf9cd3887..b2a6cde9ff763 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx
@@ -15,7 +15,6 @@
 #include "GPUTPCClusterFinder.h"
 #include "GPUReconstruction.h"
 #include "CfArray2D.h"
-#include "DataFormatsTPC/Digit.h"
 #include "DataFormatsTPC/ClusterNative.h"
 #include "GPUSettings.h"
 
diff --git a/GPU/GPUTracking/utils/VcShim.h b/GPU/GPUTracking/utils/VcShim.h
index 21a9a6a5c95c2..2bbc1d471bbbb 100644
--- a/GPU/GPUTracking/utils/VcShim.h
+++ b/GPU/GPUTracking/utils/VcShim.h
@@ -19,7 +19,7 @@
 
 #ifndef GPUCA_NO_VC
 
-#include <Vc/Vc>
+#include <Vc/Vc> // IWYU pragma: export
 
 #else