[llvm] [AMDGPU] Add new llvm.amdgcn.subgroup.shuffle intrinsic (PR #167372)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 11 02:05:18 PST 2025
================
@@ -7269,6 +7269,83 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
return DAG.getBitcast(VT, UnrolledLaneOp);
}
+static SDValue lowerSubgroupShuffle(const SITargetLowering &TLI, SDNode *N,
+ SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ unsigned ValSize = VT.getSizeInBits();
+ SDLoc SL(N);
+
+ SDValue Value = N->getOperand(1);
+ SDValue Index = N->getOperand(2);
+
+ // ds_bpermute requires index to be multiplied by 4
+ SDValue ShiftAmount = DAG.getTargetConstant(2, SL, MVT::i32);
+ SDValue ShiftedIndex = DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index,
+ ShiftAmount);
+
+ // Intrinsics will require i32 to operate on
+ SDValue Value32 = Value;
+ if ((ValSize != 32) || (VT.isFloatingPoint()))
+ Value32 = DAG.getBitcast(MVT::i32, Value);
+
+ auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
+ SmallVector<SDValue> IntrinArgs) -> SDValue {
+ SmallVector<SDValue> Operands(1);
+ Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
+ Operands.append(IntrinArgs);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
+ };
+
+ if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
+ // If we can bpermute across the whole wave, then just do that
+ SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
+ {ShiftedIndex, Value32});
+ return DAG.getBitcast(VT, BPermute);
+ } else {
+ assert(TLI.getSubtarget()->isWave64());
+
+ // Otherwise, we need to make use of whole wave mode
+ SDValue PoisonVal = DAG.getPOISON(Value32->getValueType(0));
+ SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0));
+
+ // Set inactive lanes to poison
----------------
jayfoad wrote:
You don't need to do this. I would expect amdgcn.set.inactive(poison) to optimized out anyway.
https://github.com/llvm/llvm-project/pull/167372
More information about the llvm-commits
mailing list