[llvm] [AMDGPU] Add new llvm.amdgcn.wave.shuffle intrinsic (PR #167372)

Wed Nov 19 09:28:47 PST 2025

================
@@ -7269,6 +7269,83 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   return DAG.getBitcast(VT, UnrolledLaneOp);
 }
 
+static SDValue lowerSubgroupShuffle(const SITargetLowering &TLI, SDNode *N,
+                                    SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  unsigned ValSize = VT.getSizeInBits();
+  SDLoc SL(N);
+
+  SDValue Value = N->getOperand(1);
+  SDValue Index = N->getOperand(2);
+
+  // ds_bpermute requires index to be multiplied by 4
+  SDValue ShiftAmount = DAG.getTargetConstant(2, SL, MVT::i32);
+  SDValue ShiftedIndex = DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index,
+                                   ShiftAmount);
+
+  // Intrinsics will require i32 to operate on
+  SDValue Value32 = Value;
+  if ((ValSize != 32) || (VT.isFloatingPoint()))
+    Value32 = DAG.getBitcast(MVT::i32, Value);
+
+  auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
+                                   SmallVector<SDValue> IntrinArgs) -> SDValue {
+    SmallVector<SDValue> Operands(1);
+    Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
+    Operands.append(IntrinArgs);
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
+  };
+
+  if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
+    // If we can bpermute across the whole wave, then just do that
+    SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
+                                     {ShiftedIndex, Value32});
+    return DAG.getBitcast(VT, BPermute);
+  } else {
----------------
saxlungs wrote:

Refactored

https://github.com/llvm/llvm-project/pull/167372