[llvm-branch-commits] [llvm] AMDGPU: Make vector_shuffle legal for v2i32 with v_pk_mov_b32 (PR #123684)

Tue Jan 21 01:34:33 PST 2025

================
@@ -489,6 +489,90 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
 }
 
+void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+
+  // TODO: Handle 16-bit element vectors with even aligned masks.
+  if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
+      VT.getVectorNumElements() != 2) {
+    SelectCode(N);
+    return;
+  }
+
+  auto *SVN = cast<ShuffleVectorSDNode>(N);
+
+  SDValue Src0 = SVN->getOperand(0);
+  SDValue Src1 = SVN->getOperand(1);
+  ArrayRef<int> Mask = SVN->getMask();
+  SDLoc DL(N);
+
+  assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
+         Mask[0] < 4 && Mask[1] < 4);
+
+  SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
+  SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
+  unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
+  unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
+
+  if (Mask[0] < 0) {
+    Src0SubReg = Src1SubReg;
+    MachineSDNode *ImpDef =
+        CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
+    VSrc0 = SDValue(ImpDef, 0);
+  }
+
+  if (Mask[1] < 0) {
+    Src1SubReg = Src0SubReg;
+    MachineSDNode *ImpDef =
+        CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
+    VSrc1 = SDValue(ImpDef, 0);
+  }
+
+  // SGPR case needs to lower to copies.
+  //
+  // Also use subregister extract when we can directly blend the registers with
+  // a simple subregister copy.
+  //
+  // TODO: Maybe we should fold this out earlier
+  if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
+      Src1SubReg == AMDGPU::sub0) {
+    // The low element of the result always comes from src0.
+    // The high element of the result always comes from src1.
+    // op_sel selects the high half of src0.
+    // op_sel_hi selects the high half of src1.
+
+    unsigned Src0OpSel =
+        Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
+    unsigned Src1OpSel =
+        Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
----------------
arsenm wrote:

I'm not sure this is correctly encoded. I'm confused by how op_sel and op_sel_hi are supposed to be represented. We set fields in the source modifiers. I guess this should probably be OP_SEL_1?

https://github.com/llvm/llvm-project/pull/123684