[llvm-branch-commits] [llvm] AMDGPU: Make vector_shuffle legal for v2i32 with v_pk_mov_b32 (PR #123684)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jan 20 20:33:16 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
For VALU shuffles, this saves an instruction in some case.
---
Patch is 285.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123684.diff
19 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+114)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h (+1)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+7)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll (+21-28)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll (+17-23)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll (+34-50)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll (+112-160)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll (+21-28)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll (+17-23)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll (+34-50)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll (+112-160)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll (+21-28)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll (+17-23)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll (+34-50)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll (+112-160)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll (+500-287)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll (+500-287)
- (modified) llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll (+48-48)
- (modified) llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll (+1-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 6d5c3b5e0742b3..8d03fde8911242 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -489,6 +489,90 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
}
+void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) {
+ EVT VT = N->getValueType(0);
+ EVT EltVT = VT.getVectorElementType();
+
+ // TODO: Handle 16-bit element vectors with even aligned masks.
+ if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
+ VT.getVectorNumElements() != 2) {
+ SelectCode(N);
+ return;
+ }
+
+ auto *SVN = cast<ShuffleVectorSDNode>(N);
+
+ SDValue Src0 = SVN->getOperand(0);
+ SDValue Src1 = SVN->getOperand(1);
+ ArrayRef<int> Mask = SVN->getMask();
+ SDLoc DL(N);
+
+ assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
+ Mask[0] < 4 && Mask[1] < 4);
+
+ SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
+ SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
+ unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
+ unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
+
+ if (Mask[0] < 0) {
+ Src0SubReg = Src1SubReg;
+ MachineSDNode *ImpDef =
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
+ VSrc0 = SDValue(ImpDef, 0);
+ }
+
+ if (Mask[1] < 0) {
+ Src1SubReg = Src0SubReg;
+ MachineSDNode *ImpDef =
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
+ VSrc1 = SDValue(ImpDef, 0);
+ }
+
+ // SGPR case needs to lower to copies.
+ //
+ // Also use subregister extract when we can directly blend the registers with
+ // a simple subregister copy.
+ //
+ // TODO: Maybe we should fold this out earlier
+ if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
+ Src1SubReg == AMDGPU::sub0) {
+ // The low element of the result always comes from src0.
+ // The high element of the result always comes from src1.
+ // op_sel selects the high half of src0.
+ // op_sel_hi selects the high half of src1.
+
+ unsigned Src0OpSel =
+ Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
+ unsigned Src1OpSel =
+ Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
+
+ SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
+ SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
+ SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+ CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
+ {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
+ ZeroMods, // clamp
+ ZeroMods, // op_sel
+ ZeroMods, // op_sel_hi
+ ZeroMods, // neg_lo
+ ZeroMods}); // neg_hi
+ return;
+ }
+
+ SDValue ResultElt0 =
+ CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
+ SDValue ResultElt1 =
+ CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
+
+ const SDValue Ops[] = {
+ CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
+ ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+ ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
+ CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
+}
+
void AMDGPUDAGToDAGISel::Select(SDNode *N) {
unsigned int Opc = N->getOpcode();
if (N->isMachineOpcode()) {
@@ -562,6 +646,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectBuildVector(N, RegClassID);
return;
}
+ case ISD::VECTOR_SHUFFLE:
+ SelectVectorShuffle(N);
+ return;
case ISD::BUILD_PAIR: {
SDValue RC, SubReg0, SubReg1;
SDLoc DL(N);
@@ -3101,6 +3188,33 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
}
Mods = VecMods;
+ } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
+ Src.getNumOperands() == 2) {
+
+ // TODO: We should repeat the build_vector source check above for the
+ // vector_shuffle for negates and casts of individual elements.
+
+ auto *SVN = cast<ShuffleVectorSDNode>(Src);
+ ArrayRef<int> Mask = SVN->getMask();
+
+ if (Mask[0] < 2 && Mask[1] < 2) {
+ // src1 should be undef.
+ SDValue ShuffleSrc = SVN->getOperand(0);
+
+ if (ShuffleSrc.getOpcode() == ISD::FNEG) {
+ ShuffleSrc = ShuffleSrc.getOperand(0);
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ }
+
+ if (Mask[0] == 1)
+ Mods |= SISrcMods::OP_SEL_0;
+ if (Mask[1] == 1)
+ Mods |= SISrcMods::OP_SEL_1;
+
+ Src = ShuffleSrc;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+ }
}
// Packed instructions do not have abs modifiers.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 7e61eb470622f1..7dcd208a9cdd41 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -86,6 +86,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
protected:
void SelectBuildVector(SDNode *N, unsigned RegClassID);
+ void SelectVectorShuffle(SDNode *N);
private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6cf5774fc53b06..1aeca7f370aa1b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -422,6 +422,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
{MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
Expand);
+ if (Subtarget->hasPkMovB32()) {
+ // TODO: 16-bit element vectors should be legal with even aligned elements.
+ // TODO: Can be legal with wider source types than the result with
+ // subregister extracts.
+ setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
+ }
+
setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
Custom);
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll
index 3410b067fb5b4e..47b15a032dedb8 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll
@@ -171,15 +171,14 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -187,15 +186,15 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -274,27 +273,24 @@ define void @v_shuffle_v2f32_v2f32__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2f32_v2f32__3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_2:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -447,27 +443,24 @@ define void @v_shuffle_v2f32_v2f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2f32_v2f32__1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v2f32_v2f32__1_0:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
index 7edb6939f884c1..3960a59b65e63e 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
@@ -632,10 +632,9 @@ define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -645,10 +644,9 @@ define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -765,13 +763,12 @@ define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v5, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v3
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -786,9 +783,8 @@ define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v3
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
@@ -1480,10 +1476,9 @@ define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1493,10 +1488,9 @@ define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v3, 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll
index ea02b31bff04fd..46090aef4a0df5 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll
@@ -335,13 +335,12 @@ define void @v_shuffle_v2f32_v4f32__7_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v5
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -356,9 +355,8 @@ define void @v_shuffle_v2f32_v4f32__7_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:5]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v5
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=v"()
@@ -447,8 +445,7 @@ define void @v_shuffle_v2f32_v4f32__7_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[4:7]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, v7
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -463,8 +460,8 @@ define void @v_shuffle_v2f32_v4f32__7_2(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[4:7]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: v_mov_b32_e32 v0, v7
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -637,8 +634,7 @@ define void @v_shuffle_v2f32_v4f32__7_6(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -650,8 +646,7 @@ define void @v_shuffle_v2f32_v4f32__7_6(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] op_sel_hi:[0,0]
; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -809,9 +804,8 @@ define void @v_shuffle_v2f32_v4f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -822,9 +816,8 @@ define void @v_shuffle_v2f32_v4f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] op_sel_hi:[0,0]
+; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x float> asm "; def $0", "=...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/123684
More information about the llvm-branch-commits
mailing list