[llvm-branch-commits] [llvm] WIP: AMDGPU: Implement getRegSequenceLikeInputs for v_pk_mov_b32 (PR #125657)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Feb 4 01:28:53 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
In principle we need this analysis to avoid regressions when
using v_pk_mov_b32 when shuffling to physical register inputs. However,
as it stands this only introduces regressions by decomposing every
useful case where we benefit from the instruction.
---
Patch is 379.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125657.diff
13 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+34)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+3)
- (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll (+121-89)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll (+143-129)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll (+466-343)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll (+121-89)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll (+143-129)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll (+466-343)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll (+121-89)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll (+143-129)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll (+466-343)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 35667801c809d5..3cae838321885d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9725,6 +9725,40 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
return nullptr;
}
+bool SIInstrInfo::getRegSequenceLikeInputs(
+ const MachineInstr &MI, unsigned DefIdx,
+ SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const {
+ assert(MI.getOpcode() == AMDGPU::V_PK_MOV_B32 &&
+ "v_pk_mov_b32 is the only reg-sequence like instruction");
+ assert(DefIdx == 0);
+
+ unsigned Src0Mods = MI.getOperand(1).getImm();
+ const MachineOperand &Src0 = MI.getOperand(2);
+ unsigned Src1Mods = MI.getOperand(3).getImm();
+ const MachineOperand &Src1 = MI.getOperand(4);
+
+ unsigned SubReg0 =
+ Src0Mods & SISrcMods::OP_SEL_0 ? AMDGPU::sub1 : AMDGPU::sub0;
+ unsigned SubReg1 =
+ Src1Mods & SISrcMods::OP_SEL_0 ? AMDGPU::sub1 : AMDGPU::sub0;
+
+ if (!Src0.isUndef()) {
+ // src0 will provide the result sub0 from src0.
+ SubReg0 = RI.composeSubRegIndices(Src0.getSubReg(), SubReg0);
+ InputRegs.push_back(
+ RegSubRegPairAndIdx(Src0.getReg(), SubReg0, AMDGPU::sub0));
+ }
+
+ if (!Src1.isUndef()) {
+ // src1 will provide the result's sub1 from src1.
+ SubReg1 = RI.composeSubRegIndices(Src1.getSubReg(), SubReg1);
+ InputRegs.push_back(
+ RegSubRegPairAndIdx(Src1.getReg(), SubReg1, AMDGPU::sub1));
+ }
+
+ return true;
+}
+
unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
const MachineInstr &MI,
unsigned *PredCost) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 933935a86f9f98..425ff77e8cdc3f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1437,6 +1437,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
int FrameIndex,
LiveIntervals *LIS = nullptr,
VirtRegMap *VRM = nullptr) const override;
+ bool getRegSequenceLikeInputs(
+ const MachineInstr &MI, unsigned DefIdx,
+ SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const override;
unsigned getInstrLatency(const InstrItineraryData *ItinData,
const MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 1afd68767cd3ba..a6f8035e93b182 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1107,7 +1107,7 @@ let isCommutable = 1, isReMaterializable = 1 in {
defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
} // End SubtargetPredicate = HasPackedFP32Ops
- let SubtargetPredicate = HasPkMovB32 in
+ let SubtargetPredicate = HasPkMovB32, isRegSequence = 1 in
defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
} // End isCommutable = 1, isReMaterializable = 1
diff --git a/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir
index 90291221e8e178..602fc29bf1db36 100644
--- a/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir
@@ -15,8 +15,8 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; CHECK-NEXT: $vgpr4 = COPY [[COPY2]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY3]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5
@@ -49,8 +49,8 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; CHECK-NEXT: $vgpr4 = COPY [[COPY4]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY5]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5
@@ -118,7 +118,7 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; CHECK-NEXT: $vgpr4 = COPY [[COPY4]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY5]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5
@@ -154,7 +154,7 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
; CHECK-NEXT: $vgpr4 = COPY [[COPY4]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY5]]
@@ -191,7 +191,7 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, undef [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
; CHECK-NEXT: $vgpr4 = COPY [[COPY4]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY5]]
@@ -229,7 +229,7 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, undef [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; CHECK-NEXT: $vgpr4 = COPY [[COPY4]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY5]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5
@@ -304,8 +304,8 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]].sub0_sub1, 8, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; CHECK-NEXT: $vgpr4 = COPY [[COPY6]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY7]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5
@@ -345,8 +345,8 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, [[REG_SEQUENCE]].sub0_sub1, 12, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
; CHECK-NEXT: $vgpr4 = COPY [[COPY6]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY7]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5
@@ -388,8 +388,8 @@ body: |
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]].sub2_sub3, 12, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec
- ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3
; CHECK-NEXT: $vgpr4 = COPY [[COPY8]]
; CHECK-NEXT: $vgpr5 = COPY [[COPY9]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll
index 9e3c044a76295f..fafdac7ffac62f 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll
@@ -176,7 +176,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
@@ -191,8 +192,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_u_u(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
@@ -565,15 +566,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -581,15 +583,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_0(ptr addrspace(1) inreg %ptr) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[2:3]
+; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -672,26 +675,31 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0]
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: v_mov_b32_e32 v2, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -865,7 +873,8 @@ define void @v_shuffle_v4f32_v2f32__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -878,7 +887,8 @@ define void @v_shuffle_v4f32_v2f32__1_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -962,7 +972,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -978,7 +989,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1126,13 +1138,14 @@ define void @v_shuffle_v4f32_v2f32__3_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1142,13 +1155,14 @@ define void @v_shuffle_v4f32_v2f32__3_2_0_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
+; GFX940-NEXT: v_mov_b32_e32 v6, 0
; GFX940-NEXT: ;;#ASMSTART
-; GFX940-NEXT: ; def v[0:1]
+; GFX940-NEXT: ; def v[4:5]
; GFX940-NEXT: ;;#ASMEND
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
@@ -1289,15 +1303,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[2:3]
+; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v[0:1]
+; GFX90A-NEXT: ; def v[4:5]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0]
; GFX90A-NEXT: v_mov_b32_e32 v0, v1
-; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v5
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
@@ -1305,15 +1320,16 @@ define void @v_shuffle_v4f...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/125657
More information about the llvm-branch-commits
mailing list