[llvm] [SelectionDAG] Fold undemanded operand to UNDEF for VECTOR_SHUFFLE (PR #145524)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 24 08:00:11 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-selectiondag
Author: Björn Pettersson (bjope)
<details>
<summary>Changes</summary>
Always let SimplifyDemandedVectorElts fold either side of a VECTOR_SHUFFLE to UNDEF if no elements are demanded from that side.
For a single use this could be done by SimplifyDemandedVectorElts already, but in case the operand had multiple uses we did not eliminate the use.
---
Full diff: https://github.com/llvm/llvm-project/pull/145524.diff
6 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+13)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll (+7-9)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll (+6-9)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll (+7-9)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll (+6-9)
- (modified) llvm/test/CodeGen/X86/vec_int_to_fp.ll (+11-13)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 66717135c9adf..e40a592ecb57c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3587,6 +3587,19 @@ bool TargetLowering::SimplifyDemandedVectorElts(
DemandedRHS.setBit(M - NumElts);
}
+ // If either side isn't demanded, replace it by UNDEF. We handle this
+ // explicitly here to also simplify in case of mulitple uses (on the
+ // contrary to the SimplifyDemandedVectorElts calls below).
+ bool FoldLHS = !DemandedLHS && !LHS.isUndef();
+ bool FoldRHS = !DemandedRHS && !RHS.isUndef();
+ if (FoldLHS || FoldRHS) {
+ LHS = FoldLHS ? TLO.DAG.getUNDEF(LHS.getValueType()) : LHS;
+ RHS = FoldRHS ? TLO.DAG.getUNDEF(RHS.getValueType()) : RHS;
+ SDValue NewOp =
+ TLO.DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, ShuffleMask);
+ return TLO.CombineTo(Op, NewOp);
+ }
+
// See if we can simplify either shuffle operand.
APInt UndefLHS, ZeroLHS;
APInt UndefRHS, ZeroRHS;
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
index 008e19b620520..5914253b5f58e 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll
@@ -1228,51 +1228,49 @@ define void @v_shuffle_v3bf16_v2bf16__3_u_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v2
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v2, s[16:17]
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v2, s[16:17]
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v2
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_alignbit_b32 v2, s0, v2, 16
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x bfloat> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
index 99c9480adc410..cd4dbe93e8a11 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll
@@ -1928,48 +1928,45 @@ define void @v_shuffle_v3bf16_v3bf16__5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[1:2]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v3, v2, s[16:17]
-; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v4, v3, s[16:17]
-; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v4, v3, s[0:1]
-; GFX942-NEXT: global_store_short v4, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x bfloat> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
index e34becc1065ff..99cb8a38f57c3 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll
@@ -1228,51 +1228,49 @@ define void @v_shuffle_v3f16_v2f16__3_u_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v1
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v2
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v0, 0
; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16
-; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v0, v2, s[16:17]
-; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v1
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16
-; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v0, v2, s[16:17]
-; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f16_v2f16__3_u_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v1
; GFX942-NEXT: ;;#ASMEND
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v2
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_nop 0
; GFX942-NEXT: v_alignbit_b32 v2, s0, v2, 16
-; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX942-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX942-NEXT: global_store_short v0, v1, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x half> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
index 84d42c882494c..0854ff2ebfc5d 100644
--- a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll
@@ -1928,48 +1928,45 @@ define void @v_shuffle_v3f16_v3f16__5_u_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[0:1]
; GFX900-NEXT: ;;#ASMEND
-; GFX900-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; def v[1:2]
; GFX900-NEXT: ;;#ASMEND
+; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4
; GFX900-NEXT: global_store_dword v3, v2, s[16:17]
-; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4
; GFX90A-NEXT: global_store_dword v4, v3, s[16:17]
-; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v3f16_v3f16__5_u_1:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[0:1]
; GFX942-NEXT: ;;#ASMEND
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX942-NEXT: ;;#ASMSTART
; GFX942-NEXT: ; def v[2:3]
; GFX942-NEXT: ;;#ASMEND
+; GFX942-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4
; GFX942-NEXT: global_store_dword v4, v3, s[0:1]
-; GFX942-NEXT: global_store_short v4, v0, s[0:1] offset:4
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <4 x half> asm "; def $0", "=v"()
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 62ab5d82bfbb6..910dd1ee6c419 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -2099,21 +2099,19 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psrlq $1, %xmm2
; SSE41-NEXT: por %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm0
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: cvtsi2ss %rax, %xmm3
+; SSE41-NEXT: movq %xmm0, %rax
; SSE41-NEXT: xorps %xmm2, %xmm2
; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: addps %xmm1, %xmm2
-; SSE41-NEXT: xorps %xmm3, %xmm3
-; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[2,3]
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],zero,zero
+; SSE41-NEXT: movaps %xmm2, %xmm3
+; SSE41-NEXT: addps %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: uitofp_4i64_to_4f32_undef:
``````````
</details>
https://github.com/llvm/llvm-project/pull/145524
More information about the llvm-commits
mailing list