[llvm] AMDGPU: Replace insertelement undef with poison in cases with manual updates (PR #130898)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 12 06:33:59 PDT 2025
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/130898
>From 7b07d29f2e2aea74a3dfb660fcad393c764ca422 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 12 Mar 2025 12:32:31 +0700
Subject: [PATCH] AMDGPU: Replace insertelement undef with poison in cases with
manual updates
I had to manually intervene in a few tests. fcanonicalize.f16.ll is directly sensitive
to undef vs. poison.
---
llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 16 +++++++-------
.../AMDGPU/promote-alloca-array-aggregate.ll | 6 ++---
.../AMDGPU/promote-alloca-loadstores.ll | 22 +++++++++----------
3 files changed, 22 insertions(+), 22 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index e72f3d3ce993a..d48b75a666db7 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -238,7 +238,7 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %ins0 = insertelement <2 x half> undef, half %lo, i32 0
+ %ins0 = insertelement <2 x half> poison, half %lo, i32 0
%ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
ret <2 x half> %canonicalized
@@ -2581,7 +2581,7 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %vec = insertelement <2 x half> undef, half %val, i32 0
+ %vec = insertelement <2 x half> poison, half %val, i32 0
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
@@ -2622,7 +2622,7 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %vec = insertelement <2 x half> undef, half %val, i32 1
+ %vec = insertelement <2 x half> poison, half %val, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
@@ -2785,7 +2785,7 @@ define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 2.0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %vec0 = insertelement <2 x half> undef, half %val, i32 0
+ %vec0 = insertelement <2 x half> poison, half %val, i32 0
%vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
ret <2 x half> %canonicalized
@@ -2829,7 +2829,7 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, 2.0, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %vec0 = insertelement <2 x half> undef, half 2.0, i32 0
+ %vec0 = insertelement <2 x half> poison, half 2.0, i32 0
%vec1 = insertelement <2 x half> %vec0, half %val, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
ret <2 x half> %canonicalized
@@ -2925,7 +2925,7 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %vec = insertelement <4 x half> undef, half %val, i32 0
+ %vec = insertelement <4 x half> poison, half %val, i32 0
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec)
ret <4 x half> %canonicalized
}
@@ -2977,7 +2977,7 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %vec0 = insertelement <4 x half> undef, half %val0, i32 0
+ %vec0 = insertelement <4 x half> poison, half %val0, i32 0
%vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1)
ret <4 x half> %canonicalized
@@ -3035,7 +3035,7 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 0
; GFX11-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %vec0 = insertelement <4 x half> undef, half %val0, i32 0
+ %vec0 = insertelement <4 x half> poison, half %val0, i32 0
%vec1 = insertelement <4 x half> %vec0, half %val1, i32 2
%vec2 = insertelement <4 x half> %vec1, half %val2, i32 3
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec2)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
index 2c3cb1e6a5e6e..a4a8a985df0bf 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -134,7 +134,7 @@ define amdgpu_vs void @promote_load_from_store_aggr_varoff(<4 x i32> %input) {
; CHECK-NEXT: [[FOO3_UNPACK2:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[FOO3_UNPACK2]], i32 2
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 [[FOO3_UNPACK2]]
-; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x i32> %input, i32 [[TMP2]], i64 3
+; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x i32> [[INPUT:%.*]], i32 [[TMP2]], i64 3
; CHECK-NEXT: store <4 x i32> [[FOO12]], ptr addrspace(1) @pv1, align 16
; CHECK-NEXT: ret void
;
@@ -344,7 +344,7 @@ define amdgpu_ps void @promote_double_aggr() #0 {
; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[FOO5_FCA_1_EXTRACT]], [[FOO5_FCA_1_EXTRACT]]
; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[FOO10]], [[FOO5_FCA_1_EXTRACT]]
; CHECK-NEXT: [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
-; CHECK-NEXT: [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0
+; CHECK-NEXT: [[FOO18:%.*]] = insertelement <4 x float> poison, float [[FOO17]], i32 0
; CHECK-NEXT: [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
; CHECK-NEXT: [[FOO20:%.*]] = insertelement <4 x float> [[FOO19]], float [[FOO17]], i32 2
; CHECK-NEXT: [[FOO21:%.*]] = insertelement <4 x float> [[FOO20]], float [[FOO17]], i32 3
@@ -370,7 +370,7 @@ define amdgpu_ps void @promote_double_aggr() #0 {
%foo15 = load double, ptr addrspace(5) %foo14
%foo16 = fadd double %foo13, %foo15
%foo17 = fptrunc double %foo16 to float
- %foo18 = insertelement <4 x float> undef, float %foo17, i32 0
+ %foo18 = insertelement <4 x float> poison, float %foo17, i32 0
%foo19 = insertelement <4 x float> %foo18, float %foo17, i32 1
%foo20 = insertelement <4 x float> %foo19, float %foo17, i32 2
%foo21 = insertelement <4 x float> %foo20, float %foo17, i32 3
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
index 1e49500a243e1..119d3611e1007 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
@@ -9,15 +9,15 @@ define amdgpu_kernel void @test_overwrite(i64 %val, i1 %cond) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA]], i64 68, i32 0
+; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 68, i32 0
; CHECK-NEXT: [[TMP2]] = insertelement <3 x i64> [[TMP1]], i64 32, i32 0
; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 68
; CHECK-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
; CHECK: end:
-; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
+; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP2]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
; CHECK-NEXT: ret void
;
entry:
@@ -64,15 +64,15 @@ define amdgpu_kernel void @test_no_overwrite(i64 %val, i1 %cond) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP1:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
-; CHECK-NEXT: [[TMP1]] = insertelement <3 x i64> [[PROMOTEALLOCA]], i64 32, i32 1
+; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP1:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
+; CHECK-NEXT: [[TMP1]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 32, i32 1
; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 32
; CHECK-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
; CHECK: end:
-; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP1]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 1
+; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP1]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 1
; CHECK-NEXT: ret void
;
entry:
More information about the llvm-commits
mailing list