[llvm] Reland: [GlobalISel] prevent G_UNMERGE_VALUES for vectors with different elements (PR #144661)

Wed Jun 18 02:30:28 PDT 2025

https://github.com/ro-i created https://github.com/llvm/llvm-project/pull/144661

This commit prevents building a G_UNMERGE_VALUES instruction with different source and destination vector elements in `LegalizationArtifactCombiner::ArtifactValueFinder::tryCombineMergeLike()`, e.g.:
`%1:_(<2 x s8>), %2:_(<2 x s8>) = G_UNMERGE_VALUES %0:_(<2 x s16>)`

This LLVM defect was identified via the AMD Fuzzing project.

Note: this originally was #133335, which I had to revert because of changes that have been made to main during the lifetime of that PR. They led to two test failures which did not show as merge conflicts but still made a rebase necessary. You can see the changes needed after rebasing the PR in the second commit of this PR.

>From a295d0013fefbe5f395877febf33708a34eabd50 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Thu, 27 Mar 2025 17:54:38 -0500
Subject: [PATCH 1/2] [GlobalISel] prevent G_UNMERGE_VALUES for vectors with
 different elements

This commit prevents building a G_UNMERGE_VALUES instruction with
different source and destination vector elements in
LegalizationArtifactCombiner::ArtifactValueFinder::tryCombineMergeLike(),
e.g.:
`%1:_(<2 x s8>), %2:_(<2 x s8>) = G_UNMERGE_VALUES %0:_(<2 x s16>)`

This LLVM defect was identified via the AMD Fuzzing project.
---
 .../GlobalISel/LegalizationArtifactCombiner.h |  5 +-
 .../AMDGPU/GlobalISel/insertelement.ll        | 55 +++++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 22f6a5fde546a..8f560c42082f9 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -997,6 +997,7 @@ class LegalizationArtifactCombiner {
 
       // Recognize UnmergeSrc that can be unmerged to DstTy directly.
       // Types have to be either both vector or both non-vector types.
+      // In case of vector types, the scalar elements need to match.
       // Merge-like opcodes are combined one at the time. First one creates new
       // unmerge, following should use the same unmerge (builder performs CSE).
       //
@@ -1005,7 +1006,9 @@ class LegalizationArtifactCombiner {
       // %AnotherDst:_(DstTy) = G_merge_like_opcode %2:_(EltTy), %3
       //
       // %Dst:_(DstTy), %AnotherDst = G_UNMERGE_VALUES %UnmergeSrc
-      if ((DstTy.isVector() == UnmergeSrcTy.isVector()) &&
+      if (((!DstTy.isVector() && !UnmergeSrcTy.isVector()) ||
+           (DstTy.isVector() && UnmergeSrcTy.isVector() &&
+            DstTy.getScalarType() == UnmergeSrcTy.getScalarType())) &&
           (Elt0UnmergeIdx % NumMIElts == 0) &&
           getCoverTy(UnmergeSrcTy, DstTy) == UnmergeSrcTy) {
         if (!isSequenceFromUnmerge(MI, 0, Unmerge, Elt0UnmergeIdx, NumMIElts,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 8134eb3ca2afc..132a89478c5fd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -6506,3 +6506,58 @@ entry:
   %insert = insertelement <5 x double> %vec, double %val, i32 %idx
   ret <5 x double> %insert
 }
+
+; Found by fuzzer, reduced with llvm-reduce.
+define amdgpu_kernel void @insert_very_small_from_very_large(<32 x i16> %L3, ptr %ptr) {
+; GPRIDX-LABEL: insert_very_small_from_very_large:
+; GPRIDX:       ; %bb.0: ; %bb
+; GPRIDX-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x0
+; GPRIDX-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x40
+; GPRIDX-NEXT:    s_waitcnt lgkmcnt(0)
+; GPRIDX-NEXT:    s_lshr_b32 s2, s12, 1
+; GPRIDX-NEXT:    s_and_b32 s2, s2, 1
+; GPRIDX-NEXT:    s_lshl_b32 s2, s2, 1
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
+; GPRIDX-NEXT:    flat_store_byte v[0:1], v2
+; GPRIDX-NEXT:    s_endpgm
+;
+; GFX10-LABEL: insert_very_small_from_very_large:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x40
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_lshr_b32 s2, s12, 1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    s_and_b32 s2, s2, 1
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s2
+; GFX10-NEXT:    flat_store_byte v[0:1], v2
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: insert_very_small_from_very_large:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b512 s[8:23], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x40
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshr_b32 s2, s8, 1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_and_b32 s2, s2, 1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    flat_store_b8 v[0:1], v2
+; GFX11-NEXT:    s_endpgm
+bb:
+  %a = bitcast <32 x i16> %L3 to i512
+  %b = trunc i512 %a to i8
+  %c = trunc i8 %b to i2
+  %d = bitcast i2 %c to <2 x i1>
+  %insert = insertelement <2 x i1> %d, i1 false, i32 0
+  store <2 x i1> %insert, ptr %ptr, align 1
+  ret void
+}

>From d7198c4bc1942647397202a9a99d875932a65c5f Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Wed, 18 Jun 2025 04:03:54 -0500
Subject: [PATCH 2/2] fix after rebase

---
 .../AMDGPU/GlobalISel/insertelement.ll        | 57 ++++++++-----------
 ...ffer-fat-pointers-contents-legalization.ll | 10 ++--
 2 files changed, 28 insertions(+), 39 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 132a89478c5fd..51d0b225b2a27 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -6508,50 +6508,39 @@ entry:
 }
 
 ; Found by fuzzer, reduced with llvm-reduce.
-define amdgpu_kernel void @insert_very_small_from_very_large(<32 x i16> %L3, ptr %ptr) {
+define void @insert_very_small_from_very_large(<32 x i16> %L3, ptr %ptr) {
 ; GPRIDX-LABEL: insert_very_small_from_very_large:
 ; GPRIDX:       ; %bb.0: ; %bb
-; GPRIDX-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x0
-; GPRIDX-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x40
-; GPRIDX-NEXT:    s_waitcnt lgkmcnt(0)
-; GPRIDX-NEXT:    s_lshr_b32 s2, s12, 1
-; GPRIDX-NEXT:    s_and_b32 s2, s2, 1
-; GPRIDX-NEXT:    s_lshl_b32 s2, s2, 1
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
-; GPRIDX-NEXT:    flat_store_byte v[0:1], v2
-; GPRIDX-NEXT:    s_endpgm
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GPRIDX-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GPRIDX-NEXT:    v_and_b32_e32 v0, 1, v0
+; GPRIDX-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; GPRIDX-NEXT:    v_and_b32_e32 v0, 3, v0
+; GPRIDX-NEXT:    flat_store_byte v[16:17], v0
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: insert_very_small_from_very_large:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx16 s[12:27], s[8:9], 0x0
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x40
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX10-NEXT:    flat_store_byte v[16:17], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s2, s12, 1
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_and_b32 s2, s2, 1
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    s_lshl_b32 s2, s2, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-NEXT:    flat_store_byte v[0:1], v2
-; GFX10-NEXT:    s_endpgm
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: insert_very_small_from_very_large:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b512 s[8:23], s[4:5], 0x0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x40
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b16 v0.l, 1, v0.l
+; GFX11-NEXT:    v_and_b16 v0.l, v0.l, 1
+; GFX11-NEXT:    v_lshlrev_b16 v0.l, 1, v0.l
+; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT:    flat_store_b8 v[16:17], v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s2, s8, 1
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_and_b32 s2, s2, 1
-; GFX11-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 1
-; GFX11-NEXT:    v_mov_b32_e32 v2, s2
-; GFX11-NEXT:    flat_store_b8 v[0:1], v2
-; GFX11-NEXT:    s_endpgm
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %a = bitcast <32 x i16> %L3 to i512
   %b = trunc i512 %a to i8
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index fdc1dd6cce8e1..53b2542cf9a7e 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -2166,14 +2166,14 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v6i8:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <6 x i8>, ptr addrspace(7) %p
@@ -3630,10 +3630,10 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load volatile <6 x i8>, ptr addrspace(7) %p