[llvm] [GISel][AMDGPU] Expand ShuffleVector (PR #124527)
Alan Li via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 27 09:24:38 PST 2025
https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/124527
>From fb991ad2f5c56fa2029f935a34646e3e76c88110 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Mon, 27 Jan 2025 18:26:19 +0800
Subject: [PATCH] [GISel][AMDGPU] Expand ShuffleVector
This patch dismantles G_SHUFFLE_VECTOR before lowering.
The original lowering would emit extract vector element ops.
By using unmerged values and avoid constants used in extract_element_elt,
the build vector op combine could find ways to fold.
Only enabled on AMDGPU.
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 4 +
.../include/llvm/Target/GlobalISel/Combine.td | 7 +
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 40 +++++
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 3 +-
.../prelegalizer-combiner-shuffle.mir | 137 ++++++++++++++++++
.../AMDGPU/GlobalISel/shufflevector.ll | 18 +++
...ffer-fat-pointers-contents-legalization.ll | 27 +---
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 6 +-
.../llvm.amdgcn.raw.tbuffer.store.d16.ll | 2 -
.../llvm.amdgcn.struct.tbuffer.store.d16.ll | 3 +-
llvm/test/CodeGen/AMDGPU/mad-mix.ll | 16 +-
11 files changed, 222 insertions(+), 41 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 9b78342c8fc39..f51f0495d8256 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -264,6 +264,10 @@ class CombinerHelper {
void applyCombineShuffleConcat(MachineInstr &MI,
SmallVector<Register> &Ops) const;
+ /// Replace \p MI with a build_vector.
+ bool matchCombineShuffleToBuildVector(MachineInstr &MI) const;
+ void applyCombineShuffleToBuildVector(MachineInstr &MI) const;
+
/// Try to combine G_SHUFFLE_VECTOR into G_CONCAT_VECTORS.
/// Returns true if MI changed.
///
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 3590ab221ad44..9ce6d0f24af62 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1560,6 +1560,13 @@ def combine_shuffle_concat : GICombineRule<
[{ return Helper.matchCombineShuffleConcat(*${root}, ${matchinfo}); }]),
(apply [{ Helper.applyCombineShuffleConcat(*${root}, ${matchinfo}); }])>;
+// Combines shuffles of vector into build_vector
+def combine_shuffle_vector_to_build_vector : GICombineRule<
+ (defs root:$root),
+ (match (G_SHUFFLE_VECTOR $dst, $src1, $src2, $mask):$root,
+ [{ return Helper.matchCombineShuffleToBuildVector(*${root}); }]),
+ (apply [{ Helper.applyCombineShuffleToBuildVector(*${root}); }])>;
+
def insert_vector_element_idx_undef : GICombineRule<
(defs root:$root),
(match (G_IMPLICIT_DEF $idx),
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 0dfbb91f2ac54..184b42b671ee3 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -25,6 +25,7 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
@@ -384,6 +385,45 @@ void CombinerHelper::applyCombineConcatVectors(
MI.eraseFromParent();
}
+bool CombinerHelper::matchCombineShuffleToBuildVector(MachineInstr &MI) const {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
+ "Invalid instruction");
+ auto &Shuffle = cast<GShuffleVector>(MI);
+
+ Register SrcVec1 = Shuffle.getSrc1Reg();
+ Register SrcVec2 = Shuffle.getSrc2Reg();
+
+ LLT SrcVec1Type = MRI.getType(SrcVec1);
+ LLT SrcVec2Type = MRI.getType(SrcVec2);
+ return SrcVec1Type.isVector() && SrcVec2Type.isVector();
+}
+
+void CombinerHelper::applyCombineShuffleToBuildVector(MachineInstr &MI) const {
+ auto &Shuffle = cast<GShuffleVector>(MI);
+
+ Register SrcVec1 = Shuffle.getSrc1Reg();
+ Register SrcVec2 = Shuffle.getSrc2Reg();
+ LLT EltTy = MRI.getType(SrcVec1).getElementType();
+ int Width = MRI.getType(SrcVec1).getNumElements();
+
+ auto Unmerge1 = Builder.buildUnmerge(EltTy, SrcVec1);
+ auto Unmerge2 = Builder.buildUnmerge(EltTy, SrcVec2);
+
+ SmallVector<Register> Extracts;
+ // Select only applicable elements from unmerged values.
+ for (int Val : Shuffle.getMask()) {
+ if (Val == -1)
+ Extracts.push_back(Builder.buildUndef(EltTy).getReg(0));
+ else if (Val < Width)
+ Extracts.push_back(Unmerge1.getReg(Val));
+ else
+ Extracts.push_back(Unmerge2.getReg(Val - Width));
+ }
+
+ Builder.buildBuildVector(MI.getOperand(0).getReg(), Extracts);
+ MI.eraseFromParent();
+}
+
bool CombinerHelper::matchCombineShuffleConcat(
MachineInstr &MI, SmallVector<Register> &Ops) const {
ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index da47aaf8a3b5c..25e8aaf41511f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -163,7 +163,8 @@ def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>;
def AMDGPUPreLegalizerCombiner: GICombiner<
"AMDGPUPreLegalizerCombinerImpl",
- [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16, foldable_fneg]> {
+ [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,
+ foldable_fneg, combine_shuffle_vector_to_build_vector]> {
let CombineAllMethodName = "tryCombineAllImpl";
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
new file mode 100644
index 0000000000000..bba608cceee19
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
@@ -0,0 +1,137 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: shuffle_vector_to_extract
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: shuffle_vector_to_extract
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[UV7]](s16)
+ ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<4 x s16>), [[COPY1]](p3) :: (store (<4 x s16>), addrspace 3)
+ ; CHECK-NEXT: SI_RETURN
+ %0:_(p3) = COPY $vgpr0
+ %1:_(p3) = COPY $vgpr1
+ %12:_(<8 x s16>) = G_IMPLICIT_DEF
+ %10:_(<8 x s16>) = G_LOAD %0(p3) :: (load (<8 x s16>), align 8, addrspace 3)
+ %11:_(<4 x s16>) = G_SHUFFLE_VECTOR %10(<8 x s16>), %12, shufflemask(4, 5, 6, 7)
+ G_STORE %11(<4 x s16>), %1(p3) :: (store (<4 x s16>), addrspace 3)
+ SI_RETURN
+...
+
+---
+name: shuffle_vector_to_extract2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: shuffle_vector_to_extract2
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[UV3]](s16), [[UV4]](s16)
+ ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s16>), [[COPY1]](p3) :: (store (<2 x s16>), addrspace 3)
+ ; CHECK-NEXT: SI_RETURN
+ %0:_(p3) = COPY $vgpr0
+ %1:_(p3) = COPY $vgpr1
+ %12:_(<8 x s16>) = G_IMPLICIT_DEF
+ %10:_(<8 x s16>) = G_LOAD %0(p3) :: (load (<8 x s16>), align 8, addrspace 3)
+ %11:_(<2 x s16>) = G_SHUFFLE_VECTOR %10(<8 x s16>), %12, shufflemask(3, 4)
+ G_STORE %11(<2 x s16>), %1(p3) :: (store (<2 x s16>), addrspace 3)
+ SI_RETURN
+
+...
+
+---
+name: shuffle_vector_to_extract_odd_elements
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: shuffle_vector_to_extract_odd_elements
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16)
+ ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<3 x s16>), [[COPY1]](p3) :: (store (<3 x s16>), align 8, addrspace 3)
+ ; CHECK-NEXT: SI_RETURN
+ %0:_(p3) = COPY $vgpr0
+ %1:_(p3) = COPY $vgpr1
+ %12:_(<8 x s16>) = G_IMPLICIT_DEF
+ %10:_(<8 x s16>) = G_LOAD %0(p3) :: (load (<8 x s16>), align 8, addrspace 3)
+ %11:_(<3 x s16>) = G_SHUFFLE_VECTOR %10(<8 x s16>), %12, shufflemask(0, 1, 2)
+ G_STORE %11(<3 x s16>), %1(p3) :: (store (<3 x s16>), addrspace 3)
+ SI_RETURN
+...
+
+
+---
+name: shuffle_vector_to_extract_minus_1_no_conversion
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: shuffle_vector_to_extract_minus_1_no_conversion
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV4]](s16), [[UV5]](s16), [[DEF]](s16), [[UV7]](s16)
+ ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<4 x s16>), [[COPY1]](p3) :: (store (<4 x s16>), addrspace 3)
+ ; CHECK-NEXT: SI_RETURN
+ %0:_(p3) = COPY $vgpr0
+ %1:_(p3) = COPY $vgpr1
+ %12:_(<8 x s16>) = G_IMPLICIT_DEF
+ %10:_(<8 x s16>) = G_LOAD %0(p3) :: (load (<8 x s16>), align 8, addrspace 3)
+ %11:_(<4 x s16>) = G_SHUFFLE_VECTOR %10(<8 x s16>), %12, shufflemask(4, 5, -1, 7)
+ G_STORE %11(<4 x s16>), %1(p3) :: (store (<4 x s16>), addrspace 3)
+ SI_RETURN
+...
+
+---
+name: shuffle_vector_to_extract_across_vectors_no_conversion
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: shuffle_vector_to_extract_across_vectors_no_conversion
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV6]](s16), [[UV7]](s16), [[DEF]](s16), [[DEF]](s16)
+ ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<4 x s16>), [[COPY1]](p3) :: (store (<4 x s16>), addrspace 3)
+ ; CHECK-NEXT: SI_RETURN
+ %0:_(p3) = COPY $vgpr0
+ %1:_(p3) = COPY $vgpr1
+ %12:_(<8 x s16>) = G_IMPLICIT_DEF
+ %10:_(<8 x s16>) = G_LOAD %0(p3) :: (load (<8 x s16>), align 8, addrspace 3)
+ %11:_(<4 x s16>) = G_SHUFFLE_VECTOR %10(<8 x s16>), %12, shufflemask(6, 7, 8, 9)
+ G_STORE %11(<4 x s16>), %1(p3) :: (store (<4 x s16>), addrspace 3)
+ SI_RETURN
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll
new file mode 100644
index 0000000000000..09274c4d3626b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mtriple=amdgcn-amd-hmcsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX942 %s
+
+define void @shuffle_to_extract(ptr addrspace(3) %in, ptr addrspace(3) %out) {
+; GFX942-LABEL: shuffle_to_extract:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: ds_read2_b64 v[2:5], v0 offset1:1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: ds_write_b64 v1, v[4:5]
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %val = load <8 x half>, ptr addrspace(3) %in, align 8
+ %res = shufflevector <8 x half> %val, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ store <4 x half> %res, ptr addrspace(3) %out, align 8
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 405058b24dcc2..fdc1dd6cce8e1 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -1736,10 +1736,6 @@ define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0
; GISEL-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8
-; GISEL-NEXT: s_mov_b32 s4, 0xffff
-; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: v_bfi_b32 v0, s4, v0, v0
-; GISEL-NEXT: v_bfi_b32 v1, s4, v1, v1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1820,11 +1816,6 @@ define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0
; GISEL-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:12
-; GISEL-NEXT: s_mov_b32 s4, 0xffff
-; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: v_bfi_b32 v0, s4, v0, v0
-; GISEL-NEXT: v_bfi_b32 v1, s4, v1, v1
-; GISEL-NEXT: v_bfi_b32 v2, s4, v2, v2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1867,12 +1858,6 @@ define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:16
-; GISEL-NEXT: s_mov_b32 s4, 0xffff
-; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: v_bfi_b32 v0, s4, v0, v0
-; GISEL-NEXT: v_bfi_b32 v1, s4, v1, v1
-; GISEL-NEXT: v_bfi_b32 v2, s4, v2, v2
-; GISEL-NEXT: v_bfi_b32 v3, s4, v3, v3
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2181,14 +2166,14 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
; GISEL-LABEL: load_v6i8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4
+; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0
; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
-; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load <6 x i8>, ptr addrspace(7) %p
@@ -3644,11 +3629,11 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc
; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GISEL-NEXT: s_setpc_b64 s[30:31]
%p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
%ret = load volatile <6 x i8>, ptr addrspace(7) %p
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index c0c0d3ded117d..c7f02b162dfee 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -9049,13 +9049,13 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GFX8-GISEL-NEXT: v_mad_u16 v6, v4, v5, v6
; GFX8-GISEL-NEXT: v_mad_u16 v2, v0, v1, v2
; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-GISEL-NEXT: v_or_b32_e32 v2, v2, v6
-; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; GFX8-GISEL-NEXT: v_mad_u16 v0, v0, v1, v3
-; GFX8-GISEL-NEXT: v_mad_u16 v1, v4, v5, v6
+; GFX8-GISEL-NEXT: v_mad_u16 v1, v4, v5, v7
+; GFX8-GISEL-NEXT: v_or_b32_e32 v2, v2, v6
; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v0, v1
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
index 63b139bb25e77..0c46ccda17640 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
@@ -184,8 +184,6 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s6, s6, s6
-; GFX12-PACKED-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
index 17ebb1a835462..2d5c95156c6f2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
@@ -204,10 +204,9 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
; GFX12-PACKED-GISEL-NEXT: s_load_b96 s[8:10], s[4:5], 0x10
; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s8, s8, s8
-; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s10
; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s8
; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s10
; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
; GFX12-PACKED-GISEL-NEXT: s_endpgm
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index 4c2a16c17b38a..1720bf984ca09 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -440,21 +440,13 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
-; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
-; GISEL-CI-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GISEL-CI-NEXT: v_or_b32_e32 v1, v1, v4
-; GISEL-CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v6, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v5
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v3
; GISEL-CI-NEXT: v_mad_f32 v0, v4, v0, v1
-; GISEL-CI-NEXT: v_mac_f32_e32 v1, v5, v2
+; GISEL-CI-NEXT: v_mac_f32_e32 v1, v6, v2
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.shuf = shufflevector <2 x half> %src0, <2 x half> undef, <2 x i32> <i32 1, i32 0>
%src1.shuf = shufflevector <2 x half> %src1, <2 x half> undef, <2 x i32> <i32 0, i32 1>
More information about the llvm-commits
mailing list