[llvm] [GlobalISel] Fix buildCopyFromRegs for split vectors (PR #77448)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 9 03:48:49 PST 2024
https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/77448
Fixes #77055
>From 6409bf5c6d454568a79d149e383ba6f857f47fb2 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 9 Jan 2024 12:47:47 +0100
Subject: [PATCH] [GlobalISel] Fix buildCopyFromRegs for split vectors
Fixes #77055
---
llvm/lib/CodeGen/GlobalISel/CallLowering.cpp | 34 ++++++-
llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll | 96 ++++++++++++++++++++
2 files changed, 127 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 6858e030c2c75e..1e3c5d5d8007b1 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -478,9 +478,37 @@ static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
} else {
// Vector was split, and elements promoted to a wider type.
// FIXME: Should handle floating point promotions.
- LLT BVType = LLT::fixed_vector(LLTy.getNumElements(), PartLLT);
- auto BV = B.buildBuildVector(BVType, Regs);
- B.buildTrunc(OrigRegs[0], BV);
+ unsigned NumElts = LLTy.getNumElements();
+ LLT BVType = LLT::fixed_vector(NumElts, PartLLT);
+
+ Register BuildVec;
+ if (NumElts == Regs.size())
+ BuildVec = B.buildBuildVector(BVType, Regs).getReg(0);
+ else {
+ SmallVector<Register, 0> BVRegs;
+ BVRegs.reserve(NumElts);
+
+ // Vector elements are packed in the inputs.
+ // e.g. we have a <4 x s16> but 2 x s32 in regs.
+ assert(NumElts > Regs.size());
+ LLT SrcEltTy = MRI.getType(Regs[0]);
+ LLT OriginalEltTy = MRI.getType(OrigRegs[0]).getElementType();
+
+ // Input registers contain packed elements.
+ // Determine how many elements per reg.
+ assert((SrcEltTy.getSizeInBits() % OriginalEltTy.getSizeInBits()) == 0);
+ unsigned EltPerReg =
+ (SrcEltTy.getSizeInBits() / OriginalEltTy.getSizeInBits());
+
+ for (Register R : Regs) {
+ auto Unmerge = B.buildUnmerge(OriginalEltTy, R);
+ for (unsigned K = 0; K < EltPerReg; ++K)
+ BVRegs.push_back(B.buildAnyExt(PartLLT, Unmerge.getReg(K)).getReg(0));
+ }
+ assert(BVRegs.size() == NumElts);
+ BuildVec = B.buildBuildVector(BVType, BVRegs).getReg(0);
+ }
+ B.buildTrunc(OrigRegs[0], BuildVec);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
new file mode 100644
index 00000000000000..3037b84b25775a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bf16.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -global-isel -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
+
+; TODO: expand testcases - currently only contains cases that were known to crash.
+
+; assert in IRTranslator, #77055
+define <4 x bfloat> @v4bf16(<4 x bfloat> %arg0) {
+; GCN-LABEL: v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT: v_or_b32_e32 v3, v1, v0
+; GCN-NEXT: v_or_b32_e32 v2, v4, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: v_mov_b32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_mov_b32_sdwa v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v4bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v3, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %res = shufflevector <4 x bfloat> %arg0, <4 x bfloat> zeroinitializer, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
+ ret <4 x bfloat> %res
+}
More information about the llvm-commits
mailing list