[llvm] [AMDGPU] - Add s_bitreplicate intrinsic (PR #69209)
Jessica Del via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 27 01:59:14 PDT 2023
https://github.com/OutOfCache updated https://github.com/llvm/llvm-project/pull/69209
>From 290af6eed41bc49e21e482252b0a86f6d5fc7c34 Mon Sep 17 00:00:00 2001
From: Jessica Del <Jessica.Del at amd.com>
Date: Mon, 16 Oct 2023 11:26:38 +0200
Subject: [PATCH 1/7] [AMDGPU] - Generate s_bitreplicate_b64_b32
Add intrinsic for s_bitreplicate. Lower to S_BITREPLICATE_B64_B32
machine instruction in both GISel and Selection DAG.
Support VGPR arguments by inserting a `v_readfirstlane`.
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 5 +++
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 3 ++
.../AMDGPU/AMDGPUInstructionSelector.cpp | 14 ++++++
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 1 +
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 +++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 8 ++++
.../AMDGPU/llvm.amdgcn.bitreplicate.ll | 45 +++++++++++++++++++
7 files changed, 82 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4f42462f655e260..66aa7862893d049 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1927,6 +1927,11 @@ def int_amdgcn_inverse_ballot :
Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
[IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_bitreplicate :
+ Intrinsic<[llvm_i64_ty], [llvm_i32_ty],
+ [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
[data_ty],
[
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b5ceaaa14b4fd5e..ff7b6724def9b1d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2568,6 +2568,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
llvm_unreachable("Unsupported size for inverse ballot mask.");
}
break;
+ case Intrinsic::amdgcn_s_bitreplicate:
+ Opcode = AMDGPU::S_BITREPLICATE_B64_B32;
+ break;
default:
SelectCode(N);
return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 31d72fb8cadd8a6..1b2f809f102d308 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1067,6 +1067,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return selectBallot(I);
case Intrinsic::amdgcn_inverse_ballot:
return selectInverseBallot(I);
+ case Intrinsic::amdgcn_s_bitreplicate:
+ return selectBitReplicate(I);
case Intrinsic::amdgcn_reloc_constant:
return selectRelocConstant(I);
case Intrinsic::amdgcn_groupstaticsize:
@@ -1470,6 +1472,18 @@ bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const {
return true;
}
+bool AMDGPUInstructionSelector::selectBitReplicate(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register MaskReg = I.getOperand(2).getReg();
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BITREPLICATE_B64_B32), DstReg)
+ .addReg(MaskReg);
+ I.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 93e45fcd8682f07..2bea2a5fe0804d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -114,6 +114,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectIntrinsicCmp(MachineInstr &MI) const;
bool selectBallot(MachineInstr &I) const;
bool selectInverseBallot(MachineInstr &I) const;
+ bool selectBitReplicate(MachineInstr &I) const;
bool selectRelocConstant(MachineInstr &I) const;
bool selectGroupStaticSize(MachineInstr &I) const;
bool selectReturnAddress(MachineInstr &I) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5b056bd9e5dba2c..bbc0bc58ade3ada 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2994,6 +2994,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyMappingBFE(B, OpdMapper, false);
return;
case Intrinsic::amdgcn_inverse_ballot:
+ case Intrinsic::amdgcn_s_bitreplicate:
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 2); // Mask
return;
@@ -4544,6 +4545,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
break;
}
+ case Intrinsic::amdgcn_s_bitreplicate:
+ Register MaskReg = MI.getOperand(2).getReg();
+ unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
+ OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
}
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 51397cbb791469d..295e878341b438e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6297,6 +6297,14 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
return CreatedBB;
}
+ // Legalize S_BITREPLICATE
+ if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32) {
+ MachineOperand &Src = MI.getOperand(1);
+ if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
+ Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
+ return CreatedBB;
+ }
+
// Legalize MIMG and MUBUF/MTBUF for shaders.
//
// Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
new file mode 100644
index 000000000000000..027c9ef5e7cc349
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+
+declare i64 @llvm.amdgcn.s.bitreplicate(i32)
+
+define i64 @test_s_bitreplicate_constant() {
+; GFX11-LABEL: test_s_bitreplicate_constant:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], 0x85fe3a92
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 u0x85FE3A92)
+ ret i64 %br
+}
+
+define amdgpu_cs void @test_s_bitreplicate_sgpr(i32 inreg %mask, ptr addrspace(1) %out) {
+; GFX11-LABEL: test_s_bitreplicate_sgpr:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], s0
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+entry:
+ %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
+ store i64 %br, ptr addrspace(1) %out
+ ret void
+}
+
+define i64 @test_s_bitreplicate_vgpr(i32 %mask) {
+; GFX11-LABEL: test_s_bitreplicate_vgpr:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_bitreplicate_b64_b32 s[0:1], s0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
+ ret i64 %br
+}
>From 077ccfa79f313627aa94882aaf9e088464ab2d72 Mon Sep 17 00:00:00 2001
From: Jessica Del <Jessica.Del at amd.com>
Date: Tue, 24 Oct 2023 08:47:58 +0200
Subject: [PATCH 2/7] [AMDGPU] - Add pattern for s_bitreplicate intrinsic
Add a pattern instead of manually selecting
in GISel and SDAG.
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 3 ---
.../Target/AMDGPU/AMDGPUInstructionSelector.cpp | 14 --------------
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h | 1 -
llvm/lib/Target/AMDGPU/SOPInstructions.td | 5 +++++
4 files changed, 5 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ff7b6724def9b1d..b5ceaaa14b4fd5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2568,9 +2568,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
llvm_unreachable("Unsupported size for inverse ballot mask.");
}
break;
- case Intrinsic::amdgcn_s_bitreplicate:
- Opcode = AMDGPU::S_BITREPLICATE_B64_B32;
- break;
default:
SelectCode(N);
return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 1b2f809f102d308..31d72fb8cadd8a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1067,8 +1067,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return selectBallot(I);
case Intrinsic::amdgcn_inverse_ballot:
return selectInverseBallot(I);
- case Intrinsic::amdgcn_s_bitreplicate:
- return selectBitReplicate(I);
case Intrinsic::amdgcn_reloc_constant:
return selectRelocConstant(I);
case Intrinsic::amdgcn_groupstaticsize:
@@ -1472,18 +1470,6 @@ bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const {
return true;
}
-bool AMDGPUInstructionSelector::selectBitReplicate(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- const DebugLoc &DL = I.getDebugLoc();
- const Register DstReg = I.getOperand(0).getReg();
- const Register MaskReg = I.getOperand(2).getReg();
-
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BITREPLICATE_B64_B32), DstReg)
- .addReg(MaskReg);
- I.eraseFromParent();
- return true;
-}
-
bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 2bea2a5fe0804d0..93e45fcd8682f07 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -114,7 +114,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectIntrinsicCmp(MachineInstr &MI) const;
bool selectBallot(MachineInstr &I) const;
bool selectInverseBallot(MachineInstr &I) const;
- bool selectBitReplicate(MachineInstr &I) const;
bool selectRelocConstant(MachineInstr &I) const;
bool selectGroupStaticSize(MachineInstr &I) const;
bool selectReturnAddress(MachineInstr &I) const;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index f3309049e7a7fd9..9b4f23236876a93 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -363,6 +363,11 @@ let SubtargetPredicate = isGFX9Plus in {
let isReMaterializable = 1 in
def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">;
+
+ def : GCNPat <
+ (int_amdgcn_s_bitreplicate i32:$src0),
+ (S_BITREPLICATE_B64_B32 SSrc_b32:$src0)
+ >;
} // End SubtargetPredicate = isGFX9Plus
let SubtargetPredicate = isGFX10Plus in {
>From f04552b3c15ece7a72341c72797ad77f638cfe18 Mon Sep 17 00:00:00 2001
From: Jessica Del <Jessica.Del at amd.com>
Date: Tue, 24 Oct 2023 09:26:15 +0200
Subject: [PATCH 3/7] fixup! [AMDGPU] - Add pattern for s_bitreplicate
intrinsic
---
llvm/lib/Target/AMDGPU/SOPInstructions.td | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 9b4f23236876a93..ee5150eb47f7d68 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -362,12 +362,8 @@ let SubtargetPredicate = isGFX9Plus in {
} // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]
let isReMaterializable = 1 in
- def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">;
-
- def : GCNPat <
- (int_amdgcn_s_bitreplicate i32:$src0),
- (S_BITREPLICATE_B64_B32 SSrc_b32:$src0)
- >;
+ def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32",
+ [(set i64:$sdst, (int_amdgcn_s_bitreplicate i32:$src0))]>;
} // End SubtargetPredicate = isGFX9Plus
let SubtargetPredicate = isGFX10Plus in {
>From 98db73fe356c462108bdff6a661ee82be1f205ca Mon Sep 17 00:00:00 2001
From: Jessica Del <Jessica.Del at amd.com>
Date: Wed, 25 Oct 2023 09:29:14 +0200
Subject: [PATCH 4/7] fixup! [AMDGPU] - Generate s_bitreplicate_b64_b32
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 66aa7862893d049..0bbf0f28489430d 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1927,7 +1927,9 @@ def int_amdgcn_inverse_ballot :
Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
[IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
-
+// Since there is no good VALU equivalent of this instruction, we mark it as convergent.
+// This prevents tranformations from uniform input to divergent input.
+// If the input is in a VGPR, we insert a v_readfirstlane.
def int_amdgcn_s_bitreplicate :
Intrinsic<[llvm_i64_ty], [llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
>From bf7c1c5de4dab38f47ec3616f6c3d919fab3528f Mon Sep 17 00:00:00 2001
From: Jessica Del <Jessica.Del at amd.com>
Date: Wed, 25 Oct 2023 14:08:09 +0200
Subject: [PATCH 5/7] fixup! [AMDGPU] - Generate s_bitreplicate_b64_b32
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 0bbf0f28489430d..5c527cb9268858e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1928,7 +1928,8 @@ def int_amdgcn_inverse_ballot :
[IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// Since there is no good VALU equivalent of this instruction, we mark it as convergent.
-// This prevents tranformations from uniform input to divergent input.
+// This prevents tranformations from uniform input to divergent input, therefore
+// eliminating the need for a waterfall loop.
// If the input is in a VGPR, we insert a v_readfirstlane.
def int_amdgcn_s_bitreplicate :
Intrinsic<[llvm_i64_ty], [llvm_i32_ty],
>From 5bb2c3a8145fc49903828fc62e07e165a067ffb8 Mon Sep 17 00:00:00 2001
From: Jessica Del <Jessica.Del at amd.com>
Date: Wed, 25 Oct 2023 16:47:55 +0200
Subject: [PATCH 6/7] fixup! [AMDGPU] - Generate s_bitreplicate_b64_b32
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 5c527cb9268858e..c0d4f7ab094cb7e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1928,7 +1928,7 @@ def int_amdgcn_inverse_ballot :
[IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// Since there is no good VALU equivalent of this instruction, we mark it as convergent.
-// This prevents tranformations from uniform input to divergent input, therefore
+// This prevents transformations from uniform input to divergent input, therefore
// eliminating the need for a waterfall loop.
// If the input is in a VGPR, we insert a v_readfirstlane.
def int_amdgcn_s_bitreplicate :
>From 0550f6bf26e7d92b479b88d7c81121156148949c Mon Sep 17 00:00:00 2001
From: Jessica Del <Jessica.Del at amd.com>
Date: Fri, 27 Oct 2023 10:58:46 +0200
Subject: [PATCH 7/7] fixup! [AMDGPU] - Add pattern for s_bitreplicate
intrinsic
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index c0d4f7ab094cb7e..0aaa2627451f61f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1927,10 +1927,8 @@ def int_amdgcn_inverse_ballot :
Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
[IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
-// Since there is no good VALU equivalent of this instruction, we mark it as convergent.
-// This prevents transformations from uniform input to divergent input, therefore
-// eliminating the need for a waterfall loop.
-// If the input is in a VGPR, we insert a v_readfirstlane.
+// Lowers to S_BITREPLICATE_B64_B32.
+// The argument must be uniform; otherwise, the result is undefined.
def int_amdgcn_s_bitreplicate :
Intrinsic<[llvm_i64_ty], [llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
More information about the llvm-commits
mailing list