[llvm] 5a50352 - AMDGPU/GlobalISel: Implement expansion for rsq.clamp
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 6 07:30:52 PDT 2020
Author: Matt Arsenault
Date: 2020-08-06T10:23:25-04:00
New Revision: 5a503521e7b757bda70325f4c01bdbc0f4e3128e
URL: https://github.com/llvm/llvm-project/commit/5a503521e7b757bda70325f4c01bdbc0f4e3128e
DIFF: https://github.com/llvm/llvm-project/commit/5a503521e7b757bda70325f4c01bdbc0f4e3128e.diff
LOG: AMDGPU/GlobalISel: Implement expansion for rsq.clamp
Not sure why we handle this removed instruction on newer subtargets
for this one and no others, but maintain compatibility with the DAG.
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.rsq.clamp.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 457cb615216d..fb5bdf8aed51 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -3160,6 +3160,55 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
return true;
}
+// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
+// FIXME: Why do we handle this one but not other removed instructions?
+//
+// Reciprocal square root. The clamp prevents infinite results, clamping
+// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
+// +-max_float.
+bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return true;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(2).getReg();
+ auto Flags = MI.getFlags();
+
+ LLT Ty = MRI.getType(Dst);
+
+ const fltSemantics *FltSemantics;
+ if (Ty == LLT::scalar(32))
+ FltSemantics = &APFloat::IEEEsingle();
+ else if (Ty == LLT::scalar(64))
+ FltSemantics = &APFloat::IEEEdouble();
+ else
+ return false;
+
+ auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
+ .addUse(Src)
+ .setMIFlags(Flags);
+
+ // We don't need to concern ourselves with the snan handling
diff erence, since
+ // the rsq quieted (or not) so use the one which will directly select.
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+ const bool UseIEEE = MFI->getMode().IEEE;
+
+ auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
+ auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
+ B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
+
+ auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
+
+ if (UseIEEE)
+ B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
+ else
+ B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -4393,6 +4442,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeTrapIntrinsic(MI, MRI, B);
case Intrinsic::debugtrap:
return legalizeDebugTrapIntrinsic(MI, MRI, B);
+ case Intrinsic::amdgcn_rsq_clamp:
+ return legalizeRsqClampIntrinsic(MI, MRI, B);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 99191487f90d..3e3e1f1b1015 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -128,6 +128,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.rsq.clamp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.rsq.clamp.mir
new file mode 100644
index 000000000000..890a2245f80b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.rsq.clamp.mir
@@ -0,0 +1,63 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s
+
+---
+name: test_rsq_clamp_flags_ieee_on_f32
+tracksRegLiveness: true
+machineFunctionInfo:
+ mode:
+ ieee: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; SI-LABEL: name: test_rsq_clamp_flags_ieee_on_f32
+ ; SI: liveins: $vgpr0
+ ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; SI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), [[COPY]](s32)
+ ; SI: $vgpr0 = COPY [[INT]](s32)
+ ; VI-LABEL: name: test_rsq_clamp_flags_ieee_on_f32
+ ; VI: liveins: $vgpr0
+ ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; VI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
+ ; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x47EFFFFFE0000000
+ ; VI: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMINNUM_IEEE [[INT]], [[C]]
+ ; VI: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC7EFFFFFE0000000
+ ; VI: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMAXNUM_IEEE [[FMINNUM_IEEE]], [[C1]]
+ ; VI: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0
+ $vgpr0 = COPY %1
+...
+
+---
+name: test_rsq_clamp_flags_ieee_off_f32
+tracksRegLiveness: true
+machineFunctionInfo:
+ mode:
+ ieee: false
+
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; SI-LABEL: name: test_rsq_clamp_flags_ieee_off_f32
+ ; SI: liveins: $vgpr0
+ ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; SI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), [[COPY]](s32)
+ ; SI: $vgpr0 = COPY [[INT]](s32)
+ ; VI-LABEL: name: test_rsq_clamp_flags_ieee_off_f32
+ ; VI: liveins: $vgpr0
+ ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; VI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
+ ; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x47EFFFFFE0000000
+ ; VI: [[FMINNUM:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMINNUM [[INT]], [[C]]
+ ; VI: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC7EFFFFFE0000000
+ ; VI: [[FMAXNUM:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMAXNUM [[FMINNUM]], [[C1]]
+ ; VI: $vgpr0 = COPY [[FMAXNUM]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0
+ $vgpr0 = COPY %1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
new file mode 100644
index 000000000000..bd570df3d83b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+
+define float @v_rsq_clamp_f32(float %src) #0 {
+; SI-LABEL: v_rsq_clamp_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f32_e32 v0, v0
+; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
+; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
+ ret float %rsq_clamp
+}
+
+define float @v_rsq_clamp_fabs_f32(float %src) #0 {
+; SI-LABEL: v_rsq_clamp_fabs_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f32_e64 v0, |v0|
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_fabs_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f32_e64 v0, |v0|
+; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
+; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %fabs.src = call float @llvm.fabs.f32(float %src)
+ %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %fabs.src)
+ ret float %rsq_clamp
+}
+
+define double @v_rsq_clamp_f64(double %src) #0 {
+; SI-LABEL: v_rsq_clamp_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], v[0:1]
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
+; VI-NEXT: s_mov_b32 s4, -1
+; VI-NEXT: s_mov_b32 s5, 0x7fefffff
+; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_mov_b32 s5, 0xffefffff
+; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
+ ret double %rsq_clamp
+}
+
+define double @v_rsq_clamp_fabs_f64(double %src) #0 {
+; SI-LABEL: v_rsq_clamp_fabs_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f64_e64 v[0:1], |v[0:1]|
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_fabs_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f64_e64 v[0:1], |v[0:1]|
+; VI-NEXT: s_mov_b32 s4, -1
+; VI-NEXT: s_mov_b32 s5, 0x7fefffff
+; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_mov_b32 s5, 0xffefffff
+; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %fabs.src = call double @llvm.fabs.f64(double %src)
+ %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %fabs.src)
+ ret double %rsq_clamp
+}
+
+define float @v_rsq_clamp_undef_f32() #0 {
+; SI-LABEL: v_rsq_clamp_undef_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f32_e32 v0, s4
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_undef_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f32_e32 v0, s4
+; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
+; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
+ ret float %rsq_clamp
+}
+
+define double @v_rsq_clamp_undef_f64() #0 {
+; SI-LABEL: v_rsq_clamp_undef_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], s[4:5]
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_undef_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f64_e32 v[0:1], s[4:5]
+; VI-NEXT: s_mov_b32 s4, -1
+; VI-NEXT: s_mov_b32 s5, 0x7fefffff
+; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_mov_b32 s5, 0xffefffff
+; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double undef)
+ ret double %rsq_clamp
+}
+
+define float @v_rsq_clamp_f32_non_ieee(float %src) #2 {
+; SI-LABEL: v_rsq_clamp_f32_non_ieee:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_f32_non_ieee:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f32_e32 v0, v0
+; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
+; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
+ ret float %rsq_clamp
+}
+
+define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
+; SI-LABEL: v_rsq_clamp_f64_non_ieee:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], v[0:1]
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_rsq_clamp_f64_non_ieee:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
+; VI-NEXT: s_mov_b32 s4, -1
+; VI-NEXT: s_mov_b32 s5, 0x7fefffff
+; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_mov_b32 s5, 0xffefffff
+; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
+ ret double %rsq_clamp
+}
+
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.amdgcn.rsq.clamp.f32(float) #1
+declare double @llvm.fabs.f64(double) #1
+declare double @llvm.amdgcn.rsq.clamp.f64(double) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "amdgpu-ieee"="false" }
More information about the llvm-commits
mailing list