[llvm] 35ef4c9 - [AMDGPU][GlobalISel] Legalize G_ABS
Mirko Brkusanin via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 4 05:50:32 PDT 2021
Author: Mirko Brkusanin
Date: 2021-06-04T14:46:43+02:00
New Revision: 35ef4c940bea1e2b803f17f13a625b2126c62b82
URL: https://github.com/llvm/llvm-project/commit/35ef4c940bea1e2b803f17f13a625b2126c62b82
DIFF: https://github.com/llvm/llvm-project/commit/35ef4c940bea1e2b803f17f13a625b2126c62b82.diff
LOG: [AMDGPU][GlobalISel] Legalize G_ABS
Legalize and select G_ABS so that we can use llvm.abs intrinsic
Differential Revision: https://reviews.llvm.org/D102391
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
Modified:
llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/SOPInstructions.td
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index f8ebe322cf69d..7bdd073b70cc7 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -397,6 +397,8 @@ class LegalizerHelper {
LegalizeResult lowerSMULH_UMULH(MachineInstr &MI);
LegalizeResult lowerSelect(MachineInstr &MI);
LegalizeResult lowerDIVREM(MachineInstr &MI);
+ LegalizeResult lowerAbsToAddXor(MachineInstr &MI);
+ LegalizeResult lowerAbsToMaxNeg(MachineInstr &MI);
};
/// Helper function that creates a libcall to the given \p Name using the given
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 4240f7fe6223e..ba8050bb9d787 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2015,6 +2015,13 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
Observer.changedInstr(MI);
return Legalized;
+ case TargetOpcode::G_ABS:
+ Observer.changingInstr(MI);
+ widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
+ widenScalarDst(MI, WideTy);
+ Observer.changedInstr(MI);
+ return Legalized;
+
case TargetOpcode::G_ADD:
case TargetOpcode::G_AND:
case TargetOpcode::G_MUL:
@@ -3200,22 +3207,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
case G_SSHLSAT:
case G_USHLSAT:
return lowerShlSat(MI);
- case G_ABS: {
- // Expand %res = G_ABS %a into:
- // %v1 = G_ASHR %a, scalar_size-1
- // %v2 = G_ADD %a, %v1
- // %res = G_XOR %v2, %v1
- LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
- Register OpReg = MI.getOperand(1).getReg();
- auto ShiftAmt =
- MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
- auto Shift =
- MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
- auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
- MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
- MI.eraseFromParent();
- return Legalized;
- }
+ case G_ABS:
+ return lowerAbsToAddXor(MI);
case G_SELECT:
return lowerSelect(MI);
case G_SDIVREM:
@@ -4160,6 +4153,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
case G_SMAX:
case G_UMIN:
case G_UMAX:
+ case G_ABS:
case G_FMINNUM:
case G_FMAXNUM:
case G_FMINNUM_IEEE:
@@ -7010,3 +7004,35 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
MI.eraseFromParent();
return Legalized;
}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
+ // Expand %res = G_ABS %a into:
+ // %v1 = G_ASHR %a, scalar_size-1
+ // %v2 = G_ADD %a, %v1
+ // %res = G_XOR %v2, %v1
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ Register OpReg = MI.getOperand(1).getReg();
+ auto ShiftAmt =
+ MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
+ auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
+ auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
+ MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
+ MI.eraseFromParent();
+ return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
+ // Expand %res = G_ABS %a into:
+ // %v1 = G_CONSTANT 0
+ // %v2 = G_SUB %v1, %a
+ // %res = G_SMAX %a, %v2
+ Register SrcReg = MI.getOperand(1).getReg();
+ LLT Ty = MRI.getType(SrcReg);
+ auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
+ auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
+ MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
+ MI.eraseFromParent();
+ return Legalized;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index fc9e9a103f28c..ad6196e8a7b38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -966,7 +966,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
if (ST.hasVOP3PInsts()) {
- getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32, S16, V2S16})
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.clampMaxNumElements(0, S16, 2)
@@ -975,7 +975,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.lower();
} else {
- getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32, S16})
.widenScalarToNextPow2(0)
.minScalar(0, S16)
@@ -994,7 +994,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.lower();
- getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+ getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32})
.minScalar(0, S32)
.widenScalarToNextPow2(0)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 770a9f764188c..4d2f4844b0f7a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2324,6 +2324,24 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MI.eraseFromParent();
return;
}
+ case AMDGPU::G_ABS: {
+ Register SrcReg = MI.getOperand(1).getReg();
+ const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
+
+ // There is no VALU abs instruction so we need to replace it with a sub and
+ // max combination.
+ if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
+ MachineFunction *MF = MI.getParent()->getParent();
+ ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
+ MachineIRBuilder B(MI, Apply);
+ LegalizerHelper Helper(*MF, Apply, B);
+
+ if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
+ llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
+ return;
+ }
+ LLVM_FALLTHROUGH;
+ }
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
case AMDGPU::G_MUL:
@@ -3508,6 +3526,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_SMAX:
case AMDGPU::G_UMIN:
case AMDGPU::G_UMAX:
+ case AMDGPU::G_ABS:
case AMDGPU::G_SHUFFLE_VECTOR:
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 2319e5ace041d..175e9043b5b20 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -294,7 +294,9 @@ def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">;
} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9
let Defs = [SCC] in {
-def S_ABS_I32 : SOP1_32 <"s_abs_i32">;
+def S_ABS_I32 : SOP1_32 <"s_abs_i32",
+ [(set i32:$sdst, (abs i32:$src0))]
+ >;
} // End Defs = [SCC]
let SubtargetPredicate = HasVGPRIndexMode in {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
new file mode 100644
index 0000000000000..cca8a9ee86fde
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -0,0 +1,154 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -verify-machineinstrs -o - < %s | FileCheck %s --check-prefixes=GFX,GFX6
+; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs -o - < %s | FileCheck %s --check-prefixes=GFX,GFX8
+
+declare i16 @llvm.abs.i16(i16, i1)
+declare i32 @llvm.abs.i32(i32, i1)
+declare i64 @llvm.abs.i64(i64, i1)
+declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
+
+define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
+; GFX-LABEL: abs_sgpr_i16:
+; GFX: ; %bb.0:
+; GFX-NEXT: s_sext_i32_i16 s0, s0
+; GFX-NEXT: s_abs_i32 s0, s0
+; GFX-NEXT: ; return to shader part epilog
+ %res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
+ ret i16 %res
+}
+
+define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) {
+; GFX-LABEL: abs_sgpr_i32:
+; GFX: ; %bb.0:
+; GFX-NEXT: s_abs_i32 s0, s0
+; GFX-NEXT: ; return to shader part epilog
+ %res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
+ ret i32 %res
+}
+
+define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
+; GFX-LABEL: abs_sgpr_i64:
+; GFX: ; %bb.0:
+; GFX-NEXT: s_ashr_i32 s2, s1, 31
+; GFX-NEXT: s_add_u32 s0, s0, s2
+; GFX-NEXT: s_cselect_b32 s4, 1, 0
+; GFX-NEXT: s_and_b32 s4, s4, 1
+; GFX-NEXT: s_cmp_lg_u32 s4, 0
+; GFX-NEXT: s_mov_b32 s3, s2
+; GFX-NEXT: s_addc_u32 s1, s1, s2
+; GFX-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX-NEXT: ; return to shader part epilog
+ %res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
+ ret i64 %res
+}
+
+define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
+; GFX-LABEL: abs_sgpr_v4i32:
+; GFX: ; %bb.0:
+; GFX-NEXT: s_abs_i32 s0, s0
+; GFX-NEXT: s_abs_i32 s1, s1
+; GFX-NEXT: s_abs_i32 s2, s2
+; GFX-NEXT: s_abs_i32 s3, s3
+; GFX-NEXT: ; return to shader part epilog
+ %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
+ ret <4 x i32> %res
+}
+
+define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) {
+; GFX6-LABEL: abs_vgpr_i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_vgpr_i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+ %res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
+ ret i16 %res
+}
+
+define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) {
+; GFX6-LABEL: abs_vgpr_i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_vgpr_i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
+; GFX8-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+ %res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
+ ret i32 %res
+}
+
+define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
+; GFX6-LABEL: abs_vgpr_i64:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_vgpr_i64:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2
+; GFX8-NEXT: v_xor_b32_e32 v1, v1, v2
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: ; return to shader part epilog
+ %res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
+ ret i64 %res
+}
+
+define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
+; GFX6-LABEL: abs_vgpr_v4i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v4
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v4
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v2, v4
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v3, v4
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: v_readfirstlane_b32 s2, v2
+; GFX6-NEXT: v_readfirstlane_b32 s3, v3
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_vgpr_v4i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v0
+; GFX8-NEXT: v_max_i32_e32 v0, v0, v4
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v1
+; GFX8-NEXT: v_max_i32_e32 v1, v1, v4
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v2
+; GFX8-NEXT: v_max_i32_e32 v2, v2, v4
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v3
+; GFX8-NEXT: v_max_i32_e32 v3, v3, v4
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v3
+; GFX8-NEXT: ; return to shader part epilog
+ %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
+ ret <4 x i32> %res
+}
More information about the llvm-commits
mailing list