[llvm] r270305 - AMDGPU: Fix high bits after division optimization
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri May 20 18:53:35 PDT 2016
Author: arsenm
Date: Fri May 20 20:53:33 2016
New Revision: 270305
URL: http://llvm.org/viewvc/llvm-project?rev=270305&view=rev
Log:
AMDGPU: Fix high bits after division optimization
This is essentially doing a 24-bit signed division with FP.
We need to truncate to the N bit result.
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/trunk/test/CodeGen/AMDGPU/sdiv.ll
llvm/trunk/test/CodeGen/AMDGPU/sdivrem24.ll
llvm/trunk/test/CodeGen/AMDGPU/udiv.ll
llvm/trunk/test/CodeGen/AMDGPU/udivrem.ll
llvm/trunk/test/CodeGen/AMDGPU/udivrem24.ll
llvm/trunk/test/CodeGen/AMDGPU/udivrem64.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=270305&r1=270304&r2=270305&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Fri May 20 20:53:33 2016
@@ -1175,7 +1175,7 @@ SDValue AMDGPUTargetLowering::SplitVecto
// This is a shortcut for integer division because we have fast i32<->f32
// conversions, and fast f32 reciprocal instructions. The fractional part of a
-// float is enough to accurately represent up to a 24-bit integer.
+// float is enough to accurately represent up to a 24-bit signed integer.
SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
bool Sign) const {
SDLoc DL(Op);
@@ -1185,10 +1185,22 @@ SDValue AMDGPUTargetLowering::LowerDIVRE
MVT IntVT = MVT::i32;
MVT FltVT = MVT::f32;
- ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
- ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
+ unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
+ if (LHSSignBits < 9)
+ return SDValue();
+
+ unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
+ if (RHSSignBits < 9)
+ return SDValue();
unsigned BitSize = VT.getSizeInBits();
+ unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
+ unsigned DivBits = BitSize - SignBits;
+ if (Sign)
+ ++DivBits;
+
+ ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
+ ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
SDValue jq = DAG.getConstant(1, DL, IntVT);
@@ -1252,6 +1264,18 @@ SDValue AMDGPUTargetLowering::LowerDIVRE
SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
+ // Truncate to number of bits this divide really is.
+ if (Sign) {
+ SDValue InRegSize
+ = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
+ Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
+ Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
+ } else {
+ SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
+ Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
+ Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
+ }
+
return DAG.getMergeValues({ Div, Rem }, DL);
}
@@ -1344,19 +1368,14 @@ SDValue AMDGPUTargetLowering::LowerUDIVR
return DAG.getMergeValues(Results, DL);
}
- SDValue Num = Op.getOperand(0);
- SDValue Den = Op.getOperand(1);
-
if (VT == MVT::i32) {
- if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) &&
- DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) {
- // TODO: We technically could do this for i64, but shouldn't that just be
- // handled by something generally reducing 64-bit division on 32-bit
- // values to 32-bit?
- return LowerDIVREM24(Op, DAG, false);
- }
+ if (SDValue Res = LowerDIVREM24(Op, DAG, false))
+ return Res;
}
+ SDValue Num = Op.getOperand(0);
+ SDValue Den = Op.getOperand(1);
+
// RCP = URECIP(Den) = 2^32 / Den + e
// e is rounding error.
SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
@@ -1464,11 +1483,11 @@ SDValue AMDGPUTargetLowering::LowerSDIVR
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue NegOne = DAG.getConstant(-1, DL, VT);
- if (VT == MVT::i32 &&
- DAG.ComputeNumSignBits(LHS) > 8 &&
- DAG.ComputeNumSignBits(RHS) > 8) {
- return LowerDIVREM24(Op, DAG, true);
+ if (VT == MVT::i32) {
+ if (SDValue Res = LowerDIVREM24(Op, DAG, true))
+ return Res;
}
+
if (VT == MVT::i64 &&
DAG.ComputeNumSignBits(LHS) > 32 &&
DAG.ComputeNumSignBits(RHS) > 32) {
Modified: llvm/trunk/test/CodeGen/AMDGPU/sdiv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sdiv.ll?rev=270305&r1=270304&r2=270305&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sdiv.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sdiv.ll Fri May 20 20:53:33 2016
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
@@ -82,6 +82,60 @@ define void @sdiv_v4i32_4(<4 x i32> addr
ret void
}
+; FUNC-LABEL: {{^}}v_sdiv_i8:
+; SI: v_rcp_f32
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 8
+; SI: buffer_store_dword [[BFE]]
+define void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+ %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+ %num = load i8, i8 addrspace(1) * %in
+ %den = load i8, i8 addrspace(1) * %den_ptr
+ %result = sdiv i8 %num, %den
+ %result.ext = sext i8 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_sdiv_i23:
+; SI: v_rcp_f32
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 23
+; SI: buffer_store_dword [[BFE]]
+define void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+ %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
+ %num = load i23, i23 addrspace(1) * %in
+ %den = load i23, i23 addrspace(1) * %den_ptr
+ %result = sdiv i23 %num, %den
+ %result.ext = sext i23 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_sdiv_i24:
+; SI: v_rcp_f32
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 24
+; SI: buffer_store_dword [[BFE]]
+define void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+ %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
+ %num = load i24, i24 addrspace(1) * %in
+ %den = load i24, i24 addrspace(1) * %den_ptr
+ %result = sdiv i24 %num, %den
+ %result.ext = sext i24 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_sdiv_i25:
+; SI-NOT: v_rcp_f32
+define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
+ %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1
+ %num = load i25, i25 addrspace(1) * %in
+ %den = load i25, i25 addrspace(1) * %den_ptr
+ %result = sdiv i25 %num, %den
+ %result.ext = sext i25 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
+
; Tests for 64-bit divide bypass.
; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
; %result = sdiv i64 %a, %b
Modified: llvm/trunk/test/CodeGen/AMDGPU/sdivrem24.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sdivrem24.ll?rev=270305&r1=270304&r2=270305&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sdivrem24.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sdivrem24.ll Fri May 20 20:53:33 2016
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
@@ -181,13 +181,13 @@ define void @srem24_i32(i32 addrspace(1)
ret void
}
-; FUNC-LABEL: {{^}}srem25_i32:
+; FUNC-LABEL: {{^}}no_srem25_i32:
; SI-NOT: v_cvt_f32_i32
; SI-NOT: v_rcp_f32
; EG-NOT: INT_TO_FLT
; EG-NOT: RECIP_IEEE
-define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%num = load i32, i32 addrspace(1) * %in, align 4
%den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -200,40 +200,138 @@ define void @srem25_i32(i32 addrspace(1)
ret void
}
-; FUNC-LABEL: {{^}}test_no_srem24_i32_1:
+; FUNC-LABEL: {{^}}no_sdiv25_i24_i25_i32:
; SI-NOT: v_cvt_f32_i32
; SI-NOT: v_rcp_f32
; EG-NOT: INT_TO_FLT
; EG-NOT: RECIP_IEEE
-define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%num = load i32, i32 addrspace(1) * %in, align 4
%den = load i32, i32 addrspace(1) * %den_ptr, align 4
%num.i24.0 = shl i32 %num, 8
- %den.i24.0 = shl i32 %den, 7
+ %den.i25.0 = shl i32 %den, 7
%num.i24 = ashr i32 %num.i24.0, 8
- %den.i24 = ashr i32 %den.i24.0, 7
- %result = srem i32 %num.i24, %den.i24
+ %den.i25 = ashr i32 %den.i25.0, 7
+ %result = sdiv i32 %num.i24, %den.i25
store i32 %result, i32 addrspace(1)* %out, align 4
ret void
}
-; FUNC-LABEL: {{^}}test_no_srem24_i32_2:
+; FUNC-LABEL: {{^}}no_sdiv25_i25_i24_i32:
; SI-NOT: v_cvt_f32_i32
; SI-NOT: v_rcp_f32
; EG-NOT: INT_TO_FLT
; EG-NOT: RECIP_IEEE
-define void @test_no_srem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%num = load i32, i32 addrspace(1) * %in, align 4
%den = load i32, i32 addrspace(1) * %den_ptr, align 4
- %num.i24.0 = shl i32 %num, 7
+ %num.i25.0 = shl i32 %num, 7
%den.i24.0 = shl i32 %den, 8
- %num.i24 = ashr i32 %num.i24.0, 7
+ %num.i25 = ashr i32 %num.i25.0, 7
%den.i24 = ashr i32 %den.i24.0, 8
- %result = srem i32 %num.i24, %den.i24
+ %result = sdiv i32 %num.i25, %den.i24
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}no_srem25_i24_i25_i32:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i24.0 = shl i32 %num, 8
+ %den.i25.0 = shl i32 %den, 7
+ %num.i24 = ashr i32 %num.i24.0, 8
+ %den.i25 = ashr i32 %den.i25.0, 7
+ %result = srem i32 %num.i24, %den.i25
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}no_srem25_i25_i24_i32:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i25.0 = shl i32 %num, 7
+ %den.i24.0 = shl i32 %den, 8
+ %num.i25 = ashr i32 %num.i25.0, 7
+ %den.i24 = ashr i32 %den.i24.0, 8
+ %result = srem i32 %num.i25, %den.i24
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}srem25_i24_i11_i32:
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 24
+
+; EG: INT_TO_FLT
+; EG: RECIP_IEEE
+define void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i24.0 = shl i32 %num, 8
+ %den.i11.0 = shl i32 %den, 21
+ %num.i24 = ashr i32 %num.i24.0, 8
+ %den.i11 = ashr i32 %den.i11.0, 21
+ %result = srem i32 %num.i24, %den.i11
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}srem25_i11_i24_i32:
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 24
+
+; EG: INT_TO_FLT
+; EG: RECIP_IEEE
+define void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i11.0 = shl i32 %num, 21
+ %den.i24.0 = shl i32 %den, 8
+ %num.i11 = ashr i32 %num.i11.0, 21
+ %den.i24 = ashr i32 %den.i24.0, 8
+ %result = srem i32 %num.i11, %den.i24
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}srem25_i17_i12_i32:
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 17
+
+; EG: INT_TO_FLT
+; EG: RECIP_IEEE
+define void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i17.0 = shl i32 %num, 15
+ %den.i12.0 = shl i32 %den, 20
+ %num.i17 = ashr i32 %num.i17.0, 15
+ %den.i12 = ashr i32 %den.i12.0, 20
+ %result = sdiv i32 %num.i17, %den.i12
store i32 %result, i32 addrspace(1)* %out, align 4
ret void
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/udiv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/udiv.ll?rev=270305&r1=270304&r2=270305&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/udiv.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/udiv.ll Fri May 20 20:53:33 2016
@@ -91,3 +91,57 @@ define void @udiv_i32_div_k_odd(i32 addr
store i32 %result, i32 addrspace(1)* %out
ret void
}
+
+; FUNC-LABEL: {{^}}v_udiv_i8:
+; SI: v_rcp_f32
+; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xff, v{{[0-9]+}}
+; SI: buffer_store_dword [[TRUNC]]
+define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+ %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+ %num = load i8, i8 addrspace(1) * %in
+ %den = load i8, i8 addrspace(1) * %den_ptr
+ %result = udiv i8 %num, %den
+ %result.ext = zext i8 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_udiv_i16:
+; SI: v_rcp_f32
+; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xffff, v{{[0-9]+}}
+; SI: buffer_store_dword [[TRUNC]]
+define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+ %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+ %num = load i16, i16 addrspace(1) * %in
+ %den = load i16, i16 addrspace(1) * %den_ptr
+ %result = udiv i16 %num, %den
+ %result.ext = zext i16 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_udiv_i23:
+; SI: v_rcp_f32
+; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; SI: buffer_store_dword [[TRUNC]]
+define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+ %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
+ %num = load i23, i23 addrspace(1) * %in
+ %den = load i23, i23 addrspace(1) * %den_ptr
+ %result = udiv i23 %num, %den
+ %result.ext = zext i23 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_udiv_i24:
+; SI-NOT: v_rcp_f32
+define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+ %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
+ %num = load i24, i24 addrspace(1) * %in
+ %den = load i24, i24 addrspace(1) * %den_ptr
+ %result = udiv i24 %num, %den
+ %result.ext = zext i24 %result to i32
+ store i32 %result.ext, i32 addrspace(1)* %out
+ ret void
+}
Modified: llvm/trunk/test/CodeGen/AMDGPU/udivrem.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/udivrem.ll?rev=270305&r1=270304&r2=270305&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/udivrem.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/udivrem.ll Fri May 20 20:53:33 2016
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
Modified: llvm/trunk/test/CodeGen/AMDGPU/udivrem24.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/udivrem24.ll?rev=270305&r1=270304&r2=270305&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/udivrem24.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/udivrem24.ll Fri May 20 20:53:33 2016
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}udiv24_i8:
@@ -40,7 +40,7 @@ define void @udiv24_i16(i16 addrspace(1)
ret void
}
-; FUNC-LABEL: {{^}}udiv24_i32:
+; FUNC-LABEL: {{^}}udiv23_i32:
; SI: v_cvt_f32_u32
; SI-DAG: v_cvt_f32_u32
; SI-DAG: v_rcp_f32
@@ -50,6 +50,23 @@ define void @udiv24_i16(i16 addrspace(1)
; EG-DAG: UINT_TO_FLT
; EG-DAG: RECIP_IEEE
; EG: FLT_TO_UINT
+define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i23.0 = shl i32 %num, 9
+ %den.i23.0 = shl i32 %den, 9
+ %num.i23 = lshr i32 %num.i23.0, 9
+ %den.i23 = lshr i32 %den.i23.0, 9
+ %result = udiv i32 %num.i23, %den.i23
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}udiv24_i32:
+; SI: v_rcp_iflag
+; SI-NOT v_rcp_f32
+; EG-NOT: RECIP_IEEE
define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%num = load i32, i32 addrspace(1) * %in, align 4
@@ -63,6 +80,40 @@ define void @udiv24_i32(i32 addrspace(1)
ret void
}
+; FUNC-LABEL: {{^}}no_udiv24_u23_u24_i32:
+; SI: v_rcp_iflag
+; SI-NOT v_rcp_f32
+; EG-NOT: RECIP_IEEE
+define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i23.0 = shl i32 %num, 9
+ %den.i24.0 = shl i32 %den, 8
+ %num.i23 = lshr i32 %num.i23.0, 9
+ %den.i24 = lshr i32 %den.i24.0, 8
+ %result = udiv i32 %num.i23, %den.i24
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}no_udiv24_u24_u23_i32:
+; SI: v_rcp_iflag
+; SI-NOT v_rcp_f32
+; EG-NOT: RECIP_IEEE
+define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i24.0 = shl i32 %num, 8
+ %den.i23.0 = shl i32 %den, 9
+ %num.i24 = lshr i32 %num.i24.0, 8
+ %den.i23 = lshr i32 %den.i23.0, 9
+ %result = udiv i32 %num.i24, %den.i23
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
; FUNC-LABEL: {{^}}udiv25_i32:
; RCP_IFLAG is for URECIP in the full 32b alg
; SI: v_rcp_iflag
@@ -74,11 +125,11 @@ define void @udiv25_i32(i32 addrspace(1)
%den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%num = load i32, i32 addrspace(1) * %in, align 4
%den = load i32, i32 addrspace(1) * %den_ptr, align 4
- %num.i24.0 = shl i32 %num, 7
- %den.i24.0 = shl i32 %den, 7
- %num.i24 = lshr i32 %num.i24.0, 7
- %den.i24 = lshr i32 %den.i24.0, 7
- %result = udiv i32 %num.i24, %den.i24
+ %num.i25.0 = shl i32 %num, 7
+ %den.i25.0 = shl i32 %den, 7
+ %num.i25 = lshr i32 %num.i25.0, 7
+ %den.i25 = lshr i32 %den.i25.0, 7
+ %result = udiv i32 %num.i25, %den.i25
store i32 %result, i32 addrspace(1)* %out, align 4
ret void
}
@@ -162,15 +213,8 @@ define void @urem24_i16(i16 addrspace(1)
}
; FUNC-LABEL: {{^}}urem24_i32:
-; SI: v_cvt_f32_u32
-; SI: v_cvt_f32_u32
-; SI: v_rcp_f32
-; SI: v_cvt_u32_f32
-
-; EG: UINT_TO_FLT
-; EG-DAG: UINT_TO_FLT
-; EG-DAG: RECIP_IEEE
-; EG: FLT_TO_UINT
+; SI-NOT: v_rcp_f32
+; EG-NOT: RECIP_IEEE
define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
%den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
%num = load i32, i32 addrspace(1) * %in, align 4
@@ -243,3 +287,41 @@ define void @test_no_urem24_i32_2(i32 ad
store i32 %result, i32 addrspace(1)* %out, align 4
ret void
}
+
+; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32:
+; SI-DAG: v_rcp_f32
+; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}}
+; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
+
+; EG: RECIP_IEEE
+define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i16.0 = shl i32 %num, 16
+ %den.i23.0 = shl i32 %den, 9
+ %num.i16 = lshr i32 %num.i16.0, 16
+ %den.i23 = lshr i32 %den.i23.0, 9
+ %result = udiv i32 %num.i16, %den.i23
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32:
+; SI-DAG: v_rcp_f32
+; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}}
+; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
+
+; EG: RECIP_IEEE
+define void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+ %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+ %num = load i32, i32 addrspace(1) * %in, align 4
+ %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+ %num.i23.0 = shl i32 %num, 9
+ %den.i16.0 = shl i32 %den, 16
+ %num.i23 = lshr i32 %num.i23.0, 9
+ %den.i16 = lshr i32 %den.i16.0, 16
+ %result = udiv i32 %num.i23, %den.i16
+ store i32 %result, i32 addrspace(1)* %out, align 4
+ ret void
+}
Modified: llvm/trunk/test/CodeGen/AMDGPU/udivrem64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/udivrem64.ll?rev=270305&r1=270304&r2=270305&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/udivrem64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/udivrem64.ll Fri May 20 20:53:33 2016
@@ -1,4 +1,4 @@
-;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
@@ -184,7 +184,7 @@ define void @test_urem3264(i64 addrspace
ret void
}
-;FUNC-LABEL: {{^}}test_udiv2464:
+;FUNC-LABEL: {{^}}test_udiv2364:
;EG: UINT_TO_FLT
;EG: UINT_TO_FLT
;EG: FLT_TO_UINT
@@ -195,15 +195,15 @@ define void @test_urem3264(i64 addrspace
;VI-NOT: v_lshrrev_b64
;GCN: v_mad_f32
;GCN: s_endpgm
-define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
- %1 = lshr i64 %x, 40
- %2 = lshr i64 %y, 40
+define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+ %1 = lshr i64 %x, 41
+ %2 = lshr i64 %y, 41
%result = udiv i64 %1, %2
store i64 %result, i64 addrspace(1)* %out
ret void
}
-;FUNC-LABEL: {{^}}test_urem2464:
+;FUNC-LABEL: {{^}}test_urem2364:
;EG: UINT_TO_FLT
;EG: UINT_TO_FLT
;EG: FLT_TO_UINT
@@ -214,9 +214,9 @@ define void @test_udiv2464(i64 addrspace
;VI-NOT: v_lshrrev_b64
;GCN: v_mad_f32
;GCN: s_endpgm
-define void @test_urem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
- %1 = lshr i64 %x, 40
- %2 = lshr i64 %y, 40
+define void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+ %1 = lshr i64 %x, 41
+ %2 = lshr i64 %y, 41
%result = urem i64 %1, %2
store i64 %result, i64 addrspace(1)* %out
ret void
More information about the llvm-commits
mailing list