[llvm] r296401 - AMDGPU: Use v_med3_{f16|i16|u16}
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 27 14:40:40 PST 2017
Author: arsenm
Date: Mon Feb 27 16:40:39 2017
New Revision: 296401
URL: http://llvm.org/viewvc/llvm-project?rev=296401&view=rev
Log:
AMDGPU: Use v_med3_{f16|i16|u16}
Added:
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td
llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td
llvm/trunk/test/CodeGen/AMDGPU/fmed3.ll
llvm/trunk/test/CodeGen/AMDGPU/smed3.ll
llvm/trunk/test/CodeGen/AMDGPU/umed3.ll
llvm/trunk/test/MC/AMDGPU/vop3-gfx9.s
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td?rev=296401&r1=296400&r2=296401&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td Mon Feb 27 16:40:39 2017
@@ -664,9 +664,10 @@ class ROTRPattern <Instruction BIT_ALIGN
class IntMed3Pat<Instruction med3Inst,
SDPatternOperator max,
SDPatternOperator max_oneuse,
- SDPatternOperator min_oneuse> : Pat<
- (max (min_oneuse i32:$src0, i32:$src1),
- (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
+ SDPatternOperator min_oneuse,
+ ValueType vt = i32> : Pat<
+ (max (min_oneuse vt:$src0, vt:$src1),
+ (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
(med3Inst $src0, $src1, $src2)
>;
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h?rev=296401&r1=296400&r2=296401&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h Mon Feb 27 16:40:39 2017
@@ -276,6 +276,10 @@ public:
return (getGeneration() >= EVERGREEN);
}
+ bool hasMed3_16() const {
+ return getGeneration() >= GFX9;
+ }
+
bool hasCARRY() const {
return (getGeneration() >= EVERGREEN);
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=296401&r1=296400&r2=296401&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Mon Feb 27 16:40:39 2017
@@ -4069,8 +4069,9 @@ static unsigned minMaxOpcToMin3Max3Opc(u
}
}
-static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
- SDValue Op0, SDValue Op1, bool Signed) {
+SDValue SITargetLowering::performIntMed3ImmCombine(
+ SelectionDAG &DAG, const SDLoc &SL,
+ SDValue Op0, SDValue Op1, bool Signed) const {
ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
if (!K1)
return SDValue();
@@ -4088,23 +4089,22 @@ static SDValue performIntMed3ImmCombine(
}
EVT VT = K0->getValueType(0);
+ unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
+ if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
+ return DAG.getNode(Med3Opc, SL, VT,
+ Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+ }
+ // If there isn't a 16-bit med3 operation, convert to 32-bit.
MVT NVT = MVT::i32;
unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
- SDValue Tmp1, Tmp2, Tmp3;
- Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
- Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
- Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
-
- if (VT == MVT::i16) {
- Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
- Tmp1, Tmp2, Tmp3);
-
- return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
- } else
- return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
- Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+ SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
+ SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
+ SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
+
+ SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
}
static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
@@ -4141,9 +4141,8 @@ SDValue SITargetLowering::performFPMed3I
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
}
- // No med3 for f16, but clamp is possible.
- // TODO: gfx9 has med3 f16
- if (VT == MVT::f16 || VT == MVT::f64)
+ // med3 for f16 is only available on gfx9+.
+ if (VT == MVT::f64 || (VT == MVT::f16 && !Subtarget->hasMed3_16()))
return SDValue();
// This isn't safe with signaling NaNs because in IEEE mode, min/max on a
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=296401&r1=296400&r2=296401&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Mon Feb 27 16:40:39 2017
@@ -86,6 +86,8 @@ class SITargetLowering final : public AM
SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
SDValue Op0, SDValue Op1) const;
+ SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+ SDValue Op0, SDValue Op1, bool Signed) const;
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td?rev=296401&r1=296400&r2=296401&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td Mon Feb 27 16:40:39 2017
@@ -1321,7 +1321,7 @@ def VOP_F16_F16_I16 : VOPProfile <[f16,
def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
-def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>;
+def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=296401&r1=296400&r2=296401&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Mon Feb 27 16:40:39 2017
@@ -1216,6 +1216,14 @@ def : Pat <
// Miscellaneous Optimization Patterns
//============================================================================//
+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// TODO: Also do for 64-bit.
+def : Pat<
+ (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+ (S_SUB_I32 $src0, NegSubInlineConst32:$src1)
+>;
+
def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
@@ -1235,14 +1243,11 @@ class FPMed3Pat<ValueType vt,
def : FPMed3Pat<f32, V_MED3_F32>;
-
-// Undo sub x, c -> add x, -c canonicalization since c is more likely
-// an inline immediate than -c.
-// TODO: Also do for 64-bit.
-def : Pat<
- (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
- (S_SUB_I32 $src0, NegSubInlineConst32:$src1)
->;
+let Predicates = [isGFX9] in {
+def : FPMed3Pat<f16, V_MED3_F16>;
+def : IntMed3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>;
+def : IntMed3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>;
+} // End Predicates = [isGFX9]
//============================================================================//
// Assembler aliases
Modified: llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td?rev=296401&r1=296400&r2=296401&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td Mon Feb 27 16:40:39 2017
@@ -258,8 +258,8 @@ def V_MAD_I16 : VOP3Inst <"v_mad_i16", V
let Predicates = [isVI] in {
-multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
- Instruction inst, SDPatternOperator op3> {
+multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
+ Instruction inst, SDPatternOperator op3> {
def : Pat<
(op2 (op1 i16:$src0, i16:$src1), i16:$src2),
(inst i16:$src0, i16:$src1, i16:$src2)
@@ -278,8 +278,8 @@ def : Pat<
>;
}
-defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>;
-defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
} // End Predicates = [isVI]
@@ -291,6 +291,10 @@ def V_ADD3_U32 : VOP3Inst <"v_add3_u32",
def V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+
+def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmed3>;
+def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmed3>;
+def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumed3>;
}
@@ -487,3 +491,7 @@ defm V_LSHL_OR_B32 : VOP3_Real_vi <0x200
defm V_AND_OR_B32 : VOP3_Real_vi <0x201>;
defm V_OR3_B32 : VOP3_Real_vi <0x202>;
defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>;
+
+defm V_MED3_F16 : VOP3_Real_vi <0x1fa>;
+defm V_MED3_I16 : VOP3_Real_vi <0x1fb>;
+defm V_MED3_U16 : VOP3_Real_vi <0x1fc>;
Modified: llvm/trunk/test/CodeGen/AMDGPU/fmed3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmed3.ll?rev=296401&r1=296400&r2=296401&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmed3.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmed3.ll Mon Feb 27 16:40:39 2017
@@ -1,5 +1,10 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
+
; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32:
; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}}
@@ -688,8 +693,8 @@ define void @v_test_global_nnans_med3_f3
; ---------------------------------------------------------------------
; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use0:
-; GCN: v_min_f32
-; GCN: v_max_f32
+; GCN-DAG: v_min_f32
+; GCN-DAG: v_max_f32
; GCN: v_min_f32
; GCN: v_max_f32
define void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
@@ -884,12 +889,86 @@ define void @v_test_global_nnans_min_max
ret void
}
+; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f16:
+; SI: v_cvt_f32_f16
+; SI: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; SI: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
+; SI: v_cvt_f16_f32
+
+; VI: v_add_f16_e32 v{{[0-9]+}}, 1.0
+; VI: v_max_f16_e32 v{{[0-9]+}}, 2.0
+; VI: v_min_f16_e32 v{{[0-9]+}}, 4.0
+
+; GFX9: v_add_f16_e32 v{{[0-9]+}}, 1.0
+; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
+define void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+ %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
+ %a = load half, half addrspace(1)* %gep0
+ %a.add = fadd nnan half %a, 1.0
+ %max = call half @llvm.maxnum.f16(half %a.add, half 2.0)
+ %med = call half @llvm.minnum.f16(half %max, half 4.0)
+
+ store half %med, half addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_nnan_inputs_med3_f16_pat0:
+; GCN: {{buffer_|flat_}}load_ushort [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_ushort [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_ushort [[C:v[0-9]+]]
+
+; SI: v_cvt_f32_f16
+; SI: v_cvt_f32_f16
+; SI: v_add_f32_e32
+; SI: v_add_f32_e32
+; SI: v_add_f32_e32
+; SI: v_med3_f32
+; SI: v_cvt_f16_f32_e32
+
+
+; GFX89-DAG: v_add_f16_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]]
+; GFX89-DAG: v_add_f16_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
+; GFX89-DAG: v_add_f16_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
+
+; VI-DAG: v_min_f16
+; VI-DAG: v_max_f16
+; VI: v_min_f16
+; VI: v_max_f16
+
+; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
+define void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+ %gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid
+ %gep2 = getelementptr half, half addrspace(1)* %cptr, i32 %tid
+ %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
+ %a = load volatile half, half addrspace(1)* %gep0
+ %b = load volatile half, half addrspace(1)* %gep1
+ %c = load volatile half, half addrspace(1)* %gep2
+
+ %a.nnan = fadd nnan half %a, 1.0
+ %b.nnan = fadd nnan half %b, 2.0
+ %c.nnan = fadd nnan half %c, 4.0
+
+ %tmp0 = call half @llvm.minnum.f16(half %a.nnan, half %b.nnan)
+ %tmp1 = call half @llvm.maxnum.f16(half %a.nnan, half %b.nnan)
+ %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %c.nnan)
+ %med3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
+ store half %med3, half addrspace(1)* %outgep
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare float @llvm.fabs.f32(float) #0
declare float @llvm.minnum.f32(float, float) #0
declare float @llvm.maxnum.f32(float, float) #0
declare double @llvm.minnum.f64(double, double) #0
declare double @llvm.maxnum.f64(double, double) #0
+declare half @llvm.fabs.f16(half) #0
+declare half @llvm.minnum.f16(half, half) #0
+declare half @llvm.maxnum.f16(half, half) #0
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
Added: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll?rev=296401&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll Mon Feb 27 16:40:39 2017
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_fmed3_f16:
+; GCN: v_med3_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
+ %src0.f16 = trunc i32 %src0.arg to i16
+ %src0 = bitcast i16 %src0.f16 to half
+ %src1.f16 = trunc i32 %src1.arg to i16
+ %src1 = bitcast i16 %src1.f16 to half
+ %src2.f16 = trunc i32 %src2.arg to i16
+ %src2 = bitcast i16 %src2.f16 to half
+ %mad = call half @llvm.amdgcn.fmed3.f16(half %src0, half %src1, half %src2)
+ store half %mad, half addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fmed3_srcmods_f16:
+; GCN: v_med3_f16 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}|
+define void @test_fmed3_srcmods_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
+ %src0.f16 = trunc i32 %src0.arg to i16
+ %src0 = bitcast i16 %src0.f16 to half
+ %src1.f16 = trunc i32 %src1.arg to i16
+ %src1 = bitcast i16 %src1.f16 to half
+ %src2.f16 = trunc i32 %src2.arg to i16
+ %src2 = bitcast i16 %src2.f16 to half
+ %src0.fneg = fsub half -0.0, %src0
+ %src1.fabs = call half @llvm.fabs.f16(half %src1)
+ %src2.fabs = call half @llvm.fabs.f16(half %src2)
+ %src2.fneg.fabs = fsub half -0.0, %src2.fabs
+ %mad = call half @llvm.amdgcn.fmed3.f16(half %src0.fneg, half %src1.fabs, half %src2.fneg.fabs)
+ store half %mad, half addrspace(1)* %out
+ ret void
+}
+
+declare half @llvm.amdgcn.fmed3.f16(half, half, half) #0
+declare half @llvm.fabs.f16(half) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
Modified: llvm/trunk/test/CodeGen/AMDGPU/smed3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/smed3.ll?rev=296401&r1=296400&r2=296401&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/smed3.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/smed3.ll Mon Feb 27 16:40:39 2017
@@ -1,12 +1,13 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32:
; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@@ -25,7 +26,7 @@ define void @v_test_smed3_r_i_i_i32(i32
; GCN: v_max_i32
; GCN: v_min_i32
define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@@ -45,7 +46,7 @@ define void @v_test_smed3_multi_use_r_i_
; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@@ -64,7 +65,7 @@ define void @v_test_smed3_r_i_i_constant
; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@@ -83,7 +84,7 @@ define void @v_test_smed3_r_i_i_sign_mis
; GCN: v_cmp_lt_i64
; GCN: v_cmp_gt_i64
define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
%a = load i64, i64 addrspace(1)* %gep0
@@ -99,9 +100,10 @@ define void @v_test_smed3_r_i_i_i64(i64
}
; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16:
-; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; SICIVI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
define void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
%a = load i16, i16 addrspace(1)* %gep0
@@ -362,6 +364,7 @@ bb:
ret void
}
+; FIXME: Should keep scalar or not promote
; GCN-LABEL: {{^}}s_test_smed3_i16_pat_0:
; GCN: s_sext_i32_i16
; GCN: s_sext_i32_i16
@@ -444,6 +447,35 @@ bb:
ret void
}
+; GCN-LABEL: {{^}}v_test_smed3_i16_pat_0:
+; SI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; FIXME: VI not matching med3
+; VI: v_min_i16
+; VI: v_max_i16
+; VI: v_min_i16
+; VI: v_max_i16
+
+; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+bb:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
+ %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
+ %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
+ %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+ %x = load i16, i16 addrspace(1)* %gep0
+ %y = load i16, i16 addrspace(1)* %gep1
+ %z = load i16, i16 addrspace(1)* %gep2
+
+ %tmp0 = call i16 @smin16(i16 %x, i16 %y)
+ %tmp1 = call i16 @smax16(i16 %x, i16 %y)
+ %tmp2 = call i16 @smin16(i16 %tmp1, i16 %z)
+ %tmp3 = call i16 @smax16(i16 %tmp0, i16 %tmp2)
+ store i16 %tmp3, i16 addrspace(1)* %out.gep
+ ret void
+}
+
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
attributes #2 = { nounwind readnone alwaysinline }
Modified: llvm/trunk/test/CodeGen/AMDGPU/umed3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/umed3.ll?rev=296401&r1=296400&r2=296401&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/umed3.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/umed3.ll Mon Feb 27 16:40:39 2017
@@ -1,12 +1,13 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32:
; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@@ -25,7 +26,7 @@ define void @v_test_umed3_r_i_i_i32(i32
; GCN: v_max_u32
; GCN: v_min_u32
define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@@ -45,7 +46,7 @@ define void @v_test_umed3_multi_use_r_i_
; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@@ -64,7 +65,7 @@ define void @v_test_umed3_r_i_i_constant
; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@@ -83,7 +84,7 @@ define void @v_test_umed3_r_i_i_sign_mis
; GCN: v_cmp_lt_u64
; GCN: v_cmp_gt_u64
define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
%a = load i64, i64 addrspace(1)* %gep0
@@ -99,9 +100,10 @@ define void @v_test_umed3_r_i_i_i64(i64
}
; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16:
-; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; SICIVI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
- %tid = call i32 @llvm.r600.read.tidig.x()
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
%a = load i16, i16 addrspace(1)* %gep0
@@ -479,6 +481,35 @@ bb:
ret void
}
+; GCN-LABEL: {{^}}v_test_umed3_i16_pat_0:
+; SI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; FIXME: VI not matching med3
+; VI: v_min_u16
+; VI: v_max_u16
+; VI: v_min_u16
+; VI: v_max_u16
+
+; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+bb:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
+ %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
+ %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
+ %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+ %x = load i16, i16 addrspace(1)* %gep0
+ %y = load i16, i16 addrspace(1)* %gep1
+ %z = load i16, i16 addrspace(1)* %gep2
+
+ %tmp0 = call i16 @umin16(i16 %x, i16 %y)
+ %tmp1 = call i16 @umax16(i16 %x, i16 %y)
+ %tmp2 = call i16 @umin16(i16 %tmp1, i16 %z)
+ %tmp3 = call i16 @umax16(i16 %tmp0, i16 %tmp2)
+ store i16 %tmp3, i16 addrspace(1)* %out.gep
+ ret void
+}
+
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
attributes #2 = { nounwind readnone alwaysinline }
Modified: llvm/trunk/test/MC/AMDGPU/vop3-gfx9.s
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/MC/AMDGPU/vop3-gfx9.s?rev=296401&r1=296400&r2=296401&view=diff
==============================================================================
--- llvm/trunk/test/MC/AMDGPU/vop3-gfx9.s (original)
+++ llvm/trunk/test/MC/AMDGPU/vop3-gfx9.s Mon Feb 27 16:40:39 2017
@@ -30,3 +30,15 @@ v_or3_b32 v1, v2, v3, v4
v_pack_b32_f16 v1, v2, v3
// GFX9: v_pack_b32_f16 v1, v2, v3 ; encoding: [0x01,0x00,0xa0,0xd2,0x02,0x07,0x02,0x00]
// NOVI: :1: error: instruction not supported on this GPU
+
+v_med3_f16 v1, v2, v3, v4
+// GFX9: v_med3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfa,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_med3_i16 v1, v2, v3, v4
+// GFX9: v_med3_i16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfb,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_med3_u16 v1, v2, v3, v4
+// GFX9: v_med3_u16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfc,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
More information about the llvm-commits
mailing list