[llvm] r225306 - R600/SI: Add basic DAG combines for fp_class
Matt Arsenault
Matthew.Arsenault at amd.com
Tue Jan 6 15:00:39 PST 2015
Author: arsenm
Date: Tue Jan 6 17:00:39 2015
New Revision: 225306
URL: http://llvm.org/viewvc/llvm-project?rev=225306&view=rev
Log:
R600/SI: Add basic DAG combines for fp_class
Modified:
llvm/trunk/lib/Target/R600/SIISelLowering.cpp
llvm/trunk/lib/Target/R600/SIISelLowering.h
llvm/trunk/test/CodeGen/R600/llvm.AMDGPU.class.ll
Modified: llvm/trunk/lib/Target/R600/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIISelLowering.cpp?rev=225306&r1=225305&r2=225306&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/R600/SIISelLowering.cpp Tue Jan 6 17:00:39 2015
@@ -218,7 +218,7 @@ SITargetLowering::SITargetLowering(Targe
setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::SETCC);
-
+ setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::UINT_TO_FP);
// All memory operations. Some folding on the pointer operand is done to help
@@ -1302,6 +1302,49 @@ SDValue SITargetLowering::performSHLPtrC
return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
}
+SDValue SITargetLowering::performOrCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
+ if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
+ RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
+ SDValue Src = LHS.getOperand(0);
+ if (Src != RHS.getOperand(0))
+ return SDValue();
+
+ const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+ if (!CLHS || !CRHS)
+ return SDValue();
+
+ // Only 10 bits are used.
+ static const uint32_t MaxMask = 0x3ff;
+
+ uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
+ return DAG.getNode(AMDGPUISD::FP_CLASS, SDLoc(N), MVT::i1,
+ Src, DAG.getConstant(NewMask, MVT::i32));
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performClassCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Mask = N->getOperand(1);
+
+ // fp_class x, 0 -> false
+ if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
+ if (CMask->isNullValue())
+ return DAG.getConstant(0, MVT::i1);
+ }
+
+ return SDValue();
+}
+
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
switch (Opc) {
case ISD::FMAXNUM:
@@ -1531,6 +1574,10 @@ SDValue SITargetLowering::PerformDAGComb
}
break;
}
+ case ISD::OR:
+ return performOrCombine(N, DCI);
+ case AMDGPUISD::FP_CLASS:
+ return performClassCombine(N, DCI);
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
Modified: llvm/trunk/lib/Target/R600/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIISelLowering.h?rev=225306&r1=225305&r2=225306&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/R600/SIISelLowering.h Tue Jan 6 17:00:39 2015
@@ -58,6 +58,8 @@ class SITargetLowering : public AMDGPUTa
SDValue performSHLPtrCombine(SDNode *N,
unsigned AS,
DAGCombinerInfo &DCI) const;
+ SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
Modified: llvm/trunk/test/CodeGen/R600/llvm.AMDGPU.class.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/R600/llvm.AMDGPU.class.ll?rev=225306&r1=225305&r2=225306&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/R600/llvm.AMDGPU.class.ll (original)
+++ llvm/trunk/test/CodeGen/R600/llvm.AMDGPU.class.ll Tue Jan 6 17:00:39 2015
@@ -331,5 +331,167 @@ define void @test_class_lit_constant_dyn
ret void
}
+; SI-LABEL: {{^}}test_fold_or_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+ %a = load float addrspace(1)* %gep.in
+
+ %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+ %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1
+ %or = or i1 %class0, %class1
+
+ %sext = sext i1 %or to i32
+ store i32 %sext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or3_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+ %a = load float addrspace(1)* %gep.in
+
+ %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+ %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
+ %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+ %or.0 = or i1 %class0, %class1
+ %or.1 = or i1 %or.0, %class2
+
+ %sext = sext i1 %or.1 to i32
+ store i32 %sext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
+; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+ %a = load float addrspace(1)* %gep.in
+
+ %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+ %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
+ %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+ %class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
+ %class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1
+ %class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1
+ %class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
+ %class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1
+ %class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1
+ %class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1
+ %or.0 = or i1 %class0, %class1
+ %or.1 = or i1 %or.0, %class2
+ %or.2 = or i1 %or.1, %class3
+ %or.3 = or i1 %or.2, %class4
+ %or.4 = or i1 %or.3, %class5
+ %or.5 = or i1 %or.4, %class6
+ %or.6 = or i1 %or.5, %class7
+ %or.7 = or i1 %or.6, %class8
+ %or.8 = or i1 %or.7, %class9
+ %sext = sext i1 %or.8 to i32
+ store i32 %sext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_class_f32_1:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+ %a = load float addrspace(1)* %gep.in
+
+ %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+ %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
+ %or = or i1 %class0, %class1
+
+ %sext = sext i1 %or to i32
+ store i32 %sext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_class_f32_2:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+ %a = load float addrspace(1)* %gep.in
+
+ %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
+ %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
+ %or = or i1 %class0, %class1
+
+ %sext = sext i1 %or to i32
+ store i32 %sext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}test_no_fold_or_class_f32_0:
+; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}}
+; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
+; SI: s_or_b64
+; SI: s_endpgm
+define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %gep.in = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.out = getelementptr i32 addrspace(1)* %out, i32 %tid
+ %a = load float addrspace(1)* %gep.in
+
+ %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+ %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1
+ %or = or i1 %class0, %class1
+
+ %sext = sext i1 %or to i32
+ store i32 %sext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}test_class_0_f32:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
+ %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1
+ %sext = sext i1 %result to i32
+ store i32 %sext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; SI-LABEL: {{^}}test_class_0_f64:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
+ %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1
+ %sext = sext i1 %result to i32
+ store i32 %sext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
More information about the llvm-commits
mailing list