[PATCH 1/1] R600: Implement zero undef variants of ctlz/cttz

Jan Vesely jan.vesely at rutgers.edu
Wed Jun 18 07:26:51 PDT 2014


v2: use ffbh/l if available
v3: Rebase on top of Matt's SI patches

Signed-off-by: Jan Vesely <jan.vesely at rutgers.edu>
---

Hi Matt,

this is the rebased version. It's nicer than the original series,
and it does not include 64 bit version.
There's one thing that confuses me though. The generated code for vector
ops looks like this:
          FFBL_INT              * T1.W, T0.W,  
          FFBL_INT              * T1.Z, T0.Z,  
          FFBL_INT              * T1.Y, T0.Y,  
          FFBL_INT                T1.X, T0.X,  
          LSHR                  * T0.X, KC0[2].Y, literal.x,

The manual says that FFBX family are vector unit only instruction, so shouldn't
there be more than 1 per instruction group?

regards,
Jan

 lib/Target/R600/AMDGPUISelLowering.cpp   |  6 ++++++
 lib/Target/R600/AMDGPUSubtarget.h        |  8 ++++++++
 lib/Target/R600/EvergreenInstructions.td |  3 +++
 test/CodeGen/R600/ctlz_zero_undef.ll     | 13 +++++++++++++
 test/CodeGen/R600/cttz_zero_undef.ll     | 13 +++++++++++++
 5 files changed, 43 insertions(+)

diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 4d95723..99aceeb 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -263,6 +263,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 
+  if (!Subtarget->hasFFBH())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+
+  if (!Subtarget->hasFFBL())
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+
   static const MVT::SimpleValueType VectorIntTypes[] = {
     MVT::v2i32, MVT::v4i32
   };
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 9c78f35..63f8c00 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -105,6 +105,14 @@ public:
             hasCaymanISA());
   }
 
+  bool hasFFBL() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasFFBH() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
   bool IsIRStructurizerEnabled() const;
   bool isIfCvtEnabled() const;
   unsigned getWavefrontSize() const;
diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
index dcb7e98..484e522 100644
--- a/lib/Target/R600/EvergreenInstructions.td
+++ b/lib/Target/R600/EvergreenInstructions.td
@@ -328,6 +328,9 @@ defm CUBE_eg : CUBE_Common<0xC0>;
 
 def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
 
+def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>;
+def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
+
 let hasSideEffects = 1 in {
   def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
 }
diff --git a/test/CodeGen/R600/ctlz_zero_undef.ll b/test/CodeGen/R600/ctlz_zero_undef.ll
index 15b5188..1340ef9 100644
--- a/test/CodeGen/R600/ctlz_zero_undef.ll
+++ b/test/CodeGen/R600/ctlz_zero_undef.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
@@ -10,6 +11,8 @@ declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
 ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: BUFFER_STORE_DWORD [[VRESULT]],
 ; SI: S_ENDPGM
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
@@ -21,6 +24,8 @@ define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou
 ; SI: V_FFBH_U32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; SI: BUFFER_STORE_DWORD [[RESULT]],
 ; SI: S_ENDPGM
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
@@ -34,6 +39,9 @@ define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; SI: V_FFBH_U32_e32
 ; SI: BUFFER_STORE_DWORDX2
 ; SI: S_ENDPGM
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32> addrspace(1)* %valptr, align 8
   %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
@@ -49,6 +57,11 @@ define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x
 ; SI: V_FFBH_U32_e32
 ; SI: BUFFER_STORE_DWORDX4
 ; SI: S_ENDPGM
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32> addrspace(1)* %valptr, align 16
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
diff --git a/test/CodeGen/R600/cttz_zero_undef.ll b/test/CodeGen/R600/cttz_zero_undef.ll
index cf44f8e..9c4a355 100644
--- a/test/CodeGen/R600/cttz_zero_undef.ll
+++ b/test/CodeGen/R600/cttz_zero_undef.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
 declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
@@ -10,6 +11,8 @@ declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
 ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: BUFFER_STORE_DWORD [[VRESULT]],
 ; SI: S_ENDPGM
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %cttz, i32 addrspace(1)* %out, align 4
@@ -21,6 +24,8 @@ define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou
 ; SI: V_FFBL_B32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; SI: BUFFER_STORE_DWORD [[RESULT]],
 ; SI: S_ENDPGM
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32 addrspace(1)* %valptr, align 4
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
@@ -34,6 +39,9 @@ define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; SI: V_FFBL_B32_e32
 ; SI: BUFFER_STORE_DWORDX2
 ; SI: S_ENDPGM
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32> addrspace(1)* %valptr, align 8
   %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
@@ -49,6 +57,11 @@ define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x
 ; SI: V_FFBL_B32_e32
 ; SI: BUFFER_STORE_DWORDX4
 ; SI: S_ENDPGM
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32> addrspace(1)* %valptr, align 16
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
-- 
1.9.3




More information about the llvm-commits mailing list