[PATCH] R600/SI: Prefer selecting more e64 instruction forms.

Mon Sep 15 10:16:03 PDT 2014

On Mon, Sep 08, 2014 at 03:30:41PM +0000, Matt Arsenault wrote:
> Add some more tests to make sure better operand
> choices are still made. Leave some cases that seem
> to have no reason to ever be e64 alone.
> 

LGTM.

> http://reviews.llvm.org/D5246
> 
> Files:
>   lib/Target/R600/SIInstructions.td
>   test/CodeGen/R600/and.ll
>   test/CodeGen/R600/bfi_int.ll
>   test/CodeGen/R600/ctpop.ll
>   test/CodeGen/R600/llvm.AMDGPU.rsq.ll
>   test/CodeGen/R600/rsq.ll
>   test/CodeGen/R600/trunc.ll

> Index: lib/Target/R600/SIInstructions.td
> ===================================================================
> --- lib/Target/R600/SIInstructions.td
> +++ lib/Target/R600/SIInstructions.td
> @@ -1905,9 +1905,9 @@
>                    (EXTRACT_SUBREG i64:$src1, sub1)), sub1)
>  >;
>  
> -def : BinOp64Pat <and, V_AND_B32_e32>;
> -def : BinOp64Pat <or, V_OR_B32_e32>;
> -def : BinOp64Pat <xor, V_XOR_B32_e32>;
> +def : BinOp64Pat <and, V_AND_B32_e64>;
> +def : BinOp64Pat <or, V_OR_B32_e64>;
> +def : BinOp64Pat <xor, V_XOR_B32_e64>;
>  
>  class SextInReg <ValueType vt, int ShiftAmt> : Pat <
>    (sext_inreg i32:$src0, vt),
> @@ -1919,7 +1919,7 @@
>  
>  def : Pat <
>    (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
> -  (V_BCNT_U32_B32_e32 $popcnt, $val)
> +  (V_BCNT_U32_B32_e64 $popcnt, $val)
>  >;
>  
>  def : Pat <
> @@ -1939,7 +1939,7 @@
>  
>  def : Pat <
>    (addc i32:$src0, i32:$src1),
> -  (V_ADD_I32_e32 $src0, $src1)
> +  (V_ADD_I32_e64 $src0, $src1)
>  >;
>  
>  /********** ======================= **********/
> @@ -2961,13 +2961,13 @@
>  
>  def : Pat <
>    (i1 (trunc i32:$a)),
> -  (V_CMP_EQ_I32_e64 (V_AND_B32_e32 (i32 1), $a), 1)
> +  (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1)
>  >;
>  
>  //============================================================================//
>  // Miscellaneous Optimization Patterns
>  //============================================================================//
>  
> -def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e32>;
> +def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
>  
>  } // End isSI predicate
> Index: test/CodeGen/R600/and.ll
> ===================================================================
> --- test/CodeGen/R600/and.ll
> +++ test/CodeGen/R600/and.ll
> @@ -129,11 +129,30 @@
>  }
>  
>  ; FUNC-LABEL: @v_and_constant_i64
> -; SI: V_AND_B32
> -; SI: V_AND_B32
> +; SI: V_AND_B32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
> +; SI: V_AND_B32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
>  define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
>    %a = load i64 addrspace(1)* %aptr, align 8
>    %and = and i64 %a, 1234567
>    store i64 %and, i64 addrspace(1)* %out, align 8
>    ret void
>  }
> +
> +; FIXME: Replace and 0 with mov 0
> +; FUNC-LABEL: @v_and_inline_imm_i64
> +; SI: V_AND_B32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
> +; SI: V_AND_B32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
> +define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
> +  %a = load i64 addrspace(1)* %aptr, align 8
> +  %and = and i64 %a, 64
> +  store i64 %and, i64 addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FUNC-LABEL: @s_and_inline_imm_i64
> +; SI: S_AND_B64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64
> +define void @s_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
> +  %and = and i64 %a, 64
> +  store i64 %and, i64 addrspace(1)* %out, align 8
> +  ret void
> +}
> Index: test/CodeGen/R600/bfi_int.ll
> ===================================================================
> --- test/CodeGen/R600/bfi_int.ll
> +++ test/CodeGen/R600/bfi_int.ll
> @@ -38,8 +38,8 @@
>  ; R600-CHECK: @bfi_sha256_ma
>  ; R600-CHECK: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W
>  ; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W
> -; SI-CHECK: V_XOR_B32_e32 [[DST:v[0-9]+]], {{[sv][0-9]+, v[0-9]+}}
> -; SI-CHECK: V_BFI_B32 {{v[0-9]+}}, [[DST]], {{[sv][0-9]+, [sv][0-9]+}}
> +; SI-CHECK: V_XOR_B32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
> +; SI-CHECK: V_BFI_B32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
>  
>  define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
>  entry:
> Index: test/CodeGen/R600/ctpop.ll
> ===================================================================
> --- test/CodeGen/R600/ctpop.ll
> +++ test/CodeGen/R600/ctpop.ll
> @@ -42,8 +42,7 @@
>  ; SI: BUFFER_LOAD_DWORD [[VAL1:v[0-9]+]],
>  ; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
>  ; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]]
> -; SI-NOT: ADD
> -; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
> +; SI-NEXT: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
>  ; SI: BUFFER_STORE_DWORD [[RESULT]],
>  ; SI: S_ENDPGM
>  
> @@ -59,6 +58,20 @@
>    ret void
>  }
>  
> +; FUNC-LABEL: @v_ctpop_add_sgpr_i32
> +; SI: BUFFER_LOAD_DWORD [[VAL0:v[0-9]+]],
> +; SI-NEXT: S_WAITCNT
> +; SI-NEXT: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
> +; SI-NEXT: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
> +  %val0 = load i32 addrspace(1)* %in0, align 4
> +  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
> +  %add = add i32 %ctpop0, %sval
> +  store i32 %add, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
>  ; FUNC-LABEL: @v_ctpop_v2i32:
>  ; SI: V_BCNT_U32_B32_e32
>  ; SI: V_BCNT_U32_B32_e32
> Index: test/CodeGen/R600/llvm.AMDGPU.rsq.ll
> ===================================================================
> --- test/CodeGen/R600/llvm.AMDGPU.rsq.ll
> +++ test/CodeGen/R600/llvm.AMDGPU.rsq.ll
> @@ -4,10 +4,29 @@
>  declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone
>  
>  ; FUNC-LABEL: @rsq_f32
> -; SI: V_RSQ_F32_e32
> +; SI: V_RSQ_F32_e32 {{v[0-9]+}}, {{s[0-9]+}}
>  ; EG: RECIPSQRT_IEEE
>  define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind {
>    %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone
>    store float %rsq, float addrspace(1)* %out, align 4
>    ret void
>  }
> +
> +; TODO: Really these should be constant folded
> +; FUNC-LABEL: @rsq_f32_constant_4.0
> +; SI: V_RSQ_F32_e32 {{v[0-9]+}}, 4.0
> +; EG: RECIPSQRT_IEEE
> +define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind {
> +  %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone
> +  store float %rsq, float addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @rsq_f32_constant_100.0
> +; SI: V_RSQ_F32_e32 {{v[0-9]+}}, 0x42c80000
> +; EG: RECIPSQRT_IEEE
> +define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind {
> +  %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone
> +  store float %rsq, float addrspace(1)* %out, align 4
> +  ret void
> +}
> Index: test/CodeGen/R600/rsq.ll
> ===================================================================
> --- test/CodeGen/R600/rsq.ll
> +++ test/CodeGen/R600/rsq.ll
> @@ -26,3 +26,13 @@
>    store double %div, double addrspace(1)* %out, align 4
>    ret void
>  }
> +
> +; SI-LABEL: @rsq_f32_sgpr
> +; SI: V_RSQ_F32_e32 {{v[0-9]+}}, {{s[0-9]+}}
> +; SI: S_ENDPGM
> +define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
> +  %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
> +  %div = fdiv float 1.0, %sqrt
> +  store float %div, float addrspace(1)* %out, align 4
> +  ret void
> +}
> Index: test/CodeGen/R600/trunc.ll
> ===================================================================
> --- test/CodeGen/R600/trunc.ll
> +++ test/CodeGen/R600/trunc.ll
> @@ -46,9 +46,20 @@
>  }
>  
>  ; SI-LABEL: @trunc_i32_to_i1:
> -; SI: V_AND_B32
> +; SI: V_AND_B32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
>  ; SI: V_CMP_EQ_I32
> -define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
> +define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
> +  %a = load i32 addrspace(1)* %ptr, align 4
> +  %trunc = trunc i32 %a to i1
> +  %result = select i1 %trunc, i32 1, i32 0
> +  store i32 %result, i32 addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; SI-LABEL: @sgpr_trunc_i32_to_i1:
> +; SI: V_AND_B32_e64 v{{[0-9]+}}, 1, s{{[0-9]+}}
> +; SI: V_CMP_EQ_I32
> +define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
>    %trunc = trunc i32 %a to i1
>    %result = select i1 %trunc, i32 1, i32 0
>    store i32 %result, i32 addrspace(1)* %out, align 4

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits