[PATCH] R600/SI: Prefer selecting more e64 instruction forms.
Tom Stellard
tom at stellard.net
Mon Sep 15 10:16:03 PDT 2014
On Mon, Sep 08, 2014 at 03:30:41PM +0000, Matt Arsenault wrote:
> Add some more tests to make sure better operand
> choices are still made. Leave some cases that seem
> to have no reason to ever be e64 alone.
>
LGTM.
> http://reviews.llvm.org/D5246
>
> Files:
> lib/Target/R600/SIInstructions.td
> test/CodeGen/R600/and.ll
> test/CodeGen/R600/bfi_int.ll
> test/CodeGen/R600/ctpop.ll
> test/CodeGen/R600/llvm.AMDGPU.rsq.ll
> test/CodeGen/R600/rsq.ll
> test/CodeGen/R600/trunc.ll
> Index: lib/Target/R600/SIInstructions.td
> ===================================================================
> --- lib/Target/R600/SIInstructions.td
> +++ lib/Target/R600/SIInstructions.td
> @@ -1905,9 +1905,9 @@
> (EXTRACT_SUBREG i64:$src1, sub1)), sub1)
> >;
>
> -def : BinOp64Pat <and, V_AND_B32_e32>;
> -def : BinOp64Pat <or, V_OR_B32_e32>;
> -def : BinOp64Pat <xor, V_XOR_B32_e32>;
> +def : BinOp64Pat <and, V_AND_B32_e64>;
> +def : BinOp64Pat <or, V_OR_B32_e64>;
> +def : BinOp64Pat <xor, V_XOR_B32_e64>;
>
> class SextInReg <ValueType vt, int ShiftAmt> : Pat <
> (sext_inreg i32:$src0, vt),
> @@ -1919,7 +1919,7 @@
>
> def : Pat <
> (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
> - (V_BCNT_U32_B32_e32 $popcnt, $val)
> + (V_BCNT_U32_B32_e64 $popcnt, $val)
> >;
>
> def : Pat <
> @@ -1939,7 +1939,7 @@
>
> def : Pat <
> (addc i32:$src0, i32:$src1),
> - (V_ADD_I32_e32 $src0, $src1)
> + (V_ADD_I32_e64 $src0, $src1)
> >;
>
> /********** ======================= **********/
> @@ -2961,13 +2961,13 @@
>
> def : Pat <
> (i1 (trunc i32:$a)),
> - (V_CMP_EQ_I32_e64 (V_AND_B32_e32 (i32 1), $a), 1)
> + (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1)
> >;
>
> //============================================================================//
> // Miscellaneous Optimization Patterns
> //============================================================================//
>
> -def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e32>;
> +def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
>
> } // End isSI predicate
> Index: test/CodeGen/R600/and.ll
> ===================================================================
> --- test/CodeGen/R600/and.ll
> +++ test/CodeGen/R600/and.ll
> @@ -129,11 +129,30 @@
> }
>
> ; FUNC-LABEL: @v_and_constant_i64
> -; SI: V_AND_B32
> -; SI: V_AND_B32
> +; SI: V_AND_B32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
> +; SI: V_AND_B32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
> define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
> %a = load i64 addrspace(1)* %aptr, align 8
> %and = and i64 %a, 1234567
> store i64 %and, i64 addrspace(1)* %out, align 8
> ret void
> }
> +
> +; FIXME: Replace and 0 with mov 0
> +; FUNC-LABEL: @v_and_inline_imm_i64
> +; SI: V_AND_B32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
> +; SI: V_AND_B32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
> +define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
> + %a = load i64 addrspace(1)* %aptr, align 8
> + %and = and i64 %a, 64
> + store i64 %and, i64 addrspace(1)* %out, align 8
> + ret void
> +}
> +
> +; FUNC-LABEL: @s_and_inline_imm_i64
> +; SI: S_AND_B64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64
> +define void @s_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
> + %and = and i64 %a, 64
> + store i64 %and, i64 addrspace(1)* %out, align 8
> + ret void
> +}
> Index: test/CodeGen/R600/bfi_int.ll
> ===================================================================
> --- test/CodeGen/R600/bfi_int.ll
> +++ test/CodeGen/R600/bfi_int.ll
> @@ -38,8 +38,8 @@
> ; R600-CHECK: @bfi_sha256_ma
> ; R600-CHECK: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W
> ; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W
> -; SI-CHECK: V_XOR_B32_e32 [[DST:v[0-9]+]], {{[sv][0-9]+, v[0-9]+}}
> -; SI-CHECK: V_BFI_B32 {{v[0-9]+}}, [[DST]], {{[sv][0-9]+, [sv][0-9]+}}
> +; SI-CHECK: V_XOR_B32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
> +; SI-CHECK: V_BFI_B32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
>
> define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
> entry:
> Index: test/CodeGen/R600/ctpop.ll
> ===================================================================
> --- test/CodeGen/R600/ctpop.ll
> +++ test/CodeGen/R600/ctpop.ll
> @@ -42,8 +42,7 @@
> ; SI: BUFFER_LOAD_DWORD [[VAL1:v[0-9]+]],
> ; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
> ; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]]
> -; SI-NOT: ADD
> -; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
> +; SI-NEXT: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
> ; SI: BUFFER_STORE_DWORD [[RESULT]],
> ; SI: S_ENDPGM
>
> @@ -59,6 +58,20 @@
> ret void
> }
>
> +; FUNC-LABEL: @v_ctpop_add_sgpr_i32
> +; SI: BUFFER_LOAD_DWORD [[VAL0:v[0-9]+]],
> +; SI-NEXT: S_WAITCNT
> +; SI-NEXT: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
> +; SI-NEXT: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
> + %val0 = load i32 addrspace(1)* %in0, align 4
> + %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
> + %add = add i32 %ctpop0, %sval
> + store i32 %add, i32 addrspace(1)* %out, align 4
> + ret void
> +}
> +
> ; FUNC-LABEL: @v_ctpop_v2i32:
> ; SI: V_BCNT_U32_B32_e32
> ; SI: V_BCNT_U32_B32_e32
> Index: test/CodeGen/R600/llvm.AMDGPU.rsq.ll
> ===================================================================
> --- test/CodeGen/R600/llvm.AMDGPU.rsq.ll
> +++ test/CodeGen/R600/llvm.AMDGPU.rsq.ll
> @@ -4,10 +4,29 @@
> declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone
>
> ; FUNC-LABEL: @rsq_f32
> -; SI: V_RSQ_F32_e32
> +; SI: V_RSQ_F32_e32 {{v[0-9]+}}, {{s[0-9]+}}
> ; EG: RECIPSQRT_IEEE
> define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind {
> %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone
> store float %rsq, float addrspace(1)* %out, align 4
> ret void
> }
> +
> +; TODO: Really these should be constant folded
> +; FUNC-LABEL: @rsq_f32_constant_4.0
> +; SI: V_RSQ_F32_e32 {{v[0-9]+}}, 4.0
> +; EG: RECIPSQRT_IEEE
> +define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind {
> + %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone
> + store float %rsq, float addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; FUNC-LABEL: @rsq_f32_constant_100.0
> +; SI: V_RSQ_F32_e32 {{v[0-9]+}}, 0x42c80000
> +; EG: RECIPSQRT_IEEE
> +define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind {
> + %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone
> + store float %rsq, float addrspace(1)* %out, align 4
> + ret void
> +}
> Index: test/CodeGen/R600/rsq.ll
> ===================================================================
> --- test/CodeGen/R600/rsq.ll
> +++ test/CodeGen/R600/rsq.ll
> @@ -26,3 +26,13 @@
> store double %div, double addrspace(1)* %out, align 4
> ret void
> }
> +
> +; SI-LABEL: @rsq_f32_sgpr
> +; SI: V_RSQ_F32_e32 {{v[0-9]+}}, {{s[0-9]+}}
> +; SI: S_ENDPGM
> +define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
> + %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
> + %div = fdiv float 1.0, %sqrt
> + store float %div, float addrspace(1)* %out, align 4
> + ret void
> +}
> Index: test/CodeGen/R600/trunc.ll
> ===================================================================
> --- test/CodeGen/R600/trunc.ll
> +++ test/CodeGen/R600/trunc.ll
> @@ -46,9 +46,20 @@
> }
>
> ; SI-LABEL: @trunc_i32_to_i1:
> -; SI: V_AND_B32
> +; SI: V_AND_B32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
> ; SI: V_CMP_EQ_I32
> -define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
> +define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
> + %a = load i32 addrspace(1)* %ptr, align 4
> + %trunc = trunc i32 %a to i1
> + %result = select i1 %trunc, i32 1, i32 0
> + store i32 %result, i32 addrspace(1)* %out, align 4
> + ret void
> +}
> +
> +; SI-LABEL: @sgpr_trunc_i32_to_i1:
> +; SI: V_AND_B32_e64 v{{[0-9]+}}, 1, s{{[0-9]+}}
> +; SI: V_CMP_EQ_I32
> +define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
> %trunc = trunc i32 %a to i1
> %result = select i1 %trunc, i32 1, i32 0
> store i32 %result, i32 addrspace(1)* %out, align 4
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list