[PATCH] R600/SI: Allow commuting some 3 op instructions

Tue Oct 21 09:26:07 PDT 2014

> On Oct 21, 2014, at 6:38 AM, Tom Stellard <tom at stellard.net> wrote:
> 
> On Sat, Oct 18, 2014 at 01:57:11AM +0000, Matt Arsenault wrote:
>> e.g. v_mad_f32 a, b, c -> v_mad_f32 b, a, c
>> 
>> This simplifies matching v_madmk_f32.
>> 
>> This looks somewhat surprising, but it appears to be
>> OK to do this. We can commute src0 and src1 in all
>> of these instructions, and that's all that appears
>> to matter.
>> 
> 
> Are there any advantages to being able to commute VOP3 instructions?
> 
> -Tom

Mostly just canonicalization to simplify other checks. For example, I have another patch that forms v_madmk_f32 which relies on this to avoid checking for an immediate in the 2nd operand

> 
>> http://reviews.llvm.org/D5854
>> 
>> Files:
>>  lib/Target/R600/SIInstructions.td
>>  test/CodeGen/R600/fma.ll
>>  test/CodeGen/R600/fmuladd.ll
>>  test/CodeGen/R600/llvm.AMDGPU.umad24.ll
>>  test/CodeGen/R600/use-sgpr-multiple-times.ll
> 
>> Index: lib/Target/R600/SIInstructions.td
>> ===================================================================
>> --- lib/Target/R600/SIInstructions.td
>> +++ lib/Target/R600/SIInstructions.td
>> @@ -1374,12 +1374,12 @@
>>> ;
>> } // End isCommutable = 1
>> 
>> +let isCommutable = 1 in {
>> +
>> defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "V_MAC_LEGACY_F32",
>>   VOP_F32_F32_F32
>>> ;
>> 
>> -let isCommutable = 1 in {
>> -
>> defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7>, "V_MUL_LEGACY_F32",
>>   VOP_F32_F32_F32, int_AMDGPU_mul
>>> ;
>> @@ -1388,7 +1388,6 @@
>>   VOP_F32_F32_F32, fmul
>>> ;
>> 
>> -
>> defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9>, "V_MUL_I32_I24",
>>   VOP_I32_I32_I32, AMDGPUmul_i24
>>> ;
>> @@ -1449,9 +1448,18 @@
>> 
>> defm V_BFM_B32 : VOP2Inst <vop2<0x1e>, "V_BFM_B32",
>>   VOP_I32_I32_I32, AMDGPUbfm>;
>> +
>> +let isCommutable = 1 in {
>> defm V_MAC_F32 : VOP2Inst <vop2<0x1f>, "V_MAC_F32", VOP_F32_F32_F32>;
>> +} // End isCommutable = 1
>> +
>> defm V_MADMK_F32 : VOP2Inst <vop2<0x20>, "V_MADMK_F32", VOP_F32_F32_F32>;
>> +
>> +let isCommutable = 1 in {
>> defm V_MADAK_F32 : VOP2Inst <vop2<0x21>, "V_MADAK_F32", VOP_F32_F32_F32>;
>> +} // End isCommutable = 1
>> +
>> +
>> defm V_BCNT_U32_B32 : VOP2Inst <vop2<0x22>, "V_BCNT_U32_B32", VOP_I32_I32_I32>;
>> defm V_MBCNT_LO_U32_B32 : VOP2Inst <vop2<0x23>, "V_MBCNT_LO_U32_B32",
>>   VOP_I32_I32_I32
>> @@ -1503,18 +1511,22 @@
>> // VOP3 Instructions
>> //===----------------------------------------------------------------------===//
>> 
>> +let isCommutable = 1 in {
>> defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140>, "V_MAD_LEGACY_F32",
>>   VOP_F32_F32_F32_F32
>>> ;
>> +
>> defm V_MAD_F32 : VOP3Inst <vop3<0x141>, "V_MAD_F32",
>>   VOP_F32_F32_F32_F32, fmad
>>> ;
>> +
>> defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142>, "V_MAD_I32_I24",
>>   VOP_I32_I32_I32_I32, AMDGPUmad_i24
>>> ;
>> defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143>, "V_MAD_U32_U24",
>>   VOP_I32_I32_I32_I32, AMDGPUmad_u24
>>> ;
>> +} // End isCommutable = 1
>> 
>> defm V_CUBEID_F32 : VOP3Inst <vop3<0x144>, "V_CUBEID_F32",
>>   VOP_F32_F32_F32_F32
>> @@ -1537,12 +1549,16 @@
>> defm V_BFI_B32 : VOP3Inst <vop3<0x14a>, "V_BFI_B32",
>>   VOP_I32_I32_I32_I32, AMDGPUbfi
>>> ;
>> +
>> +let isCommutable = 1 in {
>> defm V_FMA_F32 : VOP3Inst <vop3<0x14b>, "V_FMA_F32",
>>   VOP_F32_F32_F32_F32, fma
>>> ;
>> defm V_FMA_F64 : VOP3Inst <vop3<0x14c>, "V_FMA_F64",
>>   VOP_F64_F64_F64_F64, fma
>>> ;
>> +} // End isCommutable = 1
>> +
>> //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
>> defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e>, "V_ALIGNBIT_B32",
>>   VOP_I32_I32_I32_I32
>> @@ -1628,12 +1644,15 @@
>> // Double precision division pre-scale.
>> defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e>, "V_DIV_SCALE_F64", []>;
>> 
>> +let isCommutable = 1 in {
>> defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f>, "V_DIV_FMAS_F32",
>>   VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
>>> ;
>> defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170>, "V_DIV_FMAS_F64",
>>   VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
>>> ;
>> +} // End isCommutable = 1
>> +
>> //def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
>> //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
>> //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
>> @@ -2848,6 +2867,8 @@
>> defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "V_MQSAD_U32_U8",
>>   VOP_I32_I32_I32
>>> ;
>> +
>> +let isCommutable = 1 in {
>> defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "V_MAD_U64_U32",
>>   VOP_I64_I32_I32_I64
>>> ;
>> @@ -2856,6 +2877,7 @@
>> defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "V_MAD_I64_I32",
>>   VOP_I64_I32_I32_I64
>>> ;
>> +} // End isCommutable = 1
>> 
>> // Remaining instructions:
>> // FLAT_*
>> Index: test/CodeGen/R600/fma.ll
>> ===================================================================
>> --- test/CodeGen/R600/fma.ll
>> +++ test/CodeGen/R600/fma.ll
>> @@ -5,19 +5,21 @@
>> declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
>> declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
>> 
>> -; FUNC-LABEL: {{^}}fma_f32:
>> +declare i32 @llvm.r600.read.tidig.x() nounwind readnone
>> +
>> +; FUNC-LABEL: {{^}}fma_f32
>> ; SI: V_FMA_F32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
>> 
>> ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}},
>> ; EG: FMA {{\*? *}}[[RES]]
>> define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
>>                      float addrspace(1)* %in2, float addrspace(1)* %in3) {
>> -   %r0 = load float addrspace(1)* %in1
>> -   %r1 = load float addrspace(1)* %in2
>> -   %r2 = load float addrspace(1)* %in3
>> -   %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
>> -   store float %r3, float addrspace(1)* %out
>> -   ret void
>> +  %r0 = load float addrspace(1)* %in1
>> +  %r1 = load float addrspace(1)* %in2
>> +  %r2 = load float addrspace(1)* %in3
>> +  %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
>> +  store float %r3, float addrspace(1)* %out
>> +  ret void
>> }
>> 
>> ; FUNC-LABEL: {{^}}fma_v2f32:
>> @@ -29,12 +31,12 @@
>> ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
>> define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
>>                        <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
>> -   %r0 = load <2 x float> addrspace(1)* %in1
>> -   %r1 = load <2 x float> addrspace(1)* %in2
>> -   %r2 = load <2 x float> addrspace(1)* %in3
>> -   %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
>> -   store <2 x float> %r3, <2 x float> addrspace(1)* %out
>> -   ret void
>> +  %r0 = load <2 x float> addrspace(1)* %in1
>> +  %r1 = load <2 x float> addrspace(1)* %in2
>> +  %r2 = load <2 x float> addrspace(1)* %in3
>> +  %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
>> +  store <2 x float> %r3, <2 x float> addrspace(1)* %out
>> +  ret void
>> }
>> 
>> ; FUNC-LABEL: {{^}}fma_v4f32:
>> @@ -50,10 +52,41 @@
>> ; EG-DAG: FMA {{\*? *}}[[RES]].W
>> define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
>>                        <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
>> -   %r0 = load <4 x float> addrspace(1)* %in1
>> -   %r1 = load <4 x float> addrspace(1)* %in2
>> -   %r2 = load <4 x float> addrspace(1)* %in3
>> -   %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
>> -   store <4 x float> %r3, <4 x float> addrspace(1)* %out
>> -   ret void
>> +  %r0 = load <4 x float> addrspace(1)* %in1
>> +  %r1 = load <4 x float> addrspace(1)* %in2
>> +  %r2 = load <4 x float> addrspace(1)* %in3
>> +  %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
>> +  store <4 x float> %r3, <4 x float> addrspace(1)* %out
>> +  ret void
>> +}
>> +
>> +; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
>> +; SI: V_FMA_F32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}}
>> +define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
>> +  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
>> +  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
>> +  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
>> +  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
>> +
>> +  %a = load float addrspace(1)* %in.a.gep, align 4
>> +  %b = load float addrspace(1)* %in.b.gep, align 4
>> +
>> +  %fma = call float @llvm.fma.f32(float %a, float 2.0, float %b)
>> +  store float %fma, float addrspace(1)* %out.gep, align 4
>> +  ret void
>> +}
>> +
>> +; FUNC-LABEL: @fma_commute_mul_s_f32
>> +define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
>> +  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
>> +  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
>> +  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
>> +  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
>> +
>> +  %a = load float addrspace(1)* %in.a.gep, align 4
>> +  %c = load float addrspace(1)* %in.b.gep, align 4
>> +
>> +  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
>> +  store float %fma, float addrspace(1)* %out.gep, align 4
>> +  ret void
>> }
>> Index: test/CodeGen/R600/fmuladd.ll
>> ===================================================================
>> --- test/CodeGen/R600/fmuladd.ll
>> +++ test/CodeGen/R600/fmuladd.ll
>> @@ -116,7 +116,7 @@
>> ; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
>> ; CHECK-DAG: BUFFER_LOAD_DWORD [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
>> ; CHECK-DAG: BUFFER_LOAD_DWORD [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
>> -; CHECK: V_MAD_F32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
>> +; CHECK: V_MAD_F32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
>> ; CHECK: BUFFER_STORE_DWORD [[RESULT]]
>> define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
>>   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
>> @@ -158,7 +158,7 @@
>> ; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32
>> ; CHECK-DAG: BUFFER_LOAD_DWORD [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
>> ; CHECK-DAG: BUFFER_LOAD_DWORD [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
>> -; CHECK: V_MAD_F32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
>> +; CHECK: V_MAD_F32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
>> ; CHECK: BUFFER_STORE_DWORD [[RESULT]]
>> define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
>>   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
>> Index: test/CodeGen/R600/llvm.AMDGPU.umad24.ll
>> ===================================================================
>> --- test/CodeGen/R600/llvm.AMDGPU.umad24.ll
>> +++ test/CodeGen/R600/llvm.AMDGPU.umad24.ll
>> @@ -5,6 +5,7 @@
>> ; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
>> 
>> declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone
>> +declare i32 @llvm.r600.read.tidig.x() nounwind readnone
>> 
>> ; FUNC-LABEL: {{^}}test_umad24:
>> ; SI: V_MAD_U32_U24
>> @@ -17,3 +18,21 @@
>>   ret void
>> }
>> 
>> +; FUNC-LABEL: {{^}}commute_umad24:
>> +; SI-DAG: BUFFER_LOAD_DWORD [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
>> +; SI-DAG: BUFFER_LOAD_DWORD [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
>> +; SI: V_MAD_U32_U24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]]
>> +; SI: BUFFER_STORE_DWORD [[RESULT]]
>> +define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
>> +  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
>> +  %out.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
>> +  %src0.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
>> +  %src2.gep = getelementptr i32 addrspace(1)* %src0.gep, i32 1
>> +
>> +  %src0 = load i32 addrspace(1)* %src0.gep, align 4
>> +  %src2 = load i32 addrspace(1)* %src2.gep, align 4
>> +  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone
>> +  store i32 %mad, i32 addrspace(1)* %out.gep, align 4
>> +  ret void
>> +}
>> +
>> Index: test/CodeGen/R600/use-sgpr-multiple-times.ll
>> ===================================================================
>> --- test/CodeGen/R600/use-sgpr-multiple-times.ll
>> +++ test/CodeGen/R600/use-sgpr-multiple-times.ll
>> @@ -73,7 +73,7 @@
>> 
>> ; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
>> ; SI: S_LOAD_DWORD [[SGPR:s[0-9]+]]
>> -; SI: V_FMA_F32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]]
>> +; SI: V_FMA_F32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
>> ; SI: BUFFER_STORE_DWORD [[RESULT]]
>> define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
>>   %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
> 
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits