[llvm] r303291 - AMDGPU: Try to use op_sel when selecting packed instructions
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed May 17 13:00:00 PDT 2017
Author: arsenm
Date: Wed May 17 15:00:00 2017
New Revision: 303291
URL: http://llvm.org/viewvc/llvm-project?rev=303291&view=rev
Log:
AMDGPU: Try to use op_sel when selecting packed instructions
Avoids instructions to pack a vector when the source is really
a scalar being broadcast.
Also be smarter and look for per-component fneg.
Doesn't yet handle scalar from upper half of register
or other swizzles.
Added:
llvm/trunk/test/CodeGen/AMDGPU/packed-op-sel.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/trunk/test/CodeGen/AMDGPU/immv216.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp?rev=303291&r1=303290&r2=303291&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp Wed May 17 15:00:00 2017
@@ -1707,10 +1707,38 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods
// FIXME: Look for on separate components
if (Src.getOpcode() == ISD::FNEG) {
- Mods |= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
Src = Src.getOperand(0);
}
+ if (Src.getOpcode() == ISD::BUILD_VECTOR) {
+ unsigned VecMods = Mods;
+
+ SDValue Lo = Src.getOperand(0);
+ SDValue Hi = Src.getOperand(1);
+
+ if (Lo.getOpcode() == ISD::FNEG) {
+ Lo = Lo.getOperand(0);
+ Mods ^= SISrcMods::NEG;
+ }
+
+ if (Hi.getOpcode() == ISD::FNEG) {
+ Hi = Hi.getOperand(0);
+ Mods ^= SISrcMods::NEG_HI;
+ }
+
+ if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
+ // Really a scalar input. Just select from the low half of the register to
+ // avoid packing.
+
+ Src = Lo;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+ }
+
+ Mods = VecMods;
+ }
+
// Packed instructions do not have abs modifiers.
// FIXME: Handle abs/neg of individual components.
Modified: llvm/trunk/test/CodeGen/AMDGPU/immv216.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/immv216.ll?rev=303291&r1=303290&r2=303291&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/immv216.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/immv216.ll Wed May 17 15:00:00 2017
@@ -288,9 +288,9 @@ define amdgpu_kernel void @commute_add_i
}
; GCN-LABEL: {{^}}commute_add_literal_v2f16:
-; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x64006400
-; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]]
+; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]] op_sel_hi:[0,1]{{$}}
; GFX9: buffer_store_dword [[REG]]
; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
Added: llvm/trunk/test/CodeGen/AMDGPU/packed-op-sel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/packed-op-sel.ll?rev=303291&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/packed-op-sel.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/packed-op-sel.ll Wed May 17 15:00:00 2017
@@ -0,0 +1,266 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
+
+; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
+define amdgpu_kernel void @fma_vector_vector_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+ %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
+ %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; Apply fneg to broadcasted vector
+; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+ %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
+ %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+ %neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %scalar0.broadcast
+
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; Apply fneg before broadcast
+; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+ %neg.scalar0 = fsub half -0.0, %scalar0
+ %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
+ %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; Apply fneg before and after broadcast, and should cancel out.
+; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_neg_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
+define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+ %neg.scalar0 = fsub half -0.0, %scalar0
+ %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
+ %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+ %neg.neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %neg.scalar0.broadcast
+
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.scalar0.broadcast)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; Add scalar, but negate low component
+; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+ %neg.scalar0 = fsub half -0.0, %scalar0
+ %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
+ %neg.scalar0.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %scalar0, i32 1
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.scalar0)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; Add scalar, but negate high component
+; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_hi:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+ %neg.scalar0 = fsub half -0.0, %scalar0
+ %neg.scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
+ %scalar0.neg.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %neg.scalar0, i32 1
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.neg.scalar0)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; Apply fneg before broadcast with bitcast
+; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_xor_b32_e32 [[NEG_SCALAR0:v[0-9]+]], 0x8000, [[SCALAR0]]
+; GCN-NEXT: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[NEG_SCALAR0]] op_sel_hi:[1,0]{{$}}
+define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+ %neg.scalar0 = fsub half -0.0, %scalar0
+ %neg.scalar0.bc = bitcast half %neg.scalar0 to i16
+
+ %neg.scalar0.vec = insertelement <2 x i16> undef, i16 %neg.scalar0.bc, i32 0
+ %neg.scalar0.broadcast = shufflevector <2 x i16> %neg.scalar0.vec, <2 x i16> undef, <2 x i32> zeroinitializer
+
+ %result = add <2 x i16> %vec0, %neg.scalar0.broadcast
+ store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo_neg_scalar_hi:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]]
+
+; FIXME: Remove and
+; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]]
+; GCN: v_xor_b32_e32 [[SCALAR1]], 0x8000, [[SCALAR1]]
+; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]]{{$}}
+define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+ %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+ %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
+
+ %neg.scalar1 = fsub half -0.0, %scalar1
+ %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
+ %vec2 = insertelement <2 x half> %vec.ins0, half %neg.scalar1, i32 1
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]]
+
+; FIXME: Remove and
+; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]]
+; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+ %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+ %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
+
+ %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
+ %vec2 = insertelement <2 x half> %vec.ins0, half %scalar1, i32 1
+ %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
+
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
More information about the llvm-commits
mailing list