[all-commits] [llvm/llvm-project] ca907b: [AMDGPU] Insert waitcnt after returning from call

Wed Sep 23 03:19:00 PDT 2020

  Branch: refs/heads/master
  Home:   https://github.com/llvm/llvm-project
  Commit: ca907bfb57d8ad3ec3bcc2cff2abab7b1b933af6
      https://github.com/llvm/llvm-project/commit/ca907bfb57d8ad3ec3bcc2cff2abab7b1b933af6
  Author: Sebastian Neubauer <sebastian.neubauer at amd.com>
  Date:   2020-09-23 (Wed, 23 Sep 2020)

  Changed paths:
    M llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    M llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll
    M llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
    M llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
    M llvm/test/CodeGen/AMDGPU/call-argument-types.ll
    M llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
    M llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
    M llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
    M llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
    M llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
    M llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
    M llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    M llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
    M llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
    M llvm/test/CodeGen/AMDGPU/function-args.ll
    M llvm/test/CodeGen/AMDGPU/function-returns.ll
    M llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll
    M llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
    M llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
    M llvm/test/CodeGen/AMDGPU/hsa-func.ll
    M llvm/test/CodeGen/AMDGPU/imm16.ll
    M llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll
    M llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
    M llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
    M llvm/test/CodeGen/AMDGPU/load-hi16.ll
    M llvm/test/CodeGen/AMDGPU/load-lo16.ll
    M llvm/test/CodeGen/AMDGPU/load-local.128.ll
    M llvm/test/CodeGen/AMDGPU/load-local.96.ll
    M llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
    M llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
    M llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
    M llvm/test/CodeGen/AMDGPU/memory_clause.ll
    M llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
    M llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
    M llvm/test/CodeGen/AMDGPU/nested-calls.ll
    M llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
    M llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
    M llvm/test/CodeGen/AMDGPU/offset-split-global.ll
    M llvm/test/CodeGen/AMDGPU/ret_jump.ll
    M llvm/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir
    M llvm/test/CodeGen/AMDGPU/shl.ll
    M llvm/test/CodeGen/AMDGPU/sibling-call.ll
    M llvm/test/CodeGen/AMDGPU/smrd.ll
    M llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
    M llvm/test/CodeGen/AMDGPU/stack-realign.ll
    M llvm/test/CodeGen/AMDGPU/store-hi16.ll
    M llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
    M llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
    M llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
    M llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
    M llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll
    M llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
    M llvm/test/CodeGen/AMDGPU/wave32.ll
    M llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected
    M llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected

  Log Message:
  -----------
  [AMDGPU] Insert waitcnt after returning from call

When memory operations are outstanding on function calls, either the
caller or the callee can insert a waitcnt to ensure that all reads are
finished.
Calls need some time to be executed, so if the callee inserts the
waitcnt, filling the instruction buffer and waiting for memory will be
interleaved, hiding some latency. This comes at the cost of having a
waitcnt inside functions that may not be needed as no memory operations
are outstanding.

For function calls, this is already implemented. The same principal
applies to returns: If the caller inserts a waitcnt after the call, the
callee does not have to wait and the return and memory operation can be
run in parallel.

This commit implements waiting in the caller after returning from a
function call.

Differential Revision: https://reviews.llvm.org/D87674