[all-commits] [llvm/llvm-project] 3277c7: [AMDGPU] Skip VGPR deallocation for waveslot limit...

Stanislav Mekhanoshin via All-commits all-commits at lists.llvm.org
Mon Oct 21 09:40:14 PDT 2024


  Branch: refs/heads/main
  Home:   https://github.com/llvm/llvm-project
  Commit: 3277c7cd28154e33637a168acb26cea7ac1f7fff
      https://github.com/llvm/llvm-project/commit/3277c7cd28154e33637a168acb26cea7ac1f7fff
  Author: Stanislav Mekhanoshin <rampitec at users.noreply.github.com>
  Date:   2024-10-21 (Mon, 21 Oct 2024)

  Changed paths:
    M llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    M llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
    M llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
    M llvm/test/CodeGen/AMDGPU/add.ll
    M llvm/test/CodeGen/AMDGPU/add.v2i16.ll
    M llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
    M llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
    M llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
    M llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
    M llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
    M llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
    M llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
    M llvm/test/CodeGen/AMDGPU/bitreverse.ll
    M llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
    M llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
    M llvm/test/CodeGen/AMDGPU/bswap.ll
    M llvm/test/CodeGen/AMDGPU/build_vector.ll
    M llvm/test/CodeGen/AMDGPU/calling-conventions.ll
    M llvm/test/CodeGen/AMDGPU/carryout-selection.ll
    M llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
    M llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
    M llvm/test/CodeGen/AMDGPU/clamp.ll
    M llvm/test/CodeGen/AMDGPU/cluster_stores.ll
    M llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
    M llvm/test/CodeGen/AMDGPU/ctlz.ll
    M llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    M llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
    M llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
    M llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
    M llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
    M llvm/test/CodeGen/AMDGPU/fabs.f16.ll
    M llvm/test/CodeGen/AMDGPU/fadd.f16.ll
    M llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
    M llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
    M llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
    M llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
    M llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
    M llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
    M llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
    M llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
    M llvm/test/CodeGen/AMDGPU/fdiv.ll
    M llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll
    M llvm/test/CodeGen/AMDGPU/flat-scratch.ll
    M llvm/test/CodeGen/AMDGPU/fma-combine.ll
    M llvm/test/CodeGen/AMDGPU/fmax3.ll
    M llvm/test/CodeGen/AMDGPU/fmaximum.ll
    M llvm/test/CodeGen/AMDGPU/fmed3.ll
    M llvm/test/CodeGen/AMDGPU/fmin3.ll
    M llvm/test/CodeGen/AMDGPU/fminimum.ll
    M llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
    M llvm/test/CodeGen/AMDGPU/fmul.f16.ll
    M llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
    M llvm/test/CodeGen/AMDGPU/fnearbyint.ll
    M llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
    M llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
    M llvm/test/CodeGen/AMDGPU/fneg.f16.ll
    M llvm/test/CodeGen/AMDGPU/fneg.ll
    M llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
    M llvm/test/CodeGen/AMDGPU/fp-classify.ll
    M llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
    M llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
    M llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll
    M llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
    M llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
    M llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
    M llvm/test/CodeGen/AMDGPU/fpext.f16.ll
    M llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
    M llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
    M llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
    M llvm/test/CodeGen/AMDGPU/fptrunc.ll
    M llvm/test/CodeGen/AMDGPU/frem.ll
    M llvm/test/CodeGen/AMDGPU/fshl.ll
    M llvm/test/CodeGen/AMDGPU/fshr.ll
    M llvm/test/CodeGen/AMDGPU/fsub.f16.ll
    M llvm/test/CodeGen/AMDGPU/gfx12_scalar_subword_loads.ll
    M llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
    M llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
    M llvm/test/CodeGen/AMDGPU/global-saddr-store.ll
    M llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
    M llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
    M llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
    M llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
    M llvm/test/CodeGen/AMDGPU/half.ll
    M llvm/test/CodeGen/AMDGPU/idiv-licm.ll
    M llvm/test/CodeGen/AMDGPU/idot4s.ll
    M llvm/test/CodeGen/AMDGPU/idot4u.ll
    M llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
    M llvm/test/CodeGen/AMDGPU/imm16.ll
    M llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll
    M llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
    M llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
    M llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
    M llvm/test/CodeGen/AMDGPU/llvm.log.ll
    M llvm/test/CodeGen/AMDGPU/llvm.log10.ll
    M llvm/test/CodeGen/AMDGPU/llvm.log2.ll
    M llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
    M llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.round.ll
    M llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
    M llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
    M llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
    M llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
    M llvm/test/CodeGen/AMDGPU/load-constant-f32.ll
    M llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
    M llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
    M llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
    M llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
    M llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
    M llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
    M llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
    M llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
    M llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
    M llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
    M llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
    M llvm/test/CodeGen/AMDGPU/mad.u16.ll
    M llvm/test/CodeGen/AMDGPU/mad_64_32.ll
    M llvm/test/CodeGen/AMDGPU/madak.ll
    M llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
    M llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
    M llvm/test/CodeGen/AMDGPU/min.ll
    M llvm/test/CodeGen/AMDGPU/minimummaximum.ll
    M llvm/test/CodeGen/AMDGPU/minmax.ll
    M llvm/test/CodeGen/AMDGPU/mul.ll
    M llvm/test/CodeGen/AMDGPU/offset-split-global.ll
    M llvm/test/CodeGen/AMDGPU/omod.ll
    M llvm/test/CodeGen/AMDGPU/release-vgprs-dbg-loc.mir
    M llvm/test/CodeGen/AMDGPU/release-vgprs.mir
    M llvm/test/CodeGen/AMDGPU/rotl.ll
    M llvm/test/CodeGen/AMDGPU/rotr.ll
    M llvm/test/CodeGen/AMDGPU/saddo.ll
    M llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
    M llvm/test/CodeGen/AMDGPU/select.f16.ll
    M llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
    M llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
    M llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
    M llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
    M llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
    M llvm/test/CodeGen/AMDGPU/sub.ll
    M llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
    M llvm/test/CodeGen/AMDGPU/trap-abis.ll
    M llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
    M llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
    M llvm/test/CodeGen/AMDGPU/v_cndmask.ll
    M llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
    M llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
    M llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
    M llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
    M llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
    M llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
    M llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
    M llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
    M llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
    M llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
    M llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
    M llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
    M llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
    M llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
    M llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
    M llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
    M llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
    M llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
    M llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll

  Log Message:
  -----------
  [AMDGPU] Skip VGPR deallocation for waveslot limited kernels (#112765)

MSG_DEALLOC_VGPRS slows down very small waveslot limited kernels. It's
been identified this message is only really needed for VGPR limited
kernels. A kernel becomes VGPR limited if a total number of VGPRs per
SIMD / number of used VGPRs is more than a number of wave slots.



To unsubscribe from these emails, change your notification settings at https://github.com/llvm/llvm-project/settings/notifications


More information about the All-commits mailing list