[llvm] [WIP][AMDGPU][CopyPhysReg] Expand the COPY instructions using the encoded liveness mask. (PR #151124)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 29 03:49:24 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-regalloc
Author: Vikash Gupta (vg0204)
<details>
<summary>Changes</summary>
We will now use the liveness encoded during VirtRegRewriter within the COPY instruction[#<!-- -->151123 ] to expand only defined parts of the use register. It enables us to stop using implicit and implicit-def tom satisfy the MachineVerifier, thus avoiding unnecessary false dependency among the registers. This is in reference to [SWDEV-498533](https://ontrack-internal.amd.com/browse/SWDEV-498533)
---
Patch is 2.74 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151124.diff
273 Files Affected:
- (modified) llvm/include/llvm/Target/Target.td (+1-1)
- (modified) llvm/lib/CodeGen/VirtRegMap.cpp (+88-1)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+82-66)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll (+80-78)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll (+116-112)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/bitcast_38_i16.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll (+21-21)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll (+81-74)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll (+38-34)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll (+517-527)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll (+26-20)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll (+26-24)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll (+30-30)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll (+43-39)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll (+4-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll (+12-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+43-43)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+22-21)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+43-43)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll (+24-27)
- (modified) llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir (+498-530)
- (modified) llvm/test/CodeGen/AMDGPU/add.ll (+21-17)
- (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-reuse-writes.mir (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/agpr-csr.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/always-uniform.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/and.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll (+28-28)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+127-119)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+51-51)
- (modified) llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/bitreverse.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+162-155)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+142-141)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+142-141)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-rsrc-ptr-ops.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/build_vector.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+257-257)
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/carryout-selection.ll (+31-27)
- (modified) llvm/test/CodeGen/AMDGPU/cluster_stores.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/collapse-endcf.ll (+4-1)
- (modified) llvm/test/CodeGen/AMDGPU/copy-overlap-sgpr-kill.mir (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir (+13-13)
- (modified) llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir (+2-3)
- (modified) llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir (+254-239)
- (modified) llvm/test/CodeGen/AMDGPU/ctlz.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (+26-26)
- (modified) llvm/test/CodeGen/AMDGPU/ctpop64.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/cttz.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll (+34-34)
- (modified) llvm/test/CodeGen/AMDGPU/dag-divergence.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+41-28)
- (modified) llvm/test/CodeGen/AMDGPU/ds_read2.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/ds_write2.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll (+11-15)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/fabs.bf16.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fabs.f16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/fabs.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.ll (+19-19)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll (+13-13)
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics.ll (+25-25)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+214-214)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll (+66-66)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (+108-108)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll (+99-99)
- (modified) llvm/test/CodeGen/AMDGPU/fmed3.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll (+29-29)
- (modified) llvm/test/CodeGen/AMDGPU/fnearbyint.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fneg.bf16.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/fneg.f16.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fneg.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/fp-classify.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/fshl.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/fshr.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+38-38)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+30-30)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+30-30)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+32-32)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll (+56-56)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll (+45-45)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+70-70)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+36-36)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+36-36)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+70-70)
- (modified) llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/half.ll (+123-121)
- (modified) llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+84-82)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+45-43)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/itofp.i128.ll (+18-29)
- (modified) llvm/test/CodeGen/AMDGPU/kernel-args.ll (+53-52)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll (+12-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll (+64-64)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll (+51-51)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll (+103-95)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll (+24-23)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll (+4-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll (+52-48)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll (+72-72)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll (+43-29)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll (+17-17)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll (+44-24)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll (+22-22)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll (+42-42)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll (+68-52)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll (+20-28)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll (+78-36)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll (+114-114)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll (+22-22)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-f64.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+239-243)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+302-302)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i32.ll (+166-161)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i64.ll (+38-37)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+318-318)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-f32.ll (+29-29)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+220-220)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+161-162)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+217-217)
- (modified) llvm/test/CodeGen/AMDGPU/load-select-ptr.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/mad_64_32.ll (+5-3)
- (modified) llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/memmove-var-size.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/memory_clause.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/mfma-loop.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/min.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/or.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/permute.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.mir (+1)
- (modified) llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll (+15-8)
- (modified) llvm/test/CodeGen/AMDGPU/rotl.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/rotr.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/sad.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/saddo.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir (+112-112)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll (+8-7)
- (modified) llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+66-66)
- (modified) llvm/test/CodeGen/AMDGPU/srem.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/srem64.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/store-local.128.ll (+9-10)
- (modified) llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/sub.ll (+7-6)
- (modified) llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll (+2-7)
- (modified) llvm/test/CodeGen/AMDGPU/swdev380865.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/trap-abis.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/trunc-store.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/trunc.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/uaddo.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/udiv.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/udivrem.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll (+3-5)
- (modified) llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll (-3)
- (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/usubo.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/v_cndmask.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll (+2-3)
- (modified) llvm/test/CodeGen/AMDGPU/wqm.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/xor.ll (+7-7)
``````````diff
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 4c83f8a580aa0..1f125c2cf87de 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1323,7 +1323,7 @@ def REG_SEQUENCE : StandardPseudoInstruction {
}
def COPY : StandardPseudoInstruction {
let OutOperandList = (outs unknown:$dst);
- let InOperandList = (ins unknown:$src);
+ let InOperandList = (ins unknown:$src, variable_ops);
let AsmString = "";
let hasSideEffects = false;
let isAsCheapAsAMove = true;
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index 99ba893d6f096..227c0ae813934 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -213,6 +213,8 @@ class VirtRegRewriter {
void rewrite();
void addMBBLiveIns();
bool readsUndefSubreg(const MachineOperand &MO) const;
+ uint64_t calcLiveRegUnitMask(const MachineOperand &MO,
+ MCRegister PhysReg) const;
void addLiveInsForSubRanges(const LiveInterval &LI, MCRegister PhysReg) const;
void handleIdentityCopy(MachineInstr &MI);
void expandCopyBundle(MachineInstr &MI) const;
@@ -474,6 +476,77 @@ bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const {
return true;
}
+// Return LaneBitmask value as unint64_t for PhysReg assigned to MO,
+// representing its live register units at its parent MI. In case of undef or
+// fully live MO, return 0u.
+uint64_t VirtRegRewriter::calcLiveRegUnitMask(const MachineOperand &MO,
+ MCRegister PhysReg) const {
+ Register Reg = MO.getReg();
+ const LiveInterval &LI = LIS->getInterval(Reg);
+ const MachineInstr &MI = *MO.getParent();
+ SlotIndex MIIndex = LIS->getInstructionIndex(MI);
+ unsigned SubRegIdx = MO.getSubReg();
+ LaneBitmask UseMask = SubRegIdx
+ ? TRI->getSubRegIndexLaneMask(SubRegIdx)
+ : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
+ : LaneBitmask::getNone());
+
+ LaneBitmask LiveRegUnitMask;
+ DenseSet<unsigned> LiveRegUnits;
+
+ // dbgs() << "\n********** " << printReg(Reg, TRI) << "[ " <<
+ // printReg(PhysReg, TRI) << " ]" << " **********\n";
+
+ if (MO.isUndef())
+ return 0u;
+
+ assert(LI.liveAt(MIIndex) &&
+ "Reads of completely dead register should be marked undef already");
+
+ if (LI.hasSubRanges()) {
+ for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+ unsigned Unit = (*Units).first;
+ LaneBitmask Mask = (*Units).second;
+ for (const LiveInterval::SubRange &S : LI.subranges()) {
+ if ((S.LaneMask & UseMask & Mask).any() && S.liveAt(MIIndex)) {
+ LiveRegUnits.insert(Unit);
+ }
+ }
+ }
+ } else {
+ for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+ unsigned Unit = (*Units).first;
+ const LiveRange &UnitRange = LIS->getRegUnit(Unit);
+ LaneBitmask Mask = (*Units).second;
+
+ if (UnitRange.liveAt(MIIndex) && (UseMask & Mask).any())
+ LiveRegUnits.insert(Unit);
+ }
+ }
+
+ // Consider the exact subregister & create new UseMask as per the RC for it.
+ if (SubRegIdx != 0) {
+ PhysReg = TRI->getSubReg(PhysReg, SubRegIdx);
+ UseMask = (TRI->getMinimalPhysRegClass(PhysReg))->getLaneMask();
+ }
+
+ for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+ unsigned Unit = (*Units).first;
+ LaneBitmask Mask = (*Units).second;
+ if (LiveRegUnits.count(Unit)) {
+ // dbgs() << "LIVE DEF UNIT : " << printRegUnit(Unit, TRI) << '\n';
+ LiveRegUnitMask |= Mask;
+ }
+ }
+
+ // dbgs() << "UseMask : " << PrintLaneMask(UseMask) << '\n';
+ // dbgs() << "LiveRegUnitMask : " << PrintLaneMask(LiveRegUnitMask) << '\n';
+ if (UseMask == LiveRegUnitMask)
+ return 0u;
+
+ return LiveRegUnitMask.getAsInteger();
+}
+
void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) {
if (!MI.isIdentityCopy())
return;
@@ -495,7 +568,11 @@ void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) {
// give us additional liveness information: The target (super-)register
// must not be valid before this point. Replace the COPY with a KILL
// instruction to maintain this information.
- if (MI.getOperand(1).isUndef() || MI.getNumOperands() > 2) {
+
+ // Avoid COPY with an exact 3 operand, wiith third operand be Mask, as
+ // it same as a COPY with no additional liveness information.
+ if (MI.getOperand(1).isUndef() || MI.getNumOperands() > 3 ||
+ (MI.getNumOperands() == 3 && !MI.getOperand(2).isImm())) {
MI.setDesc(TII->get(TargetOpcode::KILL));
LLVM_DEBUG(dbgs() << " replace by: " << MI);
return;
@@ -641,11 +718,14 @@ void VirtRegRewriter::rewrite() {
SmallVector<Register, 8> SuperDeads;
SmallVector<Register, 8> SuperDefs;
SmallVector<Register, 8> SuperKills;
+ uint64_t Mask;
for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
MBBI != MBBE; ++MBBI) {
LLVM_DEBUG(MBBI->print(dbgs(), Indexes));
for (MachineInstr &MI : llvm::make_early_inc_range(MBBI->instrs())) {
+ // reset for each MI.
+ Mask = 0u;
for (MachineOperand &MO : MI.operands()) {
// Make sure MRI knows about registers clobbered by regmasks.
if (MO.isRegMask())
@@ -663,6 +743,9 @@ void VirtRegRewriter::rewrite() {
RewriteRegs.insert(PhysReg);
assert(!MRI->isReserved(PhysReg) && "Reserved register assignment");
+ if (MO.isUse() && MI.isCopy())
+ Mask = calcLiveRegUnitMask(MO, PhysReg);
+
// Preserve semantics of sub-register operands.
unsigned SubReg = MO.getSubReg();
if (SubReg != 0) {
@@ -739,6 +822,10 @@ void VirtRegRewriter::rewrite() {
MO.setIsRenamable(true);
}
+ // Add LaneBitmask as MO_Imm
+ if (MI.isCopy() && Mask)
+ MI.addOperand(*MF, MachineOperand::CreateImm(Mask));
+
// Add any missing super-register kills after rewriting the whole
// instruction.
while (!SuperKills.empty())
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 571f3efd68260..29c6d18a65308 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -691,16 +691,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
I->clearRegisterKills(DefOp.getReg(), &RI);
}
- MachineInstrBuilder Builder =
- BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
- .add(DefOp);
- if (ImpDefSuperReg)
- Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
-
- if (ImpUseSuperReg) {
- Builder.addReg(ImpUseSuperReg,
- getKillRegState(KillSrc) | RegState::Implicit);
- }
+ BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
+ .add(DefOp);
return;
}
@@ -744,27 +736,26 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
.addReg(SrcReg, getKillRegState(KillSrc));
- if (ImpUseSuperReg) {
- UseBuilder.addReg(ImpUseSuperReg,
- getKillRegState(KillSrc) | RegState::Implicit);
- }
- MachineInstrBuilder DefBuilder
- = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
- .addReg(Tmp, RegState::Kill);
-
- if (ImpDefSuperReg)
- DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
+ BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
+ .addReg(Tmp, RegState::Kill);
}
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, const DebugLoc &DL,
MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
- const TargetRegisterClass *RC, bool Forward) {
+ const TargetRegisterClass *RC, bool Forward,
+ uint64_t LiveRegUnitMaskVal) {
const SIRegisterInfo &RI = TII.getRegisterInfo();
ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
MachineBasicBlock::iterator I = MI;
- MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
+ bool isSrcRegFullLive = LiveRegUnitMaskVal == 0;
+
+ uint64_t TestMaskVal = 0x0000000000000003;
+ uint8_t ShiftVal = 2;
+
+ if (!Forward)
+ TestMaskVal = TestMaskVal << (ShiftVal * (BaseIndices.size() - 1));
for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
int16_t SubIdx = BaseIndices[Idx];
@@ -772,41 +763,47 @@ static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
unsigned Opcode = AMDGPU::S_MOV_B32;
+ bool IsFirstSubreg = Idx == 0;
+
+ if (!IsFirstSubreg) {
+ TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
+ }
+
+ // Check for liveness of current subregister using TestMaskVal.
+ if (!isSrcRegFullLive && (LiveRegUnitMaskVal & TestMaskVal) == uint64_t(0))
+ continue;
// Is SGPR aligned? If so try to combine with next.
bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
- if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
+ bool isSrc64Live = true;
+
+ if (!isSrcRegFullLive)
+ isSrc64Live = Forward
+ ? ((LiveRegUnitMaskVal & (TestMaskVal << ShiftVal)) !=
+ uint64_t(0))
+ : ((LiveRegUnitMaskVal & (TestMaskVal >> ShiftVal)) !=
+ uint64_t(0));
+
+ if (isSrc64Live && AlignedDest && AlignedSrc &&
+ (Idx + 1 < BaseIndices.size())) {
// Can use SGPR64 copy
unsigned Channel = RI.getChannelFromSubReg(SubIdx);
SubIdx = RI.getSubRegFromChannel(Channel, 2);
DestSubReg = RI.getSubReg(DestReg, SubIdx);
SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
+ TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
Opcode = AMDGPU::S_MOV_B64;
Idx++;
}
- LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
- .addReg(SrcSubReg)
- .addReg(SrcReg, RegState::Implicit);
-
- if (!FirstMI)
- FirstMI = LastMI;
+ BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
+ .addReg(SrcSubReg, getKillRegState(KillSrc));
if (!Forward)
I--;
}
-
- assert(FirstMI && LastMI);
- if (!Forward)
- std::swap(FirstMI, LastMI);
-
- FirstMI->addOperand(
- MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
-
- if (KillSrc)
- LastMI->addRegisterKilled(SrcReg, &RI);
}
void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
@@ -819,6 +816,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
+ uint64_t LiveRegUnitMaskVal = 0;
+ if (MI->getNumOperands() > 2 && MI->getOperand(2).isImm()) {
+ LiveRegUnitMaskVal = MI->getOperand(2).getImm();
+ }
+
+ bool isSrcRegFullLive = LiveRegUnitMaskVal == 0;
+
// The rest of copyPhysReg assumes Src and Dst size are the same size.
// TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
// we remove Fix16BitCopies and this code block?
@@ -1052,16 +1056,15 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (ST.hasPkMovB32()) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
- .addImm(SISrcMods::OP_SEL_1)
- .addReg(SrcReg)
- .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
- .addReg(SrcReg)
- .addImm(0) // op_sel_lo
- .addImm(0) // op_sel_hi
- .addImm(0) // neg_lo
- .addImm(0) // neg_hi
- .addImm(0) // clamp
- .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
+ .addImm(SISrcMods::OP_SEL_1)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
+ .addReg(SrcReg, getKillRegState(KillSrc))
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0); // clamp
return;
}
}
@@ -1074,12 +1077,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
- Forward);
+ Forward, LiveRegUnitMaskVal);
return;
}
unsigned EltSize = 4;
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+ uint64_t TestMaskVal = 0x0000000000000003;
+ uint8_t ShiftVal = 2;
if (RI.isAGPRClass(RC)) {
if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
@@ -1094,12 +1099,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
(RI.isProperlyAlignedRC(*RC) &&
(SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
// TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
+ // TODO: In case of partial liveness, could do mix of 64-bit and 32-bit
+ // moves. Look expandSGPRCopy function for reference.
if (ST.hasMovB64()) {
Opcode = AMDGPU::V_MOV_B64_e32;
EltSize = 8;
+ TestMaskVal = 0x000000000000000F;
+ ShiftVal = 4;
} else if (ST.hasPkMovB32()) {
Opcode = AMDGPU::V_PK_MOV_B32;
EltSize = 8;
+ TestMaskVal = 0x000000000000000F;
+ ShiftVal = 4;
}
}
@@ -1114,6 +1125,10 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
+ // The TestMaskVal will scan from right to left.
+ if (!Forward)
+ TestMaskVal = TestMaskVal << (ShiftVal * (SubIndices.size() - 1));
+
// If there is an overlap, we can't kill the super-register on the last
// instruction, since it will also kill the components made live by this def.
const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
@@ -1130,7 +1145,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
bool IsFirstSubreg = Idx == 0;
- bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
+ bool UseKill = CanKillSuperReg;
+
+ if (!IsFirstSubreg) {
+ TestMaskVal = Forward ? TestMaskVal << ShiftVal : TestMaskVal >> ShiftVal;
+ }
+
+ if (!isSrcRegFullLive && (LiveRegUnitMaskVal & TestMaskVal) == uint64_t(0))
+ continue;
if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
@@ -1141,24 +1163,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineInstrBuilder MIB =
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
.addImm(SISrcMods::OP_SEL_1)
- .addReg(SrcSubReg)
+ .addReg(SrcSubReg, getKillRegState(UseKill))
.addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
- .addReg(SrcSubReg)
- .addImm(0) // op_sel_lo
- .addImm(0) // op_sel_hi
- .addImm(0) // neg_lo
- .addImm(0) // neg_hi
- .addImm(0) // clamp
- .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
- if (IsFirstSubreg)
- MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
+ .addReg(SrcSubReg, getKillRegState(UseKill))
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0); // clamp
} else {
MachineInstrBuilder Builder =
- BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
- if (IsFirstSubreg)
- Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
-
- Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
+ BuildMI(MBB, MI, DL, get(Opcode), DestSubReg)
+ .addReg(SrcSubReg, getKillRegState(UseKill));
}
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index 38374d1689366..3cab3fb763523 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -676,8 +676,8 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
; GFX7-LABEL: s_saddo_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_add_u32 s4, s0, s2
-; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_addc_u32 s5, s1, s3
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX7-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
@@ -693,8 +693,8 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
; GFX8-LABEL: s_saddo_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s4, s0, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_addc_u32 s5, s1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
@@ -710,8 +710,8 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
; GFX9-LABEL: s_saddo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s4, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_addc_u32 s5, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
index 28ed88f4cf8fb..31606dea7d335 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
@@ -96,8 +96,8 @@ define amdgpu_cs void @atomic_add_and_format(<4 x i32> inreg %arg) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
@@ -192,8 +192,8 @@ define amdgpu_cs void @atomic_sub_and_format(<4 x i32> inreg %arg) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
@@ -294,8 +294,8 @@ define amdgpu_cs void @atomic_xor_and_format(<4 x i32> inreg %arg) {
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_xor_b32_e32 v4, s4, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
@@ -392,8 +392,8 @@ define amdgpu_cs void @atomic_ptr_add_and_format(ptr addrspace(8) inreg %arg) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0
-; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
@@ -492,8 +492,8 @@ define amdgpu_cs void @atomic_ptr_sub_and_format(ptr addrspace(8) inreg %arg) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s4, v1
;...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/151124
More information about the llvm-commits
mailing list