[llvm] [CodeGen][AMDGPU] Insert IMPLICIT_DEF for undef subreg operands in REG_SEQUENCE and preserve them during register coalescing (PR #189153)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Mar 28 02:47:08 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-hexagon
@llvm/pr-subscribers-backend-aarch64
Author: adeshcom14
<details>
<summary>Changes</summary>
Currently, when TwoAddressInstructionPass lowers a REG_SEQUENCE, undef subreg operands are conditionally skipped based on LiveIntervals availability or downstream use scanning. This can leave undefined sublanes of a tuple register without any definition point. When the Register Coalescer later merges intervals involving these registers, it encounters sublane masks with no corresponding definition causing a crash.
Now, eliminateRegSequence always emits explicit IMPLICIT_DEF instructions for undef subreg operands, ensuring every sublane has a proper definition.
Additionally, the Register Coalescer unconditionally marked all IMPLICIT_DEF values as erasable, causing subreg IMPLICIT_DEFs that define undef sublanes of a tuple to be erased during coalescing. This left the merged live interval with sublane masks that had no definition.
Now the subreg IMPLICIT_DEFs are no longer marked as erasable. For non-overlapping lanes, the conflict is resolved with CR_Replace instead of CR_Erase to preserve the sublane definition.
The regression test coalescer-subreg-implicit-def.mir exercises both passes on a case that previously crashed.
Fixes [LCOMPILER-101]
---
Patch is 11.29 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/189153.diff
429 Files Affected:
- (modified) llvm/lib/CodeGen/RegisterCoalescer.cpp (+20-2)
- (modified) llvm/lib/CodeGen/TwoAddressInstructionPass.cpp (+16-16)
- (modified) llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll (+1)
- (modified) llvm/test/CodeGen/AArch64/arm64-dup.ll (+1)
- (modified) llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll (+33-27)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll (-19)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/bug_shuffle_vector_to_scalar.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll (+26-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll (+9)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fptrunc.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fsub.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll (+72-24)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fract.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll (+30)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll (+4-56)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll (+8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll (+22)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll (+31-25)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mad.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll (+62-46)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+66-32)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+14-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll (+1371-1315)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+742-706)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll (+28-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll (+96-82)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/smul.ll (+8-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll (+1791-1720)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+14-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f32.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/twoaddr-extract-dyn-v7f64.mir (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll (+11)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll (+744-708)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll (+11)
- (modified) llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll (+8-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll (+24)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+6674-4452)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+116)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll (+390-76)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll (+488-70)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll (+276-108)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+1688-852)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll (+853-169)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+5015-1192)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll (+24)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll (+727-195)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll (+1761-134)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll (+1509-131)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+2332-1308)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+5422-142)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+4630-142)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll (+56)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+3958-142)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+3286-142)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+2614-142)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+2106-322)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+1174-174)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll (+1131-642)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (+9-1)
- (modified) llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/anyext.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+114-60)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+105-21)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll (+12)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+174-63)
- (modified) llvm/test/CodeGen/AMDGPU/bfi_int.ll (+8)
- (modified) llvm/test/CodeGen/AMDGPU/bitop3.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/br_cc.f16.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation.ll (+7-3)
- (modified) llvm/test/CodeGen/AMDGPU/bswap.ll (+16-7)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll (+7-5)
- (modified) llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll (+142-28)
- (modified) llvm/test/CodeGen/AMDGPU/bypass-div.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+376-227)
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+74-24)
- (modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+9)
- (modified) llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.ll (+15)
- (added) llvm/test/CodeGen/AMDGPU/coalescer-subreg-implicit-def.mir (+126)
- (modified) llvm/test/CodeGen/AMDGPU/collapse-endcf.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll (+11)
- (modified) llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll (+8)
- (modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+17-5)
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll (+9)
- (modified) llvm/test/CodeGen/AMDGPU/dead-lane.mir (+6-5)
- (modified) llvm/test/CodeGen/AMDGPU/dead-machine-elim-after-dead-lane.ll (+5-3)
- (modified) llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll (+63-45)
- (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+6-1)
- (modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (+141-123)
- (modified) llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir (+1)
- (modified) llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll (+144)
- (modified) llvm/test/CodeGen/AMDGPU/extract-subvector.ll (+144)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll (+10-6)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll (+39-22)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll (+24-16)
- (modified) llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll (+66)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/fcmp.f16.ll (+58-28)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+113-29)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+81-19)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-vgpr16-to-spgr32.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll (+20)
- (modified) llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll (+98-42)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll (+18)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll (+120-36)
- (modified) llvm/test/CodeGen/AMDGPU/fma.f16.ll (+8)
- (modified) llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/fmaximum.ll (+10)
- (modified) llvm/test/CodeGen/AMDGPU/fmaximum3.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/fmaxnum.ll (+18-4)
- (modified) llvm/test/CodeGen/AMDGPU/fmed3.ll (+14-9)
- (modified) llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/fminimum.ll (+10)
- (modified) llvm/test/CodeGen/AMDGPU/fminimum3.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/fminnum.ll (+18-4)
- (modified) llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll (+50)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/fneg.bf16.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll (+12)
- (modified) llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll (+12)
- (modified) llvm/test/CodeGen/AMDGPU/fpext-free.ll (+32-7)
- (modified) llvm/test/CodeGen/AMDGPU/fpext.f16.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/fptoi.i128.ll (+21-16)
- (modified) llvm/test/CodeGen/AMDGPU/fptosi-sat-scalar.ll (+8)
- (modified) llvm/test/CodeGen/AMDGPU/fptosi-sat-vector.ll (+56-8)
- (modified) llvm/test/CodeGen/AMDGPU/fptoui-sat-scalar.ll (+8)
- (modified) llvm/test/CodeGen/AMDGPU/fptoui-sat-vector.ll (+77-29)
- (modified) llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll (+7)
- (modified) llvm/test/CodeGen/AMDGPU/freeze.ll (+136-33)
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+109-12)
- (modified) llvm/test/CodeGen/AMDGPU/fshr.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/function-args-inreg.ll (+134-65)
- (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+588-83)
- (modified) llvm/test/CodeGen/AMDGPU/function-returns.ll (+162)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+313-130)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+10)
- (modified) llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+218-72)
- (modified) llvm/test/CodeGen/AMDGPU/gws_agpr.ll (+10)
- (modified) llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll (+152-149)
- (modified) llvm/test/CodeGen/AMDGPU/idiv-licm.ll (+12)
- (modified) llvm/test/CodeGen/AMDGPU/idot4s.ll (+13-7)
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+115-23)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+705-530)
- (modified) llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/itofp.i128.ll (+135-109)
- (modified) llvm/test/CodeGen/AMDGPU/kernel-args.ll (+7-2)
- (modified) llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll (+7-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll (+67-27)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll (+11-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll (+122-52)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll (+8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll (+30-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll (+24)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll (+315-112)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll (+115-44)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll (+115-44)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll (+4-10)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll (+204-92)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.waitcnt.out.order.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll (+254-250)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll (+9)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.frexp.ll (+36)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll (+9)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+625-776)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+625-776)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+9)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.mulo.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.round.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll (+36-12)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+1806-1278)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+653-513)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+757-622)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+168-77)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+734-645)
- (modified) llvm/test/CodeGen/AMDGPU/load-local-i16.ll (+255-68)
- (modified) llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/loop_break.ll (+96-84)
- (modified) llvm/test/CodeGen/AMDGPU/lrint.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/lround.ll (+9-4)
- (modified) llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll (+70-14)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll (+82-20)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix.ll (+1019-289)
- (modified) llvm/test/CodeGen/AMDGPU/mad_64_32.ll (+70-47)
- (modified) llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll (+32-11)
- (modified) llvm/test/CodeGen/AMDGPU/madak.ll (+6-2)
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.ll (+189-68)
- (modified) llvm/test/CodeGen/AMDGPU/memmove-var-size.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/memset-pattern.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/min.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/minimummaximum.ll (+35-12)
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.ll (+186-68)
- (modified) llvm/test/CodeGen/AMDGPU/minmax.ll (+16)
- (modified) llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll (+7-6)
- (modified) llvm/test/CodeGen/AMDGPU/mul_int24.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/omod.ll (+58)
- (modified) llvm/test/CodeGen/AMDGPU/pack.v2f16.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/pack.v2i16.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+66-32)
- (modified) llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+10-5)
- (modified) llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll (+18-7)
- (modified) llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/r600-export-fix.ll (+29-29)
- (modified) llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll (+145-74)
- (modified) llvm/test/CodeGen/AMDGPU/repeated-divisor.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll (+11)
- (modified) llvm/test/CodeGen/AMDGPU/roundeven.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/s_cmp_0.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll (+80-63)
- (modified) llvm/test/CodeGen/AMDGPU/scratch-simple.ll (+669-466)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+9-2)
- (modified) llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/shl.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll (+54-22)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll (+101-34)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll (+96-22)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll (+242-60)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll (+54-22)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll (+101-34)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll (+96-22)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll (+242-60)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll (+90-30)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll (+162-48)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll (+162-48)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll (+294-162)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll (+90-30)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll (+162-48)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll (+162-48)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll (+54-22)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll (+101-34)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll (+96-22)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll (+242-60)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll (+27)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll (+36)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v4bf16.ll (+51)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll (+27)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll (+36)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v4f16.ll (+51)
``````````diff
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 586c27b7e3baf..bdfa25fb08b1b 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -2876,7 +2876,10 @@ JoinVals::ConflictResolution JoinVals::analyzeValue(unsigned ValNo,
//
// Clearing the valid lanes is deferred until it is sure this can be
// erased.
- V.ErasableImplicitDef = true;
+ // IMPLICIT_DEF can also be used to initialize the undef sub-parts
+ // of a tuple. We want to retain those IMPLICIT_DEFs.
+ if (DefMI->getOperand(0).getSubReg() == 0)
+ V.ErasableImplicitDef = true;
}
}
}
@@ -2975,8 +2978,23 @@ JoinVals::ConflictResolution JoinVals::analyzeValue(unsigned ValNo,
return CR_Replace;
// Check for simple erasable conflicts.
- if (DefMI->isImplicitDef())
+ if (DefMI->isImplicitDef()) {
+ // A subreg IMPLICIT_DEF that initializes an undef sublane of a tuple
+ // must not be erased if the other value has no overlapping lanes. Erasing
+ // it would leave the merged interval with sublane masks that have no
+ // definition.
+ if (DefMI->getOperand(0).getSubReg() && !SubRangeJoin &&
+ TrackSubRegLiveness && !OtherV.ErasableImplicitDef) {
+ LaneBitmask OtherLanes = Other.SubIdx
+ ? TRI->getSubRegIndexLaneMask(Other.SubIdx)
+ : LaneBitmask::getAll();
+ if ((OtherLanes & V.WriteLanes).none()) {
+ V.ValidLanes &= ~V.WriteLanes;
+ return CR_Replace;
+ }
+ }
return CR_Erase;
+ }
// Include the non-conflict where DefMI is a coalescable copy that kills
// OtherVNI. We still want the copy erased and value numbers merged.
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index ace5c6e49596e..0d26efb760442 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -2038,16 +2038,6 @@ void TwoAddressInstructionImpl::eliminateRegSequence(
}
}
- // If there are no live intervals information, we scan the use list once
- // in order to find which subregisters are used.
- LaneBitmask UsedLanes = LaneBitmask::getNone();
- if (!LIS) {
- for (MachineOperand &Use : MRI->use_nodbg_operands(DstReg)) {
- if (unsigned SubReg = Use.getSubReg())
- UsedLanes |= TRI->getSubRegIndexLaneMask(SubReg);
- }
- }
-
LaneBitmask UndefLanes = LaneBitmask::getNone();
bool DefEmitted = false;
for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) {
@@ -2055,14 +2045,24 @@ void TwoAddressInstructionImpl::eliminateRegSequence(
Register SrcReg = UseMO.getReg();
unsigned SubIdx = MI.getOperand(i+1).getImm();
// Nothing needs to be inserted for undef operands.
- // Unless there are no live intervals, and they are used at a later
- // instruction as operand.
+ // Insert IMPLICIT_DEF for undef operands with the corresponding
+ // sub-register.
if (UseMO.isUndef()) {
- LaneBitmask LaneMask = TRI->getSubRegIndexLaneMask(SubIdx);
- if (LIS || (UsedLanes & LaneMask).none()) {
- UndefLanes |= LaneMask;
- continue;
+ UndefLanes |= TRI->getSubRegIndexLaneMask(SubIdx);
+ // Insert IMPLICIT_DEF on dst register with the sub-register index.
+ MachineInstr *DefMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(TargetOpcode::IMPLICIT_DEF))
+ .addReg(DstReg, RegState::Define, SubIdx);
+ // The first def needs an undef flag because there is no live register
+ // before it.
+ if (!DefEmitted) {
+ DefMI->getOperand(0).setIsUndef(true);
+ // Return an iterator pointing to the first inserted instr.
+ MBBI = DefMI;
+ DefEmitted = true;
}
+ LLVM_DEBUG(dbgs() << "Inserted: " << *DefMI);
+ continue;
}
// Defer any kill flag to the last operand using SrcReg. Otherwise, we
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index ea55c198a70f1..8a662bf17d89b 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -207,6 +207,7 @@ define void @matrix_mul_double_shuffle(i32 %N, ptr nocapture %C, ptr nocapture r
; CHECK-GI-NEXT: fmov s2, w9
; CHECK-GI-NEXT: mov w9, w0
; CHECK-GI-NEXT: add w0, w0, #8
+; CHECK-GI-NEXT: // implicit-def: $q3
; CHECK-GI-NEXT: lsl x9, x9, #2
; CHECK-GI-NEXT: tbl v2.16b, { v2.16b, v3.16b }, v1.16b
; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 49fb6c98e223f..d93aa7a59e646 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -513,6 +513,7 @@ define void @disguised_dup(<4 x float> %x, ptr %p1, ptr %p2) {
; CHECK-GI-NEXT: adrp x8, .LCPI38_1
; CHECK-GI-NEXT: // kill: def $q0 killed $q0 def $q0_q1
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI38_1]
+; CHECK-GI-NEXT: // implicit-def: $q1
; CHECK-GI-NEXT: adrp x8, .LCPI38_0
; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI38_0]
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
index 062f5de38c45b..ce48d94e2124f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
@@ -395,6 +395,7 @@ define <4 x i32> @test_sabd_knownbits_vec4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI31_0]
; CHECK-GI-NEXT: movi v3.2d, #0x0000ff000000ff
; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: // implicit-def: $q1
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-GI-NEXT: ret
@@ -426,6 +427,7 @@ define <4 x i32> @knownbits_sabd_and_mask(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI32_0]
; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: // implicit-def: $q1
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
%1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
@@ -453,6 +455,7 @@ define <4 x i32> @knownbits_sabd_and_or_mask(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-GI-NEXT: orr v0.16b, v0.16b, v3.16b
; CHECK-GI-NEXT: orr v1.16b, v1.16b, v3.16b
; CHECK-GI-NEXT: uabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: // implicit-def: $q1
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
%1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
@@ -490,6 +493,7 @@ define <4 x i32> @knownbits_sabd_and_xor_mask(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-GI-NEXT: eor v0.16b, v0.16b, v3.16b
; CHECK-GI-NEXT: eor v1.16b, v1.16b, v3.16b
; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: // implicit-def: $q1
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
%1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
@@ -518,6 +522,7 @@ define <4 x i32> @knownbits_sabd_and_shl_mask(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-GI-NEXT: shl v0.4s, v0.4s, #17
; CHECK-GI-NEXT: shl v1.4s, v1.4s, #17
; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: // implicit-def: $q1
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
%1 = and <4 x i32> %a0, <i32 -65536, i32 -7, i32 -7, i32 -65536>
@@ -554,6 +559,7 @@ define <4 x i32> @knownbits_sabd_and_mul_mask(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-GI-NEXT: mul v1.4s, v1.4s, v2.4s
; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_0]
; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: // implicit-def: $q1
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT: ret
%1 = and <4 x i32> %a0, <i32 -65536, i32 -7, i32 -7, i32 -65536>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll
index 0a7edc13d2fad..e7dd1a8a80328 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll
@@ -86,6 +86,7 @@ define i16 @v_add_i16(i16 %a, i16 %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-NEXT: ; implicit-def: $vgpr0_hi16
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_add_i16:
@@ -96,6 +97,7 @@ define i16 @v_add_i16(i16 %a, i16 %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX12-NEXT: ; implicit-def: $vgpr0_hi16
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = add i16 %a, %b
ret i16 %c
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 6fe6b526a7afe..ecc0de6542656 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1839,21 +1839,23 @@ define i65 @v_ashr_i65_33(i65 %value) {
define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) {
; GCN-LABEL: s_ashr_i65:
; GCN: ; %bb.0:
-; GCN-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
-; GCN-NEXT: s_sub_i32 s10, s3, 64
-; GCN-NEXT: s_sub_i32 s8, 64, s3
-; GCN-NEXT: s_cmp_lt_u32 s3, 64
+; GCN-NEXT: s_mov_b32 s4, s3
+; GCN-NEXT: ; implicit-def: $sgpr3
+; GCN-NEXT: s_sub_i32 s10, s4, 64
+; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GCN-NEXT: s_sub_i32 s8, 64, s4
+; GCN-NEXT: s_cmp_lt_u32 s4, 64
; GCN-NEXT: s_cselect_b32 s11, 1, 0
-; GCN-NEXT: s_cmp_eq_u32 s3, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, 0
; GCN-NEXT: s_cselect_b32 s12, 1, 0
-; GCN-NEXT: s_ashr_i64 s[6:7], s[4:5], s3
-; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3
-; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8
-; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GCN-NEXT: s_ashr_i32 s7, s5, 31
-; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], s10
+; GCN-NEXT: s_ashr_i64 s[6:7], s[2:3], s4
+; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
+; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GCN-NEXT: s_ashr_i32 s7, s3, 31
+; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], s10
; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s12, 0
; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s11, 0
@@ -1862,25 +1864,27 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) {
;
; GFX10PLUS-LABEL: s_ashr_i65:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
-; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64
-; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3
-; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64
+; GFX10PLUS-NEXT: s_mov_b32 s4, s3
+; GFX10PLUS-NEXT: ; implicit-def: $sgpr3
+; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT: s_sub_i32 s10, s4, 64
+; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s4
+; GFX10PLUS-NEXT: s_cmp_lt_u32 s4, 64
; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0
-; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u32 s4, 0
; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3
-; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2
-; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[4:5], s3
+; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
+; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s5
+; GFX10PLUS-NEXT: s_ashr_i64 s[4:5], s[2:3], s4
; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX10PLUS-NEXT: s_ashr_i32 s3, s5, 31
-; GFX10PLUS-NEXT: s_ashr_i64 s[4:5], s[4:5], s10
+; GFX10PLUS-NEXT: s_ashr_i32 s5, s3, 31
+; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[2:3], s10
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, s3
+; GFX10PLUS-NEXT: s_cselect_b32 s2, s4, s5
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = ashr i65 %value, %amount
ret i65 %result
@@ -1889,8 +1893,9 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) {
define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_ashr_i65_33:
; GCN: ; %bb.0:
-; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GCN-NEXT: ; implicit-def: $sgpr3
; GCN-NEXT: s_lshr_b32 s4, s1, 1
+; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 31
; GCN-NEXT: s_or_b32 s0, s0, s4
; GCN-NEXT: s_ashr_i32 s2, s3, 1
@@ -1898,8 +1903,9 @@ define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) {
;
; GFX10PLUS-LABEL: s_ashr_i65_33:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT: ; implicit-def: $sgpr3
; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 1
+; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[2:3], 31
; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
index 32df44cb2f84f..1309ca33b9769 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
@@ -402,12 +402,6 @@ define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) {
; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_bswap_i16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0xc0c0001
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
%bswap = call i16 @llvm.bswap.i16(i16 %src)
ret i16 %bswap
}
@@ -434,12 +428,6 @@ define i16 @v_bswap_i16(i16 %src) {
; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001
; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_bswap_i16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
-; GFX10-NEXT: s_setpc_b64 s[30:31]
%bswap = call i16 @llvm.bswap.i16(i16 %src)
ret i16 %bswap
}
@@ -545,13 +533,6 @@ define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_bswap_i16_sext_to_i32:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
-; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX10-NEXT: s_setpc_b64 s[30:31]
%bswap = call i16 @llvm.bswap.i16(i16 %src)
%zext = sext i16 %bswap to i32
ret i32 %zext
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bug_shuffle_vector_to_scalar.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bug_shuffle_vector_to_scalar.ll
index 645b239015821..0e8d29e0ba7df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bug_shuffle_vector_to_scalar.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bug_shuffle_vector_to_scalar.ll
@@ -13,9 +13,11 @@ define amdgpu_gs <4 x float> @_amdgpu_gs_main() {
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s2, s0
; CHECK-NEXT: s_mov_b32 s3, s0
-; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: ; implicit-def: $vgpr2
+; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v2, s[0:3], 0 idxen
+; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_mov_b32_e32 v0, v1
; CHECK-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 92fd4466cf8a5..4629708b5c73a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -80,6 +80,7 @@ define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: ; implicit-def: $sgpr4_sgpr5
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 206011adf0213..b77b3858d5f27 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -2658,6 +2658,7 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v_bitcast(<14 x float> inreg %userD
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: v_readfirstlane_b32 s1, v1
+; GCN-NEXT: ; implicit-def: $sgpr16_sgpr17
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: dyn_extract_v7f64_s_v_bitcast:
@@ -2686,6 +2687,7 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v_bitcast(<14 x float> inreg %userD
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
+; GFX10-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
@@ -2715,6 +2717,7 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v_bitcast(<14 x float> inreg %userD
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
+; GFX11-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: ; return to shader part epilog
@@ -2784,6 +2787,7 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: v_readfirstlane_b32 s1, v1
+; GCN-NEXT: ; implicit-def: $sgpr16_sgpr17
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: dyn_extract_v7f64_s_v:
@@ -2812,6 +2816,7 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
+; GFX10-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
@@ -2841,6 +2846,7 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
+; GFX11-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: ; return to shader part epilog
@@ -2929,29 +2935,32 @@ define amdgpu_ps double @dyn_extract_v7f64_v_s(<7 x double> %vec, i32 inreg %sel
; GPRIDX-LABEL: dyn_extract_v7f64_v_s:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_lshl_b32 s0, s2, 1
+; GPRIDX-NEXT: ; implicit-def: $vgpr14_vgpr15
; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0)
-; GPRIDX-NEXT: v_mov_b32_e32 v14, v0
+; GPRIDX-NEXT: v_mov_b32_e32 v16, v0
; GPRIDX-NEXT: v_mov_b32_e32 v0, v1
; GPRIDX-NEXT: s_set_gpr_idx_off
-; GPRIDX-NEXT: v_readfirstlane...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/189153
More information about the llvm-commits
mailing list