[llvm] b8127cc - [AMDGPU][True16][CodeGen] fix v_mov_b16_t16 index in folding pass (#161764)

Fri Oct 3 14:34:45 PDT 2025

Author: Brox Chen
Date: 2025-10-03T17:34:42-04:00
New Revision: b8127cc8d0f5cb48c106199a059442e2719e8656

URL: https://github.com/llvm/llvm-project/commit/b8127cc8d0f5cb48c106199a059442e2719e8656
DIFF: https://github.com/llvm/llvm-project/commit/b8127cc8d0f5cb48c106199a059442e2719e8656.diff

LOG: [AMDGPU][True16][CodeGen] fix v_mov_b16_t16 index in folding pass (#161764)

With true16 mode v_mov_b16_t16 is added as new foldable copy inst, but
the src operand is in different index.

Use the correct src index for  v_mov_b16_t16.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/test/CodeGen/AMDGPU/true16-fold.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 82789bc4968c5..90c828ba8dfab 100644

--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -932,7 +932,9 @@ static MachineOperand *lookUpCopyChain(const SIInstrInfo &TII,
   for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg);
        SubDef && TII.isFoldableCopy(*SubDef);
        SubDef = MRI.getVRegDef(Sub->getReg())) {
-    MachineOperand &SrcOp = SubDef->getOperand(1);
+    unsigned SrcIdx = TII.getFoldableCopySrcIdx(*SubDef);
+    MachineOperand &SrcOp = SubDef->getOperand(SrcIdx);
+
     if (SrcOp.isImm())
       return &SrcOp;
     if (!SrcOp.isReg() || SrcOp.getReg().isPhysical())

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index cda8069936af2..46757cf5fe90c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3433,6 +3433,32 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
   }
 }
 
+unsigned SIInstrInfo::getFoldableCopySrcIdx(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::V_MOV_B16_t16_e32:
+  case AMDGPU::V_MOV_B16_t16_e64:
+    return 2;
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+  case AMDGPU::V_MOV_B64_PSEUDO:
+  case AMDGPU::V_MOV_B64_e32:
+  case AMDGPU::V_MOV_B64_e64:
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::S_MOV_B64_IMM_PSEUDO:
+  case AMDGPU::COPY:
+  case AMDGPU::WWM_COPY:
+  case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
+  case AMDGPU::V_ACCVGPR_READ_B32_e64:
+  case AMDGPU::V_ACCVGPR_MOV_B32:
+  case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
+  case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
+    return 1;
+  default:
+    llvm_unreachable("MI is not a foldable copy");
+  }
+}
+
 static constexpr AMDGPU::OpName ModifierOpNames[] = {
     AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
     AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a21089f8e0fcc..cc59acf1ebd94 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -417,6 +417,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                                   const MachineInstr &MIb) const override;
 
   static bool isFoldableCopy(const MachineInstr &MI);
+  static unsigned getFoldableCopySrcIdx(const MachineInstr &MI);
 
   void removeModOperands(MachineInstr &MI) const;
 

diff  --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
index 93cc12f152cca..9484417e63c98 100644
--- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir
+++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
@@ -57,6 +57,7 @@ body:             |
     %4:vgpr_16 = COPY %3:sgpr_lo16
     %5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec
     S_ENDPGM 0, implicit %5
+...
 
 ---
 name:            fold_16bit_madmix_clamp
@@ -207,3 +208,27 @@ body:             |
     $vgpr0 = COPY %4
     S_ENDPGM 0, implicit $vgpr0
 ...
+
+---
+name:            fold_imm16_across_reg_sequence
+tracksRegLiveness: true
+registers:
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-LABEL: name: fold_imm16_across_reg_sequence
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
+    ; CHECK-NEXT: [[V_MOV_B16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B16_t16_e64_]], %subreg.lo16, [[V_MOV_B16_t16_e64_1]], %subreg.hi16
+    ; CHECK-NEXT: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e64 0, -1, 0, -1, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F32_e64_]]
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
+    %0:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
+    %1:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
+    %2:vgpr_32 = REG_SEQUENCE %0, %subreg.lo16, %1, %subreg.hi16
+    %3:vgpr_32 = nofpexcept V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %3
+    S_ENDPGM 0, implicit $vgpr0
+...