[llvm] c8c5dc7 - GlobalIsel: Fix fma combine when one of the operands comes from unmerge
Petar Avramovic via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 12 08:47:59 PST 2022
Author: Petar Avramovic
Date: 2022-01-12T17:47:25+01:00
New Revision: c8c5dc766b259a64daf8e43045bed4e01455078c
URL: https://github.com/llvm/llvm-project/commit/c8c5dc766b259a64daf8e43045bed4e01455078c
DIFF: https://github.com/llvm/llvm-project/commit/c8c5dc766b259a64daf8e43045bed4e01455078c.diff
LOG: GlobalIsel: Fix fma combine when one of the operands comes from unmerge
Fma combine assumes that MRI.getVRegDef(Reg)->getOperand(0).getReg() = Reg
which is not true when Reg is defined by instruction with multiple defs
e.g. G_UNMERGE_VALUES.
Fix is to keep register and the instruction that defines register in
DefinitionAndSourceRegister and use when needed.
Differential Revision: https://reviews.llvm.org/D117032
Added:
Modified:
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 30b9196af59f8..9ba8cf0cd7c25 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -4851,37 +4851,39 @@ bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA(
if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
return false;
- MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
- MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+ Register Op1 = MI.getOperand(1).getReg();
+ Register Op2 = MI.getOperand(2).getReg();
+ DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+ DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
unsigned PreferredFusedOpcode =
HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
// If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
// prefer to fold the multiply with fewer uses.
- if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) &&
- isContractableFMul(*RHS, AllowFusionGlobally)) {
- if (hasMoreUses(*LHS, *RHS, MRI))
+ if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+ isContractableFMul(*RHS.MI, AllowFusionGlobally)) {
+ if (hasMoreUses(*LHS.MI, *RHS.MI, MRI))
std::swap(LHS, RHS);
}
// fold (fadd (fmul x, y), z) -> (fma x, y, z)
- if (isContractableFMul(*LHS, AllowFusionGlobally) &&
- (Aggressive || MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg()))) {
+ if (isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+ (Aggressive || MRI.hasOneNonDBGUse(LHS.Reg))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
- {LHS->getOperand(1).getReg(), LHS->getOperand(2).getReg(),
- RHS->getOperand(0).getReg()});
+ {LHS.MI->getOperand(1).getReg(),
+ LHS.MI->getOperand(2).getReg(), RHS.Reg});
};
return true;
}
// fold (fadd x, (fmul y, z)) -> (fma y, z, x)
- if (isContractableFMul(*RHS, AllowFusionGlobally) &&
- (Aggressive || MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg()))) {
+ if (isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
+ (Aggressive || MRI.hasOneNonDBGUse(RHS.Reg))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
- {RHS->getOperand(1).getReg(), RHS->getOperand(2).getReg(),
- LHS->getOperand(0).getReg()});
+ {RHS.MI->getOperand(1).getReg(),
+ RHS.MI->getOperand(2).getReg(), LHS.Reg});
};
return true;
}
@@ -4898,8 +4900,10 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA(
return false;
const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering();
- MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
- MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+ Register Op1 = MI.getOperand(1).getReg();
+ Register Op2 = MI.getOperand(2).getReg();
+ DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+ DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
LLT DstType = MRI.getType(MI.getOperand(0).getReg());
unsigned PreferredFusedOpcode =
@@ -4907,42 +4911,38 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA(
// If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
// prefer to fold the multiply with fewer uses.
- if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) &&
- isContractableFMul(*RHS, AllowFusionGlobally)) {
- if (hasMoreUses(*LHS, *RHS, MRI))
+ if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+ isContractableFMul(*RHS.MI, AllowFusionGlobally)) {
+ if (hasMoreUses(*LHS.MI, *RHS.MI, MRI))
std::swap(LHS, RHS);
}
// fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
MachineInstr *FpExtSrc;
- if (mi_match(LHS->getOperand(0).getReg(), MRI,
- m_GFPExt(m_MInstr(FpExtSrc))) &&
+ if (mi_match(LHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) &&
isContractableFMul(*FpExtSrc, AllowFusionGlobally) &&
TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
MRI.getType(FpExtSrc->getOperand(1).getReg()))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
auto FpExtX = B.buildFPExt(DstType, FpExtSrc->getOperand(1).getReg());
auto FpExtY = B.buildFPExt(DstType, FpExtSrc->getOperand(2).getReg());
- B.buildInstr(
- PreferredFusedOpcode, {MI.getOperand(0).getReg()},
- {FpExtX.getReg(0), FpExtY.getReg(0), RHS->getOperand(0).getReg()});
+ B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+ {FpExtX.getReg(0), FpExtY.getReg(0), RHS.Reg});
};
return true;
}
// fold (fadd z, (fpext (fmul x, y))) -> (fma (fpext x), (fpext y), z)
// Note: Commutes FADD operands.
- if (mi_match(RHS->getOperand(0).getReg(), MRI,
- m_GFPExt(m_MInstr(FpExtSrc))) &&
+ if (mi_match(RHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) &&
isContractableFMul(*FpExtSrc, AllowFusionGlobally) &&
TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
MRI.getType(FpExtSrc->getOperand(1).getReg()))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
auto FpExtX = B.buildFPExt(DstType, FpExtSrc->getOperand(1).getReg());
auto FpExtY = B.buildFPExt(DstType, FpExtSrc->getOperand(2).getReg());
- B.buildInstr(
- PreferredFusedOpcode, {MI.getOperand(0).getReg()},
- {FpExtX.getReg(0), FpExtY.getReg(0), LHS->getOperand(0).getReg()});
+ B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+ {FpExtX.getReg(0), FpExtY.getReg(0), LHS.Reg});
};
return true;
}
@@ -4958,8 +4958,10 @@ bool CombinerHelper::matchCombineFAddFMAFMulToFMadOrFMA(
if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive, true))
return false;
- MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
- MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+ Register Op1 = MI.getOperand(1).getReg();
+ Register Op2 = MI.getOperand(2).getReg();
+ DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+ DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
unsigned PreferredFusedOpcode =
@@ -4967,31 +4969,31 @@ bool CombinerHelper::matchCombineFAddFMAFMulToFMadOrFMA(
// If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
// prefer to fold the multiply with fewer uses.
- if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) &&
- isContractableFMul(*RHS, AllowFusionGlobally)) {
- if (hasMoreUses(*LHS, *RHS, MRI))
+ if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+ isContractableFMul(*RHS.MI, AllowFusionGlobally)) {
+ if (hasMoreUses(*LHS.MI, *RHS.MI, MRI))
std::swap(LHS, RHS);
}
MachineInstr *FMA = nullptr;
Register Z;
// fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y, (fma u, v, z))
- if (LHS->getOpcode() == PreferredFusedOpcode &&
- (MRI.getVRegDef(LHS->getOperand(3).getReg())->getOpcode() ==
+ if (LHS.MI->getOpcode() == PreferredFusedOpcode &&
+ (MRI.getVRegDef(LHS.MI->getOperand(3).getReg())->getOpcode() ==
TargetOpcode::G_FMUL) &&
- MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg()) &&
- MRI.hasOneNonDBGUse(LHS->getOperand(3).getReg())) {
- FMA = LHS;
- Z = RHS->getOperand(0).getReg();
+ MRI.hasOneNonDBGUse(LHS.MI->getOperand(0).getReg()) &&
+ MRI.hasOneNonDBGUse(LHS.MI->getOperand(3).getReg())) {
+ FMA = LHS.MI;
+ Z = RHS.Reg;
}
// fold (fadd z, (fma x, y, (fmul u, v))) -> (fma x, y, (fma u, v, z))
- else if (RHS->getOpcode() == PreferredFusedOpcode &&
- (MRI.getVRegDef(RHS->getOperand(3).getReg())->getOpcode() ==
+ else if (RHS.MI->getOpcode() == PreferredFusedOpcode &&
+ (MRI.getVRegDef(RHS.MI->getOperand(3).getReg())->getOpcode() ==
TargetOpcode::G_FMUL) &&
- MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg()) &&
- MRI.hasOneNonDBGUse(RHS->getOperand(3).getReg())) {
- Z = LHS->getOperand(0).getReg();
- FMA = RHS;
+ MRI.hasOneNonDBGUse(RHS.MI->getOperand(0).getReg()) &&
+ MRI.hasOneNonDBGUse(RHS.MI->getOperand(3).getReg())) {
+ Z = LHS.Reg;
+ FMA = RHS.MI;
}
if (FMA) {
@@ -5026,17 +5028,19 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering();
LLT DstType = MRI.getType(MI.getOperand(0).getReg());
- MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
- MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+ Register Op1 = MI.getOperand(1).getReg();
+ Register Op2 = MI.getOperand(2).getReg();
+ DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+ DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
unsigned PreferredFusedOpcode =
HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
// If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
// prefer to fold the multiply with fewer uses.
- if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) &&
- isContractableFMul(*RHS, AllowFusionGlobally)) {
- if (hasMoreUses(*LHS, *RHS, MRI))
+ if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+ isContractableFMul(*RHS.MI, AllowFusionGlobally)) {
+ if (hasMoreUses(*LHS.MI, *RHS.MI, MRI))
std::swap(LHS, RHS);
}
@@ -5055,16 +5059,17 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
MachineInstr *FMulMI, *FMAMI;
// fold (fadd (fma x, y, (fpext (fmul u, v))), z)
// -> (fma x, y, (fma (fpext u), (fpext v), z))
- if (LHS->getOpcode() == PreferredFusedOpcode &&
- mi_match(LHS->getOperand(3).getReg(), MRI, m_GFPExt(m_MInstr(FMulMI))) &&
+ if (LHS.MI->getOpcode() == PreferredFusedOpcode &&
+ mi_match(LHS.MI->getOperand(3).getReg(), MRI,
+ m_GFPExt(m_MInstr(FMulMI))) &&
isContractableFMul(*FMulMI, AllowFusionGlobally) &&
TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
MRI.getType(FMulMI->getOperand(0).getReg()))) {
MatchInfo = [=](MachineIRBuilder &B) {
buildMatchInfo(FMulMI->getOperand(1).getReg(),
- FMulMI->getOperand(2).getReg(),
- RHS->getOperand(0).getReg(), LHS->getOperand(1).getReg(),
- LHS->getOperand(2).getReg(), B);
+ FMulMI->getOperand(2).getReg(), RHS.Reg,
+ LHS.MI->getOperand(1).getReg(),
+ LHS.MI->getOperand(2).getReg(), B);
};
return true;
}
@@ -5074,7 +5079,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
// FIXME: This turns two single-precision and one double-precision
// operation into two double-precision operations, which might not be
// interesting for all targets, especially GPUs.
- if (mi_match(LHS->getOperand(0).getReg(), MRI, m_GFPExt(m_MInstr(FMAMI))) &&
+ if (mi_match(LHS.Reg, MRI, m_GFPExt(m_MInstr(FMAMI))) &&
FMAMI->getOpcode() == PreferredFusedOpcode) {
MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg());
if (isContractableFMul(*FMulMI, AllowFusionGlobally) &&
@@ -5086,8 +5091,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
X = B.buildFPExt(DstType, X).getReg(0);
Y = B.buildFPExt(DstType, Y).getReg(0);
buildMatchInfo(FMulMI->getOperand(1).getReg(),
- FMulMI->getOperand(2).getReg(),
- RHS->getOperand(0).getReg(), X, Y, B);
+ FMulMI->getOperand(2).getReg(), RHS.Reg, X, Y, B);
};
return true;
@@ -5096,16 +5100,17 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
// fold (fadd z, (fma x, y, (fpext (fmul u, v)))
// -> (fma x, y, (fma (fpext u), (fpext v), z))
- if (RHS->getOpcode() == PreferredFusedOpcode &&
- mi_match(RHS->getOperand(3).getReg(), MRI, m_GFPExt(m_MInstr(FMulMI))) &&
+ if (RHS.MI->getOpcode() == PreferredFusedOpcode &&
+ mi_match(RHS.MI->getOperand(3).getReg(), MRI,
+ m_GFPExt(m_MInstr(FMulMI))) &&
isContractableFMul(*FMulMI, AllowFusionGlobally) &&
TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
MRI.getType(FMulMI->getOperand(0).getReg()))) {
MatchInfo = [=](MachineIRBuilder &B) {
buildMatchInfo(FMulMI->getOperand(1).getReg(),
- FMulMI->getOperand(2).getReg(),
- LHS->getOperand(0).getReg(), RHS->getOperand(1).getReg(),
- RHS->getOperand(2).getReg(), B);
+ FMulMI->getOperand(2).getReg(), LHS.Reg,
+ RHS.MI->getOperand(1).getReg(),
+ RHS.MI->getOperand(2).getReg(), B);
};
return true;
}
@@ -5115,7 +5120,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
// FIXME: This turns two single-precision and one double-precision
// operation into two double-precision operations, which might not be
// interesting for all targets, especially GPUs.
- if (mi_match(RHS->getOperand(0).getReg(), MRI, m_GFPExt(m_MInstr(FMAMI))) &&
+ if (mi_match(RHS.Reg, MRI, m_GFPExt(m_MInstr(FMAMI))) &&
FMAMI->getOpcode() == PreferredFusedOpcode) {
MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg());
if (isContractableFMul(*FMulMI, AllowFusionGlobally) &&
@@ -5127,8 +5132,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
X = B.buildFPExt(DstType, X).getReg(0);
Y = B.buildFPExt(DstType, Y).getReg(0);
buildMatchInfo(FMulMI->getOperand(1).getReg(),
- FMulMI->getOperand(2).getReg(),
- LHS->getOperand(0).getReg(), X, Y, B);
+ FMulMI->getOperand(2).getReg(), LHS.Reg, X, Y, B);
};
return true;
}
@@ -5145,16 +5149,18 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
return false;
- MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
- MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+ Register Op1 = MI.getOperand(1).getReg();
+ Register Op2 = MI.getOperand(2).getReg();
+ DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+ DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
// If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
// prefer to fold the multiply with fewer uses.
int FirstMulHasFewerUses = true;
- if (isContractableFMul(*LHS, AllowFusionGlobally) &&
- isContractableFMul(*RHS, AllowFusionGlobally) &&
- hasMoreUses(*LHS, *RHS, MRI))
+ if (isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+ isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
+ hasMoreUses(*LHS.MI, *RHS.MI, MRI))
FirstMulHasFewerUses = false;
unsigned PreferredFusedOpcode =
@@ -5162,24 +5168,24 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
// fold (fsub (fmul x, y), z) -> (fma x, y, -z)
if (FirstMulHasFewerUses &&
- (isContractableFMul(*LHS, AllowFusionGlobally) &&
- (Aggressive || MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg())))) {
+ (isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+ (Aggressive || MRI.hasOneNonDBGUse(LHS.Reg)))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
- Register NegZ = B.buildFNeg(DstTy, RHS->getOperand(0).getReg()).getReg(0);
- B.buildInstr(
- PreferredFusedOpcode, {MI.getOperand(0).getReg()},
- {LHS->getOperand(1).getReg(), LHS->getOperand(2).getReg(), NegZ});
+ Register NegZ = B.buildFNeg(DstTy, RHS.Reg).getReg(0);
+ B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+ {LHS.MI->getOperand(1).getReg(),
+ LHS.MI->getOperand(2).getReg(), NegZ});
};
return true;
}
// fold (fsub x, (fmul y, z)) -> (fma -y, z, x)
- else if ((isContractableFMul(*RHS, AllowFusionGlobally) &&
- (Aggressive || MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg())))) {
+ else if ((isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
+ (Aggressive || MRI.hasOneNonDBGUse(RHS.Reg)))) {
MatchInfo = [=, &MI](MachineIRBuilder &B) {
- Register NegY = B.buildFNeg(DstTy, RHS->getOperand(1).getReg()).getReg(0);
- B.buildInstr(
- PreferredFusedOpcode, {MI.getOperand(0).getReg()},
- {NegY, RHS->getOperand(2).getReg(), LHS->getOperand(0).getReg()});
+ Register NegY =
+ B.buildFNeg(DstTy, RHS.MI->getOperand(1).getReg()).getReg(0);
+ B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+ {NegY, RHS.MI->getOperand(2).getReg(), LHS.Reg});
};
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
index 70ee30df2fdfd..52665fa9c4463 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir
@@ -218,9 +218,8 @@ body: |
; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; GFX9-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX9-CONTRACT-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>)
- ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]]
+ ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
; GFX9-DENORM-LABEL: name: test_add_mul_multiple_defs_z
; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -240,9 +239,8 @@ body: |
; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; GFX9-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX9-UNSAFE-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>)
- ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]]
+ ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
; GFX10-LABEL: name: test_add_mul_multiple_defs_z
; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -262,9 +260,8 @@ body: |
; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; GFX10-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-CONTRACT-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>)
- ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]]
+ ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
; GFX10-DENORM-LABEL: name: test_add_mul_multiple_defs_z
; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -284,9 +281,8 @@ body: |
; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; GFX10-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-UNSAFE-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>)
- ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]]
+ ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
@@ -325,9 +321,8 @@ body: |
; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; GFX9-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX9-CONTRACT-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>)
- ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64)
- ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]]
+ ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
; GFX9-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z
; GFX9-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -347,9 +342,8 @@ body: |
; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; GFX9-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX9-UNSAFE-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>)
- ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64)
- ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]]
+ ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
; GFX10-LABEL: name: test_add_mul_rhs_multiple_defs_z
; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -369,9 +363,8 @@ body: |
; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; GFX10-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-CONTRACT-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>)
- ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64)
- ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]]
+ ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32)
; GFX10-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z
; GFX10-DENORM: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -391,9 +384,8 @@ body: |
; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; GFX10-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-UNSAFE-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD]](<2 x s32>)
- ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64)
- ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[TRUNC]]
+ ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
+ ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]]
; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
index feca3bd647ce1..785efb6c4375d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
@@ -149,11 +149,10 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, <2 x float> addrs
; GFX9-DENORM-LABEL: test_add_mul_multiple_defs_z:
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DENORM-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-DENORM-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
+; GFX9-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v4, v5
+; GFX9-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v3
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
@@ -188,11 +187,10 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, <2 x float> addrs
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-DENORM-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-DENORM-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
+; GFX10-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: v_mac_f32_e32 v0, v4, v5
+; GFX10-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v3
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
@@ -233,11 +231,10 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, <2 x float> a
; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DENORM-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-DENORM-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
+; GFX9-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v4, v5
+; GFX9-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v3
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
@@ -272,11 +269,10 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, <2 x float> a
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-DENORM-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-DENORM-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
+; GFX10-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: v_mac_f32_e32 v0, v4, v5
+; GFX10-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v3
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
index 4b79912694c73..9f6db82da0c4e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
@@ -14,9 +14,8 @@ body: |
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>)
- ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64)
- ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], %el0
+ ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], %el1
; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
@@ -42,9 +41,8 @@ body: |
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>)
- ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64)
- ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], %el0
+ ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], %el1
; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
@@ -72,11 +70,10 @@ body: |
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1
; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>)
- ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64)
+ ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
- ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el0
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el1
; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s16) = G_TRUNC %0(s32)
@@ -107,11 +104,10 @@ body: |
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1
; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>)
- ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64)
+ ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
- ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el0
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el1
; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s16) = G_TRUNC %0(s32)
@@ -139,9 +135,8 @@ body: |
; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr4_vgpr5
; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>)
- ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64)
- ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY2]], [[COPY3]], %el0
+ ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY2]], [[COPY3]], %el1
; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FMA]]
; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32)
%0:_(s32) = COPY $vgpr0
@@ -170,9 +165,8 @@ body: |
; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr4_vgpr5
; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>)
- ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64)
- ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY2]], [[COPY3]], %el0
+ ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY2]], [[COPY3]], %el1
; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FMA]]
; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32)
%0:_(s32) = COPY $vgpr0
@@ -202,15 +196,14 @@ body: |
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>)
- ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64)
+ ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4
; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr5
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
- ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el0
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el1
; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FMA]]
; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32)
%0:_(s32) = COPY $vgpr0
@@ -245,8 +238,7 @@ body: |
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>)
- ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64)
+ ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4
; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr5
@@ -255,7 +247,7 @@ body: |
; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
- ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT2]], [[FPEXT3]], %el0
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT2]], [[FPEXT3]], %el1
; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], [[FMA]]
; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32)
%0:_(s32) = COPY $vgpr0
@@ -289,8 +281,7 @@ body: |
; GFX10-LABEL: name: test_f16_f32_add_fma_ext_mul_rhs
; GFX10: %ptr:_(p1) = COPY $vgpr0_vgpr1
; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>)
- ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64)
+ ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr2
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3
; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4
@@ -299,7 +290,7 @@ body: |
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
- ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el0
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], %el1
; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FMA]]
; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32)
%ptr:_(p1) = COPY $vgpr0_vgpr1
@@ -330,8 +321,7 @@ body: |
; GFX10-LABEL: name: test_f16_f32_add_ext_fma_mul_rhs
; GFX10: %ptr:_(p1) = COPY $vgpr0_vgpr1
; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>)
- ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64)
+ ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr2
; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3
@@ -344,7 +334,7 @@ body: |
; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
- ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT2]], [[FPEXT3]], %el0
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FPEXT2]], [[FPEXT3]], %el1
; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FPEXT]], [[FPEXT1]], [[FMA]]
; GFX10-NEXT: $vgpr0 = COPY [[FMA1]](s32)
%ptr:_(p1) = COPY $vgpr0_vgpr1
@@ -380,9 +370,8 @@ body: |
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr0_vgpr1
; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>)
- ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64)
- ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG %el0
+ ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
+ ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG %el1
; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[FNEG]]
; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
%0:_(s32) = COPY $vgpr0
@@ -409,10 +398,9 @@ body: |
; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX10-NEXT: %ptr:_(p1) = COPY $vgpr2_vgpr3
; GFX10-NEXT: %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST %vec(<2 x s32>)
- ; GFX10-NEXT: %el0:_(s32) = G_TRUNC [[BITCAST]](s64)
+ ; GFX10-NEXT: %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]]
- ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[COPY1]], %el0
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[COPY1]], %el1
; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
More information about the llvm-commits
mailing list