[llvm] [VectorCombine] Preserves the maximal legal FPMathFlags during foldShuffleToIdentity (PR #94295)

Mon Jun 3 17:10:52 PDT 2024

https://github.com/mustartt created https://github.com/llvm/llvm-project/pull/94295

The `VectorCombine::foldShuffleToIdentity` does not preserve fast math flags when folding the shuffle, leading to unexpected vectorized result and missed optimizations with FMA instructions.  

We can conservatively take the maximal legal set of fast math flags whenever we fold shuffles to identity to enable further optimizations in the backend.

>From 5ced37af58d5715ca715c74bd55e6e1d7711dcf8 Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry.jiang1 at ibm.com>
Date: Mon, 3 Jun 2024 15:46:28 -0400
Subject: [PATCH] Preserves the maximal legal FPMathFlags during
 foldShuffleToIdentity

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 70 +++++++++++++++----
 .../AArch64/shuffletoidentity.ll              | 47 ++++++++++++-
 2 files changed, 102 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 7ecfe5218ef67..8eb23e1112bcd 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -26,6 +27,8 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -1736,23 +1739,64 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
     Ops[Idx] = generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx),
                                    Ty, IdentityLeafs, SplatLeafs, Builder);
   }
+
+  FastMathFlags FMF;
+  FMF.setFast();
+  for_each(Item, [&FMF, Lane = FrontLane](const InstLane &E) {
+    if (E.second != Lane)
+      return;
+    auto *I = cast<Instruction>(E.first);
+    if (isa<FPMathOperator>(I))
+      FMF &= I->getFastMathFlags();
+  });
+
   Builder.SetInsertPoint(I);
   Type *DstTy =
       FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
-  if (auto *BI = dyn_cast<BinaryOperator>(I))
-    return Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(), Ops[0],
-                               Ops[1]);
-  if (auto *CI = dyn_cast<CmpInst>(I))
-    return Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
-  if (auto *SI = dyn_cast<SelectInst>(I))
-    return Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
-  if (auto *CI = dyn_cast<CastInst>(I))
-    return Builder.CreateCast((Instruction::CastOps)CI->getOpcode(), Ops[0],
-                              DstTy);
-  if (II)
-    return Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
+  if (auto *BI = dyn_cast<BinaryOperator>(I)) {
+    auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
+                                      Ops[0], Ops[1]);
+    if (auto *Inst = dyn_cast<Instruction>(Value); isa<FPMathOperator>(Inst)) {
+      Inst->setFastMathFlags(FMF);
+    }
+    return Value;
+  }
+  if (auto *CI = dyn_cast<CmpInst>(I)) {
+    auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
+    if (auto *Inst = dyn_cast<Instruction>(Value); isa<FPMathOperator>(Inst)) {
+      Inst->setFastMathFlags(FMF);
+    }
+    return Value;
+  }
+  if (auto *SI = dyn_cast<SelectInst>(I)) {
+    auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
+    if (auto *Inst = dyn_cast<Instruction>(Value); isa<FPMathOperator>(Inst)) {
+      Inst->setFastMathFlags(FMF);
+    }
+    return Value;
+  }
+  if (auto *CI = dyn_cast<CastInst>(I)) {
+    auto *Value = Builder.CreateCast((Instruction::CastOps)CI->getOpcode(),
+                                     Ops[0], DstTy);
+    if (auto *Inst = dyn_cast<Instruction>(Value); isa<FPMathOperator>(Inst)) {
+      Inst->setFastMathFlags(FMF);
+    }
+    return Value;
+  }
+  if (II) {
+    auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
+    if (auto *Inst = dyn_cast<Instruction>(Value); isa<FPMathOperator>(Inst)) {
+      Inst->setFastMathFlags(FMF);
+    }
+    return Value;
+  }
   assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
-  return Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
+  auto *Value =
+      Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
+  if (auto *Inst = dyn_cast<Instruction>(Value); isa<FPMathOperator>(Inst)) {
+    Inst->setFastMathFlags(FMF);
+  }
+  return Value;
 }
 
 // Starting from a shuffle, look up through operands tracking the shuffled index
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index c2e9be5688967..3e54099fc0970 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -828,10 +828,10 @@ define void @v8f64interleave(i64 %0, ptr %1, ptr %x, double %z) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP1:%.*]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <16 x double> [[WIDE_VEC]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <16 x double> [[WIDE_VEC]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], i64 [[TMP0:%.*]]
 ; CHECK-NEXT:    [[WIDE_VEC34:%.*]] = load <16 x double>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd <16 x double> [[WIDE_VEC34]], [[TMP3]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x double> [[WIDE_VEC34]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[TMP0]], 7
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 -56
@@ -937,5 +937,48 @@ define <4 x float> @fadd_mismatched_types(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %extshuf
 }
 
+define void @maximal_legal_fpmath(ptr %addr1, ptr %addr2, ptr %result, float %val) {
+; CHECK-LABEL: define void @maximal_legal_fpmath(
+; CHECK-SAME: ptr [[ADDR1:%.*]], ptr [[ADDR2:%.*]], ptr [[RESULT:%.*]], float [[VAL:%.*]]) {
+; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[VAL]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[SPLATINSERT]], <4 x float> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC1:%.*]] = load <16 x float>, ptr [[ADDR1]], align 4
+; CHECK-NEXT:    [[VEC2:%.*]] = load <16 x float>, ptr [[ADDR2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul contract <16 x float> [[TMP1]], [[VEC2]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd reassoc contract <16 x float> [[VEC1]], [[TMP2]]
+; CHECK-NEXT:    store <16 x float> [[INTERLEAVED_VEC]], ptr [[RESULT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %splatinsert = insertelement <4 x float> poison, float %val, i64 0
+  %incoming.vec = shufflevector <4 x float> %splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+
+  %vec1 = load <16 x float>, ptr %addr1, align 4
+  %strided.vec1 = shufflevector <16 x float> %vec1, <16 x float> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %strided.vec2 = shufflevector <16 x float> %vec1, <16 x float> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %strided.vec3 = shufflevector <16 x float> %vec1, <16 x float> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %strided.vec4 = shufflevector <16 x float> %vec1, <16 x float> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+
+  %vec2 = load <16 x float>, ptr %addr2, align 4
+  %strided.vec6 = shufflevector <16 x float> %vec2, <16 x float> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %strided.vec7 = shufflevector <16 x float> %vec2, <16 x float> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %strided.vec8 = shufflevector <16 x float> %vec2, <16 x float> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %strided.vec9 = shufflevector <16 x float> %vec2, <16 x float> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+
+  %1 = fmul fast <4 x float> %incoming.vec, %strided.vec6
+  %2 = fadd fast <4 x float> %strided.vec1, %1
+  %3 = fmul contract <4 x float> %incoming.vec, %strided.vec7
+  %4 = fadd fast <4 x float> %strided.vec2, %3
+  %5 = fmul contract reassoc <4 x float> %incoming.vec, %strided.vec8
+  %6 = fadd fast <4 x float> %strided.vec3, %5
+  %7 = fmul contract reassoc <4 x float> %incoming.vec, %strided.vec9
+  %8 = fadd contract reassoc <4 x float> %strided.vec4, %7
+
+  %9 = shufflevector <4 x float> %2, <4 x float> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %10 = shufflevector <4 x float> %6, <4 x float> %8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x float> %9, <8 x float> %10, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x float> %interleaved.vec, ptr %result, align 4
+
+  ret void
+}
 
 declare void @use(<4 x i8>)