[llvm] b5b61cc - [VectorCombine] Preserves the maximal legal FPMathFlags during foldShuffleToIdentity (#94295)

Wed Jun 5 08:35:42 PDT 2024

Author: Henry Jiang
Date: 2024-06-05T11:35:37-04:00
New Revision: b5b61cce96580552a796f21fdcbee3080fd04a9e

URL: https://github.com/llvm/llvm-project/commit/b5b61cce96580552a796f21fdcbee3080fd04a9e
DIFF: https://github.com/llvm/llvm-project/commit/b5b61cce96580552a796f21fdcbee3080fd04a9e.diff

LOG: [VectorCombine] Preserves the maximal legal FPMathFlags during foldShuffleToIdentity (#94295)

The `VectorCombine::foldShuffleToIdentity` does not preserve fast math
flags when folding the shuffle, leading to unexpected vectorized result
and missed optimizations with FMA instructions.

We can conservatively take the maximal legal set of fast math flags
whenever we fold shuffles to identity to enable further optimizations in
the backend.

---------

Co-authored-by: Henry Jiang <henry.jiang1 at ibm.com>

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 7ecfe5218ef67..e608c7fb60468 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -1736,23 +1737,47 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
     Ops[Idx] = generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx),
                                    Ty, IdentityLeafs, SplatLeafs, Builder);
   }
+
+  SmallVector<Value *, 8> ValueList;
+  for (const auto &Lane : Item)
+    if (Lane.first)
+      ValueList.push_back(Lane.first);
+
   Builder.SetInsertPoint(I);
   Type *DstTy =
       FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
-  if (auto *BI = dyn_cast<BinaryOperator>(I))
-    return Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(), Ops[0],
-                               Ops[1]);
-  if (auto *CI = dyn_cast<CmpInst>(I))
-    return Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
-  if (auto *SI = dyn_cast<SelectInst>(I))
-    return Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
-  if (auto *CI = dyn_cast<CastInst>(I))
-    return Builder.CreateCast((Instruction::CastOps)CI->getOpcode(), Ops[0],
-                              DstTy);
-  if (II)
-    return Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
+  if (auto *BI = dyn_cast<BinaryOperator>(I)) {
+    auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
+                                      Ops[0], Ops[1]);
+    propagateIRFlags(Value, ValueList);
+    return Value;
+  }
+  if (auto *CI = dyn_cast<CmpInst>(I)) {
+    auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
+    propagateIRFlags(Value, ValueList);
+    return Value;
+  }
+  if (auto *SI = dyn_cast<SelectInst>(I)) {
+    auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
+    propagateIRFlags(Value, ValueList);
+    return Value;
+  }
+  if (auto *CI = dyn_cast<CastInst>(I)) {
+    auto *Value = Builder.CreateCast((Instruction::CastOps)CI->getOpcode(),
+                                     Ops[0], DstTy);
+    propagateIRFlags(Value, ValueList);
+    return Value;
+  }
+  if (II) {
+    auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
+    propagateIRFlags(Value, ValueList);
+    return Value;
+  }
   assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
-  return Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
+  auto *Value =
+      Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
+  propagateIRFlags(Value, ValueList);
+  return Value;
 }
 
 // Starting from a shuffle, look up through operands tracking the shuffled index

diff  --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index c2e9be5688967..3e54099fc0970 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -828,10 +828,10 @@ define void @v8f64interleave(i64 %0, ptr %1, ptr %x, double %z) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP1:%.*]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <16 x double> [[WIDE_VEC]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <16 x double> [[WIDE_VEC]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], i64 [[TMP0:%.*]]
 ; CHECK-NEXT:    [[WIDE_VEC34:%.*]] = load <16 x double>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd <16 x double> [[WIDE_VEC34]], [[TMP3]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x double> [[WIDE_VEC34]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[TMP0]], 7
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 -56
@@ -937,5 +937,48 @@ define <4 x float> @fadd_mismatched_types(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %extshuf
 }
 
+define void @maximal_legal_fpmath(ptr %addr1, ptr %addr2, ptr %result, float %val) {
+; CHECK-LABEL: define void @maximal_legal_fpmath(
+; CHECK-SAME: ptr [[ADDR1:%.*]], ptr [[ADDR2:%.*]], ptr [[RESULT:%.*]], float [[VAL:%.*]]) {
+; CHECK-NEXT:    [[SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[VAL]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[SPLATINSERT]], <4 x float> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC1:%.*]] = load <16 x float>, ptr [[ADDR1]], align 4
+; CHECK-NEXT:    [[VEC2:%.*]] = load <16 x float>, ptr [[ADDR2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul contract <16 x float> [[TMP1]], [[VEC2]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd reassoc contract <16 x float> [[VEC1]], [[TMP2]]
+; CHECK-NEXT:    store <16 x float> [[INTERLEAVED_VEC]], ptr [[RESULT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %splatinsert = insertelement <4 x float> poison, float %val, i64 0
+  %incoming.vec = shufflevector <4 x float> %splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+
+  %vec1 = load <16 x float>, ptr %addr1, align 4
+  %strided.vec1 = shufflevector <16 x float> %vec1, <16 x float> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %strided.vec2 = shufflevector <16 x float> %vec1, <16 x float> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %strided.vec3 = shufflevector <16 x float> %vec1, <16 x float> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %strided.vec4 = shufflevector <16 x float> %vec1, <16 x float> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+
+  %vec2 = load <16 x float>, ptr %addr2, align 4
+  %strided.vec6 = shufflevector <16 x float> %vec2, <16 x float> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %strided.vec7 = shufflevector <16 x float> %vec2, <16 x float> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %strided.vec8 = shufflevector <16 x float> %vec2, <16 x float> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %strided.vec9 = shufflevector <16 x float> %vec2, <16 x float> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+
+  %1 = fmul fast <4 x float> %incoming.vec, %strided.vec6
+  %2 = fadd fast <4 x float> %strided.vec1, %1
+  %3 = fmul contract <4 x float> %incoming.vec, %strided.vec7
+  %4 = fadd fast <4 x float> %strided.vec2, %3
+  %5 = fmul contract reassoc <4 x float> %incoming.vec, %strided.vec8
+  %6 = fadd fast <4 x float> %strided.vec3, %5
+  %7 = fmul contract reassoc <4 x float> %incoming.vec, %strided.vec9
+  %8 = fadd contract reassoc <4 x float> %strided.vec4, %7
+
+  %9 = shufflevector <4 x float> %2, <4 x float> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %10 = shufflevector <4 x float> %6, <4 x float> %8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x float> %9, <8 x float> %10, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x float> %interleaved.vec, ptr %result, align 4
+
+  ret void
+}
 
 declare void @use(<4 x i8>)