[llvm] [SLP] getVectorCallCosts - don't provide scalar argument data for vector IntrinsicCostAttributes (PR #124254)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 24 02:44:09 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
getVectorCallCosts determines the cost of a vector intrinsic, based off an existing scalar intrinsic call - but we were including the scalar argument data to the IntrinsicCostAttributes, which meant that not only was the cost calculation not type-only based, it was making incorrect assumptions about constant values etc.
This also exposed an issue that x86 relied on fallback calculations for funnel shift costs - this is great when we have the argument data as that improves the accuracy of uniform shift amounts etc., but meant that type-only costs would default to Cost=2 for all custom lowered funnel shifts, which was far too cheap.
This is the reverse of #<!-- -->124129 where we weren't including argument data when we could.
Fixes #<!-- -->63980
---
Patch is 87.25 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124254.diff
4 Files Affected:
- (modified) llvm/lib/Target/X86/X86TargetTransformInfo.cpp (+18)
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+1-3)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll (+195-250)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll (+195-250)
``````````diff
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 34ba46f5e6cfd5..d3c923a76d074c 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4719,6 +4719,24 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
if (auto KindCost = Entry->Cost[CostKind])
return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
+
+ // Without arg data, we need to compute the expanded costs of custom lowered
+ // intrinsics to prevent use of the (very low) default costs.
+ if (ICA.isTypeBasedOnly() &&
+ (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
+ Type *CondTy = RetTy->getWithNewBitWidth(1);
+ InstructionCost Cost = 0;
+ Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
+ Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
+ Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
+ Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
+ Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
+ Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+ CmpInst::ICMP_EQ, CostKind);
+ Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+ CmpInst::ICMP_EQ, CostKind);
+ return Cost;
+ }
}
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c98d872fb6467f..a6674100654db7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9031,9 +9031,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
FastMathFlags FMF;
if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
FMF = FPCI->getFastMathFlags();
- SmallVector<const Value *> Arguments(CI->args());
- IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
- dyn_cast<IntrinsicInst>(CI));
+ IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF);
auto IntrinsicCost =
TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
index 153191b1eea084..3b526c4537243e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE4
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
@a64 = common global [8 x i64] zeroinitializer, align 64
@b64 = common global [8 x i64] zeroinitializer, align 64
@@ -240,16 +240,46 @@ define void @fshl_v16i32() {
; SSE-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 15), align 4
; SSE-NEXT: ret void
;
-; AVX-LABEL: @fshl_v16i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4
-; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
-; AVX-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
-; AVX-NEXT: ret void
+; AVX1-LABEL: @fshl_v16i32(
+; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
+; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
+; AVX1-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; AVX1-NEXT: store <4 x i32> [[TMP3]], ptr @d32, align 4
+; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
+; AVX1-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
+; AVX1-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; AVX1-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 4), align 4
+; AVX1-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; AVX1-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; AVX1-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; AVX1-NEXT: store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
+; AVX1-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
+; AVX1-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
+; AVX1-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; AVX1-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 12), align 4
+; AVX1-NEXT: ret void
+;
+; AVX2-LABEL: @fshl_v16i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
+; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
+; AVX2-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; AVX2-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4
+; AVX2-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; AVX2-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX2-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
+; AVX2-NEXT: ret void
+;
+; AVX256-LABEL: @fshl_v16i32(
+; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
+; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
+; AVX256-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; AVX256-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4
+; AVX256-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; AVX256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; AVX256-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX256-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
+; AVX256-NEXT: ret void
;
; AVX512-LABEL: @fshl_v16i32(
; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
@@ -333,155 +363,136 @@ define void @fshl_v16i32() {
}
define void @fshl_v32i16() {
-; SSE2-LABEL: @fshl_v32i16(
-; SSE2-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2
-; SSE2-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2
-; SSE2-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2
-; SSE2-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2
-; SSE2-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2
-; SSE2-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2
-; SSE2-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2
-; SSE2-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2
-; SSE2-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SSE2-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2
-; SSE2-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
-; SSE2-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
-; SSE2-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
-; SSE2-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
-; SSE2-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
-; SSE2-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
-; SSE2-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SSE2-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
-; SSE2-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
-; SSE2-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
-; SSE2-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
-; SSE2-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
-; SSE2-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
-; SSE2-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
-; SSE2-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SSE2-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
-; SSE2-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
-; SSE2-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
-; SSE2-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
-; SSE2-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
-; SSE2-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
-; SSE2-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
-; SSE2-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2
-; SSE2-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2
-; SSE2-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2
-; SSE2-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2
-; SSE2-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2
-; SSE2-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2
-; SSE2-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2
-; SSE2-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2
-; SSE2-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SSE2-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2
-; SSE2-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
-; SSE2-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
-; SSE2-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
-; SSE2-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
-; SSE2-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
-; SSE2-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
-; SSE2-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SSE2-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
-; SSE2-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
-; SSE2-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
-; SSE2-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
-; SSE2-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
-; SSE2-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
-; SSE2-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
-; SSE2-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SSE2-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
-; SSE2-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
-; SSE2-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
-; SSE2-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
-; SSE2-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
-; SSE2-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
-; SSE2-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
-; SSE2-NEXT: [[R0:%.*]] = call i16 @llvm.fshl.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]])
-; SSE2-NEXT: [[R1:%.*]] = call i16 @llvm.fshl.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]])
-; SSE2-NEXT: [[R2:%.*]] = call i16 @llvm.fshl.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]])
-; SSE2-NEXT: [[R3:%.*]] = call i16 @llvm.fshl.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]])
-; SSE2-NEXT: [[R4:%.*]] = call i16 @llvm.fshl.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]])
-; SSE2-NEXT: [[R5:%.*]] = call i16 @llvm.fshl.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]])
-; SSE2-NEXT: [[R6:%.*]] = call i16 @llvm.fshl.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]])
-; SSE2-NEXT: [[R7:%.*]] = call i16 @llvm.fshl.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]])
-; SSE2-NEXT: [[R8:%.*]] = call i16 @llvm.fshl.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]])
-; SSE2-NEXT: [[R9:%.*]] = call i16 @llvm.fshl.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]])
-; SSE2-NEXT: [[R10:%.*]] = call i16 @llvm.fshl.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]])
-; SSE2-NEXT: [[R11:%.*]] = call i16 @llvm.fshl.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]])
-; SSE2-NEXT: [[R12:%.*]] = call i16 @llvm.fshl.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]])
-; SSE2-NEXT: [[R13:%.*]] = call i16 @llvm.fshl.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]])
-; SSE2-NEXT: [[R14:%.*]] = call i16 @llvm.fshl.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]])
-; SSE2-NEXT: [[R15:%.*]] = call i16 @llvm.fshl.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]])
-; SSE2-NEXT: [[R16:%.*]] = call i16 @llvm.fshl.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]])
-; SSE2-NEXT: [[R17:%.*]] = call i16 @llvm.fshl.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]])
-; SSE2-NEXT: [[R18:%.*]] = call i16 @llvm.fshl.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]])
-; SSE2-NEXT: [[R19:%.*]] = call i16 @llvm.fshl.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]])
-; SSE2-NEXT: [[R20:%.*]] = call i16 @llvm.fshl.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]])
-; SSE2-NEXT: [[R21:%.*]] = call i16 @llvm.fshl.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]])
-; SSE2-NEXT: [[R22:%.*]] = call i16 @llvm.fshl.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]])
-; SSE2-NEXT: [[R23:%.*]] = call i16 @llvm.fshl.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]])
-; SSE2-NEXT: [[R24:%.*]] = call i16 @llvm.fshl.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]])
-; SSE2-NEXT: [[R25:%.*]] = call i16 @llvm.fshl.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]])
-; SSE2-NEXT: [[R26:%.*]] = call i16 @llvm.fshl.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]])
-; SSE2-NEXT: [[R27:%.*]] = call i16 @llvm.fshl.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]])
-; SSE2-NEXT: [[R28:%.*]] = call i16 @llvm.fshl.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]])
-; SSE2-NEXT: [[R29:%.*]] = call i16 @llvm.fshl.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]])
-; SSE2-NEXT: [[R30:%.*]] = call i16 @llvm.fshl.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]])
-; SSE2-NEXT: [[R31:%.*]] = call i16 @llvm.fshl.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]])
-; SSE2-NEXT: store i16 [[R0]], ptr @d16, align 2
-; SSE2-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2
-; SSE2-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2
-; SSE2-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2
-; SSE2-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2
-; SSE2-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2
-; SSE2-NEXT: store i16 [[R6]], ptr getelementptr inboun...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/124254
More information about the llvm-commits
mailing list