[llvm] [AMDGPU][CodeGenPrepare] Narrow 64 bit math to 32 bit if profitable (PR #130577)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 23 18:52:38 PDT 2025
https://github.com/Shoreshen updated https://github.com/llvm/llvm-project/pull/130577
>From fc7a5090fe0e6dfcc12894a57df8f43be6a0317e Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Mon, 10 Mar 2025 18:47:45 +0800
Subject: [PATCH 01/14] Narrow 64 bit math to 32 bit if profitable
---
.../AggressiveInstCombine.cpp | 44 +++++++++++++++++++
1 file changed, 44 insertions(+)
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 6b0f568864fd5..73bd75f37cc71 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1224,6 +1224,49 @@ static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI,
return false;
}
+static bool tryNarrowMathIfNoOverflow(Instruction &I,
+ TargetTransformInfo &TTI) {
+ unsigned opc = I.getOpcode();
+ if (opc != Instruction::Add && opc != Instruction::Sub &&
+ opc != Instruction::Mul) {
+ return false;
+ }
+ LLVMContext &ctx = I.getContext();
+ Type *i64type = Type::getInt64Ty(ctx);
+ Type *i32type = Type::getInt32Ty(ctx);
+
+ if (I.getType() != i64type || !TTI.isTruncateFree(i64type, i32type)) {
+ return false;
+ }
+ InstructionCost costOp64 =
+ TTI.getArithmeticInstrCost(opc, i64type, TTI::TCK_RecipThroughput);
+ InstructionCost costOp32 =
+ TTI.getArithmeticInstrCost(opc, i32type, TTI::TCK_RecipThroughput);
+ InstructionCost costZext64 = TTI.getCastInstrCost(
+ Instruction::ZExt, i64type, i32type, TTI.getCastContextHint(&I),
+ TTI::TCK_RecipThroughput);
+ if ((costOp64 - costOp32) <= costZext64) {
+ return false;
+ }
+ uint64_t AndConst0, AndConst1;
+ Value *X;
+ if ((match(I.getOperand(0), m_And(m_Value(X), m_ConstantInt(AndConst0))) ||
+ match(I.getOperand(0), m_And(m_ConstantInt(AndConst0), m_Value(X)))) &&
+ AndConst0 <= 2147483647 &&
+ (match(I.getOperand(1), m_And(m_Value(X), m_ConstantInt(AndConst1))) ||
+ match(I.getOperand(1), m_And(m_ConstantInt(AndConst1), m_Value(X)))) &&
+ AndConst1 <= 2147483647) {
+ IRBuilder<> Builder(&I);
+ Value *trun0 = Builder.CreateTrunc(I.getOperand(0), i32type);
+ Value *trun1 = Builder.CreateTrunc(I.getOperand(1), i32type);
+ Value *arith32 = Builder.CreateAdd(trun0, trun1);
+ Value *zext64 = Builder.CreateZExt(arith32, i64type);
+ I.replaceAllUsesWith(zext64);
+ I.eraseFromParent();
+ }
+ return false;
+}
+
/// This is the entry point for folds that could be implemented in regular
/// InstCombine, but they are separated because they are not expected to
/// occur frequently and/or have more than a constant-length pattern match.
@@ -1256,6 +1299,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
// needs to be called at the end of this sequence, otherwise we may make
// bugs.
MadeChange |= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange);
+ MadeChange |= tryNarrowMathIfNoOverflow(I, TTI);
}
}
>From 0fe9dbc148420023d709ba467d064cc59d22c72a Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Mon, 10 Mar 2025 21:23:10 +0800
Subject: [PATCH 02/14] add tests
---
.../narrow_math_for_and.ll | 97 +++++++++++++++++++
1 file changed, 97 insertions(+)
create mode 100644 llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll
diff --git a/llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll b/llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll
new file mode 100644
index 0000000000000..43e90f77e32f2
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=aggressive-instcombine < %s | FileCheck %s
+
+
+define i64 @narrow_add(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: define i64 @narrow_add(
+; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483647
+; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2147483647
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[ZEXT0]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ZEXT1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT: ret i64 [[TMP4]]
+;
+ %zext0 = and i64 %a, 2147483647
+ %zext1 = and i64 %b, 2147483647
+ %add = add i64 %zext0, %zext1
+ ret i64 %add
+}
+
+define i64 @narrow_mul(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: define i64 @narrow_mul(
+; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483647
+; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2147483647
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[ZEXT0]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ZEXT1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT: ret i64 [[TMP4]]
+;
+ %zext0 = and i64 %a, 2147483647
+ %zext1 = and i64 %b, 2147483647
+ %mul = mul i64 %zext0, %zext1
+ ret i64 %mul
+}
+
+define i64 @narrow_sub(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: define i64 @narrow_sub(
+; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483647
+; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2147483647
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[ZEXT0]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ZEXT1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT: ret i64 [[TMP4]]
+;
+ %zext0 = and i64 %a, 2147483647
+ %zext1 = and i64 %b, 2147483647
+ %sub = sub i64 %zext0, %zext1
+ ret i64 %sub
+}
+
+
+define i64 @no_narrow_add(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: define i64 @no_narrow_add(
+; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483648
+; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2147483648
+; CHECK-NEXT: [[ADD:%.*]] = add i64 [[ZEXT0]], [[ZEXT1]]
+; CHECK-NEXT: ret i64 [[ADD]]
+;
+ %zext0 = and i64 %a, 2147483648
+ %zext1 = and i64 %b, 2147483648
+ %add = add i64 %zext0, %zext1
+ ret i64 %add
+}
+
+define i64 @no_narrow_mul(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: define i64 @no_narrow_mul(
+; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483648
+; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2147483648
+; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ZEXT0]], [[ZEXT1]]
+; CHECK-NEXT: ret i64 [[MUL]]
+;
+ %zext0 = and i64 %a, 2147483648
+ %zext1 = and i64 %b, 2147483648
+ %mul = mul i64 %zext0, %zext1
+ ret i64 %mul
+}
+
+define i64 @no_narrow_sub(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: define i64 @no_narrow_sub(
+; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483648
+; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2147483648
+; CHECK-NEXT: [[SUB:%.*]] = sub i64 [[ZEXT0]], [[ZEXT1]]
+; CHECK-NEXT: ret i64 [[SUB]]
+;
+ %zext0 = and i64 %a, 2147483648
+ %zext1 = and i64 %b, 2147483648
+ %sub = sub i64 %zext0, %zext1
+ ret i64 %sub
+}
>From 9df0718d3a454b4d3e2930d12be3583069fedb7a Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Mon, 10 Mar 2025 23:57:09 +0800
Subject: [PATCH 03/14] fix mul, remove sub
---
.../AggressiveInstCombine.cpp | 28 ++++++++++++++++---
llvm/test/lit.cfg.py | 2 +-
2 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 73bd75f37cc71..56e97c4d64952 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1224,11 +1224,32 @@ static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI,
return false;
}
+static bool isSaveToNarrow(unsigned opc, uint64_t num1, uint64_t num2) {
+ if (num1 > 0xffffffff || num2 > 0xffffffff) {
+ // if `num > 0xffffffff`, then `%and = and i64 %a, num` may or may not have
+ // higher 32bit set. Which cause truncate possibly lose infomation
+ return false;
+ }
+ switch (opc) {
+ // If `%and = and i64 %a, num` where num <= 0xffffffff, then `%and` must be
+ // positive.
+ // Since add and mul both increasing function on positive integer domain and
+ // `%ai <= numi`, then if `(num1 op num2) <= 0xffffffff` we have `%a1 + %a2 <=
+ // 0xffffffff`
+ case Instruction::Add:
+ return (num1 + num2) <= 0xffffffff;
+ case Instruction::Mul:
+ return (num1 * num2) <= 0xffffffff;
+ break;
+ }
+
+ return false;
+}
+
static bool tryNarrowMathIfNoOverflow(Instruction &I,
TargetTransformInfo &TTI) {
unsigned opc = I.getOpcode();
- if (opc != Instruction::Add && opc != Instruction::Sub &&
- opc != Instruction::Mul) {
+ if (opc != Instruction::Add && opc != Instruction::Mul) {
return false;
}
LLVMContext &ctx = I.getContext();
@@ -1252,10 +1273,9 @@ static bool tryNarrowMathIfNoOverflow(Instruction &I,
Value *X;
if ((match(I.getOperand(0), m_And(m_Value(X), m_ConstantInt(AndConst0))) ||
match(I.getOperand(0), m_And(m_ConstantInt(AndConst0), m_Value(X)))) &&
- AndConst0 <= 2147483647 &&
(match(I.getOperand(1), m_And(m_Value(X), m_ConstantInt(AndConst1))) ||
match(I.getOperand(1), m_And(m_ConstantInt(AndConst1), m_Value(X)))) &&
- AndConst1 <= 2147483647) {
+ isSaveToNarrow(opc, AndConst0, AndConst1)) {
IRBuilder<> Builder(&I);
Value *trun0 = Builder.CreateTrunc(I.getOperand(0), i32type);
Value *trun1 = Builder.CreateTrunc(I.getOperand(1), i32type);
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index aad7a088551b2..50921879cd1f2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("ascii")
+ readobj_out = readobj_cmd.stdout.read().decode("utf-8")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From a5084d29e809d09bf7c88629b28836a531e15004 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Mon, 10 Mar 2025 23:57:53 +0800
Subject: [PATCH 04/14] fix lit.cfg.py
---
llvm/test/lit.cfg.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 50921879cd1f2..aad7a088551b2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("utf-8")
+ readobj_out = readobj_cmd.stdout.read().decode("ascii")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From 2e2d190bb817b757778360b831832c34d1b2bfa0 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 11 Mar 2025 01:36:36 +0800
Subject: [PATCH 05/14] fix test
---
.../narrow_math_for_and.ll | 66 ++++++++++++-------
llvm/test/lit.cfg.py | 2 +-
2 files changed, 42 insertions(+), 26 deletions(-)
diff --git a/llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll b/llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll
index 43e90f77e32f2..cdee5c20733ef 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll
@@ -19,11 +19,28 @@ define i64 @narrow_add(i64 noundef %a, i64 noundef %b) {
ret i64 %add
}
+define i64 @narrow_add_1(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: define i64 @narrow_add_1(
+; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483647
+; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2147483648
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[ZEXT0]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ZEXT1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT: ret i64 [[TMP4]]
+;
+ %zext0 = and i64 %a, 2147483647
+ %zext1 = and i64 %b, 2147483648
+ %add = add i64 %zext0, %zext1
+ ret i64 %add
+}
+
define i64 @narrow_mul(i64 noundef %a, i64 noundef %b) {
; CHECK-LABEL: define i64 @narrow_mul(
; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483647
-; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2147483647
+; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 0
; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[ZEXT0]] to i32
; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ZEXT1]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
@@ -31,16 +48,16 @@ define i64 @narrow_mul(i64 noundef %a, i64 noundef %b) {
; CHECK-NEXT: ret i64 [[TMP4]]
;
%zext0 = and i64 %a, 2147483647
- %zext1 = and i64 %b, 2147483647
+ %zext1 = and i64 %b, 0
%mul = mul i64 %zext0, %zext1
ret i64 %mul
}
-define i64 @narrow_sub(i64 noundef %a, i64 noundef %b) {
-; CHECK-LABEL: define i64 @narrow_sub(
+define i64 @narrow_mul_1(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: define i64 @narrow_mul_1(
; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483647
-; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2147483647
+; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2
; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[ZEXT0]] to i32
; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ZEXT1]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
@@ -48,12 +65,11 @@ define i64 @narrow_sub(i64 noundef %a, i64 noundef %b) {
; CHECK-NEXT: ret i64 [[TMP4]]
;
%zext0 = and i64 %a, 2147483647
- %zext1 = and i64 %b, 2147483647
- %sub = sub i64 %zext0, %zext1
- ret i64 %sub
+ %zext1 = and i64 %b, 2
+ %mul = mul i64 %zext0, %zext1
+ ret i64 %mul
}
-
define i64 @no_narrow_add(i64 noundef %a, i64 noundef %b) {
; CHECK-LABEL: define i64 @no_narrow_add(
; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
@@ -68,30 +84,30 @@ define i64 @no_narrow_add(i64 noundef %a, i64 noundef %b) {
ret i64 %add
}
+define i64 @no_narrow_add_1(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: define i64 @no_narrow_add_1(
+; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 4294967295
+; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 1
+; CHECK-NEXT: [[ADD:%.*]] = add i64 [[ZEXT0]], [[ZEXT1]]
+; CHECK-NEXT: ret i64 [[ADD]]
+;
+ %zext0 = and i64 %a, 4294967295
+ %zext1 = and i64 %b, 1
+ %add = add i64 %zext0, %zext1
+ ret i64 %add
+}
+
define i64 @no_narrow_mul(i64 noundef %a, i64 noundef %b) {
; CHECK-LABEL: define i64 @no_narrow_mul(
; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483648
-; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2147483648
+; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2
; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ZEXT0]], [[ZEXT1]]
; CHECK-NEXT: ret i64 [[MUL]]
;
%zext0 = and i64 %a, 2147483648
- %zext1 = and i64 %b, 2147483648
+ %zext1 = and i64 %b, 2
%mul = mul i64 %zext0, %zext1
ret i64 %mul
}
-
-define i64 @no_narrow_sub(i64 noundef %a, i64 noundef %b) {
-; CHECK-LABEL: define i64 @no_narrow_sub(
-; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483648
-; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2147483648
-; CHECK-NEXT: [[SUB:%.*]] = sub i64 [[ZEXT0]], [[ZEXT1]]
-; CHECK-NEXT: ret i64 [[SUB]]
-;
- %zext0 = and i64 %a, 2147483648
- %zext1 = and i64 %b, 2147483648
- %sub = sub i64 %zext0, %zext1
- ret i64 %sub
-}
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index aad7a088551b2..50921879cd1f2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("ascii")
+ readobj_out = readobj_cmd.stdout.read().decode("utf-8")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From 2063614767f690acd22c54c9706ad9a2e5d20099 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 11 Mar 2025 10:22:20 +0800
Subject: [PATCH 06/14] fix variable name
---
.../AggressiveInstCombine.cpp | 30 +++++++++----------
llvm/test/lit.cfg.py | 2 +-
2 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 56e97c4d64952..b1f13956a2940 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1253,20 +1253,20 @@ static bool tryNarrowMathIfNoOverflow(Instruction &I,
return false;
}
LLVMContext &ctx = I.getContext();
- Type *i64type = Type::getInt64Ty(ctx);
- Type *i32type = Type::getInt32Ty(ctx);
+ Type *I64Type = Type::getInt64Ty(ctx);
+ Type *I32Type = Type::getInt32Ty(ctx);
- if (I.getType() != i64type || !TTI.isTruncateFree(i64type, i32type)) {
+ if (I.getType() != I64Type || !TTI.isTruncateFree(I64Type, I32Type)) {
return false;
}
- InstructionCost costOp64 =
- TTI.getArithmeticInstrCost(opc, i64type, TTI::TCK_RecipThroughput);
- InstructionCost costOp32 =
- TTI.getArithmeticInstrCost(opc, i32type, TTI::TCK_RecipThroughput);
- InstructionCost costZext64 = TTI.getCastInstrCost(
- Instruction::ZExt, i64type, i32type, TTI.getCastContextHint(&I),
+ InstructionCost CostOp64 =
+ TTI.getArithmeticInstrCost(opc, I64Type, TTI::TCK_RecipThroughput);
+ InstructionCost CostOp32 =
+ TTI.getArithmeticInstrCost(opc, I32Type, TTI::TCK_RecipThroughput);
+ InstructionCost CostZext64 = TTI.getCastInstrCost(
+ Instruction::ZExt, I64Type, I32Type, TTI.getCastContextHint(&I),
TTI::TCK_RecipThroughput);
- if ((costOp64 - costOp32) <= costZext64) {
+ if ((CostOp64 - CostOp32) <= CostZext64) {
return false;
}
uint64_t AndConst0, AndConst1;
@@ -1277,11 +1277,11 @@ static bool tryNarrowMathIfNoOverflow(Instruction &I,
match(I.getOperand(1), m_And(m_ConstantInt(AndConst1), m_Value(X)))) &&
isSaveToNarrow(opc, AndConst0, AndConst1)) {
IRBuilder<> Builder(&I);
- Value *trun0 = Builder.CreateTrunc(I.getOperand(0), i32type);
- Value *trun1 = Builder.CreateTrunc(I.getOperand(1), i32type);
- Value *arith32 = Builder.CreateAdd(trun0, trun1);
- Value *zext64 = Builder.CreateZExt(arith32, i64type);
- I.replaceAllUsesWith(zext64);
+ Value *Trun0 = Builder.CreateTrunc(I.getOperand(0), I32Type);
+ Value *Trun1 = Builder.CreateTrunc(I.getOperand(1), I32Type);
+ Value *Arith32 = Builder.CreateAdd(Trun0, Trun1);
+ Value *Zext64 = Builder.CreateZExt(Arith32, I64Type);
+ I.replaceAllUsesWith(Zext64);
I.eraseFromParent();
}
return false;
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 50921879cd1f2..aad7a088551b2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("utf-8")
+ readobj_out = readobj_cmd.stdout.read().decode("ascii")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From 0ac2f9ed50b36f15abfe168bf8a65eeb76530d4c Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 11 Mar 2025 17:37:27 +0800
Subject: [PATCH 07/14] fix comments
---
.../AggressiveInstCombine.cpp | 95 ++++++++--------
.../narrow_math_for_and.ll | 105 +++++++++++++++++-
2 files changed, 148 insertions(+), 52 deletions(-)
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index b1f13956a2940..5277318071be9 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1224,67 +1224,66 @@ static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI,
return false;
}
-static bool isSaveToNarrow(unsigned opc, uint64_t num1, uint64_t num2) {
- if (num1 > 0xffffffff || num2 > 0xffffffff) {
- // if `num > 0xffffffff`, then `%and = and i64 %a, num` may or may not have
- // higher 32bit set. Which cause truncate possibly lose infomation
+static bool tryNarrowMathIfNoOverflow(Instruction &I, TargetTransformInfo &TTI,
+ const DataLayout &DL) {
+ unsigned opc = I.getOpcode();
+ Type *OldType = I.getType();
+ if (opc != Instruction::Add && opc != Instruction::Mul &&
+ !OldType->isIntOrIntVectorTy()) {
return false;
}
+ unsigned OrigBit = OldType->getScalarSizeInBits();
+ unsigned MaxBitsNeed = OrigBit;
switch (opc) {
- // If `%and = and i64 %a, num` where num <= 0xffffffff, then `%and` must be
- // positive.
- // Since add and mul both increasing function on positive integer domain and
- // `%ai <= numi`, then if `(num1 op num2) <= 0xffffffff` we have `%a1 + %a2 <=
- // 0xffffffff`
case Instruction::Add:
- return (num1 + num2) <= 0xffffffff;
+ MaxBitsNeed = KnownBits::add(computeKnownBits(I.getOperand(0), DL),
+ computeKnownBits(I.getOperand(1), DL))
+ .countMaxActiveBits();
+ break;
case Instruction::Mul:
- return (num1 * num2) <= 0xffffffff;
+ MaxBitsNeed = KnownBits::mul(computeKnownBits(I.getOperand(0), DL),
+ computeKnownBits(I.getOperand(1), DL))
+ .countMaxActiveBits();
+ break;
+ default:
break;
}
- return false;
-}
+ MaxBitsNeed = std::max<unsigned>(bit_ceil(MaxBitsNeed), 8);
-static bool tryNarrowMathIfNoOverflow(Instruction &I,
- TargetTransformInfo &TTI) {
- unsigned opc = I.getOpcode();
- if (opc != Instruction::Add && opc != Instruction::Mul) {
+ if (OrigBit <= MaxBitsNeed) {
return false;
}
- LLVMContext &ctx = I.getContext();
- Type *I64Type = Type::getInt64Ty(ctx);
- Type *I32Type = Type::getInt32Ty(ctx);
- if (I.getType() != I64Type || !TTI.isTruncateFree(I64Type, I32Type)) {
- return false;
- }
- InstructionCost CostOp64 =
- TTI.getArithmeticInstrCost(opc, I64Type, TTI::TCK_RecipThroughput);
- InstructionCost CostOp32 =
- TTI.getArithmeticInstrCost(opc, I32Type, TTI::TCK_RecipThroughput);
- InstructionCost CostZext64 = TTI.getCastInstrCost(
- Instruction::ZExt, I64Type, I32Type, TTI.getCastContextHint(&I),
- TTI::TCK_RecipThroughput);
- if ((CostOp64 - CostOp32) <= CostZext64) {
+ Type *NewType = I.getType()->getWithNewBitWidth(MaxBitsNeed);
+
+ // Old cost
+ InstructionCost OldCost =
+ TTI.getArithmeticInstrCost(opc, OldType, TTI::TCK_RecipThroughput);
+ // New cost of new op
+ InstructionCost NewCost =
+ TTI.getArithmeticInstrCost(opc, NewType, TTI::TCK_RecipThroughput);
+ // New cost of narrowing 2 operands (use trunc)
+ NewCost += TTI.getCastInstrCost(Instruction::Trunc, NewType, OldType,
+ TTI.getCastContextHint(&I),
+ TTI::TCK_RecipThroughput) *
+ 2;
+ // New cost of zext narrowed result to original type
+ NewCost += TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
+ TTI.getCastContextHint(&I),
+ TTI::TCK_RecipThroughput);
+ if (NewCost >= OldCost) {
return false;
}
- uint64_t AndConst0, AndConst1;
- Value *X;
- if ((match(I.getOperand(0), m_And(m_Value(X), m_ConstantInt(AndConst0))) ||
- match(I.getOperand(0), m_And(m_ConstantInt(AndConst0), m_Value(X)))) &&
- (match(I.getOperand(1), m_And(m_Value(X), m_ConstantInt(AndConst1))) ||
- match(I.getOperand(1), m_And(m_ConstantInt(AndConst1), m_Value(X)))) &&
- isSaveToNarrow(opc, AndConst0, AndConst1)) {
- IRBuilder<> Builder(&I);
- Value *Trun0 = Builder.CreateTrunc(I.getOperand(0), I32Type);
- Value *Trun1 = Builder.CreateTrunc(I.getOperand(1), I32Type);
- Value *Arith32 = Builder.CreateAdd(Trun0, Trun1);
- Value *Zext64 = Builder.CreateZExt(Arith32, I64Type);
- I.replaceAllUsesWith(Zext64);
- I.eraseFromParent();
- }
- return false;
+ IRBuilder<> Builder(&I);
+ Value *Trun0 = Builder.CreateTrunc(I.getOperand(0), NewType);
+ Value *Trun1 = Builder.CreateTrunc(I.getOperand(1), NewType);
+ Value *Arith = Builder.CreateBinOp((Instruction::BinaryOps)opc, Trun0, Trun1);
+
+ Value *Zext = Builder.CreateZExt(Arith, OldType);
+ I.replaceAllUsesWith(Zext);
+ I.eraseFromParent();
+ return true;
}
/// This is the entry point for folds that could be implemented in regular
@@ -1319,7 +1318,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
// needs to be called at the end of this sequence, otherwise we may make
// bugs.
MadeChange |= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange);
- MadeChange |= tryNarrowMathIfNoOverflow(I, TTI);
+ MadeChange |= tryNarrowMathIfNoOverflow(I, TTI, DL);
}
}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll b/llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll
index cdee5c20733ef..38df58356559e 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=aggressive-instcombine < %s | FileCheck %s
+; REQUIRES: amdgpu-registered-target
define i64 @narrow_add(i64 noundef %a, i64 noundef %b) {
; CHECK-LABEL: define i64 @narrow_add(
@@ -36,19 +37,53 @@ define i64 @narrow_add_1(i64 noundef %a, i64 noundef %b) {
ret i64 %add
}
+define <2 x i64> @narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 {
+; CHECK-LABEL: define <2 x i64> @narrow_add_vec(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT0:%.*]] = and <2 x i64> [[A]], <i64 2147483647, i64 30>
+; CHECK-NEXT: [[ZEXT1:%.*]] = and <2 x i64> [[B]], <i64 2147483647, i64 2147483646>
+; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i64> [[ZEXT0]] to <2 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i64> [[ZEXT1]] to <2 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+; CHECK-NEXT: ret <2 x i64> [[TMP4]]
+;
+ %zext0 = and <2 x i64> %a, <i64 2147483647, i64 30>
+ %zext1 = and <2 x i64> %b, <i64 2147483647, i64 2147483646>
+ %add = add <2 x i64> %zext0, %zext1
+ ret <2 x i64> %add
+}
+
+define <2 x i32> @narrow_add_vec_1(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: define <2 x i32> @narrow_add_vec_1(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT0:%.*]] = and <2 x i32> [[A]], <i32 16384, i32 16383>
+; CHECK-NEXT: [[ZEXT1:%.*]] = and <2 x i32> [[B]], <i32 16384, i32 16385>
+; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[ZEXT0]] to <2 x i16>
+; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i32> [[ZEXT1]] to <2 x i16>
+; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
+; CHECK-NEXT: ret <2 x i32> [[TMP4]]
+;
+ %zext0 = and <2 x i32> %a, <i32 16384, i32 16383>
+ %zext1 = and <2 x i32> %b, <i32 16384, i32 16385>
+ %add = add <2 x i32> %zext0, %zext1
+ ret <2 x i32> %add
+}
+
define i64 @narrow_mul(i64 noundef %a, i64 noundef %b) {
; CHECK-LABEL: define i64 @narrow_mul(
; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483647
-; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 0
+; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2
; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[ZEXT0]] to i32
; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ZEXT1]] to i32
-; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
; CHECK-NEXT: ret i64 [[TMP4]]
;
%zext0 = and i64 %a, 2147483647
- %zext1 = and i64 %b, 0
+ %zext1 = and i64 %b, 2
%mul = mul i64 %zext0, %zext1
ret i64 %mul
}
@@ -60,7 +95,7 @@ define i64 @narrow_mul_1(i64 noundef %a, i64 noundef %b) {
; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2
; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[ZEXT0]] to i32
; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ZEXT1]] to i32
-; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
; CHECK-NEXT: ret i64 [[TMP4]]
;
@@ -70,6 +105,40 @@ define i64 @narrow_mul_1(i64 noundef %a, i64 noundef %b) {
ret i64 %mul
}
+define <2 x i64> @narrow_mul_vec(<2 x i64> %a, <2 x i64> %b) #0 {
+; CHECK-LABEL: define <2 x i64> @narrow_mul_vec(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT0:%.*]] = and <2 x i64> [[A]], <i64 47483647, i64 50>
+; CHECK-NEXT: [[ZEXT1:%.*]] = and <2 x i64> [[B]], <i64 80, i64 20>
+; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i64> [[ZEXT0]] to <2 x i32>
+; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i64> [[ZEXT1]] to <2 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+; CHECK-NEXT: ret <2 x i64> [[TMP4]]
+;
+ %zext0 = and <2 x i64> %a, <i64 47483647, i64 50>
+ %zext1 = and <2 x i64> %b, <i64 80, i64 20>
+ %mul = mul <2 x i64> %zext0, %zext1
+ ret <2 x i64> %mul
+}
+
+define <2 x i32> @narrow_add_mul_1(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: define <2 x i32> @narrow_add_mul_1(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT0:%.*]] = and <2 x i32> [[A]], splat (i32 16384)
+; CHECK-NEXT: [[ZEXT1:%.*]] = and <2 x i32> [[B]], <i32 3, i32 2>
+; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[ZEXT0]] to <2 x i16>
+; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i32> [[ZEXT1]] to <2 x i16>
+; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
+; CHECK-NEXT: ret <2 x i32> [[TMP4]]
+;
+ %zext0 = and <2 x i32> %a, <i32 16384, i32 16384>
+ %zext1 = and <2 x i32> %b, <i32 3, i32 2>
+ %mul = mul <2 x i32> %zext0, %zext1
+ ret <2 x i32> %mul
+}
+
define i64 @no_narrow_add(i64 noundef %a, i64 noundef %b) {
; CHECK-LABEL: define i64 @no_narrow_add(
; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
@@ -98,6 +167,20 @@ define i64 @no_narrow_add_1(i64 noundef %a, i64 noundef %b) {
ret i64 %add
}
+define <2 x i64> @no_narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 {
+; CHECK-LABEL: define <2 x i64> @no_narrow_add_vec(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT0:%.*]] = and <2 x i64> [[A]], <i64 2147483648, i64 30>
+; CHECK-NEXT: [[ZEXT1:%.*]] = and <2 x i64> [[B]], <i64 2147483648, i64 2147483646>
+; CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[ZEXT0]], [[ZEXT1]]
+; CHECK-NEXT: ret <2 x i64> [[ADD]]
+;
+ %zext0 = and <2 x i64> %a, <i64 2147483648, i64 30>
+ %zext1 = and <2 x i64> %b, <i64 2147483648, i64 2147483646>
+ %add = add <2 x i64> %zext0, %zext1
+ ret <2 x i64> %add
+}
+
define i64 @no_narrow_mul(i64 noundef %a, i64 noundef %b) {
; CHECK-LABEL: define i64 @no_narrow_mul(
; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
@@ -111,3 +194,17 @@ define i64 @no_narrow_mul(i64 noundef %a, i64 noundef %b) {
%mul = mul i64 %zext0, %zext1
ret i64 %mul
}
+
+define <2 x i64> @no_narrow_mul_vec(<2 x i64> %a, <2 x i64> %b) #0 {
+; CHECK-LABEL: define <2 x i64> @no_narrow_mul_vec(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT0:%.*]] = and <2 x i64> [[A]], <i64 32768, i64 50>
+; CHECK-NEXT: [[ZEXT1:%.*]] = and <2 x i64> [[B]], <i64 131072, i64 20>
+; CHECK-NEXT: [[MUL:%.*]] = mul <2 x i64> [[ZEXT0]], [[ZEXT1]]
+; CHECK-NEXT: ret <2 x i64> [[MUL]]
+;
+ %zext0 = and <2 x i64> %a, <i64 32768, i64 50>
+ %zext1 = and <2 x i64> %b, <i64 131072, i64 20>
+ %mul = mul <2 x i64> %zext0, %zext1
+ ret <2 x i64> %mul
+}
>From f7d076945352c3e937eb787e59e1f85f1891f5fb Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Tue, 11 Mar 2025 18:38:01 +0800
Subject: [PATCH 08/14] fix comments
---
.../AggressiveInstCombine.cpp | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 5277318071be9..7c9a901f18cb9 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1226,15 +1226,15 @@ static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI,
static bool tryNarrowMathIfNoOverflow(Instruction &I, TargetTransformInfo &TTI,
const DataLayout &DL) {
- unsigned opc = I.getOpcode();
+ unsigned Opc = I.getOpcode();
Type *OldType = I.getType();
- if (opc != Instruction::Add && opc != Instruction::Mul &&
- !OldType->isIntOrIntVectorTy()) {
+
+ if (Opc != Instruction::Add && Opc != Instruction::Mul)
return false;
- }
+
unsigned OrigBit = OldType->getScalarSizeInBits();
unsigned MaxBitsNeed = OrigBit;
- switch (opc) {
+ switch (Opc) {
case Instruction::Add:
MaxBitsNeed = KnownBits::add(computeKnownBits(I.getOperand(0), DL),
computeKnownBits(I.getOperand(1), DL))
@@ -1251,18 +1251,17 @@ static bool tryNarrowMathIfNoOverflow(Instruction &I, TargetTransformInfo &TTI,
MaxBitsNeed = std::max<unsigned>(bit_ceil(MaxBitsNeed), 8);
- if (OrigBit <= MaxBitsNeed) {
+ if (OrigBit <= MaxBitsNeed)
return false;
- }
Type *NewType = I.getType()->getWithNewBitWidth(MaxBitsNeed);
// Old cost
InstructionCost OldCost =
- TTI.getArithmeticInstrCost(opc, OldType, TTI::TCK_RecipThroughput);
+ TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput);
// New cost of new op
InstructionCost NewCost =
- TTI.getArithmeticInstrCost(opc, NewType, TTI::TCK_RecipThroughput);
+ TTI.getArithmeticInstrCost(Opc, NewType, TTI::TCK_RecipThroughput);
// New cost of narrowing 2 operands (use trunc)
NewCost += TTI.getCastInstrCost(Instruction::Trunc, NewType, OldType,
TTI.getCastContextHint(&I),
@@ -1278,7 +1277,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction &I, TargetTransformInfo &TTI,
IRBuilder<> Builder(&I);
Value *Trun0 = Builder.CreateTrunc(I.getOperand(0), NewType);
Value *Trun1 = Builder.CreateTrunc(I.getOperand(1), NewType);
- Value *Arith = Builder.CreateBinOp((Instruction::BinaryOps)opc, Trun0, Trun1);
+ Value *Arith = Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trun0, Trun1);
Value *Zext = Builder.CreateZExt(Arith, OldType);
I.replaceAllUsesWith(Zext);
>From 68ef90b8217e176e8c0889ed3c0d43712ca25f19 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Wed, 12 Mar 2025 16:27:40 +0800
Subject: [PATCH 09/14] move from aggressive-instcombine to codegenprepare
---
llvm/lib/CodeGen/CodeGenPrepare.cpp | 65 +++++
.../AggressiveInstCombine.cpp | 62 -----
.../atomic_optimizations_global_pointer.ll | 52 ++--
.../CodeGen/AMDGPU/narrow_math_for_and.ll | 231 ++++++++++++++++++
.../AMDGPU/promote-constOffset-to-imm.ll | 90 ++++---
llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 9 +-
llvm/test/CodeGen/X86/pmulh.ll | 62 +++--
.../CodeGen/X86/scheduler-backtracking.ll | 20 --
llvm/test/CodeGen/X86/shrink_vmul.ll | 28 +--
.../narrow_math_for_and.ll | 210 ----------------
10 files changed, 421 insertions(+), 408 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
delete mode 100644 llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index d5fbd4c380746..796ae2512baf4 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8594,6 +8594,68 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
return false;
}
+static bool tryNarrowMathIfNoOverflow(Instruction *I,
+ const TargetTransformInfo *TTI,
+ const DataLayout &DL) {
+ unsigned Opc = I->getOpcode();
+ Type *OldType = I->getType();
+
+ if (Opc != Instruction::Add && Opc != Instruction::Mul)
+ return false;
+
+ unsigned OrigBit = OldType->getScalarSizeInBits();
+ unsigned MaxBitsNeed = OrigBit;
+ switch (Opc) {
+ case Instruction::Add:
+ MaxBitsNeed = KnownBits::add(computeKnownBits(I->getOperand(0), DL),
+ computeKnownBits(I->getOperand(1), DL))
+ .countMaxActiveBits();
+ break;
+ case Instruction::Mul:
+ MaxBitsNeed = KnownBits::mul(computeKnownBits(I->getOperand(0), DL),
+ computeKnownBits(I->getOperand(1), DL))
+ .countMaxActiveBits();
+ break;
+ default:
+ break;
+ }
+
+ MaxBitsNeed = std::max<unsigned>(bit_ceil(MaxBitsNeed), 8);
+
+ if (OrigBit <= MaxBitsNeed)
+ return false;
+
+ Type *NewType = I->getType()->getWithNewBitWidth(MaxBitsNeed);
+
+ // Old cost
+ InstructionCost OldCost =
+ TTI->getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput);
+ // New cost of new op
+ InstructionCost NewCost =
+ TTI->getArithmeticInstrCost(Opc, NewType, TTI::TCK_RecipThroughput);
+ // New cost of narrowing 2 operands (use trunc)
+ NewCost += TTI->getCastInstrCost(Instruction::Trunc, NewType, OldType,
+ TTI->getCastContextHint(I),
+ TTI::TCK_RecipThroughput) *
+ 2;
+ // New cost of zext narrowed result to original type
+ NewCost += TTI->getCastInstrCost(Instruction::ZExt, OldType, NewType,
+ TTI->getCastContextHint(I),
+ TTI::TCK_RecipThroughput);
+ if (NewCost >= OldCost) {
+ return false;
+ }
+ IRBuilder<> Builder(I);
+ Value *Trun0 = Builder.CreateTrunc(I->getOperand(0), NewType);
+ Value *Trun1 = Builder.CreateTrunc(I->getOperand(1), NewType);
+ Value *Arith = Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trun0, Trun1);
+
+ Value *Zext = Builder.CreateZExt(Arith, OldType);
+ I->replaceAllUsesWith(Zext);
+ I->eraseFromParent();
+ return true;
+}
+
bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
bool AnyChange = false;
AnyChange = fixupDbgVariableRecordsOnInst(*I);
@@ -8775,6 +8837,9 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
return optimizeExtractElementInst(cast<ExtractElementInst>(I));
case Instruction::Br:
return optimizeBranch(cast<BranchInst>(I), *TLI, FreshBBs, IsHugeFunc);
+ case Instruction::Add:
+ case Instruction::Mul:
+ return tryNarrowMathIfNoOverflow(I, TTI, *DL);
}
return AnyChange;
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 7c9a901f18cb9..6b0f568864fd5 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1224,67 +1224,6 @@ static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI,
return false;
}
-static bool tryNarrowMathIfNoOverflow(Instruction &I, TargetTransformInfo &TTI,
- const DataLayout &DL) {
- unsigned Opc = I.getOpcode();
- Type *OldType = I.getType();
-
- if (Opc != Instruction::Add && Opc != Instruction::Mul)
- return false;
-
- unsigned OrigBit = OldType->getScalarSizeInBits();
- unsigned MaxBitsNeed = OrigBit;
- switch (Opc) {
- case Instruction::Add:
- MaxBitsNeed = KnownBits::add(computeKnownBits(I.getOperand(0), DL),
- computeKnownBits(I.getOperand(1), DL))
- .countMaxActiveBits();
- break;
- case Instruction::Mul:
- MaxBitsNeed = KnownBits::mul(computeKnownBits(I.getOperand(0), DL),
- computeKnownBits(I.getOperand(1), DL))
- .countMaxActiveBits();
- break;
- default:
- break;
- }
-
- MaxBitsNeed = std::max<unsigned>(bit_ceil(MaxBitsNeed), 8);
-
- if (OrigBit <= MaxBitsNeed)
- return false;
-
- Type *NewType = I.getType()->getWithNewBitWidth(MaxBitsNeed);
-
- // Old cost
- InstructionCost OldCost =
- TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput);
- // New cost of new op
- InstructionCost NewCost =
- TTI.getArithmeticInstrCost(Opc, NewType, TTI::TCK_RecipThroughput);
- // New cost of narrowing 2 operands (use trunc)
- NewCost += TTI.getCastInstrCost(Instruction::Trunc, NewType, OldType,
- TTI.getCastContextHint(&I),
- TTI::TCK_RecipThroughput) *
- 2;
- // New cost of zext narrowed result to original type
- NewCost += TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
- TTI.getCastContextHint(&I),
- TTI::TCK_RecipThroughput);
- if (NewCost >= OldCost) {
- return false;
- }
- IRBuilder<> Builder(&I);
- Value *Trun0 = Builder.CreateTrunc(I.getOperand(0), NewType);
- Value *Trun1 = Builder.CreateTrunc(I.getOperand(1), NewType);
- Value *Arith = Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trun0, Trun1);
-
- Value *Zext = Builder.CreateZExt(Arith, OldType);
- I.replaceAllUsesWith(Zext);
- I.eraseFromParent();
- return true;
-}
-
/// This is the entry point for folds that could be implemented in regular
/// InstCombine, but they are separated because they are not expected to
/// occur frequently and/or have more than a constant-length pattern match.
@@ -1317,7 +1256,6 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
// needs to be called at the end of this sequence, otherwise we may make
// bugs.
MadeChange |= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange);
- MadeChange |= tryNarrowMathIfNoOverflow(I, TTI, DL);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 3737cc414c58f..b859253d5c5da 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1837,22 +1837,22 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: s_mov_b32 s9, 0
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-NEXT: s_cbranch_execz .LBB3_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
+; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1264-NEXT: v_mov_b32_e32 v1, 0
+; GFX1264-NEXT: s_wait_alu 0xfffe
+; GFX1264-NEXT: s_mul_i32 s6, s6, 5
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
-; GFX1264-NEXT: s_mov_b32 s10, -1
; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mov_b32_e32 v0, s6
-; GFX1264-NEXT: v_mov_b32_e32 v1, s7
+; GFX1264-NEXT: s_mov_b32 s10, -1
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mov_b32 s8, s2
; GFX1264-NEXT: s_mov_b32 s9, s3
@@ -1874,20 +1874,19 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-LABEL: add_i64_constant:
; GFX1232: ; %bb.0: ; %entry
; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1232-NEXT: s_mov_b32 s7, exec_lo
-; GFX1232-NEXT: s_mov_b32 s5, 0
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0
; GFX1232-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232-NEXT: s_mov_b32 s4, exec_lo
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB3_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7
+; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s6
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
+; GFX1232-NEXT: s_mul_i32 s5, s5, 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1232-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s8, s2
; GFX1232-NEXT: s_mov_b32 s9, s3
@@ -1895,8 +1894,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB3_2:
-; GFX1232-NEXT: s_wait_alu 0xfffe
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
@@ -5406,22 +5404,22 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: s_mov_b32 s9, 0
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-NEXT: s_cbranch_execz .LBB9_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
+; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1264-NEXT: v_mov_b32_e32 v1, 0
+; GFX1264-NEXT: s_wait_alu 0xfffe
+; GFX1264-NEXT: s_mul_i32 s6, s6, 5
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
-; GFX1264-NEXT: s_mov_b32 s10, -1
; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mov_b32_e32 v0, s6
-; GFX1264-NEXT: v_mov_b32_e32 v1, s7
+; GFX1264-NEXT: s_mov_b32 s10, -1
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mov_b32 s8, s2
; GFX1264-NEXT: s_mov_b32 s9, s3
@@ -5446,20 +5444,19 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-LABEL: sub_i64_constant:
; GFX1232: ; %bb.0: ; %entry
; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1232-NEXT: s_mov_b32 s7, exec_lo
-; GFX1232-NEXT: s_mov_b32 s5, 0
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0
; GFX1232-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232-NEXT: s_mov_b32 s4, exec_lo
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB9_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7
+; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s6
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
+; GFX1232-NEXT: s_mul_i32 s5, s5, 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1232-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s8, s2
; GFX1232-NEXT: s_mov_b32 s9, s3
@@ -5467,8 +5464,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB9_2:
-; GFX1232-NEXT: s_wait_alu 0xfffe
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2
diff --git a/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
new file mode 100644
index 0000000000000..b0ef9e03663b4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
@@ -0,0 +1,231 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
+
+; REQUIRES: amdgpu-registered-target
+
+define i64 @narrow_add(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: narrow_add:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %zext0 = and i64 %a, 2147483647
+ %zext1 = and i64 %b, 2147483647
+ %add = add i64 %zext0, %zext1
+ ret i64 %add
+}
+
+define i64 @narrow_add_1(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: narrow_add_1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v2
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %zext0 = and i64 %a, 2147483647
+ %zext1 = and i64 %b, 2147483648
+ %add = add i64 %zext0, %zext1
+ ret i64 %add
+}
+
+define <2 x i64> @narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 {
+; CHECK-LABEL: narrow_add_vec:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v1, 30, v2
+; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; CHECK-NEXT: v_and_b32_e32 v2, 0x7fffffff, v4
+; CHECK-NEXT: v_and_b32_e32 v3, 0x7ffffffe, v6
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2
+; CHECK-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_add_nc_u32 v2, v1, v3
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %zext0 = and <2 x i64> %a, <i64 2147483647, i64 30>
+ %zext1 = and <2 x i64> %b, <i64 2147483647, i64 2147483646>
+ %add = add <2 x i64> %zext0, %zext1
+ ret <2 x i64> %add
+}
+
+define <2 x i32> @narrow_add_vec_1(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: narrow_add_vec_1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v1, 0x3fff, v1
+; CHECK-NEXT: v_and_b32_e32 v0, 0x4000, v0
+; CHECK-NEXT: v_and_b32_e32 v3, 0x4001, v3
+; CHECK-NEXT: v_and_b32_e32 v2, 0x4000, v2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; CHECK-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; CHECK-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_pk_add_u16 v1, v0, v1
+; CHECK-NEXT: v_and_b32_e32 v0, 0xc000, v1
+; CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %zext0 = and <2 x i32> %a, <i32 16384, i32 16383>
+ %zext1 = and <2 x i32> %b, <i32 16384, i32 16385>
+ %add = add <2 x i32> %zext0, %zext1
+ ret <2 x i32> %add
+}
+
+define i64 @narrow_mul(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: narrow_mul:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v1, 2, v2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_mul_lo_u32 v0, v0, v1
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %zext0 = and i64 %a, 2147483647
+ %zext1 = and i64 %b, 2
+ %mul = mul i64 %zext0, %zext1
+ ret i64 %mul
+}
+
+define i64 @narrow_mul_1(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: narrow_mul_1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v1, 0xf73594, v0
+; CHECK-NEXT: v_and_b32_e32 v2, 0x100, v2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_mul_u32_u24_e32 v0, v1, v2
+; CHECK-NEXT: v_mul_hi_u32_u24_e32 v1, v1, v2
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %zext0 = and i64 %a, 16201108
+ %zext1 = and i64 %b, 256
+ %mul = mul i64 %zext0, %zext1
+ ret i64 %mul
+}
+
+define <2 x i64> @narrow_mul_vec(<2 x i64> %a, <2 x i64> %b) #0 {
+; CHECK-LABEL: narrow_mul_vec:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0x2d48aff, v0
+; CHECK-NEXT: v_and_b32_e32 v1, 0x50, v4
+; CHECK-NEXT: v_and_b32_e32 v2, 50, v2
+; CHECK-NEXT: v_and_b32_e32 v3, 20, v6
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; CHECK-NEXT: v_mul_lo_u32 v0, v0, v1
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_mul_u32_u24_e32 v2, v2, v3
+; CHECK-NEXT: v_mov_b32_e32 v3, 0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %zext0 = and <2 x i64> %a, <i64 47483647, i64 50>
+ %zext1 = and <2 x i64> %b, <i64 80, i64 20>
+ %mul = mul <2 x i64> %zext0, %zext1
+ ret <2 x i64> %mul
+}
+
+define <2 x i32> @narrow_add_mul_1(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: narrow_add_mul_1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v1, 0x4000, v1
+; CHECK-NEXT: v_and_b32_e32 v0, 0x4000, v0
+; CHECK-NEXT: v_and_b32_e32 v2, 3, v2
+; CHECK-NEXT: v_and_b32_e32 v3, 2, v3
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; CHECK-NEXT: v_mul_u32_u24_e32 v0, v0, v2
+; CHECK-NEXT: v_mul_u32_u24_e32 v1, v1, v3
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %zext0 = and <2 x i32> %a, <i32 16384, i32 16384>
+ %zext1 = and <2 x i32> %b, <i32 3, i32 2>
+ %mul = mul <2 x i32> %zext0, %zext1
+ ret <2 x i32> %mul
+}
+
+define i64 @no_narrow_add(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: no_narrow_add:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; CHECK-NEXT: v_and_b32_e32 v1, 0x80000000, v2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_add_co_u32 v0, s0, v0, v1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %zext0 = and i64 %a, 2147483648
+ %zext1 = and i64 %b, 2147483648
+ %add = add i64 %zext0, %zext1
+ ret i64 %add
+}
+
+define i64 @no_narrow_add_1(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: no_narrow_add_1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v1, 1, v2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_add_co_u32 v0, s0, v0, v1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %zext0 = and i64 %a, 4294967295
+ %zext1 = and i64 %b, 1
+ %add = add i64 %zext0, %zext1
+ ret i64 %add
+}
+
+define <2 x i64> @no_narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 {
+; CHECK-LABEL: no_narrow_add_vec:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; CHECK-NEXT: v_and_b32_e32 v1, 0x80000000, v4
+; CHECK-NEXT: v_and_b32_e32 v2, 30, v2
+; CHECK-NEXT: v_and_b32_e32 v3, 0x7ffffffe, v6
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_add_co_u32 v0, s0, v0, v1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_add_co_u32 v2, s0, v2, v3
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %zext0 = and <2 x i64> %a, <i64 2147483648, i64 30>
+ %zext1 = and <2 x i64> %b, <i64 2147483648, i64 2147483646>
+ %add = add <2 x i64> %zext0, %zext1
+ ret <2 x i64> %add
+}
+
+define i64 @no_narrow_mul(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: no_narrow_mul:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; CHECK-NEXT: v_and_b32_e32 v1, 2, v2
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %zext0 = and i64 %a, 2147483648
+ %zext1 = and i64 %b, 2
+ %mul = mul i64 %zext0, %zext1
+ ret i64 %mul
+}
+
+define <2 x i64> @no_narrow_mul_vec(<2 x i64> %a, <2 x i64> %b) #0 {
+; CHECK-LABEL: no_narrow_mul_vec:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v1, 0x8000, v0
+; CHECK-NEXT: v_and_b32_e32 v3, 0x20000, v4
+; CHECK-NEXT: v_and_b32_e32 v4, 50, v2
+; CHECK-NEXT: v_and_b32_e32 v5, 20, v6
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; CHECK-NEXT: v_mul_u32_u24_e32 v0, v1, v3
+; CHECK-NEXT: v_mul_hi_u32_u24_e32 v1, v1, v3
+; CHECK-NEXT: v_mul_u32_u24_e32 v2, v4, v5
+; CHECK-NEXT: v_mul_hi_u32_u24_e32 v3, v4, v5
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %zext0 = and <2 x i64> %a, <i64 32768, i64 50>
+ %zext1 = and <2 x i64> %b, <i64 131072, i64 20>
+ %mul = mul <2 x i64> %zext0, %zext1
+ ret <2 x i64> %mul
+}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index ee89bf406c2a3..cab90831e5a6f 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -365,11 +365,9 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX8-NEXT: v_mov_b32_e32 v1, 3
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_or_b32_e32 v0, v12, v0
+; GFX8-NEXT: v_or_b32_e32 v0, 0x5000, v0
; GFX8-NEXT: v_mov_b32_e32 v1, s35
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: s_movk_i32 s0, 0x5000
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_mov_b32_e32 v10, 0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v11, 0
@@ -487,15 +485,14 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: s_mov_b32 s32, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0
-; GFX900-NEXT: v_lshlrev_b32_e32 v0, 17, v0
-; GFX900-NEXT: v_and_b32_e32 v6, 0xfe000000, v0
-; GFX900-NEXT: v_lshl_or_b32 v0, v1, 3, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v1, 17, v0
+; GFX900-NEXT: v_and_b32_e32 v6, 0xfe000000, v1
+; GFX900-NEXT: v_mov_b32_e32 v1, 3
+; GFX900-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX900-NEXT: s_movk_i32 s0, 0x5000
+; GFX900-NEXT: v_or3_b32 v0, v6, v0, s0
; GFX900-NEXT: v_mov_b32_e32 v1, s35
; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0
-; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX900-NEXT: s_movk_i32 s0, 0x5000
-; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX900-NEXT: v_mov_b32_e32 v4, 0
; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT: v_mov_b32_e32 v5, 0
@@ -604,17 +601,16 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX10-NEXT: s_mov_b32 s32, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 17, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, 3
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 17, v0
+; GFX10-NEXT: s_movk_i32 s1, 0x7f
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: v_and_b32_e32 v6, 0xfe000000, v2
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-NEXT: s_movk_i32 s1, 0x7f
-; GFX10-NEXT: v_and_b32_e32 v6, 0xfe000000, v1
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 3, v6
-; GFX10-NEXT: v_add_co_u32 v0, s0, v0, s34
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s35, s0
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_or3_b32 v0, v6, v0, 0x5000
+; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0
; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB1_2 Depth 2
@@ -716,16 +712,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: s_mov_b32 s32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT: v_and_b32_e32 v1, 0xff, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 17, v0
-; GFX90A-NEXT: v_and_b32_e32 v0, 0xfe000000, v0
-; GFX90A-NEXT: v_lshl_or_b32 v1, v1, 3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, s35
-; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v2, vcc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 17, v0
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xfe000000, v1
+; GFX90A-NEXT: v_mov_b32_e32 v1, 3
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX90A-NEXT: s_movk_i32 s0, 0x5000
-; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v1
-; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX90A-NEXT: v_or3_b32 v0, v2, v0, s0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s35
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0
; GFX90A-NEXT: s_movk_i32 s3, 0x7f
; GFX90A-NEXT: s_movk_i32 s0, 0xd000
@@ -734,7 +729,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB1_2 Depth 2
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_mov_b32 s4, 0
; GFX90A-NEXT: .LBB1_2: ; %for.body
; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1
@@ -765,34 +760,34 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: s_addk_i32 s4, 0x2000
; GFX90A-NEXT: s_cmp_gt_u32 s4, 0x3fffff
; GFX90A-NEXT: s_waitcnt vmcnt(8)
-; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v12, v4
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v12, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(7)
-; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v18, v1
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v18, v3
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v19, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(5)
-; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v14, v1
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v14, v3
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v15, v4, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v16, v1
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v16, v3
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v17, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(4)
-; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v24, v1
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v24, v3
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v25, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(3)
-; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v26, v1
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v26, v3
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v27, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(2)
-; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v28, v1
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v28, v3
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v29, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v20, v1
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v20, v3
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v21, v4, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v8, v1
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v10, v1
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v1
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v3
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2
; GFX90A-NEXT: ; %bb.3: ; %while.cond.loopexit
@@ -805,7 +800,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: s_branch .LBB1_1
; GFX90A-NEXT: .LBB1_5: ; %while.end
; GFX90A-NEXT: v_mov_b32_e32 v1, s35
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v2
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
; GFX90A-NEXT: s_endpgm
@@ -824,15 +819,14 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 17, v0
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0xff, v0
; GFX11-NEXT: s_movk_i32 s1, 0x7f
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v6, 0xfe000000, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 3, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v0, s0, v0, s34
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s35, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_or3_b32 v0, v6, v0, 0x5000
+; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0
; GFX11-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB1_2 Depth 2
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 4290590e99711..4eb7761bfbddd 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -508,17 +508,16 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %
; SI-LABEL: widen_i1_zext_to_i64_constant_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[0:1], 0x0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s2, s2, 1
-; SI-NEXT: s_add_u32 s4, s2, 0x3e7
-; SI-NEXT: s_addc_u32 s5, 0, 0
-; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_and_b32 s2, s2, 0xff
+; SI-NEXT: s_addk_i32 s2, 0x3e7
+; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index 300da68d9a3b3..acdc457edccee 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -65,8 +65,12 @@ define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512-LABEL: and_mulhuw_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%a1 = and <4 x i64> %a, <i64 65535, i64 65535, i64 65535, i64 65535>
@@ -1985,31 +1989,44 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) {
define <8 x i64> @zext_mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: zext_mulhuw_v8i16_lshr_i64:
; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pmulhuw %xmm1, %xmm3
+; SSE2-NEXT: pmullw %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT: psrlq $16, %xmm0
+; SSE2-NEXT: psrlq $16, %xmm1
+; SSE2-NEXT: psrlq $16, %xmm2
+; SSE2-NEXT: psrlq $16, %xmm3
; SSE2-NEXT: retq
;
; SSE41-LABEL: zext_mulhuw_v8i16_lshr_i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pmulhuw %xmm1, %xmm0
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: pxor %xmm4, %xmm4
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE41-NEXT: pmulld %xmm0, %xmm3
+; SSE41-NEXT: pmulld %xmm2, %xmm1
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero
+; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE41-NEXT: psrlq $16, %xmm2
+; SSE41-NEXT: psrlq $16, %xmm3
+; SSE41-NEXT: psrlq $16, %xmm0
+; SSE41-NEXT: psrlq $16, %xmm1
; SSE41-NEXT: retq
;
; AVX2-LABEL: zext_mulhuw_v8i16_lshr_i64:
@@ -2022,8 +2039,11 @@ define <8 x i64> @zext_mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
;
; AVX512-LABEL: zext_mulhuw_v8i16_lshr_i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm0
; AVX512-NEXT: retq
%a1 = zext <8 x i16> %a to <8 x i64>
%b1 = zext <8 x i16> %b to <8 x i64>
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index 426587a84ce17..30b85d3da918c 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -644,10 +644,6 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
; ILP-NEXT: sete %cl
; ILP-NEXT: cmpq %rdi, %rsi
; ILP-NEXT: sbbq $0, %rcx
-; ILP-NEXT: movl $0, %ecx
-; ILP-NEXT: sbbq %rcx, %rcx
-; ILP-NEXT: movl $0, %ecx
-; ILP-NEXT: sbbq %rcx, %rcx
; ILP-NEXT: adcq $1, %rax
; ILP-NEXT: retq
;
@@ -659,10 +655,6 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
; HYBRID-NEXT: sete %cl
; HYBRID-NEXT: cmpq %rdi, %rsi
; HYBRID-NEXT: sbbq $0, %rcx
-; HYBRID-NEXT: movl $0, %ecx
-; HYBRID-NEXT: sbbq %rcx, %rcx
-; HYBRID-NEXT: movl $0, %ecx
-; HYBRID-NEXT: sbbq %rcx, %rcx
; HYBRID-NEXT: adcq $1, %rax
; HYBRID-NEXT: retq
;
@@ -674,10 +666,6 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
; BURR-NEXT: sete %cl
; BURR-NEXT: cmpq %rdi, %rsi
; BURR-NEXT: sbbq $0, %rcx
-; BURR-NEXT: movl $0, %ecx
-; BURR-NEXT: sbbq %rcx, %rcx
-; BURR-NEXT: movl $0, %ecx
-; BURR-NEXT: sbbq %rcx, %rcx
; BURR-NEXT: adcq $1, %rax
; BURR-NEXT: retq
;
@@ -689,10 +677,6 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
; SRC-NEXT: xorl %eax, %eax
; SRC-NEXT: cmpq %rdi, %rsi
; SRC-NEXT: sbbq $0, %rcx
-; SRC-NEXT: movl $0, %ecx
-; SRC-NEXT: sbbq %rcx, %rcx
-; SRC-NEXT: movl $0, %ecx
-; SRC-NEXT: sbbq %rcx, %rcx
; SRC-NEXT: adcq $1, %rax
; SRC-NEXT: retq
;
@@ -704,10 +688,6 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
; LIN-NEXT: sete %cl
; LIN-NEXT: cmpq %rdi, %rsi
; LIN-NEXT: sbbq $0, %rcx
-; LIN-NEXT: movl $0, %ecx
-; LIN-NEXT: sbbq %rcx, %rcx
-; LIN-NEXT: movl $0, %ecx
-; LIN-NEXT: sbbq %rcx, %rcx
; LIN-NEXT: adcq $1, %rax
; LIN-NEXT: retq
%r = zext i64 %b to i256
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index e53eed4587797..16b629c06a849 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -27,11 +27,11 @@ define void @mul_2xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64
; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
; X86-SSE-NEXT: movd %ecx, %xmm1
; X86-SSE-NEXT: pxor %xmm2, %xmm2
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT: pmullw %xmm0, %xmm1
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT: movq %xmm1, (%esi,%eax,4)
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-SSE-NEXT: pmullw %xmm1, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT: movq %xmm0, (%esi,%eax,4)
; X86-SSE-NEXT: popl %esi
; X86-SSE-NEXT: retl
;
@@ -61,11 +61,11 @@ define void @mul_2xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64
; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm1
; X64-SSE-NEXT: pxor %xmm2, %xmm2
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT: pmullw %xmm0, %xmm1
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-SSE-NEXT: pmullw %xmm1, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8:
@@ -1364,8 +1364,8 @@ define void @mul_2xi8_varconst1(ptr nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: movd %ecx, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,255,u,u,u,u,u,u]
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,255,0,u,u,u,u]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
@@ -1388,8 +1388,8 @@ define void @mul_2xi8_varconst1(ptr nocapture readonly %a, i64 %index) {
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,255,u,u,u,u,u,u]
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,255,0,u,u,u,u]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
@@ -1490,10 +1490,10 @@ define void @mul_2xi8_varconst3(ptr nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
; X86-SSE-NEXT: movd %ecx, %xmm0
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,256,u,u,u,u,u,u]
; X86-SSE-NEXT: pxor %xmm1, %xmm1
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,256,0,u,u,u,u]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
@@ -1514,10 +1514,10 @@ define void @mul_2xi8_varconst3(ptr nocapture readonly %a, i64 %index) {
; X64-SSE-NEXT: movq c(%rip), %rax
; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,256,u,u,u,u,u,u]
; X64-SSE-NEXT: pxor %xmm1, %xmm1
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,256,0,u,u,u,u]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
diff --git a/llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll b/llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll
deleted file mode 100644
index 38df58356559e..0000000000000
--- a/llvm/test/Transforms/AggressiveInstCombine/narrow_math_for_and.ll
+++ /dev/null
@@ -1,210 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=aggressive-instcombine < %s | FileCheck %s
-
-; REQUIRES: amdgpu-registered-target
-
-define i64 @narrow_add(i64 noundef %a, i64 noundef %b) {
-; CHECK-LABEL: define i64 @narrow_add(
-; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483647
-; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2147483647
-; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[ZEXT0]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ZEXT1]] to i32
-; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
-; CHECK-NEXT: ret i64 [[TMP4]]
-;
- %zext0 = and i64 %a, 2147483647
- %zext1 = and i64 %b, 2147483647
- %add = add i64 %zext0, %zext1
- ret i64 %add
-}
-
-define i64 @narrow_add_1(i64 noundef %a, i64 noundef %b) {
-; CHECK-LABEL: define i64 @narrow_add_1(
-; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483647
-; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2147483648
-; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[ZEXT0]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ZEXT1]] to i32
-; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
-; CHECK-NEXT: ret i64 [[TMP4]]
-;
- %zext0 = and i64 %a, 2147483647
- %zext1 = and i64 %b, 2147483648
- %add = add i64 %zext0, %zext1
- ret i64 %add
-}
-
-define <2 x i64> @narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 {
-; CHECK-LABEL: define <2 x i64> @narrow_add_vec(
-; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ZEXT0:%.*]] = and <2 x i64> [[A]], <i64 2147483647, i64 30>
-; CHECK-NEXT: [[ZEXT1:%.*]] = and <2 x i64> [[B]], <i64 2147483647, i64 2147483646>
-; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i64> [[ZEXT0]] to <2 x i32>
-; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i64> [[ZEXT1]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
-; CHECK-NEXT: ret <2 x i64> [[TMP4]]
-;
- %zext0 = and <2 x i64> %a, <i64 2147483647, i64 30>
- %zext1 = and <2 x i64> %b, <i64 2147483647, i64 2147483646>
- %add = add <2 x i64> %zext0, %zext1
- ret <2 x i64> %add
-}
-
-define <2 x i32> @narrow_add_vec_1(<2 x i32> %a, <2 x i32> %b) #0 {
-; CHECK-LABEL: define <2 x i32> @narrow_add_vec_1(
-; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ZEXT0:%.*]] = and <2 x i32> [[A]], <i32 16384, i32 16383>
-; CHECK-NEXT: [[ZEXT1:%.*]] = and <2 x i32> [[B]], <i32 16384, i32 16385>
-; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[ZEXT0]] to <2 x i16>
-; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i32> [[ZEXT1]] to <2 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
-; CHECK-NEXT: ret <2 x i32> [[TMP4]]
-;
- %zext0 = and <2 x i32> %a, <i32 16384, i32 16383>
- %zext1 = and <2 x i32> %b, <i32 16384, i32 16385>
- %add = add <2 x i32> %zext0, %zext1
- ret <2 x i32> %add
-}
-
-define i64 @narrow_mul(i64 noundef %a, i64 noundef %b) {
-; CHECK-LABEL: define i64 @narrow_mul(
-; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483647
-; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2
-; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[ZEXT0]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ZEXT1]] to i32
-; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
-; CHECK-NEXT: ret i64 [[TMP4]]
-;
- %zext0 = and i64 %a, 2147483647
- %zext1 = and i64 %b, 2
- %mul = mul i64 %zext0, %zext1
- ret i64 %mul
-}
-
-define i64 @narrow_mul_1(i64 noundef %a, i64 noundef %b) {
-; CHECK-LABEL: define i64 @narrow_mul_1(
-; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483647
-; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2
-; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[ZEXT0]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ZEXT1]] to i32
-; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
-; CHECK-NEXT: ret i64 [[TMP4]]
-;
- %zext0 = and i64 %a, 2147483647
- %zext1 = and i64 %b, 2
- %mul = mul i64 %zext0, %zext1
- ret i64 %mul
-}
-
-define <2 x i64> @narrow_mul_vec(<2 x i64> %a, <2 x i64> %b) #0 {
-; CHECK-LABEL: define <2 x i64> @narrow_mul_vec(
-; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ZEXT0:%.*]] = and <2 x i64> [[A]], <i64 47483647, i64 50>
-; CHECK-NEXT: [[ZEXT1:%.*]] = and <2 x i64> [[B]], <i64 80, i64 20>
-; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i64> [[ZEXT0]] to <2 x i32>
-; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i64> [[ZEXT1]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
-; CHECK-NEXT: ret <2 x i64> [[TMP4]]
-;
- %zext0 = and <2 x i64> %a, <i64 47483647, i64 50>
- %zext1 = and <2 x i64> %b, <i64 80, i64 20>
- %mul = mul <2 x i64> %zext0, %zext1
- ret <2 x i64> %mul
-}
-
-define <2 x i32> @narrow_add_mul_1(<2 x i32> %a, <2 x i32> %b) #0 {
-; CHECK-LABEL: define <2 x i32> @narrow_add_mul_1(
-; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ZEXT0:%.*]] = and <2 x i32> [[A]], splat (i32 16384)
-; CHECK-NEXT: [[ZEXT1:%.*]] = and <2 x i32> [[B]], <i32 3, i32 2>
-; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[ZEXT0]] to <2 x i16>
-; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i32> [[ZEXT1]] to <2 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
-; CHECK-NEXT: ret <2 x i32> [[TMP4]]
-;
- %zext0 = and <2 x i32> %a, <i32 16384, i32 16384>
- %zext1 = and <2 x i32> %b, <i32 3, i32 2>
- %mul = mul <2 x i32> %zext0, %zext1
- ret <2 x i32> %mul
-}
-
-define i64 @no_narrow_add(i64 noundef %a, i64 noundef %b) {
-; CHECK-LABEL: define i64 @no_narrow_add(
-; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483648
-; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2147483648
-; CHECK-NEXT: [[ADD:%.*]] = add i64 [[ZEXT0]], [[ZEXT1]]
-; CHECK-NEXT: ret i64 [[ADD]]
-;
- %zext0 = and i64 %a, 2147483648
- %zext1 = and i64 %b, 2147483648
- %add = add i64 %zext0, %zext1
- ret i64 %add
-}
-
-define i64 @no_narrow_add_1(i64 noundef %a, i64 noundef %b) {
-; CHECK-LABEL: define i64 @no_narrow_add_1(
-; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 4294967295
-; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 1
-; CHECK-NEXT: [[ADD:%.*]] = add i64 [[ZEXT0]], [[ZEXT1]]
-; CHECK-NEXT: ret i64 [[ADD]]
-;
- %zext0 = and i64 %a, 4294967295
- %zext1 = and i64 %b, 1
- %add = add i64 %zext0, %zext1
- ret i64 %add
-}
-
-define <2 x i64> @no_narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 {
-; CHECK-LABEL: define <2 x i64> @no_narrow_add_vec(
-; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ZEXT0:%.*]] = and <2 x i64> [[A]], <i64 2147483648, i64 30>
-; CHECK-NEXT: [[ZEXT1:%.*]] = and <2 x i64> [[B]], <i64 2147483648, i64 2147483646>
-; CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[ZEXT0]], [[ZEXT1]]
-; CHECK-NEXT: ret <2 x i64> [[ADD]]
-;
- %zext0 = and <2 x i64> %a, <i64 2147483648, i64 30>
- %zext1 = and <2 x i64> %b, <i64 2147483648, i64 2147483646>
- %add = add <2 x i64> %zext0, %zext1
- ret <2 x i64> %add
-}
-
-define i64 @no_narrow_mul(i64 noundef %a, i64 noundef %b) {
-; CHECK-LABEL: define i64 @no_narrow_mul(
-; CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ZEXT0:%.*]] = and i64 [[A]], 2147483648
-; CHECK-NEXT: [[ZEXT1:%.*]] = and i64 [[B]], 2
-; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ZEXT0]], [[ZEXT1]]
-; CHECK-NEXT: ret i64 [[MUL]]
-;
- %zext0 = and i64 %a, 2147483648
- %zext1 = and i64 %b, 2
- %mul = mul i64 %zext0, %zext1
- ret i64 %mul
-}
-
-define <2 x i64> @no_narrow_mul_vec(<2 x i64> %a, <2 x i64> %b) #0 {
-; CHECK-LABEL: define <2 x i64> @no_narrow_mul_vec(
-; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ZEXT0:%.*]] = and <2 x i64> [[A]], <i64 32768, i64 50>
-; CHECK-NEXT: [[ZEXT1:%.*]] = and <2 x i64> [[B]], <i64 131072, i64 20>
-; CHECK-NEXT: [[MUL:%.*]] = mul <2 x i64> [[ZEXT0]], [[ZEXT1]]
-; CHECK-NEXT: ret <2 x i64> [[MUL]]
-;
- %zext0 = and <2 x i64> %a, <i64 32768, i64 50>
- %zext1 = and <2 x i64> %b, <i64 131072, i64 20>
- %mul = mul <2 x i64> %zext0, %zext1
- ret <2 x i64> %mul
-}
>From f4fb6d0929efac3ef8db7a24bff97810cfb6b328 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Thu, 13 Mar 2025 12:39:13 +0800
Subject: [PATCH 10/14] move to amdgpu-codegenprepare
---
llvm/lib/CodeGen/CodeGenPrepare.cpp | 65 --------------
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 83 +++++++++++++++++
.../AMDGPU/amdgpu-codegenprepare-mul24.ll | 5 +-
.../CodeGen/AMDGPU/narrow_math_for_and.ll | 24 ++---
.../AMDGPU/promote-constOffset-to-imm.ll | 90 ++++++++++---------
llvm/test/CodeGen/X86/pmulh.ll | 62 +++++--------
.../CodeGen/X86/scheduler-backtracking.ll | 20 +++++
llvm/test/CodeGen/X86/shrink_vmul.ll | 28 +++---
8 files changed, 202 insertions(+), 175 deletions(-)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 796ae2512baf4..d5fbd4c380746 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8594,68 +8594,6 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
return false;
}
-static bool tryNarrowMathIfNoOverflow(Instruction *I,
- const TargetTransformInfo *TTI,
- const DataLayout &DL) {
- unsigned Opc = I->getOpcode();
- Type *OldType = I->getType();
-
- if (Opc != Instruction::Add && Opc != Instruction::Mul)
- return false;
-
- unsigned OrigBit = OldType->getScalarSizeInBits();
- unsigned MaxBitsNeed = OrigBit;
- switch (Opc) {
- case Instruction::Add:
- MaxBitsNeed = KnownBits::add(computeKnownBits(I->getOperand(0), DL),
- computeKnownBits(I->getOperand(1), DL))
- .countMaxActiveBits();
- break;
- case Instruction::Mul:
- MaxBitsNeed = KnownBits::mul(computeKnownBits(I->getOperand(0), DL),
- computeKnownBits(I->getOperand(1), DL))
- .countMaxActiveBits();
- break;
- default:
- break;
- }
-
- MaxBitsNeed = std::max<unsigned>(bit_ceil(MaxBitsNeed), 8);
-
- if (OrigBit <= MaxBitsNeed)
- return false;
-
- Type *NewType = I->getType()->getWithNewBitWidth(MaxBitsNeed);
-
- // Old cost
- InstructionCost OldCost =
- TTI->getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput);
- // New cost of new op
- InstructionCost NewCost =
- TTI->getArithmeticInstrCost(Opc, NewType, TTI::TCK_RecipThroughput);
- // New cost of narrowing 2 operands (use trunc)
- NewCost += TTI->getCastInstrCost(Instruction::Trunc, NewType, OldType,
- TTI->getCastContextHint(I),
- TTI::TCK_RecipThroughput) *
- 2;
- // New cost of zext narrowed result to original type
- NewCost += TTI->getCastInstrCost(Instruction::ZExt, OldType, NewType,
- TTI->getCastContextHint(I),
- TTI::TCK_RecipThroughput);
- if (NewCost >= OldCost) {
- return false;
- }
- IRBuilder<> Builder(I);
- Value *Trun0 = Builder.CreateTrunc(I->getOperand(0), NewType);
- Value *Trun1 = Builder.CreateTrunc(I->getOperand(1), NewType);
- Value *Arith = Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trun0, Trun1);
-
- Value *Zext = Builder.CreateZExt(Arith, OldType);
- I->replaceAllUsesWith(Zext);
- I->eraseFromParent();
- return true;
-}
-
bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
bool AnyChange = false;
AnyChange = fixupDbgVariableRecordsOnInst(*I);
@@ -8837,9 +8775,6 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
return optimizeExtractElementInst(cast<ExtractElementInst>(I));
case Instruction::Br:
return optimizeBranch(cast<BranchInst>(I), *TLI, FreshBBs, IsHugeFunc);
- case Instruction::Add:
- case Instruction::Mul:
- return tryNarrowMathIfNoOverflow(I, TTI, *DL);
}
return AnyChange;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index fdba8835cbf0a..0294baa3e1fb6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1560,6 +1560,86 @@ void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
llvm_unreachable("not a division");
}
+Type *findSmallestLegalBits(Instruction *I, int OrigBit, int MaxBitsNeeded,
+ const TargetLowering *TLI, const DataLayout &DL) {
+ if (MaxBitsNeeded >= OrigBit) {
+ return nullptr;
+ }
+ Type *NewType = I->getType()->getWithNewBitWidth(MaxBitsNeeded);
+ while (OrigBit > MaxBitsNeeded) {
+ if (TLI->isOperationLegalOrCustom(
+ TLI->InstructionOpcodeToISD(I->getOpcode()),
+ TLI->getValueType(DL, NewType, true))) {
+ return NewType;
+ }
+ MaxBitsNeeded *= 2;
+ NewType = I->getType()->getWithNewBitWidth(MaxBitsNeeded);
+ }
+ return nullptr;
+}
+
+static bool tryNarrowMathIfNoOverflow(Instruction *I, const TargetLowering *TLI,
+ const TargetTransformInfo &TTI,
+ const DataLayout &DL) {
+ unsigned Opc = I->getOpcode();
+ Type *OldType = I->getType();
+
+ if (Opc != Instruction::Add && Opc != Instruction::Mul)
+ return false;
+
+ unsigned OrigBit = OldType->getScalarSizeInBits();
+ unsigned MaxBitsNeeded = OrigBit;
+ switch (Opc) {
+ case Instruction::Add:
+ MaxBitsNeeded = KnownBits::add(computeKnownBits(I->getOperand(0), DL),
+ computeKnownBits(I->getOperand(1), DL))
+ .countMaxActiveBits();
+ break;
+ case Instruction::Mul:
+ MaxBitsNeeded = KnownBits::mul(computeKnownBits(I->getOperand(0), DL),
+ computeKnownBits(I->getOperand(1), DL))
+ .countMaxActiveBits();
+ break;
+ default:
+ break;
+ }
+
+ MaxBitsNeeded = std::max<unsigned>(bit_ceil(MaxBitsNeeded), 8);
+ Type *NewType = findSmallestLegalBits(I, OrigBit, MaxBitsNeeded, TLI, DL);
+
+ if (!NewType)
+ return false;
+
+ // Old cost
+ InstructionCost OldCost =
+ TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput);
+ // New cost of new op
+ InstructionCost NewCost =
+ TTI.getArithmeticInstrCost(Opc, NewType, TTI::TCK_RecipThroughput);
+ // New cost of narrowing 2 operands (use trunc)
+ NewCost += TTI.getCastInstrCost(Instruction::Trunc, NewType, OldType,
+ TTI.getCastContextHint(I),
+ TTI::TCK_RecipThroughput) *
+ 2;
+ // New cost of zext narrowed result to original type
+ NewCost +=
+ TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
+ TTI.getCastContextHint(I), TTI::TCK_RecipThroughput);
+ if (NewCost >= OldCost)
+ return false;
+
+ IRBuilder<> Builder(I);
+ Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType);
+ Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType);
+ Value *Arith =
+ Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1);
+
+ Value *Zext = Builder.CreateZExt(Arith, OldType);
+ I->replaceAllUsesWith(Zext);
+ I->eraseFromParent();
+ return true;
+}
+
bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
if (foldBinOpIntoSelect(I))
return true;
@@ -1644,6 +1724,9 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
}
}
+ Changed = tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(),
+ TM.getTargetTransformInfo(F), DL);
+
return Changed;
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll
index 296b817bc8f75..d7c35a8b007c6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll
@@ -414,7 +414,10 @@ define i64 @umul24_i64_2(i64 %lhs, i64 %rhs) {
; DISABLED-LABEL: @umul24_i64_2(
; DISABLED-NEXT: [[LHS24:%.*]] = and i64 [[LHS:%.*]], 65535
; DISABLED-NEXT: [[RHS24:%.*]] = and i64 [[RHS:%.*]], 65535
-; DISABLED-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]]
+; DISABLED-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
+; DISABLED-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
+; DISABLED-NEXT: [[TMP3:%.*]] = mul i32 [[TMP1]], [[TMP2]]
+; DISABLED-NEXT: [[MUL:%.*]] = zext i32 [[TMP3]] to i64
; DISABLED-NEXT: ret i64 [[MUL]]
;
%lhs24 = and i64 %lhs, 65535
diff --git a/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
index b0ef9e03663b4..fbc27be4e71d6 100644
--- a/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
+++ b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
@@ -2,8 +2,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
-; REQUIRES: amdgpu-registered-target
-
define i64 @narrow_add(i64 noundef %a, i64 noundef %b) {
; CHECK-LABEL: narrow_add:
; CHECK: ; %bb.0:
@@ -36,14 +34,16 @@ define <2 x i64> @narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 {
; CHECK-LABEL: narrow_add_vec:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v1, 30, v2
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
-; CHECK-NEXT: v_and_b32_e32 v2, 0x7fffffff, v4
+; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v4
+; CHECK-NEXT: v_and_b32_e32 v2, 30, v2
; CHECK-NEXT: v_and_b32_e32 v3, 0x7ffffffe, v6
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2
-; CHECK-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_add_nc_u32 v2, v1, v3
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_add_co_u32 v0, s0, v0, v1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_add_co_u32 v2, s0, v2, v3
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%zext0 = and <2 x i64> %a, <i64 2147483647, i64 30>
%zext1 = and <2 x i64> %b, <i64 2147483647, i64 2147483646>
@@ -110,13 +110,13 @@ define <2 x i64> @narrow_mul_vec(<2 x i64> %a, <2 x i64> %b) #0 {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v0, 0x2d48aff, v0
; CHECK-NEXT: v_and_b32_e32 v1, 0x50, v4
-; CHECK-NEXT: v_and_b32_e32 v2, 50, v2
-; CHECK-NEXT: v_and_b32_e32 v3, 20, v6
+; CHECK-NEXT: v_and_b32_e32 v3, 50, v2
+; CHECK-NEXT: v_and_b32_e32 v4, 20, v6
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; CHECK-NEXT: v_mul_lo_u32 v0, v0, v1
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: v_mul_u32_u24_e32 v2, v2, v3
-; CHECK-NEXT: v_mov_b32_e32 v3, 0
+; CHECK-NEXT: v_mul_u32_u24_e32 v2, v3, v4
+; CHECK-NEXT: v_mul_hi_u32_u24_e32 v3, v3, v4
; CHECK-NEXT: s_setpc_b64 s[30:31]
%zext0 = and <2 x i64> %a, <i64 47483647, i64 50>
%zext1 = and <2 x i64> %b, <i64 80, i64 20>
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index cab90831e5a6f..ee89bf406c2a3 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -365,9 +365,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX8-NEXT: v_mov_b32_e32 v1, 3
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_or_b32_e32 v0, v12, v0
-; GFX8-NEXT: v_or_b32_e32 v0, 0x5000, v0
; GFX8-NEXT: v_mov_b32_e32 v1, s35
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: s_movk_i32 s0, 0x5000
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_mov_b32_e32 v10, 0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v11, 0
@@ -485,14 +487,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX900-NEXT: s_mov_b32 s32, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX900-NEXT: v_lshlrev_b32_e32 v1, 17, v0
-; GFX900-NEXT: v_and_b32_e32 v6, 0xfe000000, v1
-; GFX900-NEXT: v_mov_b32_e32 v1, 3
-; GFX900-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX900-NEXT: s_movk_i32 s0, 0x5000
-; GFX900-NEXT: v_or3_b32 v0, v6, v0, s0
+; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0
+; GFX900-NEXT: v_lshlrev_b32_e32 v0, 17, v0
+; GFX900-NEXT: v_and_b32_e32 v6, 0xfe000000, v0
+; GFX900-NEXT: v_lshl_or_b32 v0, v1, 3, v6
; GFX900-NEXT: v_mov_b32_e32 v1, s35
; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0
+; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX900-NEXT: s_movk_i32 s0, 0x5000
+; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX900-NEXT: v_mov_b32_e32 v4, 0
; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX900-NEXT: v_mov_b32_e32 v5, 0
@@ -601,16 +604,17 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX10-NEXT: s_mov_b32 s32, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GFX10-NEXT: v_mov_b32_e32 v1, 3
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 17, v0
-; GFX10-NEXT: s_movk_i32 s1, 0x7f
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT: v_and_b32_e32 v6, 0xfe000000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 17, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0
-; GFX10-NEXT: v_or3_b32 v0, v6, v0, 0x5000
-; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0
+; GFX10-NEXT: s_movk_i32 s1, 0x7f
+; GFX10-NEXT: v_and_b32_e32 v6, 0xfe000000, v1
+; GFX10-NEXT: v_lshl_or_b32 v0, v0, 3, v6
+; GFX10-NEXT: v_add_co_u32 v0, s0, v0, s34
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s35, s0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX10-NEXT: ; =>This Loop Header: Depth=1
; GFX10-NEXT: ; Child Loop BB1_2 Depth 2
@@ -712,15 +716,16 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: s_mov_b32 s32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 17, v0
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xfe000000, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, 3
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xff, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 17, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, 0xfe000000, v0
+; GFX90A-NEXT: v_lshl_or_b32 v1, v1, 3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, s35
+; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v2, vcc
; GFX90A-NEXT: s_movk_i32 s0, 0x5000
-; GFX90A-NEXT: v_or3_b32 v0, v2, v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s35
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v1
+; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0
; GFX90A-NEXT: s_movk_i32 s3, 0x7f
; GFX90A-NEXT: s_movk_i32 s0, 0xd000
@@ -729,7 +734,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB1_2 Depth 2
-; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_mov_b32 s4, 0
; GFX90A-NEXT: .LBB1_2: ; %for.body
; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1
@@ -760,34 +765,34 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: s_addk_i32 s4, 0x2000
; GFX90A-NEXT: s_cmp_gt_u32 s4, 0x3fffff
; GFX90A-NEXT: s_waitcnt vmcnt(8)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v12, v4
+; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v12, v4
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(7)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v18, v3
+; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v18, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v19, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(5)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v14, v3
+; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v14, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v15, v4, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v16, v3
+; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v16, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v17, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(4)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v24, v3
+; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v24, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v25, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(3)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v26, v3
+; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v26, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v27, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(2)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v28, v3
+; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v28, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v29, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(1)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v20, v3
+; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v20, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v21, v4, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3
+; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v8, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3
+; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v10, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v4, vcc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v3
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v1
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2
; GFX90A-NEXT: ; %bb.3: ; %while.cond.loopexit
@@ -800,7 +805,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX90A-NEXT: s_branch .LBB1_1
; GFX90A-NEXT: .LBB1_5: ; %while.end
; GFX90A-NEXT: v_mov_b32_e32 v1, s35
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v2
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
; GFX90A-NEXT: s_endpgm
@@ -819,14 +824,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 17, v0
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0xff, v0
; GFX11-NEXT: s_movk_i32 s1, 0x7f
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v6, 0xfe000000, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-NEXT: v_lshl_or_b32 v0, v0, 3, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_or3_b32 v0, v6, v0, 0x5000
-; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0
+; GFX11-NEXT: v_add_co_u32 v0, s0, v0, s34
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s35, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX11-NEXT: ; =>This Loop Header: Depth=1
; GFX11-NEXT: ; Child Loop BB1_2 Depth 2
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index acdc457edccee..300da68d9a3b3 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -65,12 +65,8 @@ define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512-LABEL: and_mulhuw_v4i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%a1 = and <4 x i64> %a, <i64 65535, i64 65535, i64 65535, i64 65535>
@@ -1989,44 +1985,31 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) {
define <8 x i64> @zext_mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: zext_mulhuw_v8i16_lshr_i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pmulhuw %xmm1, %xmm3
-; SSE2-NEXT: pmullw %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT: psrlq $16, %xmm0
-; SSE2-NEXT: psrlq $16, %xmm1
-; SSE2-NEXT: psrlq $16, %xmm2
-; SSE2-NEXT: psrlq $16, %xmm3
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: zext_mulhuw_v8i16_lshr_i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE41-NEXT: pmulld %xmm0, %xmm3
-; SSE41-NEXT: pmulld %xmm2, %xmm1
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero
-; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE41-NEXT: psrlq $16, %xmm2
-; SSE41-NEXT: psrlq $16, %xmm3
-; SSE41-NEXT: psrlq $16, %xmm0
-; SSE41-NEXT: psrlq $16, %xmm1
+; SSE41-NEXT: pmulhuw %xmm1, %xmm0
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: retq
;
; AVX2-LABEL: zext_mulhuw_v8i16_lshr_i64:
@@ -2039,11 +2022,8 @@ define <8 x i64> @zext_mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
;
; AVX512-LABEL: zext_mulhuw_v8i16_lshr_i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm0
+; AVX512-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512-NEXT: retq
%a1 = zext <8 x i16> %a to <8 x i64>
%b1 = zext <8 x i16> %b to <8 x i64>
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index 30b85d3da918c..426587a84ce17 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -644,6 +644,10 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
; ILP-NEXT: sete %cl
; ILP-NEXT: cmpq %rdi, %rsi
; ILP-NEXT: sbbq $0, %rcx
+; ILP-NEXT: movl $0, %ecx
+; ILP-NEXT: sbbq %rcx, %rcx
+; ILP-NEXT: movl $0, %ecx
+; ILP-NEXT: sbbq %rcx, %rcx
; ILP-NEXT: adcq $1, %rax
; ILP-NEXT: retq
;
@@ -655,6 +659,10 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
; HYBRID-NEXT: sete %cl
; HYBRID-NEXT: cmpq %rdi, %rsi
; HYBRID-NEXT: sbbq $0, %rcx
+; HYBRID-NEXT: movl $0, %ecx
+; HYBRID-NEXT: sbbq %rcx, %rcx
+; HYBRID-NEXT: movl $0, %ecx
+; HYBRID-NEXT: sbbq %rcx, %rcx
; HYBRID-NEXT: adcq $1, %rax
; HYBRID-NEXT: retq
;
@@ -666,6 +674,10 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
; BURR-NEXT: sete %cl
; BURR-NEXT: cmpq %rdi, %rsi
; BURR-NEXT: sbbq $0, %rcx
+; BURR-NEXT: movl $0, %ecx
+; BURR-NEXT: sbbq %rcx, %rcx
+; BURR-NEXT: movl $0, %ecx
+; BURR-NEXT: sbbq %rcx, %rcx
; BURR-NEXT: adcq $1, %rax
; BURR-NEXT: retq
;
@@ -677,6 +689,10 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
; SRC-NEXT: xorl %eax, %eax
; SRC-NEXT: cmpq %rdi, %rsi
; SRC-NEXT: sbbq $0, %rcx
+; SRC-NEXT: movl $0, %ecx
+; SRC-NEXT: sbbq %rcx, %rcx
+; SRC-NEXT: movl $0, %ecx
+; SRC-NEXT: sbbq %rcx, %rcx
; SRC-NEXT: adcq $1, %rax
; SRC-NEXT: retq
;
@@ -688,6 +704,10 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
; LIN-NEXT: sete %cl
; LIN-NEXT: cmpq %rdi, %rsi
; LIN-NEXT: sbbq $0, %rcx
+; LIN-NEXT: movl $0, %ecx
+; LIN-NEXT: sbbq %rcx, %rcx
+; LIN-NEXT: movl $0, %ecx
+; LIN-NEXT: sbbq %rcx, %rcx
; LIN-NEXT: adcq $1, %rax
; LIN-NEXT: retq
%r = zext i64 %b to i256
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 16b629c06a849..e53eed4587797 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -27,11 +27,11 @@ define void @mul_2xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64
; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
; X86-SSE-NEXT: movd %ecx, %xmm1
; X86-SSE-NEXT: pxor %xmm2, %xmm2
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X86-SSE-NEXT: pmullw %xmm1, %xmm0
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT: movq %xmm0, (%esi,%eax,4)
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-SSE-NEXT: pmullw %xmm0, %xmm1
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT: movq %xmm1, (%esi,%eax,4)
; X86-SSE-NEXT: popl %esi
; X86-SSE-NEXT: retl
;
@@ -61,11 +61,11 @@ define void @mul_2xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64
; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm1
; X64-SSE-NEXT: pxor %xmm2, %xmm2
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-SSE-NEXT: pmullw %xmm1, %xmm0
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-SSE-NEXT: pmullw %xmm0, %xmm1
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8:
@@ -1364,8 +1364,8 @@ define void @mul_2xi8_varconst1(ptr nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: movd %ecx, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,255,u,u,u,u,u,u]
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,255,0,u,u,u,u]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
@@ -1388,8 +1388,8 @@ define void @mul_2xi8_varconst1(ptr nocapture readonly %a, i64 %index) {
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: pxor %xmm1, %xmm1
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,255,u,u,u,u,u,u]
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,255,0,u,u,u,u]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
@@ -1490,10 +1490,10 @@ define void @mul_2xi8_varconst3(ptr nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: movl c, %edx
; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
; X86-SSE-NEXT: movd %ecx, %xmm0
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,256,u,u,u,u,u,u]
; X86-SSE-NEXT: pxor %xmm1, %xmm1
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,256,0,u,u,u,u]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
@@ -1514,10 +1514,10 @@ define void @mul_2xi8_varconst3(ptr nocapture readonly %a, i64 %index) {
; X64-SSE-NEXT: movq c(%rip), %rax
; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,256,u,u,u,u,u,u]
; X64-SSE-NEXT: pxor %xmm1, %xmm1
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,256,0,u,u,u,u]
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
; X64-SSE-NEXT: retq
;
>From c7fbcd1e5b6981b7ef611eb7d16ff55c7f40bdbe Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Thu, 13 Mar 2025 12:44:49 +0800
Subject: [PATCH 11/14] fix comments
---
llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
index fbc27be4e71d6..3f49b1e550595 100644
--- a/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
+++ b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
@@ -2,7 +2,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
-define i64 @narrow_add(i64 noundef %a, i64 noundef %b) {
+define i64 @narrow_add(i64 %a, i64 %b) {
; CHECK-LABEL: narrow_add:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -17,7 +17,7 @@ define i64 @narrow_add(i64 noundef %a, i64 noundef %b) {
ret i64 %add
}
-define i64 @narrow_add_1(i64 noundef %a, i64 noundef %b) {
+define i64 @narrow_add_1(i64 %a, i64 %b) {
; CHECK-LABEL: narrow_add_1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -73,7 +73,7 @@ define <2 x i32> @narrow_add_vec_1(<2 x i32> %a, <2 x i32> %b) #0 {
ret <2 x i32> %add
}
-define i64 @narrow_mul(i64 noundef %a, i64 noundef %b) {
+define i64 @narrow_mul(i64 %a, i64 %b) {
; CHECK-LABEL: narrow_mul:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -88,7 +88,7 @@ define i64 @narrow_mul(i64 noundef %a, i64 noundef %b) {
ret i64 %mul
}
-define i64 @narrow_mul_1(i64 noundef %a, i64 noundef %b) {
+define i64 @narrow_mul_1(i64 %a, i64 %b) {
; CHECK-LABEL: narrow_mul_1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -142,7 +142,7 @@ define <2 x i32> @narrow_add_mul_1(<2 x i32> %a, <2 x i32> %b) #0 {
ret <2 x i32> %mul
}
-define i64 @no_narrow_add(i64 noundef %a, i64 noundef %b) {
+define i64 @no_narrow_add(i64 %a, i64 %b) {
; CHECK-LABEL: no_narrow_add:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -158,7 +158,7 @@ define i64 @no_narrow_add(i64 noundef %a, i64 noundef %b) {
ret i64 %add
}
-define i64 @no_narrow_add_1(i64 noundef %a, i64 noundef %b) {
+define i64 @no_narrow_add_1(i64 %a, i64 %b) {
; CHECK-LABEL: no_narrow_add_1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -194,7 +194,7 @@ define <2 x i64> @no_narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 {
ret <2 x i64> %add
}
-define i64 @no_narrow_mul(i64 noundef %a, i64 noundef %b) {
+define i64 @no_narrow_mul(i64 %a, i64 %b) {
; CHECK-LABEL: no_narrow_mul:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
>From f946445f34ada3c23afaaa0adbd95af84be1f214 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Thu, 20 Mar 2025 09:57:34 +0800
Subject: [PATCH 12/14] fix comments
---
.../lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 18 +++++++++---------
llvm/test/lit.cfg.py | 2 +-
2 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 0294baa3e1fb6..c7a9dfd627b74 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1562,16 +1562,16 @@ void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
Type *findSmallestLegalBits(Instruction *I, int OrigBit, int MaxBitsNeeded,
const TargetLowering *TLI, const DataLayout &DL) {
- if (MaxBitsNeeded >= OrigBit) {
+ if (MaxBitsNeeded >= OrigBit)
return nullptr;
- }
+
Type *NewType = I->getType()->getWithNewBitWidth(MaxBitsNeeded);
while (OrigBit > MaxBitsNeeded) {
if (TLI->isOperationLegalOrCustom(
TLI->InstructionOpcodeToISD(I->getOpcode()),
- TLI->getValueType(DL, NewType, true))) {
+ TLI->getValueType(DL, NewType, true)))
return NewType;
- }
+
MaxBitsNeeded *= 2;
NewType = I->getType()->getWithNewBitWidth(MaxBitsNeeded);
}
@@ -1601,7 +1601,8 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I, const TargetLowering *TLI,
.countMaxActiveBits();
break;
default:
- break;
+ llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
+ "Instruction::Mul.");
}
MaxBitsNeeded = std::max<unsigned>(bit_ceil(MaxBitsNeeded), 8);
@@ -1617,10 +1618,9 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I, const TargetLowering *TLI,
InstructionCost NewCost =
TTI.getArithmeticInstrCost(Opc, NewType, TTI::TCK_RecipThroughput);
// New cost of narrowing 2 operands (use trunc)
- NewCost += TTI.getCastInstrCost(Instruction::Trunc, NewType, OldType,
- TTI.getCastContextHint(I),
- TTI::TCK_RecipThroughput) *
- 2;
+ NewCost += 2 * TTI.getCastInstrCost(Instruction::Trunc, NewType, OldType,
+ TTI.getCastContextHint(I),
+ TTI::TCK_RecipThroughput);
// New cost of zext narrowed result to original type
NewCost +=
TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index aad7a088551b2..50921879cd1f2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("ascii")
+ readobj_out = readobj_cmd.stdout.read().decode("utf-8")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From ab4b6ce8ca252046c4b9013f927fb57b0f086487 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Thu, 20 Mar 2025 09:58:01 +0800
Subject: [PATCH 13/14] fix lit
---
llvm/test/lit.cfg.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 50921879cd1f2..aad7a088551b2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -466,7 +466,7 @@ def have_cxx_shared_library():
print("could not exec llvm-readobj")
return False
- readobj_out = readobj_cmd.stdout.read().decode("utf-8")
+ readobj_out = readobj_cmd.stdout.read().decode("ascii")
readobj_cmd.wait()
regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
>From 4159ffba448a4fca5a625585e41045f92b130d83 Mon Sep 17 00:00:00 2001
From: shore <372660931 at qq.com>
Date: Thu, 20 Mar 2025 09:59:18 +0800
Subject: [PATCH 14/14] fix format
---
llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index c7a9dfd627b74..f4c3ac71b48d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1589,6 +1589,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I, const TargetLowering *TLI,
unsigned OrigBit = OldType->getScalarSizeInBits();
unsigned MaxBitsNeeded = OrigBit;
+
switch (Opc) {
case Instruction::Add:
MaxBitsNeeded = KnownBits::add(computeKnownBits(I->getOperand(0), DL),
More information about the llvm-commits
mailing list