[clang] [llvm] [VectorCombine] Shrink loads used in shufflevector rebroadcasts (PR #128938)
Leon Clark via cfe-commits
cfe-commits at lists.llvm.org
Fri May 30 14:05:32 PDT 2025
https://github.com/PeddleSpam updated https://github.com/llvm/llvm-project/pull/128938
>From 3a75016c68cfb3d0d7ac18add582909045fc188f Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Wed, 26 Feb 2025 15:59:02 +0000
Subject: [PATCH 01/17] [AggressiveInstCombine] Shrink loads used in
shufflevector rebroadcasts.
Attempt to shrink the size of vector loads where only some of the incoming lanes are used for rebroadcasts in shufflevector instructions.
---
.../load-shufflevector.ll | 345 ++++++++++++++++++
1 file changed, 345 insertions(+)
create mode 100644 llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
diff --git a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
new file mode 100644
index 0000000000000..3f6c8334e61cf
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
@@ -0,0 +1,345 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=aggressive-instcombine -S < %s | FileCheck %s
+
+define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: ret <8 x half> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x half> %val1
+}
+
+define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: ret <8 x half> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x half> %val1
+}
+
+define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: ret <4 x half> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+ ret <4 x half> %val1
+}
+
+define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: ret <8 x half> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x half> %val1
+}
+
+define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[FINALLY:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: br label %[[FINALLY]]
+; CHECK: [[FINALLY]]:
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: ret <8 x half> [[VAL3]]
+;
+entry:
+ %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
+ br i1 %cond, label %then, label %else
+
+then:
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ br label %finally
+
+else:
+ %val2 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ br label %finally
+
+finally:
+ %val3 = phi <8 x half> [ %val1, %then ], [ %val2, %else ]
+ ret <8 x half> %val3
+}
+
+define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: br label %[[FINALLY:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: br label %[[FINALLY]]
+; CHECK: [[FINALLY]]:
+; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: ret <4 x half> [[VAL3]]
+;
+entry:
+ %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
+ br i1 %cond, label %then, label %else
+
+then:
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ br label %finally
+
+else:
+ %val2 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+ br label %finally
+
+finally:
+ %val3 = phi <4 x half> [ %val1, %then ], [ %val2, %else ]
+ ret <4 x half> %val3
+}
+
+define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: br label %[[FINALLY:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: br label %[[FINALLY]]
+; CHECK: [[FINALLY]]:
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: ret <8 x half> [[VAL3]]
+;
+entry:
+ %val0 = load <4 x half>, ptr addrspace(1) %arg0, align 32
+ br i1 %cond, label %then, label %else
+
+then:
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ br label %finally
+
+else:
+ %val2 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ br label %finally
+
+finally:
+ %val3 = phi <8 x half> [ %val1, %then ], [ %val2, %else ]
+ ret <8 x half> %val3
+}
+
+define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: ret <8 x i32> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %val1
+}
+
+define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: ret <8 x i32> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x i32> %val1
+}
+
+define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: ret <4 x i32> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+ ret <4 x i32> %val1
+}
+
+define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: ret <8 x i32> [[TMP1]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x i32> %val1
+}
+
+define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[FINALLY:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: br label %[[FINALLY]]
+; CHECK: [[FINALLY]]:
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: ret <8 x i32> [[VAL3]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ br i1 %cond, label %then, label %else
+
+then:
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ br label %finally
+
+else:
+ %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ br label %finally
+
+finally:
+ %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ]
+ ret <8 x i32> %val3
+}
+
+define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[FINALLY:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: br label %[[FINALLY]]
+; CHECK: [[FINALLY]]:
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: ret <8 x i32> [[VAL3]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ br i1 %cond, label %then, label %else
+
+then:
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ br label %finally
+
+else:
+ %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ br label %finally
+
+finally:
+ %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ]
+ ret <8 x i32> %val3
+}
+
+define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: br label %[[FINALLY:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: br label %[[FINALLY]]
+; CHECK: [[FINALLY]]:
+; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: ret <4 x i32> [[VAL3]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ br i1 %cond, label %then, label %else
+
+then:
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ br label %finally
+
+else:
+ %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+ br label %finally
+
+finally:
+ %val3 = phi <4 x i32> [ %val1, %then ], [ %val2, %else ]
+ ret <4 x i32> %val3
+}
+
+define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly %arg0, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: br label %[[FINALLY:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: br label %[[FINALLY]]
+; CHECK: [[FINALLY]]:
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: ret <8 x i32> [[VAL3]]
+;
+entry:
+ %val0 = load <4 x i32>, ptr addrspace(1) %arg0, align 32
+ br i1 %cond, label %then, label %else
+
+then:
+ %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ br label %finally
+
+else:
+ %val2 = shufflevector <4 x i32> %val0, <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ br label %finally
+
+finally:
+ %val3 = phi <8 x i32> [ %val1, %then ], [ %val2, %else ]
+ ret <8 x i32> %val3
+}
>From f566b7a2649ab88654c50bd6c9031ed0e5d101a7 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Wed, 26 Feb 2025 21:18:30 +0000
Subject: [PATCH 02/17] Add implementation and update tests.
---
clang/test/CodeGenOpenCL/preserve_vec3.cl | 20 ++---
.../AggressiveInstCombine.cpp | 90 +++++++++++++++++++
.../load-shufflevector.ll | 88 +++++++++---------
3 files changed, 144 insertions(+), 54 deletions(-)
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index 49ebae6fc7013..0538eac4029bb 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
// CHECK-LABEL: define dso_local spir_kernel void @foo(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
// CHECK-NEXT: ret void
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -47,8 +47,8 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index af994022a8ec1..a20f69a65c60c 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -916,6 +916,95 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
return true;
}
+// If `I` is a load instruction, used only by shufflevector instructions with
+// poison values, attempt to shrink the load to only the lanes being used.
+static bool shrinkLoadsForBroadcast(Instruction &I) {
+ auto *OldLoad = dyn_cast<LoadInst>(&I);
+ if (!OldLoad)
+ return false;
+
+ auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
+ if (!VecTy)
+ return false;
+
+ auto IsPoisonOrUndef = [](Value *V) -> bool {
+ if (auto *C = dyn_cast<Constant>(V)) {
+ return isa<PoisonValue>(C) || isa<UndefValue>(C);
+ }
+ return false;
+ };
+
+ using IndexRange = std::pair<unsigned, unsigned>;
+ auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
+ auto OutputRange = IndexRange(VecTy->getNumElements(), 0u);
+ for (auto &Use: I.uses()) {
+ // All uses must be ShuffleVector instructions.
+ auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
+ if (!Shuffle)
+ return {};
+
+ // Get index range for value.
+ auto *Op0 = Shuffle->getOperand(0u);
+ auto *Op1 = Shuffle->getOperand(1u);
+ if (!IsPoisonOrUndef(Op1))
+ return {};
+
+ // Find the min and max indices used by the ShuffleVector instruction.
+ auto Mask = Shuffle->getShuffleMask();
+ auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
+ auto NumElems = Op0Ty->getNumElements();
+
+ for (unsigned Index: Mask) {
+ if (Index < NumElems) {
+ OutputRange.first = std::min(Index, OutputRange.first);
+ OutputRange.second = std::max(Index, OutputRange.second);
+ }
+ }
+ }
+ return OutputRange;
+ };
+
+ if (auto Indices = GetIndexRangeInShuffles()) {
+ auto OldSize = VecTy->getNumElements();
+ auto NewSize = Indices->second + 1u;
+
+ if (NewSize < OldSize) {
+ auto Builder = IRBuilder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ // Create new load of smaller vector.
+ auto *ElemTy = VecTy->getElementType();
+ auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
+ auto *NewLoad = cast<LoadInst>(
+ Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
+ NewLoad->copyMetadata(I);
+
+ // Replace all users.
+ auto OldShuffles = SmallVector<ShuffleVectorInst*, 4u>{};
+ for (auto &Use: I.uses()) {
+ auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
+
+ Builder.SetInsertPoint(Shuffle);
+ Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
+ auto *NewShuffle = Builder.CreateShuffleVector(
+ NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask()
+ );
+
+ Shuffle->replaceAllUsesWith(NewShuffle);
+ OldShuffles.push_back(Shuffle);
+ }
+
+ // Erase old users.
+ for (auto *Shuffle: OldShuffles)
+ Shuffle->eraseFromParent();
+
+ I.eraseFromParent();
+ return true;
+ }
+ }
+ return false;
+}
+
namespace {
class StrNCmpInliner {
public:
@@ -1253,6 +1342,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
MadeChange |= tryToRecognizeTableBasedCttz(I);
MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
MadeChange |= foldPatternedLoads(I, DL);
+ MadeChange |= shrinkLoadsForBroadcast(I);
// NOTE: This function introduces erasing of the instruction `I`, so it
// needs to be called at the end of this sequence, otherwise we may make
// bugs.
diff --git a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
index 3f6c8334e61cf..57006f2c65380 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
@@ -5,8 +5,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -19,8 +19,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -33,8 +33,8 @@ define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x half> [[TMP1]]
;
entry:
@@ -47,8 +47,8 @@ define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -61,16 +61,16 @@ define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 4
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x half> [[VAL3]]
;
entry:
@@ -94,16 +94,16 @@ define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
; CHECK-NEXT: ret <4 x half> [[VAL3]]
;
entry:
@@ -127,16 +127,16 @@ define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x half> [[VAL3]]
;
entry:
@@ -160,8 +160,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -174,8 +174,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -188,8 +188,8 @@ define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x i32> [[TMP1]]
;
entry:
@@ -202,8 +202,8 @@ define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -216,16 +216,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 8
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
@@ -249,16 +249,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
@@ -282,16 +282,16 @@ define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
; CHECK-NEXT: ret <4 x i32> [[VAL3]]
;
entry:
@@ -315,16 +315,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
>From f5c8bf557308ee766b8437ebdb2f4231d37d0c4f Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 28 Feb 2025 17:25:35 +0000
Subject: [PATCH 03/17] Fix broken tests.
---
.../builtins-systemz-zvector-constrained.c | 4 +-
.../SystemZ/builtins-systemz-zvector.c | 52 +++++++++----------
.../builtins-systemz-zvector2-constrained.c | 12 ++---
.../SystemZ/builtins-systemz-zvector2.c | 12 ++---
.../AggressiveInstCombine.cpp | 19 ++++---
5 files changed, 49 insertions(+), 50 deletions(-)
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c
index 4993df20df143..e335c363ecb48 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c
@@ -79,8 +79,8 @@ void test_core(void) {
vec_xstd2(vd, idx, ptrd);
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vrepg
+ // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vlrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
index d5d15b4dea966..422c97a77511c 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
@@ -777,80 +777,80 @@ void test_core(void) {
// CHECK: <2 x i64> splat (i64 -4503582447501313)
vsc = vec_splat(vsc, 0);
- // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer
- // CHECK-ASM: vrepb
+ // CHECK: shufflevector <1 x i8> %{{.*}}, <1 x i8> poison, <16 x i32> zeroinitializer
+ // CHECK-ASM: vlrepb
vsc = vec_splat(vsc, 15);
// CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
// CHECK-ASM: vrepb
vuc = vec_splat(vuc, 0);
- // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer
- // CHECK-ASM: vrepb
+ // CHECK: store volatile <16 x i8> splat (i8 {{.*}}), ptr @vuc
+ // CHECK-ASM: vst
vuc = vec_splat(vuc, 15);
// CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
// CHECK-ASM: vrepb
vbc = vec_splat(vbc, 0);
- // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer
- // CHECK-ASM: vrepb
+ // CHECK: shufflevector <1 x i8> %{{.*}}, <1 x i8> poison, <16 x i32> zeroinitializer
+ // CHECK-ASM: vlrepb
vbc = vec_splat(vbc, 15);
// CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
// CHECK-ASM: vrepb
vss = vec_splat(vss, 0);
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer
- // CHECK-ASM: vreph
+ // CHECK: shufflevector <1 x i16> %{{.*}}, <1 x i16> poison, <8 x i32> zeroinitializer
+ // CHECK-ASM: vlreph
vss = vec_splat(vss, 7);
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-ASM: vreph
vus = vec_splat(vus, 0);
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer
- // CHECK-ASM: vreph
+ // CHECK: store volatile <8 x i16> splat (i16 {{.*}}), ptr @vus
+ // CHECK-ASM: vst
vus = vec_splat(vus, 7);
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-ASM: vreph
vbs = vec_splat(vbs, 0);
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer
- // CHECK-ASM: vreph
+ // CHECK: shufflevector <1 x i16> %{{.*}}, <1 x i16> poison, <8 x i32> zeroinitializer
+ // CHECK-ASM: vlreph
vbs = vec_splat(vbs, 7);
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-ASM: vreph
vsi = vec_splat(vsi, 0);
- // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vrepf
+ // CHECK: shufflevector <1 x i32> %{{.*}}, <1 x i32> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vlrepf
vsi = vec_splat(vsi, 3);
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-ASM: vrepf
vui = vec_splat(vui, 0);
- // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vrepf
+ // CHECK: store volatile <4 x i32> splat (i32 {{.*}}), ptr @vui
+ // CHECK-ASM: vst
vui = vec_splat(vui, 3);
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-ASM: vrepf
vbi = vec_splat(vbi, 0);
- // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vrepf
+ // CHECK: shufflevector <1 x i32> %{{.*}}, <1 x i32> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vlrepf
vbi = vec_splat(vbi, 3);
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-ASM: vrepf
vsl = vec_splat(vsl, 0);
- // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer
+ // CHECK: shufflevector <1 x i64> %{{.*}}, <1 x i64> poison, <2 x i32> zeroinitializer
// CHECK-ASM: vrepg
vsl = vec_splat(vsl, 1);
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
- // CHECK-ASM: vrepg
+ // CHECK-ASM: vst
vul = vec_splat(vul, 0);
- // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vrepg
+ // CHECK: store volatile <2 x i64> splat (i64 {{.*}}), ptr @vul
+ // CHECK-ASM: vst
vul = vec_splat(vul, 1);
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
vbl = vec_splat(vbl, 0);
- // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vrepg
+ // CHECK: shufflevector <1 x i64> %{{.*}}, <1 x i64> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vlrepg
vbl = vec_splat(vbl, 1);
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vrepg
+ // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vlrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c
index 25b3e0b68cd02..2b79df2a1886e 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c
@@ -130,14 +130,14 @@ void test_core(void) {
// CHECK-ASM: vst
vf = vec_splat(vf, 0);
- // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vrepf
+ // CHECK: shufflevector <1 x float> %{{.*}}, <1 x float> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vlrepf
vf = vec_splat(vf, 1);
- // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
- // CHECK-ASM: vrepf
+ // CHECK: shufflevector <2 x float> %{{.*}}, <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ // CHECK-ASM: vst
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vrepg
+ // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vlrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
index c1ef178fcfaa9..1ccbe6df5f16d 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
@@ -254,14 +254,14 @@ void test_core(void) {
// CHECK-ASM: vstrlr
vf = vec_splat(vf, 0);
- // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vrepf
+ // CHECK: shufflevector <1 x float> %{{.*}}, <1 x float> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vlrepf
vf = vec_splat(vf, 1);
- // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
- // CHECK-ASM: vrepf
+ // CHECK: shufflevector <2 x float> %{{.*}}, <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ // CHECK-ASM: vst
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vrepg
+ // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vlrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index a20f69a65c60c..d5e025597e281 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -916,7 +916,7 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
return true;
}
-// If `I` is a load instruction, used only by shufflevector instructions with
+// If `I` is a load instruction, used only by shufflevector instructions with
// poison values, attempt to shrink the load to only the lanes being used.
static bool shrinkLoadsForBroadcast(Instruction &I) {
auto *OldLoad = dyn_cast<LoadInst>(&I);
@@ -937,7 +937,7 @@ static bool shrinkLoadsForBroadcast(Instruction &I) {
using IndexRange = std::pair<unsigned, unsigned>;
auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
auto OutputRange = IndexRange(VecTy->getNumElements(), 0u);
- for (auto &Use: I.uses()) {
+ for (auto &Use : I.uses()) {
// All uses must be ShuffleVector instructions.
auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
if (!Shuffle)
@@ -954,7 +954,7 @@ static bool shrinkLoadsForBroadcast(Instruction &I) {
auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
auto NumElems = Op0Ty->getNumElements();
- for (unsigned Index: Mask) {
+ for (unsigned Index : Mask) {
if (Index < NumElems) {
OutputRange.first = std::min(Index, OutputRange.first);
OutputRange.second = std::max(Index, OutputRange.second);
@@ -976,26 +976,25 @@ static bool shrinkLoadsForBroadcast(Instruction &I) {
auto *ElemTy = VecTy->getElementType();
auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
auto *NewLoad = cast<LoadInst>(
- Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
+ Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
NewLoad->copyMetadata(I);
// Replace all users.
- auto OldShuffles = SmallVector<ShuffleVectorInst*, 4u>{};
- for (auto &Use: I.uses()) {
+ auto OldShuffles = SmallVector<ShuffleVectorInst *, 4u>{};
+ for (auto &Use : I.uses()) {
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
-
+
Builder.SetInsertPoint(Shuffle);
Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
auto *NewShuffle = Builder.CreateShuffleVector(
- NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask()
- );
+ NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask());
Shuffle->replaceAllUsesWith(NewShuffle);
OldShuffles.push_back(Shuffle);
}
// Erase old users.
- for (auto *Shuffle: OldShuffles)
+ for (auto *Shuffle : OldShuffles)
Shuffle->eraseFromParent();
I.eraseFromParent();
>From 157074c9fb0879d658e6b69fb541cbb633c63d18 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Tue, 4 Mar 2025 17:44:56 +0000
Subject: [PATCH 04/17] Ignore non-simple loads.
---
.../builtins-systemz-zvector-constrained.c | 4 +-
.../SystemZ/builtins-systemz-zvector.c | 52 +++++++++----------
.../builtins-systemz-zvector2-constrained.c | 12 ++---
.../SystemZ/builtins-systemz-zvector2.c | 12 ++---
.../AggressiveInstCombine.cpp | 2 +-
.../load-shufflevector.ll | 14 +++++
6 files changed, 55 insertions(+), 41 deletions(-)
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c
index e335c363ecb48..4993df20df143 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector-constrained.c
@@ -79,8 +79,8 @@ void test_core(void) {
vec_xstd2(vd, idx, ptrd);
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vlrepg
+ // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
index 422c97a77511c..d5d15b4dea966 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c
@@ -777,80 +777,80 @@ void test_core(void) {
// CHECK: <2 x i64> splat (i64 -4503582447501313)
vsc = vec_splat(vsc, 0);
- // CHECK: shufflevector <1 x i8> %{{.*}}, <1 x i8> poison, <16 x i32> zeroinitializer
- // CHECK-ASM: vlrepb
+ // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer
+ // CHECK-ASM: vrepb
vsc = vec_splat(vsc, 15);
// CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
// CHECK-ASM: vrepb
vuc = vec_splat(vuc, 0);
- // CHECK: store volatile <16 x i8> splat (i8 {{.*}}), ptr @vuc
- // CHECK-ASM: vst
+ // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer
+ // CHECK-ASM: vrepb
vuc = vec_splat(vuc, 15);
// CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
// CHECK-ASM: vrepb
vbc = vec_splat(vbc, 0);
- // CHECK: shufflevector <1 x i8> %{{.*}}, <1 x i8> poison, <16 x i32> zeroinitializer
- // CHECK-ASM: vlrepb
+ // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> zeroinitializer
+ // CHECK-ASM: vrepb
vbc = vec_splat(vbc, 15);
// CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> poison, <16 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
// CHECK-ASM: vrepb
vss = vec_splat(vss, 0);
- // CHECK: shufflevector <1 x i16> %{{.*}}, <1 x i16> poison, <8 x i32> zeroinitializer
- // CHECK-ASM: vlreph
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer
+ // CHECK-ASM: vreph
vss = vec_splat(vss, 7);
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-ASM: vreph
vus = vec_splat(vus, 0);
- // CHECK: store volatile <8 x i16> splat (i16 {{.*}}), ptr @vus
- // CHECK-ASM: vst
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer
+ // CHECK-ASM: vreph
vus = vec_splat(vus, 7);
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-ASM: vreph
vbs = vec_splat(vbs, 0);
- // CHECK: shufflevector <1 x i16> %{{.*}}, <1 x i16> poison, <8 x i32> zeroinitializer
- // CHECK-ASM: vlreph
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> zeroinitializer
+ // CHECK-ASM: vreph
vbs = vec_splat(vbs, 7);
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-ASM: vreph
vsi = vec_splat(vsi, 0);
- // CHECK: shufflevector <1 x i32> %{{.*}}, <1 x i32> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vlrepf
+ // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vrepf
vsi = vec_splat(vsi, 3);
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-ASM: vrepf
vui = vec_splat(vui, 0);
- // CHECK: store volatile <4 x i32> splat (i32 {{.*}}), ptr @vui
- // CHECK-ASM: vst
+ // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vrepf
vui = vec_splat(vui, 3);
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-ASM: vrepf
vbi = vec_splat(vbi, 0);
- // CHECK: shufflevector <1 x i32> %{{.*}}, <1 x i32> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vlrepf
+ // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vrepf
vbi = vec_splat(vbi, 3);
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-ASM: vrepf
vsl = vec_splat(vsl, 0);
- // CHECK: shufflevector <1 x i64> %{{.*}}, <1 x i64> poison, <2 x i32> zeroinitializer
+ // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer
// CHECK-ASM: vrepg
vsl = vec_splat(vsl, 1);
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
- // CHECK-ASM: vst
+ // CHECK-ASM: vrepg
vul = vec_splat(vul, 0);
- // CHECK: store volatile <2 x i64> splat (i64 {{.*}}), ptr @vul
- // CHECK-ASM: vst
+ // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vrepg
vul = vec_splat(vul, 1);
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
vbl = vec_splat(vbl, 0);
- // CHECK: shufflevector <1 x i64> %{{.*}}, <1 x i64> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vlrepg
+ // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vrepg
vbl = vec_splat(vbl, 1);
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vlrepg
+ // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c
index 2b79df2a1886e..25b3e0b68cd02 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-constrained.c
@@ -130,14 +130,14 @@ void test_core(void) {
// CHECK-ASM: vst
vf = vec_splat(vf, 0);
- // CHECK: shufflevector <1 x float> %{{.*}}, <1 x float> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vlrepf
+ // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vrepf
vf = vec_splat(vf, 1);
- // CHECK: shufflevector <2 x float> %{{.*}}, <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
- // CHECK-ASM: vst
+ // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ // CHECK-ASM: vrepf
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vlrepg
+ // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
index 1ccbe6df5f16d..c1ef178fcfaa9 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c
@@ -254,14 +254,14 @@ void test_core(void) {
// CHECK-ASM: vstrlr
vf = vec_splat(vf, 0);
- // CHECK: shufflevector <1 x float> %{{.*}}, <1 x float> poison, <4 x i32> zeroinitializer
- // CHECK-ASM: vlrepf
+ // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> zeroinitializer
+ // CHECK-ASM: vrepf
vf = vec_splat(vf, 1);
- // CHECK: shufflevector <2 x float> %{{.*}}, <2 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
- // CHECK-ASM: vst
+ // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ // CHECK-ASM: vrepf
vd = vec_splat(vd, 0);
- // CHECK: shufflevector <1 x double> %{{.*}}, <1 x double> poison, <2 x i32> zeroinitializer
- // CHECK-ASM: vlrepg
+ // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> zeroinitializer
+ // CHECK-ASM: vrepg
vd = vec_splat(vd, 1);
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 1>
// CHECK-ASM: vrepg
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index d5e025597e281..ccee7591adb59 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -920,7 +920,7 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
// poison values, attempt to shrink the load to only the lanes being used.
static bool shrinkLoadsForBroadcast(Instruction &I) {
auto *OldLoad = dyn_cast<LoadInst>(&I);
- if (!OldLoad)
+ if (!OldLoad || !OldLoad->isSimple())
return false;
auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
diff --git a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
index 57006f2c65380..9978c15e90beb 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
@@ -1,6 +1,20 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes=aggressive-instcombine -S < %s | FileCheck %s
+define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
+; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(
+; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load volatile <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: ret <8 x half> [[TMP1]]
+;
+entry:
+ %val0 = load volatile <4 x half>, ptr addrspace(1) %arg0, align 32
+ %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x half> %val1
+}
+
define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
>From d6c00c06affb29eb3394491c4faa860b6e0ccbd5 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Wed, 12 Mar 2025 00:21:08 +0000
Subject: [PATCH 05/17] Move transform to VectorCombine and update tests.
---
.../AggressiveInstCombine.cpp | 89 -----------------
.../Transforms/Vectorize/VectorCombine.cpp | 99 +++++++++++++++++++
.../PhaseOrdering/X86/vec-load-combine.ll | 8 +-
.../VectorCombine/X86/load-widening.ll | 6 +-
.../VectorCombine/X86/shuffle-of-shuffles.ll | 24 ++---
.../load-shufflevector.ll | 2 +-
6 files changed, 116 insertions(+), 112 deletions(-)
rename llvm/test/Transforms/{AggressiveInstCombine => VectorCombine}/load-shufflevector.ll (99%)
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index ccee7591adb59..af994022a8ec1 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -916,94 +916,6 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
return true;
}
-// If `I` is a load instruction, used only by shufflevector instructions with
-// poison values, attempt to shrink the load to only the lanes being used.
-static bool shrinkLoadsForBroadcast(Instruction &I) {
- auto *OldLoad = dyn_cast<LoadInst>(&I);
- if (!OldLoad || !OldLoad->isSimple())
- return false;
-
- auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
- if (!VecTy)
- return false;
-
- auto IsPoisonOrUndef = [](Value *V) -> bool {
- if (auto *C = dyn_cast<Constant>(V)) {
- return isa<PoisonValue>(C) || isa<UndefValue>(C);
- }
- return false;
- };
-
- using IndexRange = std::pair<unsigned, unsigned>;
- auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
- auto OutputRange = IndexRange(VecTy->getNumElements(), 0u);
- for (auto &Use : I.uses()) {
- // All uses must be ShuffleVector instructions.
- auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
- if (!Shuffle)
- return {};
-
- // Get index range for value.
- auto *Op0 = Shuffle->getOperand(0u);
- auto *Op1 = Shuffle->getOperand(1u);
- if (!IsPoisonOrUndef(Op1))
- return {};
-
- // Find the min and max indices used by the ShuffleVector instruction.
- auto Mask = Shuffle->getShuffleMask();
- auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
- auto NumElems = Op0Ty->getNumElements();
-
- for (unsigned Index : Mask) {
- if (Index < NumElems) {
- OutputRange.first = std::min(Index, OutputRange.first);
- OutputRange.second = std::max(Index, OutputRange.second);
- }
- }
- }
- return OutputRange;
- };
-
- if (auto Indices = GetIndexRangeInShuffles()) {
- auto OldSize = VecTy->getNumElements();
- auto NewSize = Indices->second + 1u;
-
- if (NewSize < OldSize) {
- auto Builder = IRBuilder(&I);
- Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
- // Create new load of smaller vector.
- auto *ElemTy = VecTy->getElementType();
- auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
- auto *NewLoad = cast<LoadInst>(
- Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
- NewLoad->copyMetadata(I);
-
- // Replace all users.
- auto OldShuffles = SmallVector<ShuffleVectorInst *, 4u>{};
- for (auto &Use : I.uses()) {
- auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
-
- Builder.SetInsertPoint(Shuffle);
- Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
- auto *NewShuffle = Builder.CreateShuffleVector(
- NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask());
-
- Shuffle->replaceAllUsesWith(NewShuffle);
- OldShuffles.push_back(Shuffle);
- }
-
- // Erase old users.
- for (auto *Shuffle : OldShuffles)
- Shuffle->eraseFromParent();
-
- I.eraseFromParent();
- return true;
- }
- }
- return false;
-}
-
namespace {
class StrNCmpInliner {
public:
@@ -1341,7 +1253,6 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
MadeChange |= tryToRecognizeTableBasedCttz(I);
MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
MadeChange |= foldPatternedLoads(I, DL);
- MadeChange |= shrinkLoadsForBroadcast(I);
// NOTE: This function introduces erasing of the instruction `I`, so it
// needs to be called at the end of this sequence, otherwise we may make
// bugs.
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index e58789b5d5641..3b9c96fff3043 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -131,6 +131,7 @@ class VectorCombine {
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
bool foldInterleaveIntrinsics(Instruction &I);
bool shrinkType(Instruction &I);
+ bool shrinkLoadForShuffles(Instruction &I);
void replaceValue(Value &Old, Value &New) {
LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
@@ -3483,6 +3484,101 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
return true;
}
+// If `I` is a load instruction, used only by shufflevector instructions with
+// poison values, attempt to shrink the load to only the lanes being used.
+bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
+ auto *OldLoad = dyn_cast<LoadInst>(&I);
+ if (!OldLoad || !OldLoad->isSimple())
+ return false;
+
+ auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
+ if (!VecTy)
+ return false;
+
+ auto IsPoisonOrUndef = [](Value *V) -> bool {
+ if (auto *C = dyn_cast<Constant>(V)) {
+ return isa<PoisonValue>(C) || isa<UndefValue>(C);
+ }
+ return false;
+ };
+
+ using IndexRange = std::pair<int, int>;
+ auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
+ auto OutputRange = IndexRange(VecTy->getNumElements(), -1);
+ for (auto &Use : I.uses()) {
+ // All uses must be ShuffleVector instructions.
+ auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
+ if (!Shuffle)
+ return {};
+
+ // Get index range for value.
+ auto *Op0 = Shuffle->getOperand(0u);
+ auto *Op1 = Shuffle->getOperand(1u);
+ if (!IsPoisonOrUndef(Op1))
+ return {};
+
+ // Find the min and max indices used by the ShuffleVector instruction.
+ auto Mask = Shuffle->getShuffleMask();
+ auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
+ auto NumElems = int(Op0Ty->getNumElements());
+
+ for (auto Index : Mask) {
+ if (Index >= 0 && Index < NumElems) {
+ OutputRange.first = std::min(Index, OutputRange.first);
+ OutputRange.second = std::max(Index, OutputRange.second);
+ }
+ }
+
+ if (OutputRange.second < OutputRange.first)
+ return {};
+ }
+ return OutputRange;
+ };
+
+ if (auto Indices = GetIndexRangeInShuffles()) {
+ auto OldSize = VecTy->getNumElements();
+ auto NewSize = Indices->second + 1u;
+
+ if (NewSize < OldSize) {
+ auto Builder = IRBuilder(&I);
+ Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+ // Create new load of smaller vector.
+ auto *ElemTy = VecTy->getElementType();
+ auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
+ auto *NewLoad = cast<LoadInst>(
+ Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
+ NewLoad->copyMetadata(I);
+
+ // Compare cost of old and new loads.
+ auto OldCost = TTI.getMemoryOpCost(
+ Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
+ OldLoad->getPointerAddressSpace(), CostKind);
+ auto NewCost = TTI.getMemoryOpCost(
+ Instruction::Load, NewLoad->getType(), NewLoad->getAlign(),
+ NewLoad->getPointerAddressSpace(), CostKind);
+
+ if (OldCost < NewCost || !NewCost.isValid())
+ return false;
+
+ // Replace all users.
+ for (auto &Use : I.uses()) {
+ auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
+
+ Builder.SetInsertPoint(Shuffle);
+ Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
+ auto *NewShuffle = Builder.CreateShuffleVector(
+ NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask());
+
+ replaceValue(*Shuffle, *NewShuffle);
+ }
+
+ return true;
+ }
+ }
+ return false;
+}
+
/// This is the entry point for all transforms. Pass manager differences are
/// handled in the callers of this function.
bool VectorCombine::run() {
@@ -3561,6 +3657,9 @@ bool VectorCombine::run() {
case Instruction::BitCast:
MadeChange |= foldBitcastShuffle(I);
break;
+ case Instruction::Load:
+ MadeChange |= shrinkLoadForShuffles(I);
+ break;
default:
MadeChange |= shrinkType(I);
break;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 85f6fceb5bdbe..9218cc2d019f8 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -11,13 +11,13 @@ $getAt = comdat any
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
; SSE-LABEL: @ConvertVectors_ByRef(
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; SSE-NEXT: ret <4 x float> [[TMP3]]
;
; AVX-LABEL: @ConvertVectors_ByRef(
-; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; AVX-NEXT: ret <4 x float> [[TMP3]]
;
%2 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index 30a089818074e..07bd3966e8202 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -336,7 +336,7 @@ define <8 x float> @load_v2f32_v8f32(ptr dereferenceable(32) %p) {
define <4 x i32> @load_v2i32_v4i32(ptr dereferenceable(16) %p) {
; CHECK-LABEL: @load_v2i32_v4i32(
-; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index b30dc9ffdc596..9a051361e2081 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -47,21 +47,12 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
-; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
-; SSE-NEXT: ret <4 x double> [[BLEND]]
-;
-; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
-; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
-; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; AVX-NEXT: ret <4 x double> [[BLEND]]
+; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 8
+; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT: ret <4 x double> [[BLEND]]
;
%ld0 = load <4 x double>, ptr %p0, align 32
%ld1 = load <4 x double>, ptr %p1, align 32
@@ -81,3 +72,6 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
%s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
ret <2 x float> %s2
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
+; SSE: {{.*}}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
similarity index 99%
rename from llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
rename to llvm/test/Transforms/VectorCombine/load-shufflevector.ll
index 9978c15e90beb..44724a5f1f127 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/load-shufflevector.ll
+++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=aggressive-instcombine -S < %s | FileCheck %s
+; RUN: opt -passes=vector-combine -S < %s | FileCheck %s
define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(
>From 4d6873f736e32349472a264bfa2253423accba4e Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 14 Mar 2025 17:40:27 +0000
Subject: [PATCH 06/17] Address review comments and update tests.
---
.../Transforms/Vectorize/VectorCombine.cpp | 26 ++++++++++++----
.../VectorCombine/X86/load-widening.ll | 4 +--
.../VectorCombine/X86/shuffle-of-shuffles.ll | 4 +--
.../VectorCombine/load-shufflevector.ll | 30 +++++++++----------
4 files changed, 39 insertions(+), 25 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 3b9c96fff3043..5ba06ea548422 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3528,10 +3528,11 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
OutputRange.second = std::max(Index, OutputRange.second);
}
}
-
- if (OutputRange.second < OutputRange.first)
- return {};
}
+
+ if (OutputRange.second < OutputRange.first)
+ return {};
+
return OutputRange;
};
@@ -3546,11 +3547,12 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
// Create new load of smaller vector.
auto *ElemTy = VecTy->getElementType();
auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
+ auto *PtrOp = OldLoad->getPointerOperand();
auto *NewLoad = cast<LoadInst>(
- Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
+ Builder.CreateAlignedLoad(NewVecTy, PtrOp, OldLoad->getAlign()));
NewLoad->copyMetadata(I);
- // Compare cost of old and new loads.
+ // Compare cost of old and new ops.
auto OldCost = TTI.getMemoryOpCost(
Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
OldLoad->getPointerAddressSpace(), CostKind);
@@ -3558,8 +3560,20 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
Instruction::Load, NewLoad->getType(), NewLoad->getAlign(),
NewLoad->getPointerAddressSpace(), CostKind);
- if (OldCost < NewCost || !NewCost.isValid())
+ for (auto &Use : I.uses()) {
+ auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
+ auto Mask = Shuffle->getShuffleMask();
+
+ OldCost += TTI.getShuffleCost(
+ TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind);
+ NewCost += TTI.getShuffleCost(
+ TTI::SK_PermuteSingleSrc, NewVecTy, Mask, CostKind);
+ }
+
+ if (OldCost < NewCost || !NewCost.isValid()) {
+ NewLoad->eraseFromParent();
return false;
+ }
// Replace all users.
for (auto &Use : I.uses()) {
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index 07bd3966e8202..eacc40bfa9b53 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -336,7 +336,7 @@ define <8 x float> @load_v2f32_v8f32(ptr dereferenceable(32) %p) {
define <4 x i32> @load_v2i32_v4i32(ptr dereferenceable(16) %p) {
; CHECK-LABEL: @load_v2i32_v4i32(
-; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
@@ -443,7 +443,7 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <4 x i32> [[S]]
;
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index 9a051361e2081..eddfc57a7d256 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -49,8 +49,8 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
+; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
; CHECK-NEXT: ret <4 x double> [[BLEND]]
;
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
index 44724a5f1f127..3e302c5f43032 100644
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
@@ -19,7 +19,7 @@ define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
@@ -33,7 +33,7 @@ define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
@@ -47,7 +47,7 @@ define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x half> [[TMP1]]
;
@@ -61,7 +61,7 @@ define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
@@ -75,7 +75,7 @@ define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> zeroinitializer
@@ -108,7 +108,7 @@ define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -141,7 +141,7 @@ define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -174,7 +174,7 @@ define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
@@ -188,7 +188,7 @@ define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
@@ -202,7 +202,7 @@ define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x i32> [[TMP1]]
;
@@ -216,7 +216,7 @@ define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
@@ -230,7 +230,7 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer
@@ -263,7 +263,7 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> zeroinitializer
@@ -296,7 +296,7 @@ define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -329,7 +329,7 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 16
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
>From 1861c7d68ff42c231376eb215554530868a2d241 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 14 Mar 2025 17:53:21 +0000
Subject: [PATCH 07/17] Code formatting.
---
llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 5ba06ea548422..45811b00d5c7c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3564,10 +3564,10 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
auto Mask = Shuffle->getShuffleMask();
- OldCost += TTI.getShuffleCost(
- TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind);
- NewCost += TTI.getShuffleCost(
- TTI::SK_PermuteSingleSrc, NewVecTy, Mask, CostKind);
+ OldCost +=
+ TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind);
+ NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewVecTy, Mask,
+ CostKind);
}
if (OldCost < NewCost || !NewCost.isValid()) {
>From 6534370abb688570aa620235ee68eccf6e23b239 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 21 Mar 2025 05:23:37 +0000
Subject: [PATCH 08/17] Add cost analysis for shufflevector ops and update
tests.
---
clang/test/CodeGenOpenCL/preserve_vec3.cl | 20 ++---
.../Transforms/Vectorize/VectorCombine.cpp | 52 +++++++----
.../PhaseOrdering/X86/vec-load-combine.ll | 8 +-
.../VectorCombine/X86/load-inseltpoison.ll | 4 +-
.../VectorCombine/X86/load-widening.ll | 4 +-
.../VectorCombine/X86/shuffle-of-shuffles.ll | 24 +++--
.../VectorCombine/load-shufflevector.ll | 88 +++++++++----------
7 files changed, 111 insertions(+), 89 deletions(-)
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index 0538eac4029bb..49ebae6fc7013 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
// CHECK-LABEL: define dso_local spir_kernel void @foo(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
// CHECK-NEXT: ret void
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -47,8 +47,8 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 45811b00d5c7c..a355a7da67326 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -34,6 +35,7 @@
#include <numeric>
#include <queue>
#include <set>
+#include <tuple>
#define DEBUG_TYPE "vector-combine"
#include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -3484,10 +3486,13 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
return true;
}
-// If `I` is a load instruction, used only by shufflevector instructions with
-// poison values, attempt to shrink the load to only the lanes being used.
+// Attempt to shrink loads that are only used by shufflevector instructions.
bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
- auto *OldLoad = dyn_cast<LoadInst>(&I);
+ auto *InputShuffle = dyn_cast<ShuffleVectorInst>(&I);
+ if (!InputShuffle)
+ return {};
+
+ auto *OldLoad = dyn_cast<LoadInst>(InputShuffle->getOperand(0u));
if (!OldLoad || !OldLoad->isSimple())
return false;
@@ -3523,7 +3528,8 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
auto NumElems = int(Op0Ty->getNumElements());
for (auto Index : Mask) {
- if (Index >= 0 && Index < NumElems) {
+ if (Index >= 0) {
+ Index %= NumElems;
OutputRange.first = std::min(Index, OutputRange.first);
OutputRange.second = std::max(Index, OutputRange.second);
}
@@ -3552,7 +3558,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
Builder.CreateAlignedLoad(NewVecTy, PtrOp, OldLoad->getAlign()));
NewLoad->copyMetadata(I);
- // Compare cost of old and new ops.
+ // Calculate costs of old and new ops.
auto OldCost = TTI.getMemoryOpCost(
Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
OldLoad->getPointerAddressSpace(), CostKind);
@@ -3560,14 +3566,25 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
Instruction::Load, NewLoad->getType(), NewLoad->getAlign(),
NewLoad->getPointerAddressSpace(), CostKind);
+ using UseEntry = std::pair<ShuffleVectorInst*, std::vector<int>>;
+ auto NewUses = SmallVector<UseEntry, 4u>();
+ auto SizeDiff = OldSize - NewSize;
+
for (auto &Use : I.uses()) {
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
- auto Mask = Shuffle->getShuffleMask();
-
- OldCost +=
- TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, Mask, CostKind);
- NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewVecTy, Mask,
- CostKind);
+ auto OldMask = Shuffle->getShuffleMask();
+
+ // Create entry for new use.
+ NewUses.push_back({Shuffle, {}});
+ auto &NewMask = NewUses.back().second;
+ for (auto Index : OldMask)
+ NewMask.push_back(Index >= int(OldSize) ? Index - SizeDiff : Index);
+
+ // Update costs.
+ OldCost += TTI.getShuffleCost(
+ TTI::SK_PermuteSingleSrc, VecTy, OldMask, CostKind);
+ NewCost += TTI.getShuffleCost(
+ TTI::SK_PermuteSingleSrc, NewVecTy, NewMask, CostKind);
}
if (OldCost < NewCost || !NewCost.isValid()) {
@@ -3575,14 +3592,15 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
return false;
}
- // Replace all users.
- for (auto &Use : I.uses()) {
- auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
+ // Replace all uses.
+ for (auto &Use : NewUses) {
+ auto *Shuffle = Use.first;
+ auto &NewMask = Use.second;
Builder.SetInsertPoint(Shuffle);
Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
auto *NewShuffle = Builder.CreateShuffleVector(
- NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask());
+ NewLoad, PoisonValue::get(NewVecTy), NewMask);
replaceValue(*Shuffle, *NewShuffle);
}
@@ -3667,13 +3685,11 @@ bool VectorCombine::run() {
MadeChange |= foldShuffleOfIntrinsics(I);
MadeChange |= foldSelectShuffle(I);
MadeChange |= foldShuffleToIdentity(I);
+ MadeChange |= shrinkLoadForShuffles(I);
break;
case Instruction::BitCast:
MadeChange |= foldBitcastShuffle(I);
break;
- case Instruction::Load:
- MadeChange |= shrinkLoadForShuffles(I);
- break;
default:
MadeChange |= shrinkType(I);
break;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 9218cc2d019f8..85f6fceb5bdbe 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -11,13 +11,13 @@ $getAt = comdat any
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
; SSE-LABEL: @ConvertVectors_ByRef(
-; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; SSE-NEXT: ret <4 x float> [[TMP3]]
;
; AVX-LABEL: @ConvertVectors_ByRef(
-; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; AVX-NEXT: ret <4 x float> [[TMP3]]
;
%2 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 73476308916fb..70197a756c8b3 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -577,8 +577,8 @@ define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(ptr align 16 derefer
; CHECK-LABEL: @load_v1i32_extract_insert_v8i32_extra_use(
; CHECK-NEXT: [[L:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
; CHECK-NEXT: store <1 x i32> [[L]], ptr [[STORE_PTR:%.*]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <1 x i32> [[L]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P]], align 4
+; CHECK-NEXT: [[R:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <8 x i32> [[R]]
;
%l = load <1 x i32>, ptr %p, align 4
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index eacc40bfa9b53..30a089818074e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index eddfc57a7d256..b30dc9ffdc596 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -47,12 +47,21 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
-; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
-; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
-; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
-; CHECK-NEXT: ret <4 x double> [[BLEND]]
+; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
+; SSE-NEXT: ret <4 x double> [[BLEND]]
+;
+; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX-NEXT: ret <4 x double> [[BLEND]]
;
%ld0 = load <4 x double>, ptr %p0, align 32
%ld1 = load <4 x double>, ptr %p1, align 32
@@ -72,6 +81,3 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
%s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
ret <2 x float> %s2
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
index 3e302c5f43032..56c51ce7315cf 100644
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
@@ -19,8 +19,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -33,8 +33,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -47,8 +47,8 @@ define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x half> [[TMP1]]
;
entry:
@@ -61,8 +61,8 @@ define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -75,16 +75,16 @@ define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x half> [[VAL3]]
;
entry:
@@ -108,16 +108,16 @@ define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <4 x half> [[VAL3]]
;
entry:
@@ -141,16 +141,16 @@ define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x half> [[VAL3]]
;
entry:
@@ -174,8 +174,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -188,8 +188,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -202,8 +202,8 @@ define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x i32> [[TMP1]]
;
entry:
@@ -216,8 +216,8 @@ define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -230,16 +230,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
@@ -263,16 +263,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
@@ -296,16 +296,16 @@ define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <4 x i32> [[VAL3]]
;
entry:
@@ -329,16 +329,16 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
-; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[TMP1]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
; CHECK-NEXT: ret <8 x i32> [[VAL3]]
;
entry:
>From 5b63d6f806920e6c21472235161a52024e45bee3 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 21 Mar 2025 05:29:14 +0000
Subject: [PATCH 09/17] Code formatting.
---
llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index a355a7da67326..bd0770d8081a8 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3566,7 +3566,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
Instruction::Load, NewLoad->getType(), NewLoad->getAlign(),
NewLoad->getPointerAddressSpace(), CostKind);
- using UseEntry = std::pair<ShuffleVectorInst*, std::vector<int>>;
+ using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
auto NewUses = SmallVector<UseEntry, 4u>();
auto SizeDiff = OldSize - NewSize;
@@ -3581,10 +3581,10 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
NewMask.push_back(Index >= int(OldSize) ? Index - SizeDiff : Index);
// Update costs.
- OldCost += TTI.getShuffleCost(
- TTI::SK_PermuteSingleSrc, VecTy, OldMask, CostKind);
- NewCost += TTI.getShuffleCost(
- TTI::SK_PermuteSingleSrc, NewVecTy, NewMask, CostKind);
+ OldCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, OldMask,
+ CostKind);
+ NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewVecTy,
+ NewMask, CostKind);
}
if (OldCost < NewCost || !NewCost.isValid()) {
>From f2c6b30779ec30dc68deba526bd2b922f744416b Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Mon, 24 Mar 2025 23:55:57 +0000
Subject: [PATCH 10/17] Address comments.
---
.../Transforms/Vectorize/VectorCombine.cpp | 38 +++++++++----------
1 file changed, 19 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index bd0770d8081a8..93d668a2b17c6 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3490,7 +3490,7 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
auto *InputShuffle = dyn_cast<ShuffleVectorInst>(&I);
if (!InputShuffle)
- return {};
+ return false;
auto *OldLoad = dyn_cast<LoadInst>(InputShuffle->getOperand(0u));
if (!OldLoad || !OldLoad->isSimple())
@@ -3500,34 +3500,31 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
if (!VecTy)
return false;
- auto IsPoisonOrUndef = [](Value *V) -> bool {
- if (auto *C = dyn_cast<Constant>(V)) {
- return isa<PoisonValue>(C) || isa<UndefValue>(C);
- }
- return false;
- };
-
+ // Search all uses of `I`. If all uses are shufflevector ops, and the second
+ // operands are all poison values, find the minimum and maximum indices of
+ // the vector elements referenced by all shuffle masks.
+ // Otherwise return `std::nullopt`.
using IndexRange = std::pair<int, int>;
auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
- auto OutputRange = IndexRange(VecTy->getNumElements(), -1);
+ IndexRange OutputRange = IndexRange(VecTy->getNumElements(), -1);
for (auto &Use : I.uses()) {
// All uses must be ShuffleVector instructions.
auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
if (!Shuffle)
- return {};
+ return std::nullopt;
// Get index range for value.
- auto *Op0 = Shuffle->getOperand(0u);
- auto *Op1 = Shuffle->getOperand(1u);
- if (!IsPoisonOrUndef(Op1))
- return {};
+ auto *Op0 = Shuffle->getOperand(0);
+ auto *Op1 = Shuffle->getOperand(1);
+ if (!isa<PoisonValue>(Op1) && !isa<UndefValue>(Op1))
+ return std::nullopt;
// Find the min and max indices used by the ShuffleVector instruction.
- auto Mask = Shuffle->getShuffleMask();
+ ArrayRef<int> Mask = Shuffle->getShuffleMask();
auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
auto NumElems = int(Op0Ty->getNumElements());
- for (auto Index : Mask) {
+ for (int Index : Mask) {
if (Index >= 0) {
Index %= NumElems;
OutputRange.first = std::min(Index, OutputRange.first);
@@ -3537,15 +3534,18 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
}
if (OutputRange.second < OutputRange.first)
- return {};
+ return std::nullopt;
return OutputRange;
};
+ // Find the range of vector elements used by shufflevector ops, if possible.
if (auto Indices = GetIndexRangeInShuffles()) {
- auto OldSize = VecTy->getNumElements();
- auto NewSize = Indices->second + 1u;
+ unsigned OldSize = VecTy->getNumElements();
+ unsigned NewSize = Indices->second + 1u;
+ // If the range of vector elements is smaller than the full load, attempt
+ // to create a smaller load.
if (NewSize < OldSize) {
auto Builder = IRBuilder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());
>From 4dc36ee54d3083b27d3d0a52f9b2c01b36744a2d Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Wed, 26 Mar 2025 12:37:30 +0000
Subject: [PATCH 11/17] Fix conflict with subvector load widening.
---
clang/test/CodeGenOpenCL/preserve_vec3.cl | 22 +++---
.../Transforms/Vectorize/VectorCombine.cpp | 29 ++++----
.../PhaseOrdering/X86/vec-load-combine.ll | 8 +-
.../VectorCombine/X86/load-inseltpoison.ll | 10 +--
.../VectorCombine/X86/load-widening.ll | 4 +-
.../VectorCombine/X86/shuffle-of-shuffles.ll | 24 +++---
.../VectorCombine/load-shufflevector.ll | 74 +++++++++----------
7 files changed, 83 insertions(+), 88 deletions(-)
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index 49ebae6fc7013..e73657e30d884 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
// CHECK-LABEL: define dso_local spir_kernel void @foo(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
// CHECK-NEXT: ret void
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -47,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
void kernel float3_to_double2(global float3 *a, global double2 *b) {
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 93d668a2b17c6..a410493666c9f 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -33,6 +33,7 @@
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <numeric>
+#include <optional>
#include <queue>
#include <set>
#include <tuple>
@@ -3488,38 +3489,38 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
// Attempt to shrink loads that are only used by shufflevector instructions.
bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
- auto *InputShuffle = dyn_cast<ShuffleVectorInst>(&I);
- if (!InputShuffle)
- return false;
-
- auto *OldLoad = dyn_cast<LoadInst>(InputShuffle->getOperand(0u));
+ auto *OldLoad = dyn_cast<LoadInst>(&I);
if (!OldLoad || !OldLoad->isSimple())
return false;
- auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
+ auto *VecTy = dyn_cast<FixedVectorType>(OldLoad->getType());
if (!VecTy)
return false;
- // Search all uses of `I`. If all uses are shufflevector ops, and the second
- // operands are all poison values, find the minimum and maximum indices of
- // the vector elements referenced by all shuffle masks.
+ // Search all uses of load. If all uses are shufflevector instructions, and
+ // the second operands are all poison values, find the minimum and maximum
+ // indices of the vector elements referenced by all shuffle masks.
// Otherwise return `std::nullopt`.
using IndexRange = std::pair<int, int>;
auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
IndexRange OutputRange = IndexRange(VecTy->getNumElements(), -1);
for (auto &Use : I.uses()) {
- // All uses must be ShuffleVector instructions.
+ // All uses must be shufflevector instructions.
auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
if (!Shuffle)
return std::nullopt;
- // Get index range for value.
+ // Ignore shufflevector instructions that have no uses.
+ if (!Shuffle->hasNUsesOrMore(1u))
+ continue;
+
+ // Ensure second operand is a poison/undef value.
auto *Op0 = Shuffle->getOperand(0);
auto *Op1 = Shuffle->getOperand(1);
if (!isa<PoisonValue>(Op1) && !isa<UndefValue>(Op1))
return std::nullopt;
- // Find the min and max indices used by the ShuffleVector instruction.
+ // Find the min and max indices used by the shufflevector instruction.
ArrayRef<int> Mask = Shuffle->getShuffleMask();
auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
auto NumElems = int(Op0Ty->getNumElements());
@@ -3539,7 +3540,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
return OutputRange;
};
- // Find the range of vector elements used by shufflevector ops, if possible.
+ // Get the range of vector elements used by shufflevector instructions.
if (auto Indices = GetIndexRangeInShuffles()) {
unsigned OldSize = VecTy->getNumElements();
unsigned NewSize = Indices->second + 1u;
@@ -3685,6 +3686,8 @@ bool VectorCombine::run() {
MadeChange |= foldShuffleOfIntrinsics(I);
MadeChange |= foldSelectShuffle(I);
MadeChange |= foldShuffleToIdentity(I);
+ break;
+ case Instruction::Load:
MadeChange |= shrinkLoadForShuffles(I);
break;
case Instruction::BitCast:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 85f6fceb5bdbe..9218cc2d019f8 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -11,13 +11,13 @@ $getAt = comdat any
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
; SSE-LABEL: @ConvertVectors_ByRef(
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; SSE-NEXT: ret <4 x float> [[TMP3]]
;
; AVX-LABEL: @ConvertVectors_ByRef(
-; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; AVX-NEXT: ret <4 x float> [[TMP3]]
;
%2 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 70197a756c8b3..605c029f12b33 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -252,8 +252,7 @@ define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenc
define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
-; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
@@ -341,8 +340,7 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
-; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
@@ -577,8 +575,8 @@ define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(ptr align 16 derefer
; CHECK-LABEL: @load_v1i32_extract_insert_v8i32_extra_use(
; CHECK-NEXT: [[L:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 4
; CHECK-NEXT: store <1 x i32> [[L]], ptr [[STORE_PTR:%.*]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P]], align 4
-; CHECK-NEXT: [[R:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <1 x i32> [[L]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <8 x i32> [[R]]
;
%l = load <1 x i32>, ptr %p, align 4
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index 30a089818074e..eacc40bfa9b53 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index b30dc9ffdc596..eddfc57a7d256 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -47,21 +47,12 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
-; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
-; SSE-NEXT: ret <4 x double> [[BLEND]]
-;
-; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
-; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
-; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; AVX-NEXT: ret <4 x double> [[BLEND]]
+; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
+; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
+; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT: ret <4 x double> [[BLEND]]
;
%ld0 = load <4 x double>, ptr %p0, align 32
%ld1 = load <4 x double>, ptr %p1, align 32
@@ -81,3 +72,6 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
%s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
ret <2 x float> %s2
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
+; SSE: {{.*}}
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
index 56c51ce7315cf..c42df19a0b7a6 100644
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
@@ -19,8 +19,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_1(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -33,8 +33,8 @@ define <8 x half> @shuffle_v4_v8f16_r0_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -47,8 +47,8 @@ define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x half> [[TMP1]]
;
entry:
@@ -61,8 +61,8 @@ define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %ar
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x half> [[TMP1]]
;
entry:
@@ -75,13 +75,13 @@ define <8 x half> @shuffle_v4_v8f16_cond_r0_1(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -108,13 +108,13 @@ define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -141,13 +141,13 @@ define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonl
; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x half> [[VAL0]], <4 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -174,8 +174,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_1(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -188,8 +188,8 @@ define <8 x i32> @shuffle_v4_v8i32_r0_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -202,8 +202,8 @@ define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
; CHECK-NEXT: ret <4 x i32> [[TMP1]]
;
entry:
@@ -216,8 +216,8 @@ define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: ret <8 x i32> [[TMP1]]
;
entry:
@@ -230,13 +230,13 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_1(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -263,13 +263,13 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r0_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -296,13 +296,13 @@ define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -329,13 +329,13 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly
; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(
; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
; CHECK: [[THEN]]:
-; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: br label %[[FINALLY:.*]]
; CHECK: [[ELSE]]:
-; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: br label %[[FINALLY]]
; CHECK: [[FINALLY]]:
; CHECK-NEXT: [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
>From 1092f5be2850a90f46c139ff63c180105359c5c8 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Wed, 26 Mar 2025 12:51:14 +0000
Subject: [PATCH 12/17] Fix build bot false positive for undef.
---
llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index a410493666c9f..69960acab9128 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3514,7 +3514,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
if (!Shuffle->hasNUsesOrMore(1u))
continue;
- // Ensure second operand is a poison/undef value.
+ // Ensure second operand is a poison value.
auto *Op0 = Shuffle->getOperand(0);
auto *Op1 = Shuffle->getOperand(1);
if (!isa<PoisonValue>(Op1) && !isa<UndefValue>(Op1))
>From 265f9600e5139bab9a170950d3a594138395bb7f Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Wed, 7 May 2025 11:12:44 +0100
Subject: [PATCH 13/17] Address comments.
---
llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 16 ++++++----------
1 file changed, 6 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 69960acab9128..e28195a24844c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3505,23 +3505,19 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
IndexRange OutputRange = IndexRange(VecTy->getNumElements(), -1);
for (auto &Use : I.uses()) {
- // All uses must be shufflevector instructions.
- auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
- if (!Shuffle)
+ // Ensure all uses match the required pattern.
+ User *Shuffle = Use.getUser();
+ Value *Op0 = nullptr;
+ ArrayRef<int> Mask;
+
+ if (!match(Shuffle, m_Shuffle(m_Value(Op0), m_Undef(), m_Mask(Mask))))
return std::nullopt;
// Ignore shufflevector instructions that have no uses.
if (!Shuffle->hasNUsesOrMore(1u))
continue;
- // Ensure second operand is a poison value.
- auto *Op0 = Shuffle->getOperand(0);
- auto *Op1 = Shuffle->getOperand(1);
- if (!isa<PoisonValue>(Op1) && !isa<UndefValue>(Op1))
- return std::nullopt;
-
// Find the min and max indices used by the shufflevector instruction.
- ArrayRef<int> Mask = Shuffle->getShuffleMask();
auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
auto NumElems = int(Op0Ty->getNumElements());
>From 6ca4bfade60e9f6816b873f1a389bd7aa94461b6 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Mon, 19 May 2025 14:57:21 +0100
Subject: [PATCH 14/17] Address review comments.
---
llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index d07a68986486c..6d6b43b768c05 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3514,12 +3514,12 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
return std::nullopt;
// Ignore shufflevector instructions that have no uses.
- if (!Shuffle->hasNUsesOrMore(1u))
+ if (Shuffle->use_empty())
continue;
// Find the min and max indices used by the shufflevector instruction.
- auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
- auto NumElems = int(Op0Ty->getNumElements());
+ FixedVectorType *Op0Ty = cast<FixedVectorType>(Op0->getType());
+ int NumElems = static_cast<int>(Op0Ty->getNumElements());
for (int Index : Mask) {
if (Index >= 0) {
>From 03101703a2589cb09c45491db6e5b01c9e0ebdbe Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 30 May 2025 19:57:20 +0100
Subject: [PATCH 15/17] Address comments.
---
.../Transforms/Vectorize/VectorCombine.cpp | 64 +++++++++----------
1 file changed, 30 insertions(+), 34 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 6d6b43b768c05..94e68ded24730 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3493,24 +3493,26 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
if (!OldLoad || !OldLoad->isSimple())
return false;
- auto *VecTy = dyn_cast<FixedVectorType>(OldLoad->getType());
- if (!VecTy)
+ auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
+ if (!OldLoadTy)
return false;
+ unsigned const OldNumElements = OldLoadTy->getNumElements();
+
// Search all uses of load. If all uses are shufflevector instructions, and
// the second operands are all poison values, find the minimum and maximum
// indices of the vector elements referenced by all shuffle masks.
// Otherwise return `std::nullopt`.
using IndexRange = std::pair<int, int>;
auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
- IndexRange OutputRange = IndexRange(VecTy->getNumElements(), -1);
+ IndexRange OutputRange = IndexRange(OldNumElements, -1);
for (auto &Use : I.uses()) {
// Ensure all uses match the required pattern.
User *Shuffle = Use.getUser();
- Value *Op0 = nullptr;
ArrayRef<int> Mask;
- if (!match(Shuffle, m_Shuffle(m_Value(Op0), m_Undef(), m_Mask(Mask))))
+ if (!match(Shuffle,
+ m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
return std::nullopt;
// Ignore shufflevector instructions that have no uses.
@@ -3518,12 +3520,8 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
continue;
// Find the min and max indices used by the shufflevector instruction.
- FixedVectorType *Op0Ty = cast<FixedVectorType>(Op0->getType());
- int NumElems = static_cast<int>(Op0Ty->getNumElements());
-
for (int Index : Mask) {
- if (Index >= 0) {
- Index %= NumElems;
+ if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
OutputRange.first = std::min(Index, OutputRange.first);
OutputRange.second = std::max(Index, OutputRange.second);
}
@@ -3538,34 +3536,29 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
// Get the range of vector elements used by shufflevector instructions.
if (auto Indices = GetIndexRangeInShuffles()) {
- unsigned OldSize = VecTy->getNumElements();
- unsigned NewSize = Indices->second + 1u;
+ unsigned const NewNumElements = Indices->second + 1u;
// If the range of vector elements is smaller than the full load, attempt
// to create a smaller load.
- if (NewSize < OldSize) {
+ if (NewNumElements < OldNumElements) {
auto Builder = IRBuilder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());
- // Create new load of smaller vector.
- auto *ElemTy = VecTy->getElementType();
- auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
- auto *PtrOp = OldLoad->getPointerOperand();
- auto *NewLoad = cast<LoadInst>(
- Builder.CreateAlignedLoad(NewVecTy, PtrOp, OldLoad->getAlign()));
- NewLoad->copyMetadata(I);
-
// Calculate costs of old and new ops.
- auto OldCost = TTI.getMemoryOpCost(
+ Type *ElemTy = OldLoadTy->getElementType();
+ FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
+ Value *PtrOp = OldLoad->getPointerOperand();
+
+ InstructionCost OldCost = TTI.getMemoryOpCost(
Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
OldLoad->getPointerAddressSpace(), CostKind);
- auto NewCost = TTI.getMemoryOpCost(
- Instruction::Load, NewLoad->getType(), NewLoad->getAlign(),
- NewLoad->getPointerAddressSpace(), CostKind);
+ InstructionCost NewCost = TTI.getMemoryOpCost(
+ Instruction::Load, NewLoadTy, OldLoad->getAlign(),
+ OldLoad->getPointerAddressSpace(), CostKind);
using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
auto NewUses = SmallVector<UseEntry, 4u>();
- auto SizeDiff = OldSize - NewSize;
+ auto SizeDiff = OldNumElements - NewNumElements;
for (auto &Use : I.uses()) {
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
@@ -3575,19 +3568,22 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
NewUses.push_back({Shuffle, {}});
auto &NewMask = NewUses.back().second;
for (auto Index : OldMask)
- NewMask.push_back(Index >= int(OldSize) ? Index - SizeDiff : Index);
+ NewMask.push_back(Index >= int(NewNumElements) ? Index - SizeDiff : Index);
// Update costs.
- OldCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, OldMask,
- CostKind);
- NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewVecTy,
+ OldCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, OldLoadTy,
+ OldMask, CostKind);
+ NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, NewLoadTy,
NewMask, CostKind);
}
- if (OldCost < NewCost || !NewCost.isValid()) {
- NewLoad->eraseFromParent();
+ if (OldCost < NewCost || !NewCost.isValid())
return false;
- }
+
+ // Create new load of smaller vector.
+ auto *NewLoad = cast<LoadInst>(
+ Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
+ NewLoad->copyMetadata(I);
// Replace all uses.
for (auto &Use : NewUses) {
@@ -3597,7 +3593,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
Builder.SetInsertPoint(Shuffle);
Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
auto *NewShuffle = Builder.CreateShuffleVector(
- NewLoad, PoisonValue::get(NewVecTy), NewMask);
+ NewLoad, PoisonValue::get(NewLoadTy), NewMask);
replaceValue(*Shuffle, *NewShuffle);
}
>From 6a9dd0bda4d190263aba89df570ef3f06f8fa09e Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 30 May 2025 20:04:38 +0100
Subject: [PATCH 16/17] Formatting.
---
llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 94e68ded24730..52aa63f1d999f 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3511,7 +3511,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
User *Shuffle = Use.getUser();
ArrayRef<int> Mask;
- if (!match(Shuffle,
+ if (!match(Shuffle,
m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
return std::nullopt;
@@ -3548,13 +3548,13 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
Type *ElemTy = OldLoadTy->getElementType();
FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
Value *PtrOp = OldLoad->getPointerOperand();
-
+
InstructionCost OldCost = TTI.getMemoryOpCost(
Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
OldLoad->getPointerAddressSpace(), CostKind);
- InstructionCost NewCost = TTI.getMemoryOpCost(
- Instruction::Load, NewLoadTy, OldLoad->getAlign(),
- OldLoad->getPointerAddressSpace(), CostKind);
+ InstructionCost NewCost =
+ TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
+ OldLoad->getPointerAddressSpace(), CostKind);
using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
auto NewUses = SmallVector<UseEntry, 4u>();
@@ -3568,7 +3568,8 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
NewUses.push_back({Shuffle, {}});
auto &NewMask = NewUses.back().second;
for (auto Index : OldMask)
- NewMask.push_back(Index >= int(NewNumElements) ? Index - SizeDiff : Index);
+ NewMask.push_back(Index >= int(NewNumElements) ? Index - SizeDiff
+ : Index);
// Update costs.
OldCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, OldLoadTy,
>From 019e39fc49bb83e40eb02f4eb7235a82045bf262 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 30 May 2025 22:05:01 +0100
Subject: [PATCH 17/17] Correct renaming and remove function-style casts.
---
llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 52aa63f1d999f..7a878e3854322 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3568,8 +3568,9 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
NewUses.push_back({Shuffle, {}});
auto &NewMask = NewUses.back().second;
for (auto Index : OldMask)
- NewMask.push_back(Index >= int(NewNumElements) ? Index - SizeDiff
- : Index);
+ NewMask.push_back(Index >= static_cast<int>(OldNumElements)
+ ? Index - SizeDiff
+ : Index);
// Update costs.
OldCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, OldLoadTy,
More information about the cfe-commits
mailing list