[Mlir-commits] [clang] [clang-tools-extra] [compiler-rt] [lld] [lldb] [llvm] [mlir] simplifyBinaryIntrinsic: Return nan if snan is passed to maxnum/minnum (PR #180105)
YunQiang Su
llvmlistbot at llvm.org
Sun Feb 8 03:45:55 PST 2026
https://github.com/wzssyqa updated https://github.com/llvm/llvm-project/pull/180105
>From 392b203b01b57d764850ea73c367df4f14d1c527 Mon Sep 17 00:00:00 2001
From: YunQiang Su <yunqiang at isrc.iscas.ac.cn>
Date: Wed, 17 Sep 2025 19:52:51 +0800
Subject: [PATCH 01/33] simplifyBinaryIntrinsic: Return nan if snan is passed
to maxnum/minnum
Fixes: #138303
---
llvm/lib/Analysis/InstructionSimplify.cpp | 5 +-
.../AMDGPU/fcanonicalize-elimination.ll | 4 +-
llvm/test/Transforms/EarlyCSE/commute.ll | 61 +++++++++++++++++++
3 files changed, 66 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 0e0c092271a38..a3e0994b72986 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -6668,6 +6668,7 @@ static MinMaxOptResult OptimizeConstMinMax(const Constant *RHSConst,
assert(OutNewConstVal != nullptr);
bool PropagateNaN = IID == Intrinsic::minimum || IID == Intrinsic::maximum;
+ bool PropagateNaN_S = IID == Intrinsic::minnum || IID == Intrinsic::maxnum;
bool ReturnsOtherForAllNaNs =
IID == Intrinsic::minimumnum || IID == Intrinsic::maximumnum;
bool IsMin = IID == Intrinsic::minimum || IID == Intrinsic::minnum ||
@@ -6686,12 +6687,14 @@ static MinMaxOptResult OptimizeConstMinMax(const Constant *RHSConst,
// minnum(x, qnan) -> x
// maxnum(x, qnan) -> x
+ // minnum(x, snan) -> qnan
+ // maxnum(x, snan) -> qnan
// minimum(X, nan) -> qnan
// maximum(X, nan) -> qnan
// minimumnum(X, nan) -> x
// maximumnum(X, nan) -> x
if (CAPF.isNaN()) {
- if (PropagateNaN) {
+ if (PropagateNaN || (PropagateNaN_S && CAPF.isSignaling())) {
*OutNewConstVal = ConstantFP::get(CFP->getType(), CAPF.makeQuiet());
return MinMaxOptResult::UseNewConstVal;
} else if (ReturnsOtherForAllNaNs || !CAPF.isSignaling()) {
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index a87570ef5d848..a2b02fd1e0c3f 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -500,9 +500,7 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(ptr addrspace
; FIXME: Should there be more checks here? minnum with sNaN operand might get simplified away.
; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
-; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]]
-; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[LOAD]]
-; GFX9: v_max_f32_e32 v{{[0-9]+}}, [[LOAD]], [[LOAD]]
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000
define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1) %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
diff --git a/llvm/test/Transforms/EarlyCSE/commute.ll b/llvm/test/Transforms/EarlyCSE/commute.ll
index edafeccd3c8cc..62fae0f544812 100644
--- a/llvm/test/Transforms/EarlyCSE/commute.ll
+++ b/llvm/test/Transforms/EarlyCSE/commute.ll
@@ -830,6 +830,67 @@ define float @maxnum(float %a, float %b) {
ret float %r
}
+define float @maxnum_const_snan(float %x) {
+; CHECK-LABEL: @maxnum_const_snan(
+; CHECK-NEXT: ret float 0x7FFC000000000000
+;
+ %r = call float @llvm.minnum.f32(float %x, float 0x7FF4000000000000)
+ ret float %r
+}
+
+define double @minnum_const_snan(double %x) {
+; CHECK-LABEL: @minnum_const_snan(
+; CHECK-NEXT: ret double 0x7FFC000000000000
+;
+ %r = call double @llvm.minnum.f64(double %x, double 0x7FF4000000000000)
+ ret double %r
+}
+
+define float @maxnum_const_qnan(float %x) {
+; CHECK-LABEL: @maxnum_const_qnan(
+; CHECK-NEXT: ret float [[X:%.*]]
+;
+ %r = call float @llvm.minnum.f32(float %x, float 0x7FF8000000000000)
+ ret float %r
+}
+
+define double @minnum_const_qnan(double %x) {
+; CHECK-LABEL: @minnum_const_qnan(
+; CHECK-NEXT: ret double [[X:%.*]]
+;
+ %r = call double @llvm.minnum.f64(double %x, double 0x7FF8000000000000)
+ ret double %r
+}
+
+define <2 x float> @maxnum_const_snan_v2f32(<2 x float> %a) {
+; CHECK-LABEL: @maxnum_const_snan_v2f32(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret <2 x float> splat (float 0x7FFC000000000000)
+;
+entry:
+ %r = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> <float 0x7FF4000000000000, float 0x7FF4000000000000>)
+ ret <2 x float> %r
+}
+define <2 x float> @maxnum_const_qnan_v2f32(<2 x float> %a) {
+; CHECK-LABEL: @maxnum_const_qnan_v2f32(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret <2 x float> [[A:%.*]]
+;
+entry:
+ %r = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>)
+ ret <2 x float> %r
+}
+define <2 x float> @maxnum_const_mixednan_v2f32(<2 x float> %a) {
+; CHECK-LABEL: @maxnum_const_mixednan_v2f32(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[R:%.*]] = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[A:%.*]], <2 x float> <float 0x7FF8000000000000, float 0x7FF4000000000000>)
+; CHECK-NEXT: ret <2 x float> [[R]]
+;
+entry:
+ %r = tail call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> <float 0x7FF8000000000000, float 0x7FF4000000000000>)
+ ret <2 x float> %r
+}
+
define <2 x float> @minnum(<2 x float> %a, <2 x float> %b) {
; CHECK-LABEL: @minnum(
; CHECK-NEXT: [[X:%.*]] = call fast <2 x float> @llvm.minnum.v2f32(<2 x float> [[A:%.*]], <2 x float> [[B:%.*]])
>From 717989352c2743d93fb47fa0406ace184452bef8 Mon Sep 17 00:00:00 2001
From: YunQiang Su <yunqiang at isrc.iscas.ac.cn>
Date: Fri, 6 Feb 2026 16:01:24 +0800
Subject: [PATCH 02/33] Update test
---
.../Transforms/InstSimplify/ConstProp/min-max.ll | 12 ++++--------
.../test/Transforms/InstSimplify/fminmax-folds.ll | 15 +++++----------
2 files changed, 9 insertions(+), 18 deletions(-)
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/min-max.ll b/llvm/test/Transforms/InstSimplify/ConstProp/min-max.ll
index 84bec15d6ed32..a633d29179896 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/min-max.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/min-max.ll
@@ -97,8 +97,7 @@ define float @minnum_float_qnan_p0() {
define float @minnum_float_p0_snan() {
; CHECK-LABEL: @minnum_float_p0_snan(
-; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.minnum.f32(float 0.000000e+00, float 0x7FF4000000000000)
-; CHECK-NEXT: ret float [[MIN]]
+; CHECK-NEXT: ret float 0x7FFC000000000000
;
%min = call float @llvm.minnum.f32(float 0.0, float 0x7FF4000000000000)
ret float %min
@@ -106,8 +105,7 @@ define float @minnum_float_p0_snan() {
define float @minnum_float_snan_p0() {
; CHECK-LABEL: @minnum_float_snan_p0(
-; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.minnum.f32(float 0x7FF4000000000000, float 0.000000e+00)
-; CHECK-NEXT: ret float [[MIN]]
+; CHECK-NEXT: ret float 0x7FFC000000000000
;
%min = call float @llvm.minnum.f32(float 0x7FF4000000000000, float 0.0)
ret float %min
@@ -207,8 +205,7 @@ define float @maxnum_float_qnan_p0() {
define float @maxnum_float_p0_snan() {
; CHECK-LABEL: @maxnum_float_p0_snan(
-; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.maxnum.f32(float 0.000000e+00, float 0x7FF4000000000000)
-; CHECK-NEXT: ret float [[MAX]]
+; CHECK-NEXT: ret float 0x7FFC000000000000
;
%max = call float @llvm.maxnum.f32(float 0.0, float 0x7FF4000000000000)
ret float %max
@@ -216,8 +213,7 @@ define float @maxnum_float_p0_snan() {
define float @maxnum_float_snan_p0() {
; CHECK-LABEL: @maxnum_float_snan_p0(
-; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.maxnum.f32(float 0x7FF4000000000000, float 0.000000e+00)
-; CHECK-NEXT: ret float [[MAX]]
+; CHECK-NEXT: ret float 0x7FFC000000000000
;
%max = call float @llvm.maxnum.f32(float 0x7FF4000000000000, float 0.0)
ret float %max
diff --git a/llvm/test/Transforms/InstSimplify/fminmax-folds.ll b/llvm/test/Transforms/InstSimplify/fminmax-folds.ll
index 7544f7190df89..c2c4b060d7c54 100644
--- a/llvm/test/Transforms/InstSimplify/fminmax-folds.ll
+++ b/llvm/test/Transforms/InstSimplify/fminmax-folds.ll
@@ -43,10 +43,8 @@ define void @minmax_qnan_f32(float %x, ptr %minnum_res, ptr %maxnum_res, ptr %mi
; Note that maxnum/minnum return qnan here for snan inputs, unlike maximumnum/minimumnum
define void @minmax_snan_f32(float %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) {
; CHECK-LABEL: @minmax_snan_f32(
-; CHECK-NEXT: [[MINNUM:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float 0x7FF4000000000000)
-; CHECK-NEXT: store float [[MINNUM]], ptr [[MINNUM_RES:%.*]], align 4
-; CHECK-NEXT: [[MAXNUM:%.*]] = call float @llvm.maxnum.f32(float [[X]], float 0x7FF4000000000000)
-; CHECK-NEXT: store float [[MAXNUM]], ptr [[MAXNUM_RES:%.*]], align 4
+; CHECK-NEXT: store float 0x7FFC000000000000, ptr [[MINMUM_RES:%.*]], align 4
+; CHECK-NEXT: store float 0x7FFC000000000000, ptr [[MAXMUM_RES:%.*]], align 4
; CHECK-NEXT: store float 0x7FFC000000000000, ptr [[MINIMUM_RES:%.*]], align 4
; CHECK-NEXT: store float 0x7FFC000000000000, ptr [[MAXIMUM_RES:%.*]], align 4
; CHECK-NEXT: store float [[X]], ptr [[MINIMUMNUM_RES:%.*]], align 4
@@ -100,10 +98,8 @@ define void @minmax_qnan_nxv2f64_op0(<vscale x 2 x double> %x, ptr %minnum_res,
; Note that maxnum/minnum return qnan here for snan inputs, unlike maximumnum/minimumnum
define void @minmax_snan_nxv2f64_op1(<vscale x 2 x double> %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) {
; CHECK-LABEL: @minmax_snan_nxv2f64_op1(
-; CHECK-NEXT: [[MINNUM:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> splat (double 0x7FF400DEAD00DEAD), <vscale x 2 x double> [[X:%.*]])
-; CHECK-NEXT: store <vscale x 2 x double> [[MINNUM]], ptr [[MINNUM_RES:%.*]], align 16
-; CHECK-NEXT: [[MAXNUM:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> splat (double 0x7FF400DEAD00DEAD), <vscale x 2 x double> [[X]])
-; CHECK-NEXT: store <vscale x 2 x double> [[MAXNUM]], ptr [[MAXNUM_RES:%.*]], align 16
+; CHECK-NEXT: store <vscale x 2 x double> splat (double 0x7FFC00DEAD00DEAD), ptr [[MINMUM_RES:%.*]], align 16
+; CHECK-NEXT: store <vscale x 2 x double> splat (double 0x7FFC00DEAD00DEAD), ptr [[MAXMUM_RES:%.*]], align 16
; CHECK-NEXT: store <vscale x 2 x double> splat (double 0x7FFC00DEAD00DEAD), ptr [[MINIMUM_RES:%.*]], align 16
; CHECK-NEXT: store <vscale x 2 x double> splat (double 0x7FFC00DEAD00DEAD), ptr [[MAXIMUM_RES:%.*]], align 16
; CHECK-NEXT: store <vscale x 2 x double> [[X]], ptr [[MINIMUMNUM_RES:%.*]], align 16
@@ -640,8 +636,7 @@ define void @minmax_mixed_pos_inf_poison_snan_v3f32(<3 x float> %x, ptr %minnum_
; CHECK-LABEL: @minmax_mixed_pos_inf_poison_snan_v3f32(
; CHECK-NEXT: [[MINNUM:%.*]] = call nnan <3 x float> @llvm.minnum.v3f32(<3 x float> <float poison, float 0x7FF0000000000000, float 0x7FF4000000000000>, <3 x float> [[X:%.*]])
; CHECK-NEXT: store <3 x float> [[MINNUM]], ptr [[MINNUM_RES:%.*]], align 16
-; CHECK-NEXT: [[MAXNUM:%.*]] = call nnan <3 x float> @llvm.maxnum.v3f32(<3 x float> <float poison, float 0x7FF0000000000000, float 0x7FF4000000000000>, <3 x float> [[X]])
-; CHECK-NEXT: store <3 x float> [[MAXNUM]], ptr [[MAXNUM_RES:%.*]], align 16
+; CHECK-NEXT: store <3 x float> <float poison, float 0x7FF0000000000000, float 0x7FFC000000000000>, ptr [[MAXNUM_RES:%.*]], align 16
; CHECK-NEXT: [[MINIMUM:%.*]] = call nnan <3 x float> @llvm.minimum.v3f32(<3 x float> <float poison, float 0x7FF0000000000000, float 0x7FF4000000000000>, <3 x float> [[X]])
; CHECK-NEXT: store <3 x float> [[MINIMUM]], ptr [[MINIMUM_RES:%.*]], align 16
; CHECK-NEXT: store <3 x float> <float poison, float 0x7FF0000000000000, float 0x7FFC000000000000>, ptr [[MAXIMUM_RES:%.*]], align 16
>From 95e778579d66083c1bfeaa6006e170964a9d7786 Mon Sep 17 00:00:00 2001
From: Kito Cheng <kito.cheng at sifive.com>
Date: Fri, 6 Feb 2026 10:35:24 +0800
Subject: [PATCH 03/33] [RISCV] Fix P-extension instruction names per spec 0.19
(#179961)
Fix instruction naming to match P-extension specification 0.19:
- pnsari.b -> pnsrari.b (Packed Narrowing Shift Right Arithmetic
Rounding)
- pnsari.h -> pnsrari.h
- nsari -> nsrari
- paax.dhx -> paas.dhx (Packed Average Add/Sub, not Add/Add-Cross)
The instruction encodings remain unchanged as they were already correct.
Ref: https://www.jhauser.us/RISCV/ext-P/RVP-baseInstrs-Sail-019.txt
---
llvm/lib/Target/RISCV/RISCVInstrInfoP.td | 8 ++++----
llvm/test/MC/RISCV/rv32p-valid.s | 16 ++++++++--------
2 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index f789e6123616f..9c7f9a86611f8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -1269,9 +1269,9 @@ let Predicates = [HasStdExtP, IsRV32] in {
def PNSRAI_H : RVPNarrowingShiftH_ri<0b100, "pnsrai.h">;
def NSRAI : RVPNarrowingShiftW_ri<0b100, "nsrai">;
- def PNSARI_B : RVPNarrowingShiftB_ri<0b101, "pnsari.b">;
- def PNSARI_H : RVPNarrowingShiftH_ri<0b101, "pnsari.h">;
- def NSARI : RVPNarrowingShiftW_ri<0b101, "nsari">;
+ def PNSRARI_B : RVPNarrowingShiftB_ri<0b101, "pnsrari.b">;
+ def PNSRARI_H : RVPNarrowingShiftH_ri<0b101, "pnsrari.h">;
+ def NSRARI : RVPNarrowingShiftW_ri<0b101, "nsrari">;
def PNCLIPI_B : RVPNarrowingShiftB_ri<0b110, "pnclipi.b">;
def PNCLIPI_H : RVPNarrowingShiftH_ri<0b110, "pnclipi.h">;
@@ -1435,7 +1435,7 @@ let Predicates = [HasStdExtP, IsRV32] in {
def PSAS_DHX : RVPPairBinaryExchanged_rr<0b0010, 0b00, "psas.dhx">;
def PSSA_DHX : RVPPairBinaryExchanged_rr<0b0010, 0b10, "pssa.dhx">;
- def PAAX_DHX : RVPPairBinaryExchanged_rr<0b0011, 0b00, "paax.dhx">;
+ def PAAS_DHX : RVPPairBinaryExchanged_rr<0b0011, 0b00, "paas.dhx">;
def PASA_DHX : RVPPairBinaryExchanged_rr<0b0011, 0b10, "pasa.dhx">;
def PMSEQ_DH : RVPPairBinaryExchanged_rr<0b1000, 0b00, "pmseq.dh", Commutable=1>;
diff --git a/llvm/test/MC/RISCV/rv32p-valid.s b/llvm/test/MC/RISCV/rv32p-valid.s
index c601786cccc64..007887164b9d3 100644
--- a/llvm/test/MC/RISCV/rv32p-valid.s
+++ b/llvm/test/MC/RISCV/rv32p-valid.s
@@ -906,15 +906,15 @@ pnsrai.h s0, a0, 2
# CHECK-ASM-AND-OBJ: nsrai a4, t3
# CHECK-ASM: encoding: [0x1b,0xc7,0x4e,0x44]
nsrai a4, t3, 4
-# CHECK-ASM-AND-OBJ: pnsari.b t5, t5
+# CHECK-ASM-AND-OBJ: pnsrari.b t5, t5
# CHECK-ASM: encoding: [0x1b,0xcf,0x0f,0x51]
-pnsari.b t5, t5, 0
-# CHECK-ASM-AND-OBJ: pnsari.h t1, a4
+pnsrari.b t5, t5, 0
+# CHECK-ASM-AND-OBJ: pnsrari.h t1, a4
# CHECK-ASM: encoding: [0x1b,0xc3,0x37,0x52]
-pnsari.h t1, a4, 3
-# CHECK-ASM-AND-OBJ: nsari s0, t1
+pnsrari.h t1, a4, 3
+# CHECK-ASM-AND-OBJ: nsrari s0, t1
# CHECK-ASM: encoding: [0x1b,0xc4,0x53,0x54]
-nsari s0, t1, 5
+nsrari s0, t1, 5
# CHECK-ASM-AND-OBJ: pnclipi.b t1, a4
# CHECK-ASM: encoding: [0x1b,0xc3,0x77,0x61]
pnclipi.b t1, a4, 7
@@ -1266,9 +1266,9 @@ psas.dhx a2, a2, s0
# CHECK-ASM-AND-OBJ: pssa.dhx t3, t3, t3
# CHECK-ASM: encoding: [0x1b,0xee,0xde,0x95]
pssa.dhx t3, t3, t3
-# CHECK-ASM-AND-OBJ: paax.dhx t3, t3, a4
+# CHECK-ASM-AND-OBJ: paas.dhx t3, t3, a4
# CHECK-ASM: encoding: [0x1b,0xee,0xfe,0x98]
-paax.dhx t3, t3, a4
+paas.dhx t3, t3, a4
# CHECK-ASM-AND-OBJ: pasa.dhx a0, t1, t1
# CHECK-ASM: encoding: [0x1b,0xe5,0x73,0x9c]
pasa.dhx a0, t1, t1
>From 74dda51c3b2891bb3a010038ab2c0887ab5f66d8 Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo at gmail.com>
Date: Thu, 5 Feb 2026 18:35:32 -0800
Subject: [PATCH 04/33] [CMake][TableGen] Fix Ninja depslog error with implicit
outputs on Ninja <1.10 (#179842)
Ninja versions prior to 1.10 cannot handle depfile mode when CMake
generates build rules with implicit outputs (the `| ${cmake_ninja_workdir}`
syntax used for IDE support). Ninja's depslog interprets these as
multiple outputs and rejects them with the error:
ninja: error: build.ninja:XXXX: multiple outputs aren't (yet?) supported
by depslog; bring this up on the mailing list if it affects you
This primarily affected builds where CMake generates NATIVE subdirectory
builds for host tools.
This patch modifies TableGen.cmake to:
1. Detect the Ninja version at configure time
2. Disable depfile mode (fall back to globbing .td files) when:
- Ninja version is < 1.10, OR
- The tablegen invocation produces multiple outputs (e.g. -gen-register-info)
The fallback mode maintains correct dependency tracking by explicitly
globbing all .td files in include directories, ensuring tablegen reruns
when dependencies change.
Tested with:
- Ninja 1.8.2 (now works - previously failed with depslog error)
- Ninja 1.13.2
- CMake 4.2.3
- build scenarios with NATIVE tool builds
This commit used a significant amount of tooling to construct.
---
llvm/cmake/modules/TableGen.cmake | 46 +++++++++++++++++++++++++++++--
1 file changed, 43 insertions(+), 3 deletions(-)
diff --git a/llvm/cmake/modules/TableGen.cmake b/llvm/cmake/modules/TableGen.cmake
index 84c03cd6432ed..1bac44c70d2c8 100644
--- a/llvm/cmake/modules/TableGen.cmake
+++ b/llvm/cmake/modules/TableGen.cmake
@@ -24,6 +24,14 @@ function(tablegen project ofn)
# Filter out any empty include items.
list(REMOVE_ITEM tblgen_includes "")
+ # Check for multi-output tablegen invocations BEFORE deciding on depfile mode.
+ # Ninja's depslog cannot handle multiple outputs with depfile, so we must use
+ # fallback mode (globbing) for these cases.
+ set(has_extra_outputs FALSE)
+ if("-gen-register-info" IN_LIST ARGN)
+ set(has_extra_outputs TRUE)
+ endif()
+
# Use depfile instead of globbing arbitrary *.td(s) for Ninja. We force
# CMake versions older than v3.30 on Windows to use the fallback behavior
# due to a depfile parsing bug on Windows paths in versions prior to 3.30.
@@ -32,10 +40,28 @@ function(tablegen project ofn)
# behavior as v3.22 and earlier fail to parse some depfiles that get
# generated, and this behavior was fixed in CMake commit
# e04a352cca523eba2ac0d60063a3799f5bb1c69e.
+ # CRITICAL: Ninja <1.10 has a depslog limitation: it cannot handle depfile
+ # mode when CMake generates implicit outputs (absolute path aliases for IDE
+ # support). For multi-output rules OR when using Ninja <1.10, we MUST use
+ # fallback mode (glob .td files) to avoid "multiple outputs aren't supported
+ # by depslog" errors.
cmake_policy(GET CMP0116 cmp0116_state)
+
+ # Check Ninja version to avoid depslog errors with implicit outputs
+ set(ninja_version_supports_depfile TRUE)
+ if(CMAKE_GENERATOR MATCHES "Ninja")
+ execute_process(COMMAND ${CMAKE_MAKE_PROGRAM} --version
+ OUTPUT_VARIABLE ninja_version
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+ if(ninja_version VERSION_LESS "1.10")
+ set(ninja_version_supports_depfile FALSE)
+ endif()
+ endif()
if(CMAKE_GENERATOR MATCHES "Ninja" AND cmp0116_state STREQUAL NEW
AND NOT (CMAKE_HOST_WIN32 AND CMAKE_VERSION VERSION_LESS 3.30)
- AND NOT (CMAKE_VERSION VERSION_LESS 3.23))
+ AND NOT (CMAKE_VERSION VERSION_LESS 3.23)
+ AND NOT has_extra_outputs
+ AND ninja_version_supports_depfile)
# CMake emits build targets as relative paths but Ninja doesn't identify
# absolute path (in *.d) as relative path (in build.ninja). Post CMP0116,
# CMake handles this discrepancy for us, otherwise we use the fallback
@@ -128,7 +154,7 @@ function(tablegen project ofn)
# ("${${project}_TABLEGEN_TARGET}" STREQUAL "${${project}_TABLEGEN_EXE}")
# but lets us having smaller and cleaner code here.
set(tablegen_exe ${${project}_TABLEGEN_EXE})
- set(tablegen_depends ${${project}_TABLEGEN_TARGET} ${tablegen_exe})
+ set(tablegen_target ${${project}_TABLEGEN_TARGET})
if(LLVM_PARALLEL_TABLEGEN_JOBS)
set(LLVM_TABLEGEN_JOB_POOL JOB_POOL tablegen_job_pool)
@@ -136,6 +162,20 @@ function(tablegen project ofn)
set(LLVM_TABLEGEN_JOB_POOL "")
endif()
+ # For Ninja with multiple outputs, we cannot add the target to DEPENDS due to
+ # depslog limitations. Instead, rely on the implicit tool dependency from COMMAND
+ # and the globbed .td files for proper dependency tracking.
+ # For single outputs or non-Ninja generators, include the target in DEPENDS.
+ set(tablegen_target_dep)
+ if(NOT EXTRA_OUTPUTS)
+ # Single output: safe to add explicit target dependency
+ set(tablegen_target_dep ${tablegen_target})
+ elseif(NOT CMAKE_GENERATOR MATCHES "Ninja")
+ # Multiple outputs but not Ninja: Ninja's depslog is not a constraint
+ set(tablegen_target_dep ${tablegen_target})
+ endif()
+ # Multiple outputs + Ninja: Don't add target dependency; rely on COMMAND implicit tracking
+
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn} ${EXTRA_OUTPUTS}
COMMAND ${tablegen_exe} ${ARG_UNPARSED_ARGUMENTS}
${tblgen_includes}
@@ -146,7 +186,7 @@ function(tablegen project ofn)
# The file in LLVM_TARGET_DEFINITIONS may be not in the current
# directory and local_tds may not contain it, so we must
# explicitly list it here:
- DEPENDS ${ARG_DEPENDS} ${tablegen_depends}
+ DEPENDS ${ARG_DEPENDS} ${tablegen_target_dep}
${global_tds}
${LLVM_TARGET_DEFINITIONS_ABSOLUTE}
${LLVM_TARGET_DEPENDS}
>From 321503c402d213cc2bbe3018c9909a7bd7dad86b Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda at apple.com>
Date: Thu, 5 Feb 2026 18:38:20 -0800
Subject: [PATCH 05/33] [lldb] Add a new way of loading files from a shared
cache (#179881)
Taking advantage of a few new SPI in macOS 26.4 libdyld, it is possible
for lldb to load binaries out of a shared cache binary blob, instead of
needing discrete files on disk. lldb has had one special case where it
has done this for years -- if the debugee process and lldb itself are
using the same shared cache, it could create ObjectFiles based on its
own memory contents. This new method requires only the shared cache on
disk, not depending on it being mapped into lldb's address space
already.
In HostInfoMacOSX.mm, we create an array of binaries in lldb's shared
cache, by one of two methods depending on the availability of SPI/SDKs.
This PR adds a new third method for loading lldb's shared cache off disk
as a proof of concept. It will prefer this new method when the needed
SPI are available at runtime. There is also a user setting to disable
this new method in case we uncover a problem as it is deployed.
I did change the internal store of the shared cache files from a single
array, to being organized by shared cache UUIDs, so we can have multiple
shared caches indexed in the future.
In HostInfoBase.h's SharedCacheImageInfo class, you can now create an
ImageInfo with a DataExtractorSP or a void* baton. I added GetUUID and
GetExtractor methods, and the latter will use the libdyld SPI to map the
segments for a specific binary into lldb's memory and return a
DataExtractorSP.
The setting is currently called symbols.shared-cache-binary-loading.
In DynamicLoaderDarwin::FindTargetModuleForImageInfo there was an
ordering mistake where we would always consult the HostInfoMacOSX.mm
shared cache provider, instead of checking lldb's own global module
cache first when looking for a binary, resulting in creating a new
Module repeatedly for shared cache binaries with the new method, parsing
the symbol table repeatedly. I fixed the ordering so we look at existing
Modules before we check the shared cache for one.
In ObjectFileMachOTest, it tests a TEXT and a DATA symbol, checking that
the contents of the function/data object match the bytes we got from the
shared cache. The test was using a DATA_DIRTY symbol, which was fine
when using lldb's own shared cache memory, but when we worked on the
shared cache binary on-disk directly, we were seeing different values
for the bytes because of relocations in there. I changed this to a
constant DATA symbol.
rdar://148939795
---------
Co-authored-by: Jonas Devlieghere <jonas at devlieghere.com>
Co-authored-by: Alex Langford <nirvashtzero at gmail.com>
---
lldb/include/lldb/Core/ModuleList.h | 2 +
lldb/include/lldb/Host/HostInfoBase.h | 35 ++-
lldb/source/Core/CoreProperties.td | 4 +
lldb/source/Core/ModuleList.cpp | 10 +
lldb/source/Host/macosx/objcxx/CMakeLists.txt | 1 +
.../Host/macosx/objcxx/HostInfoMacOSX.mm | 243 ++++++++++++++++--
.../MacOSX-DYLD/DynamicLoaderDarwin.cpp | 19 +-
.../Platform/MacOSX/PlatformDarwinDevice.cpp | 10 +-
.../SymbolLocatorDebugSymbols.cpp | 10 +-
.../unittests/ObjectFile/MachO/CMakeLists.txt | 1 +
.../ObjectFile/MachO/TestObjectFileMachO.cpp | 19 +-
11 files changed, 310 insertions(+), 44 deletions(-)
diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h
index dd17f7558e2b4..6f7224fdeb0b3 100644
--- a/lldb/include/lldb/Core/ModuleList.h
+++ b/lldb/include/lldb/Core/ModuleList.h
@@ -81,6 +81,8 @@ class ModuleListProperties : public Properties {
bool SetClangModulesCachePath(const FileSpec &path);
bool GetEnableExternalLookup() const;
bool SetEnableExternalLookup(bool new_value);
+ bool GetSharedCacheBinaryLoading() const;
+ bool SetSharedCacheBinaryLoading(bool new_value);
bool GetEnableLLDBIndexCache() const;
bool SetEnableLLDBIndexCache(bool new_value);
uint64_t GetLLDBIndexCacheMaxByteSize();
diff --git a/lldb/include/lldb/Host/HostInfoBase.h b/lldb/include/lldb/Host/HostInfoBase.h
index 670fee19fca3d..149810ff53924 100644
--- a/lldb/include/lldb/Host/HostInfoBase.h
+++ b/lldb/include/lldb/Host/HostInfoBase.h
@@ -10,6 +10,7 @@
#define LLDB_HOST_HOSTINFOBASE_H
#include "lldb/Utility/ArchSpec.h"
+#include "lldb/Utility/DataExtractor.h"
#include "lldb/Utility/FileSpec.h"
#include "lldb/Utility/UUID.h"
#include "lldb/Utility/UserIDResolver.h"
@@ -28,8 +29,38 @@ namespace lldb_private {
class FileSpec;
struct SharedCacheImageInfo {
- UUID uuid;
- lldb::DataExtractorSP extractor_sp;
+ SharedCacheImageInfo()
+ : m_uuid(), m_extractor_sp(), m_create_data_extractor(nullptr),
+ m_image_baton(nullptr) {}
+ SharedCacheImageInfo(UUID uuid, lldb::DataExtractorSP extractor_sp)
+ : m_uuid(uuid), m_extractor_sp(extractor_sp),
+ m_create_data_extractor(nullptr), m_image_baton(nullptr) {}
+ SharedCacheImageInfo(
+ UUID uuid, lldb::DataExtractorSP (*create_data_extractor)(void *image),
+ void *image_baton)
+ : m_uuid(uuid), m_extractor_sp(),
+ m_create_data_extractor(create_data_extractor),
+ m_image_baton(image_baton) {}
+
+ lldb::DataExtractorSP GetExtractor() {
+ if (!m_extractor_sp && m_image_baton)
+ m_extractor_sp = m_create_data_extractor(m_image_baton);
+ return m_extractor_sp;
+ }
+ const UUID &GetUUID() const { return m_uuid; }
+ void *GetImageBaton();
+ void SetExtractor(lldb::DataExtractorSP extractor_sp) {
+ m_extractor_sp = extractor_sp;
+ }
+ void SetImageBaton(void *image_baton) { m_image_baton = image_baton; }
+ void SetDataExtractorCreateFunction(
+ lldb::DataExtractorSP (*create_data_extractor)(void *image));
+
+private:
+ UUID m_uuid;
+ lldb::DataExtractorSP m_extractor_sp;
+ lldb::DataExtractorSP (*m_create_data_extractor)(void *image);
+ void *m_image_baton;
};
namespace {
diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td
index 2bc62464f91bd..63efcae3d15d3 100644
--- a/lldb/source/Core/CoreProperties.td
+++ b/lldb/source/Core/CoreProperties.td
@@ -14,6 +14,10 @@ let Definition = "modulelist" in {
DefaultEnumValue<"eSymbolDownloadOff">,
EnumValues<"OptionEnumValues(g_auto_download_enum_values)">,
Desc<"On macOS, automatically download symbols with dsymForUUID (or an equivalent script/binary) for relevant images in the debug session.">;
+ def SharedCacheBinaryLoading: Property<"shared-cache-binary-loading", "Boolean">,
+ Global,
+ DefaultTrue,
+ Desc<"On macOS, load the binaries from a shared cache blob directly, instead of loading them from lldb's own in-process shared cache.">;
def ClangModulesCachePath: Property<"clang-modules-cache-path", "FileSpec">,
Global,
DefaultStringValue<"">,
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
index 613e469dc6318..fb4a80740200d 100644
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -118,6 +118,16 @@ SymbolDownload ModuleListProperties::GetSymbolAutoDownload() const {
g_modulelist_properties[idx].default_uint_value));
}
+bool ModuleListProperties::GetSharedCacheBinaryLoading() const {
+ const uint32_t idx = ePropertySharedCacheBinaryLoading;
+ return GetPropertyAtIndexAs<bool>(
+ idx, g_modulelist_properties[idx].default_uint_value != 0);
+}
+
+bool ModuleListProperties::SetSharedCacheBinaryLoading(bool new_value) {
+ return SetPropertyAtIndex(ePropertySharedCacheBinaryLoading, new_value);
+}
+
FileSpec ModuleListProperties::GetClangModulesCachePath() const {
const uint32_t idx = ePropertyClangModulesCachePath;
return GetPropertyAtIndexAs<FileSpec>(idx, {});
diff --git a/lldb/source/Host/macosx/objcxx/CMakeLists.txt b/lldb/source/Host/macosx/objcxx/CMakeLists.txt
index 1d7573335b8ec..a47a1e5086eee 100644
--- a/lldb/source/Host/macosx/objcxx/CMakeLists.txt
+++ b/lldb/source/Host/macosx/objcxx/CMakeLists.txt
@@ -14,6 +14,7 @@ add_lldb_library(lldbHostMacOSXObjCXX NO_PLUGIN_DEPENDENCIES
Support
TargetParser
LINK_LIBS
+ lldbCore
lldbUtility
${EXTRA_LIBS}
)
diff --git a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
index 8effe4cc169e0..114b91fc0b4d3 100644
--- a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
+++ b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "lldb/Host/macosx/HostInfoMacOSX.h"
+#include "lldb/Core/ModuleList.h"
#include "lldb/Host/FileSystem.h"
#include "lldb/Host/Host.h"
#include "lldb/Host/HostInfo.h"
@@ -16,6 +17,7 @@
#include "lldb/Utility/LLDBLog.h"
#include "lldb/Utility/Log.h"
#include "lldb/Utility/Timer.h"
+#include "lldb/Utility/VirtualDataExtractor.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SmallString.h"
@@ -30,6 +32,7 @@
// C inclues
#include <cstdlib>
+#include <dlfcn.h>
#include <sys/sysctl.h>
#include <sys/syslimits.h>
#include <sys/types.h>
@@ -65,6 +68,7 @@
#include <TargetConditionals.h> // for TARGET_OS_TV, TARGET_OS_WATCH
+using namespace lldb;
using namespace lldb_private;
std::optional<std::string> HostInfoMacOSX::GetOSBuildString() {
@@ -649,30 +653,229 @@ static bool ResolveAndVerifyCandidateSupportDir(FileSpec &path) {
dyld_shared_cache_dylib_text_info;
}
-extern "C" int dyld_shared_cache_iterate_text(
+// All available on at least macOS 12
+extern "C" {
+int dyld_shared_cache_iterate_text(
const uuid_t cacheUuid,
void (^callback)(const dyld_shared_cache_dylib_text_info *info));
-extern "C" uint8_t *_dyld_get_shared_cache_range(size_t *length);
-extern "C" bool _dyld_get_shared_cache_uuid(uuid_t uuid);
+uint8_t *_dyld_get_shared_cache_range(size_t *length);
+bool _dyld_get_shared_cache_uuid(uuid_t uuid);
+bool dyld_image_for_each_segment_info(void *image,
+ void (^)(const char *segmentName,
+ uint64_t vmAddr, uint64_t vmSize,
+ int perm));
+const char *dyld_shared_cache_file_path(void);
+bool dyld_shared_cache_for_file(const char *filePath,
+ void (^block)(void *cache));
+void dyld_shared_cache_copy_uuid(void *cache, uuid_t *uuid);
+uint64_t dyld_shared_cache_get_base_address(void *cache);
+void dyld_shared_cache_for_each_image(void *cache, void (^block)(void *image));
+bool dyld_image_copy_uuid(void *cache, uuid_t *uuid);
+const char *dyld_image_get_installname(void *image);
+const char *dyld_image_get_file_path(void *image);
+}
namespace {
class SharedCacheInfo {
public:
- const UUID &GetUUID() const { return m_uuid; }
- const llvm::StringMap<SharedCacheImageInfo> &GetImages() const {
- return m_images;
+ llvm::StringMap<SharedCacheImageInfo> &GetImages() {
+ return m_caches[m_host_uuid];
}
SharedCacheInfo();
private:
bool CreateSharedCacheInfoWithInstrospectionSPIs();
+ void CreateSharedCacheInfoLLDBsVirtualMemory();
+ bool CreateHostSharedCacheImageList();
+
+ // Given the UUID and filepath to a shared cache on the local debug host
+ // system, open it and add all of the binary images to m_caches.
+ bool CreateSharedCacheImageList(UUID uuid, std::string filepath);
- llvm::StringMap<SharedCacheImageInfo> m_images;
- UUID m_uuid;
+ std::map<UUID, llvm::StringMap<SharedCacheImageInfo>> m_caches;
+ UUID m_host_uuid;
+
+ // macOS 26.4 and newer
+ void (*m_dyld_image_retain_4HWTrace)(void *image);
+ void (*m_dyld_image_release_4HWTrace)(void *image);
+ dispatch_data_t (*m_dyld_image_segment_data_4HWTrace)(
+ void *image, const char *segmentName);
};
+
+} // namespace
+
+SharedCacheInfo::SharedCacheInfo() {
+ // macOS 26.4 and newer
+ m_dyld_image_retain_4HWTrace =
+ (void (*)(void *))dlsym(RTLD_DEFAULT, "dyld_image_retain_4HWTrace");
+ m_dyld_image_release_4HWTrace =
+ (void (*)(void *))dlsym(RTLD_DEFAULT, "dyld_image_release_4HWTrace");
+ m_dyld_image_segment_data_4HWTrace =
+ (dispatch_data_t(*)(void *image, const char *segmentName))dlsym(
+ RTLD_DEFAULT, "dyld_image_segment_data_4HWTrace");
+
+ uuid_t dsc_uuid;
+ _dyld_get_shared_cache_uuid(dsc_uuid);
+ m_host_uuid = UUID(dsc_uuid);
+
+ if (ModuleList::GetGlobalModuleListProperties()
+ .GetSharedCacheBinaryLoading() &&
+ CreateHostSharedCacheImageList())
+ return;
+
+ if (CreateSharedCacheInfoWithInstrospectionSPIs())
+ return;
+
+ CreateSharedCacheInfoLLDBsVirtualMemory();
+}
+
+struct segment {
+ std::string name;
+ uint64_t vmaddr;
+ size_t vmsize;
+
+ // Mapped into lldb's own address space via libdispatch:
+ const void *data;
+ size_t size;
+};
+
+static DataExtractorSP map_shared_cache_binary_segments(void *image) {
+ // dyld_image_segment_data_4HWTrace can't be called on
+ // multiple threads simultaneously.
+ static std::mutex g_mutex;
+ std::lock_guard<std::mutex> guard(g_mutex);
+
+ static dispatch_data_t (*g_dyld_image_segment_data_4HWTrace)(
+ void *image, const char *segmentName);
+ static std::once_flag g_once_flag;
+ std::call_once(g_once_flag, [&]() {
+ g_dyld_image_segment_data_4HWTrace =
+ (dispatch_data_t(*)(void *, const char *))dlsym(
+ RTLD_DEFAULT, "dyld_image_segment_data_4HWTrace");
+ });
+ if (!g_dyld_image_segment_data_4HWTrace)
+ return {};
+
+ __block std::vector<segment> segments;
+ __block void *image_copy = image;
+ dyld_image_for_each_segment_info(
+ image,
+ ^(const char *segmentName, uint64_t vmAddr, uint64_t vmSize, int perm) {
+ segment seg;
+ seg.name = segmentName;
+ seg.vmaddr = vmAddr;
+ seg.vmsize = vmSize;
+
+ dispatch_data_t data_from_libdyld =
+ g_dyld_image_segment_data_4HWTrace(image_copy, segmentName);
+ (void)dispatch_data_create_map(data_from_libdyld, &seg.data, &seg.size);
+
+ segments.push_back(seg);
+ });
+
+ if (!segments.size())
+ return {};
+
+ Log *log = GetLog(LLDBLog::Modules);
+ bool log_verbosely = log && log->GetVerbose();
+ for (const segment &seg : segments) {
+ if (log_verbosely)
+ LLDB_LOGF(
+ log,
+ "image %p %s vmaddr 0x%llx vmsize 0x%zx mapped to lldb vm addr %p",
+ image, seg.name.c_str(), seg.vmaddr, seg.vmsize, seg.data);
+ }
+
+ // Calculate the virtual address range in lldb's
+ // address space (lowest memory address to highest) so
+ // we can contain the entire range in an unowned data buffer.
+ uint64_t min_lldb_vm_addr = UINT64_MAX;
+ uint64_t max_lldb_vm_addr = 0;
+ // Calculate the minimum shared cache address seen; we want the first
+ // segment, __TEXT, at "vm offset" 0 in our DataExtractor.
+ // A __DATA segment which is at the __TEXT vm addr + 0x1000 needs to be
+ // listed as offset 0x1000.
+ uint64_t min_file_vm_addr = UINT64_MAX;
+ for (const segment &seg : segments) {
+ min_lldb_vm_addr = std::min(min_lldb_vm_addr, (uint64_t)seg.data);
+ max_lldb_vm_addr =
+ std::max(max_lldb_vm_addr, (uint64_t)seg.data + seg.vmsize);
+ min_file_vm_addr = std::min(min_file_vm_addr, (uint64_t)seg.vmaddr);
+ }
+ DataBufferSP data_sp = std::make_shared<DataBufferUnowned>(
+ (uint8_t *)min_lldb_vm_addr, max_lldb_vm_addr - min_lldb_vm_addr);
+ VirtualDataExtractor::LookupTable remap_table;
+ for (const segment &seg : segments)
+ remap_table.Append(VirtualDataExtractor::LookupTable::Entry(
+ (uint64_t)seg.vmaddr - min_file_vm_addr, (uint64_t)seg.vmsize,
+ (uint64_t)seg.data - (uint64_t)min_lldb_vm_addr));
+
+ return std::make_shared<VirtualDataExtractor>(data_sp, remap_table);
}
+// Scan the binaries in the specified shared cache filepath
+// if the UUID matches, using the macOS 26.4 libdyld SPI,
+// create a new entry in m_caches.
+bool SharedCacheInfo::CreateSharedCacheImageList(UUID uuid,
+ std::string filepath) {
+ if (!m_dyld_image_retain_4HWTrace || !m_dyld_image_release_4HWTrace ||
+ !m_dyld_image_segment_data_4HWTrace)
+ return false;
+
+ __block bool return_failed = false;
+ dyld_shared_cache_for_file(filepath.c_str(), ^(void *cache) {
+ uuid_t sc_uuid;
+ dyld_shared_cache_copy_uuid(cache, &sc_uuid);
+ UUID this_cache(sc_uuid, sizeof(uuid_t));
+ if (this_cache != uuid) {
+ return_failed = true;
+ return;
+ }
+
+ dyld_shared_cache_for_each_image(cache, ^(void *image) {
+ uuid_t uuid_tmp;
+ if (!dyld_image_copy_uuid(image, &uuid_tmp))
+ return;
+ UUID image_uuid(uuid_tmp, sizeof(uuid_t));
+
+ Log *log = GetLog(LLDBLog::Modules);
+ if (log && log->GetVerbose())
+ LLDB_LOGF(log, "sc file %s image %p", dyld_image_get_installname(image),
+ image);
+
+ m_dyld_image_retain_4HWTrace(image);
+ m_caches[m_host_uuid][dyld_image_get_installname(image)] =
+ SharedCacheImageInfo(image_uuid, map_shared_cache_binary_segments,
+ image);
+ });
+ });
+ if (return_failed)
+ return false;
+
+ return true;
+}
+
+// Get the filename and uuid of lldb's own shared cache, scan
+// the files in it using the macOS 26.4 and newer libdyld SPI.
+bool SharedCacheInfo::CreateHostSharedCacheImageList() {
+ std::string host_shared_cache_file = dyld_shared_cache_file_path();
+ __block UUID host_sc_uuid;
+ dyld_shared_cache_for_file(host_shared_cache_file.c_str(), ^(void *cache) {
+ uuid_t sc_uuid;
+ dyld_shared_cache_copy_uuid(cache, &sc_uuid);
+ host_sc_uuid = UUID(sc_uuid, sizeof(uuid_t));
+ });
+
+ if (host_sc_uuid.IsValid())
+ return CreateSharedCacheImageList(host_sc_uuid, host_shared_cache_file);
+
+ return false;
+}
+
+// Index the binaries in lldb's own shared cache memory, using
+// libdyld SPI present on macOS 12 and newer, when building against
+// the internal SDK, and add an entry to the m_caches map.
bool SharedCacheInfo::CreateSharedCacheInfoWithInstrospectionSPIs() {
#if defined(SDK_HAS_NEW_DYLD_INTROSPECTION_SPIS)
dyld_process_t dyld_process = dyld_process_create_for_current_task();
@@ -713,33 +916,31 @@ static bool ResolveAndVerifyCandidateSupportDir(FileSpec &path) {
lldb::DataBufferSP data_sp = std::make_shared<DataBufferUnowned>(
(uint8_t *)minVmAddr, maxVmAddr - minVmAddr);
lldb::DataExtractorSP extractor_sp = std::make_shared<DataExtractor>(data_sp);
- m_images[dyld_image_get_installname(image)] = SharedCacheImageInfo{
- UUID(uuid, 16), extractor_sp};
+ m_caches[m_host_uuid][dyld_image_get_installname(image)] =
+ SharedCacheImageInfo{UUID(uuid, 16), extractor_sp};
});
return true;
#endif
return false;
}
-SharedCacheInfo::SharedCacheInfo() {
- if (CreateSharedCacheInfoWithInstrospectionSPIs())
- return;
-
+// Index the binaries in lldb's own shared cache memory using
+// libdyld SPI available on macOS 10.13 or newer, add an entry to
+// m_caches.
+void SharedCacheInfo::CreateSharedCacheInfoLLDBsVirtualMemory() {
size_t shared_cache_size;
uint8_t *shared_cache_start =
_dyld_get_shared_cache_range(&shared_cache_size);
- uuid_t dsc_uuid;
- _dyld_get_shared_cache_uuid(dsc_uuid);
- m_uuid = UUID(dsc_uuid);
dyld_shared_cache_iterate_text(
- dsc_uuid, ^(const dyld_shared_cache_dylib_text_info *info) {
- lldb::DataBufferSP data_sp = std::make_shared<DataBufferUnowned>(
+ m_host_uuid.GetBytes().data(),
+ ^(const dyld_shared_cache_dylib_text_info *info) {
+ lldb::DataBufferSP buffer_sp = std::make_shared<DataBufferUnowned>(
shared_cache_start + info->textSegmentOffset,
shared_cache_size - info->textSegmentOffset);
lldb::DataExtractorSP extractor_sp =
- std::make_shared<DataExtractor>(data_sp);
- m_images[info->path] =
+ std::make_shared<DataExtractor>(buffer_sp);
+ m_caches[m_host_uuid][info->path] =
SharedCacheImageInfo{UUID(info->dylibUuid, 16), extractor_sp};
});
}
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
index a3b0001466bbb..00cae1c6cea1e 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
@@ -124,7 +124,12 @@ ModuleSP DynamicLoaderDarwin::FindTargetModuleForImageInfo(
if (module_sp || !can_create)
return module_sp;
- if (HostInfo::GetArchitecture().IsCompatibleMatch(target.GetArchitecture())) {
+ // See if we have this binary in the Target or the global Module
+ // cache already.
+ module_sp = target.GetOrCreateModule(module_spec, /*notify=*/false);
+
+ if (!module_sp &&
+ HostInfo::GetArchitecture().IsCompatibleMatch(target.GetArchitecture())) {
// When debugging on the host, we are most likely using the same shared
// cache as our inferior. The dylibs from the shared cache might not
// exist on the filesystem, so let's use the images in our own memory
@@ -135,18 +140,18 @@ ModuleSP DynamicLoaderDarwin::FindTargetModuleForImageInfo(
// If we found it and it has the correct UUID, let's proceed with
// creating a module from the memory contents.
- if (image_info.uuid &&
- (!module_spec.GetUUID() || module_spec.GetUUID() == image_info.uuid)) {
- ModuleSpec shared_cache_spec(module_spec.GetFileSpec(), image_info.uuid,
- image_info.extractor_sp);
+ if (image_info.GetUUID() &&
+ (!module_spec.GetUUID() ||
+ module_spec.GetUUID() == image_info.GetUUID())) {
+ ModuleSpec shared_cache_spec(module_spec.GetFileSpec(),
+ image_info.GetUUID(),
+ image_info.GetExtractor());
module_sp =
target.GetOrCreateModule(shared_cache_spec, false /* notify */);
}
}
// We'll call Target::ModulesDidLoad after all the modules have been
// added to the target, don't let it be called for every one.
- if (!module_sp)
- module_sp = target.GetOrCreateModule(module_spec, false /* notify */);
if (!module_sp || module_sp->GetObjectFile() == nullptr) {
llvm::Expected<ModuleSP> module_sp_or_err = m_process->ReadModuleFromMemory(
image_info.file_spec, image_info.address);
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp
index a6dc6759c16df..c1a04e801107e 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinDevice.cpp
@@ -323,10 +323,12 @@ lldb_private::Status PlatformDarwinDevice::GetSharedModuleWithLocalCache(
// If we found it and it has the correct UUID, let's proceed with
// creating a module from the memory contents.
- if (image_info.uuid &&
- (!module_spec.GetUUID() || module_spec.GetUUID() == image_info.uuid)) {
- ModuleSpec shared_cache_spec(module_spec.GetFileSpec(), image_info.uuid,
- image_info.extractor_sp);
+ if (image_info.GetUUID() &&
+ (!module_spec.GetUUID() ||
+ module_spec.GetUUID() == image_info.GetUUID())) {
+ ModuleSpec shared_cache_spec(module_spec.GetFileSpec(),
+ image_info.GetUUID(),
+ image_info.GetExtractor());
err = ModuleList::GetSharedModule(shared_cache_spec, module_sp,
old_modules, did_create_ptr);
if (module_sp) {
diff --git a/lldb/source/Plugins/SymbolLocator/DebugSymbols/SymbolLocatorDebugSymbols.cpp b/lldb/source/Plugins/SymbolLocator/DebugSymbols/SymbolLocatorDebugSymbols.cpp
index 1e51dda15d6e9..f4b572c9e88ac 100644
--- a/lldb/source/Plugins/SymbolLocator/DebugSymbols/SymbolLocatorDebugSymbols.cpp
+++ b/lldb/source/Plugins/SymbolLocator/DebugSymbols/SymbolLocatorDebugSymbols.cpp
@@ -208,8 +208,9 @@ std::optional<ModuleSpec> SymbolLocatorDebugSymbols::LocateExecutableObjectFile(
// If we found it and it has the correct UUID, let's proceed with
// creating a module from the memory contents.
- if (image_info.uuid && (!module_spec.GetUUID() ||
- module_spec.GetUUID() == image_info.uuid)) {
+ if (image_info.GetUUID() &&
+ (!module_spec.GetUUID() ||
+ module_spec.GetUUID() == image_info.GetUUID())) {
success = true;
return_module_spec.GetFileSpec() = module_spec.GetFileSpec();
LLDB_LOGF(log,
@@ -650,8 +651,9 @@ static int LocateMacOSXFilesUsingDebugSymbols(const ModuleSpec &module_spec,
// If we found it and it has the correct UUID, let's proceed with
// creating a module from the memory contents.
- if (image_info.uuid && (!module_spec.GetUUID() ||
- module_spec.GetUUID() == image_info.uuid)) {
+ if (image_info.GetUUID() &&
+ (!module_spec.GetUUID() ||
+ module_spec.GetUUID() == image_info.GetUUID())) {
success = true;
return_module_spec.GetFileSpec() = module_spec.GetFileSpec();
LLDB_LOGF(log,
diff --git a/lldb/unittests/ObjectFile/MachO/CMakeLists.txt b/lldb/unittests/ObjectFile/MachO/CMakeLists.txt
index b6c4225114a36..1b071ff1bd738 100644
--- a/lldb/unittests/ObjectFile/MachO/CMakeLists.txt
+++ b/lldb/unittests/ObjectFile/MachO/CMakeLists.txt
@@ -6,5 +6,6 @@ add_lldb_unittest(ObjectFileMachOTests
lldbPluginSymbolFileSymtab
lldbCore
lldbUtilityHelpers
+ lldbPluginPlatformMacOSX
LLVMTestingSupport
)
diff --git a/lldb/unittests/ObjectFile/MachO/TestObjectFileMachO.cpp b/lldb/unittests/ObjectFile/MachO/TestObjectFileMachO.cpp
index 3adb642c1108e..5b516fc2582f5 100644
--- a/lldb/unittests/ObjectFile/MachO/TestObjectFileMachO.cpp
+++ b/lldb/unittests/ObjectFile/MachO/TestObjectFileMachO.cpp
@@ -6,12 +6,14 @@
//
//===----------------------------------------------------------------------===//
-#include "lldb/Host/HostInfo.h"
#include "Plugins/ObjectFile/Mach-O/ObjectFileMachO.h"
+#include "Plugins/Platform/MacOSX/PlatformMacOSX.h"
+#include "Plugins/Platform/MacOSX/PlatformRemoteMacOSX.h"
#include "TestingSupport/SubsystemRAII.h"
#include "TestingSupport/TestUtilities.h"
#include "lldb/Core/Module.h"
#include "lldb/Host/FileSystem.h"
+#include "lldb/Host/HostInfo.h"
#include "lldb/lldb-defines.h"
#include "gtest/gtest.h"
@@ -19,6 +21,7 @@
#include <dlfcn.h>
#endif
+using namespace lldb;
using namespace lldb_private;
using namespace llvm;
@@ -30,12 +33,16 @@ class ObjectFileMachOTest : public ::testing::Test {
#if defined(__APPLE__)
TEST_F(ObjectFileMachOTest, ModuleFromSharedCacheInfo) {
+ ArchSpec arch("arm64-apple-macosx-");
+
+ Platform::SetHostPlatform(PlatformRemoteMacOSX::CreateInstance(true, &arch));
+
SharedCacheImageInfo image_info =
HostInfo::GetSharedCacheImageInfo("/usr/lib/libobjc.A.dylib");
- EXPECT_TRUE(image_info.uuid);
- EXPECT_TRUE(image_info.extractor_sp);
+ EXPECT_TRUE(image_info.GetUUID());
+ EXPECT_TRUE(image_info.GetExtractor());
- ModuleSpec spec(FileSpec(), UUID(), image_info.extractor_sp);
+ ModuleSpec spec(FileSpec(), UUID(), image_info.GetExtractor());
lldb::ModuleSP module = std::make_shared<Module>(spec);
ObjectFile *OF = module->GetObjectFile();
ASSERT_TRUE(llvm::isa<ObjectFileMachO>(OF));
@@ -74,13 +81,13 @@ TEST_F(ObjectFileMachOTest, ModuleFromSharedCacheInfo) {
// Read a symbol from the __TEXT segment...
check_symbol("objc_msgSend");
// ... and one from the __DATA segment
- check_symbol("OBJC_CLASS_$_NSObject");
+ check_symbol("OBJC_IVAR_$_NSObject.isa");
}
TEST_F(ObjectFileMachOTest, IndirectSymbolsInTheSharedCache) {
SharedCacheImageInfo image_info = HostInfo::GetSharedCacheImageInfo(
"/System/Library/Frameworks/AppKit.framework/Versions/C/AppKit");
- ModuleSpec spec(FileSpec(), UUID(), image_info.extractor_sp);
+ ModuleSpec spec(FileSpec(), UUID(), image_info.GetExtractor());
lldb::ModuleSP module = std::make_shared<Module>(spec);
ObjectFile *OF = module->GetObjectFile();
>From f4aa0c034a283017c2d54450590a116651b36896 Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley at gmail.com>
Date: Thu, 5 Feb 2026 18:54:44 -0800
Subject: [PATCH 06/33] [clang-apply-replacements] Change cleanup to only
happen with --format (#178763)
Cleanup can result in many unrelated changes to the given replacements.
This change makes that only apply if the user actually wants
clang-apply-replacements to format their code outside of the
replacements.
---
.../tool/ClangApplyReplacementsMain.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang-tools-extra/clang-apply-replacements/tool/ClangApplyReplacementsMain.cpp b/clang-tools-extra/clang-apply-replacements/tool/ClangApplyReplacementsMain.cpp
index 76de8bd877d03..83e47d09f5b53 100644
--- a/clang-tools-extra/clang-apply-replacements/tool/ClangApplyReplacementsMain.cpp
+++ b/clang-tools-extra/clang-apply-replacements/tool/ClangApplyReplacementsMain.cpp
@@ -139,7 +139,7 @@ int main(int argc, char **argv) {
return 1;
tooling::ApplyChangesSpec Spec;
- Spec.Cleanup = true;
+ Spec.Cleanup = DoFormat;
Spec.Format = DoFormat ? tooling::ApplyChangesSpec::kAll
: tooling::ApplyChangesSpec::kNone;
Spec.Style = DoFormat ? FormatStyle : format::getNoStyle();
>From 1f2ab9cab2a44978c7fa97e55f9008fedf577103 Mon Sep 17 00:00:00 2001
From: Snehasish Kumar <mail at snehasish.net>
Date: Thu, 5 Feb 2026 19:20:32 -0800
Subject: [PATCH 07/33] [InstCombine][profcheck] Fix profile metadata
propagation in takeLog2 (#179331)
Pass the select inst to Builder.CreateSelect so that profile metadata is retained.
Assisted-by: gemini
---
.../Transforms/InstCombine/InstCombineMulDivRem.cpp | 8 +++++++-
llvm/test/Transforms/InstCombine/cttz.ll | 11 ++++++++---
llvm/utils/profcheck-xfail.txt | 7 -------
3 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 759af82ebbbdc..8d053bd499fce 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -41,6 +41,10 @@
using namespace llvm;
using namespace PatternMatch;
+namespace llvm {
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+}
+
/// The specific integer value is used in a context where it is known to be
/// non-zero. If this allows us to simplify the computation, do so and return
/// the new operand, otherwise return null.
@@ -1619,7 +1623,9 @@ Value *InstCombinerImpl::takeLog2(Value *Op, unsigned Depth, bool AssumeNonZero,
if (Value *LogY =
takeLog2(SI->getOperand(2), Depth, AssumeNonZero, DoFold))
return IfFold([&]() {
- return Builder.CreateSelect(SI->getOperand(0), LogX, LogY);
+ return Builder.CreateSelect(SI->getOperand(0), LogX, LogY, "",
+ ProfcheckDisableMetadataFixes ? nullptr
+ : SI);
});
// log2(umin(X, Y)) -> umin(log2(X), log2(Y))
diff --git a/llvm/test/Transforms/InstCombine/cttz.ll b/llvm/test/Transforms/InstCombine/cttz.ll
index 829213b24e93e..b3291e7058896 100644
--- a/llvm/test/Transforms/InstCombine/cttz.ll
+++ b/llvm/test/Transforms/InstCombine/cttz.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals smart
; RUN: opt < %s -S -passes=instcombine | FileCheck %s
declare i32 @llvm.cttz.i32(i32, i1)
@@ -341,16 +341,18 @@ define i8 @fold_ctz_log2_maybe_z_okay(i8 %x, i8 %y, i1 %c) {
; CHECK-LABEL: @fold_ctz_log2_maybe_z_okay(
; CHECK-NEXT: [[X:%.*]] = add i8 [[X1:%.*]], 1
; CHECK-NEXT: [[Y:%.*]] = add i8 [[Y1:%.*]], 2
-; CHECK-NEXT: [[V_V:%.*]] = select i1 [[C:%.*]], i8 [[X]], i8 [[Y]]
+; CHECK-NEXT: [[V_V:%.*]] = select i1 [[C:%.*]], i8 [[X]], i8 [[Y]], !prof [[PROF0:![0-9]+]]
; CHECK-NEXT: ret i8 [[V_V]]
;
%p2 = shl i8 2, %x
%p2_2 = shl i8 4, %y
- %v = select i1 %c, i8 %p2, i8 %p2_2
+ %v = select i1 %c, i8 %p2, i8 %p2_2, !prof !0
%r = call i8 @llvm.cttz(i8 %v, i1 true)
ret i8 %r
}
+!0 = !{!"branch_weights", i32 1, i32 2}
+
define i8 @fold_clz_log2(i8 %x) {
; CHECK-LABEL: @fold_clz_log2(
; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[X:%.*]], i8 5)
@@ -390,3 +392,6 @@ define i9 @fold_clz_log2_i9(i9 %x) {
%r = call i9 @llvm.ctlz(i9 %v, i1 true)
ret i9 %r
}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 2}
+;.
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 6cd61c060dd6e..468fdc1f4c986 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -208,13 +208,10 @@ Transforms/IndVarSimplify/invalidate-modified-lcssa-phi.ll
Transforms/IndVarSimplify/pr45835.ll
Transforms/IndVarSimplify/preserving-debugloc-rem-div.ll
Transforms/InstCombine/2004-09-20-BadLoadCombine.ll
-Transforms/InstCombine/2005-04-07-UDivSelectCrash.ll
Transforms/InstCombine/add-shl-mul-umax.ll
Transforms/InstCombine/and2.ll
Transforms/InstCombine/and-fcmp.ll
Transforms/InstCombine/and-or-icmps.ll
-Transforms/InstCombine/apint-div1.ll
-Transforms/InstCombine/apint-div2.ll
Transforms/InstCombine/atomic.ll
Transforms/InstCombine/binop-cast.ll
Transforms/InstCombine/binop-select-cast-of-select-cond.ll
@@ -225,9 +222,6 @@ Transforms/InstCombine/canonicalize-clamp-like-pattern-between-negative-and-posi
Transforms/InstCombine/canonicalize-clamp-like-pattern-between-zero-and-positive-threshold.ll
Transforms/InstCombine/cast-mul-select.ll
Transforms/InstCombine/clamp-to-minmax.ll
-Transforms/InstCombine/cttz.ll
-Transforms/InstCombine/div.ll
-Transforms/InstCombine/div-shift.ll
Transforms/InstCombine/fabs.ll
Transforms/InstCombine/fcmp-select.ll
Transforms/InstCombine/ffs-1.ll
@@ -259,7 +253,6 @@ Transforms/InstCombine/minmax-fp.ll
Transforms/InstCombine/minmax-intrinsics.ll
Transforms/InstCombine/mul-inseltpoison.ll
Transforms/InstCombine/mul.ll
-Transforms/InstCombine/mul-pow2.ll
Transforms/InstCombine/multiple-uses-load-bitcast-select.ll
Transforms/InstCombine/nested-select.ll
Transforms/InstCombine/or-fcmp.ll
>From ad1f8b8b7508a2af3271c4d8024ed48931a51555 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 5 Feb 2026 19:26:25 -0800
Subject: [PATCH 08/33] [MLIR][XeGPU] Refactor layout propagation utilities
(#179016)
This PR refactors layout propagation into two distinct components:
result/anchor layout setup and source layout inference from the result.
For operations that require a specific result layout due to semantic or
hardware constraints, the propagation logic explicitly sets up the
result or anchor layout. Otherwise, it infers the source layout from the
backward-propagated consumer layout.
The result or anchor layout may differ from the backward-propagated
consumer layout; any such discrepancies are resolved via the existing
layout-conflict mechanism.
**This PR introduces the following utility functions:**
Source layout inference:
> inferBroadcastSourceLayout()
> inferMultiReductionSourceLayout()
> inferBitCastSourceLayout()
> inferShapeCastSourceLayout()
> inferInsertStridedSliceSourceLayout()
Result / anchor layout setup:
> setupMultiReductionResultLayout()
> setupBitCastResultLayout()
> setupInsertStridedSliceResultLayout()
> setupLoadMatrixAnchorLayout()
> setupStoreMatrixAnchorLayout()
> setupLoadGatherAnchorLayout()
> setupStoreScatterAnchorLayout()
Part of subgroup distribution related code changes are separated and
created as PR https://github.com/llvm/llvm-project/pull/179018/changes.
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 47 +-
.../Dialect/XeGPU/Transforms/Transforms.h | 6 -
.../XeGPU/Transforms/XeGPULayoutImpl.h | 168 ++++
.../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 52 +-
.../mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h | 24 +-
.../mlir/Dialect/XeGPU/uArch/uArchBase.h | 44 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 247 ++++-
.../Dialect/XeGPU/Transforms/CMakeLists.txt | 1 +
.../XeGPU/Transforms/XeGPUBlocking.cpp | 1 +
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 851 ++++++++++++++++++
.../Transforms/XeGPUPeepHoleOptimizer.cpp | 1 +
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 531 +++++------
.../Transforms/XeGPUSubgroupDistribute.cpp | 6 +-
.../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 1 +
.../Transforms/XeGPUWgToSgDistribute.cpp | 33 +-
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 160 ++--
.../XeGPU/propagate-layout-inst-data.mlir | 88 +-
.../XeGPU/propagate-layout-subgroup.mlir | 41 +
mlir/test/Dialect/XeGPU/propagate-layout.mlir | 137 ++-
.../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 8 +-
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 1 +
21 files changed, 1901 insertions(+), 547 deletions(-)
create mode 100644 mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
create mode 100644 mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index c146105c6b826..898fb7e1d8e6d 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -226,16 +226,31 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
InterfaceMethod<"Derive a new layout with sg_data, inst_data and lane_data set to 1 for the specified unit dims",
"xegpu::DistributeLayoutAttr",
"setUnitDimData",
- /*args=*/(ins "const llvm::SetVector<int64_t>": $unitDims)>,
+ /*args=*/(ins "const SmallVector<int64_t>": $unitDims)>,
InterfaceMethod<"Derive a new layout with sg_lane and lane_layout set to 1 for the specified unit dims",
"xegpu::DistributeLayoutAttr",
"setUnitDimLayout",
- /*args=*/(ins "const llvm::SetVector<int64_t>": $unitDims)>,
+ /*args=*/(ins "const SmallVector<int64_t>": $unitDims)>,
InterfaceMethod<[{Delinearizes a linear ID into its multidimensional
indices based on the effective layout level.}],
"FailureOr<SmallVector<Value>>",
"delinearizeId",
(ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId)>,
+ InterfaceMethod<[{Derive a new layout with sg_data, inst_data and lane_data set to the
+ specified values for the given dimension. Passing -1 for any parameter
+ preserves its original value.}],
+ "xegpu::DistributeLayoutAttr",
+ "setDimData",
+ (ins "int64_t": $dim,
+ "int64_t": $sgData,
+ "int64_t": $instData,
+ "int64_t": $laneData)>,
+ InterfaceMethod<[{Derive a new layout by collapsing dimensions.
+ `dimGroup` specifies a group of adjacent dimensions that are collapsed into
+ a single dimension in the derived layout.}],
+ "xegpu::DistributeLayoutAttr",
+ "collapseDims",
+ (ins "SmallVector<int64_t>": $dimGroup)>,
InterfaceMethod<[{Generates instructions to compute multidimensional coordinates for dist units
assigned to a level identified by linearId. The shape parameter
represents the higher-level problem size. Each level may access
@@ -501,10 +516,20 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
}
//set the layout for the sepcified unit dims: sg_data, inst_data and lane_data to 1
- DistributeLayoutAttr setUnitDimData(SetVector<int64_t> unitDims) const;
+ DistributeLayoutAttr setUnitDimData(SmallVector<int64_t> unitDims) const;
//set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
- DistributeLayoutAttr setUnitDimLayout(SetVector<int64_t> unitDims) const;
+ DistributeLayoutAttr setUnitDimLayout(SmallVector<int64_t> unitDims) const;
+
+ // Derive a new layout with sg_data, inst_data and lane_data set to the
+ // specified values for the given dimension. Passing -1 for any parameter
+ // preserves its original value.
+ DistributeLayoutAttr setDimData(int64_t dim, int64_t sgData, int64_t instData, int64_t laneData);
+
+ // Derive a new layout by collapsing dimensions.
+ // `dimGroup` specifies a group of adjacent dimensions
+ // that are collapsed into a single dimension in the derived layout.
+ DistributeLayoutAttr collapseDims(SmallVector<int64_t> dimGroup);
/// Delinearizes a linear ID into its multidimensional indices
/// based on the effective level of the layout.
@@ -672,10 +697,20 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
}
//set the layout for the sepcified unit dims: sg_data, inst_data and lane_data to 1
- DistributeLayoutAttr setUnitDimData(SetVector<int64_t> unitDims) const;
+ DistributeLayoutAttr setUnitDimData(SmallVector<int64_t> unitDims) const;
//set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
- DistributeLayoutAttr setUnitDimLayout(SetVector<int64_t> unitDims) const;
+ DistributeLayoutAttr setUnitDimLayout(SmallVector<int64_t> unitDims) const;
+
+ // Derive a new layout with sg_data, inst_data and lane_data set to the
+ // specified values for the given dimension. Passing -1 for any parameter
+ // preserves its original value.
+ DistributeLayoutAttr setDimData(int64_t dim, int64_t sgData, int64_t instData, int64_t laneData);
+
+ // Derive a new layout by collapsing dimensions.
+ // `dimGroup` specifies a group of adjacent dimensions
+ // that are collapsed into a single dimension in the derived layout.
+ DistributeLayoutAttr collapseDims(SmallVector<int64_t> dimGroup);
/// flatten a nested SliceAttr, e.g., for 2-level nested SliceAttr
/// #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 12]>, dims = [0]>, dims = [0]>
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index fede329990be4..ea01975da582f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -103,12 +103,6 @@ void populateXeGPUSgToWiDistributeTypeConversionAndLegality(
void populateXeGPUUnrollPatterns(RewritePatternSet &patterns,
const UnrollOptions &options);
-enum class LayoutKind { Lane, InstData, Subgroup };
-LogicalResult propagateLayouts(OpBuilder &builder, Operation *target,
- LayoutKind layoutKind, bool printOnly = false);
-
-LogicalResult resolveLayoutConflicts(Operation *target);
-
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
new file mode 100644
index 0000000000000..182607c22c584
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -0,0 +1,168 @@
+//===- XeGPULayoutImpl.h - Layout utility functions ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_XEGPU_UTILS_XeGPULayoutImpl_H_
+#define MLIR_DIALECT_XEGPU_UTILS_XeGPULayoutImpl_H_
+
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+
+class VectorType;
+class OpOperand;
+class OpResult;
+class OpBuilder;
+class ValueRange;
+class TypeConverter;
+class OpFoldResult;
+
+namespace xegpu {
+class DistributeLayoutAttr;
+class LayoutAttr;
+class TensorDescType;
+} // namespace xegpu
+
+namespace xegpu {
+
+enum class LayoutKind { Lane, InstData, Subgroup };
+
+LogicalResult propagateLayouts(OpBuilder &builder, Operation *target,
+ LayoutKind layoutKind, bool printOnly = false);
+
+LogicalResult resolveLayoutConflicts(Operation *target);
+
+/// [to-be-deprecated] Set the DistributeLayoutAttr for each OpOperand and
+/// OpResult of of the given operation. If the operation contains regions, it is
+/// also applied recursively to the contained operations operation.
+/// TODO: To be replaced by recoverTemporaryLayouts()
+void recoverTemporaryLayoutsDeprecated(Operation *op);
+
+/// Attach layout attributes to all vector-type operands of operations within
+/// the given operation's nested region. Reports an error if any vector operand
+/// lacks a layout attribute.
+bool recoverTemporaryLayouts(Operation *rootOp);
+
+/// Removes the LayoutAttr for a given OpOperand or OpResult if it exists.
+template <typename T,
+ typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
+ std::is_same_v<T, OpResult>>>
+void removeLayoutAttr(const T &operandOrResult);
+
+/// Removes the DistributeLayoutAttr for each OpOperand and OpResult of the
+/// given operation if they exist. If the operation contains regions, it is also
+/// applied recursively to the contained operations
+void removeLayoutAttrs(Operation *op);
+
+/// Updates the NamedAttribute sequence by dropping sg-layout and
+/// sg-data information from any DistributeLayoutAttr found.
+SmallVector<NamedAttribute>
+dropSgLayoutAndDataOnAttrs(ArrayRef<NamedAttribute> attrs);
+
+/// Updates the NamedAttribute sequence by dropping inst-data information from
+/// any DistributeLayoutAttr found.
+SmallVector<NamedAttribute> dropInstDataOnAttrs(ArrayRef<NamedAttribute> attrs);
+
+/// Infers the source layout attribute for a broadcast operation given the
+/// result layout attribute, result shape, and source shape.
+DistributeLayoutAttr inferBroadcastSourceLayout(DistributeLayoutAttr resLayout,
+ ArrayRef<int64_t> resShape,
+ ArrayRef<int64_t> srcShape);
+
+/// Infers the source layout attribute for a reduction operation given the
+/// result layout attribute and reduced dims.
+DistributeLayoutAttr
+inferMultiReductionSourceLayout(DistributeLayoutAttr resLayout,
+ SmallVector<int64_t> reduceDims);
+
+/// Infers the source layout attribute for a bitcast operation given the
+/// result layout attribute, result element type bitwidth, and source element
+/// type bitwidth.
+DistributeLayoutAttr inferBitCastSourceLayout(DistributeLayoutAttr resLayout,
+ int resElemTyBitWidth,
+ int srcElemTyBitWidth);
+
+/// Infers the source layout attribute for a shape cast operation given the
+/// result layout attribute, result shape, and source shape.
+DistributeLayoutAttr inferShapeCastSourceLayout(DistributeLayoutAttr resLayout,
+ ArrayRef<int64_t> resShape,
+ ArrayRef<int64_t> srcShape);
+
+/// Infers the source layout attribute for an insert strided slice operation
+/// given the result layout attribute, result shape, and source shape. Removes
+/// leading dimensions from the result layout to match the source shape size.
+DistributeLayoutAttr
+inferInsertStridedSliceSourceLayout(DistributeLayoutAttr resLayout,
+ ArrayRef<int64_t> resShape,
+ ArrayRef<int64_t> srcShape);
+
+/// Sets up layout for reduction operations by creating a SliceAttr for the
+/// result.
+///
+/// This function first attempts to construct a source layout that, when
+/// sliced along reduction dimensions, produces a result layout compatible
+/// with the consumer's preferred layout. This minimizes data redistribution
+/// overhead. The SliceAttr for the result is then created based on the
+/// derived source layout and the specified reduction dimensions.
+SliceAttr setupMultiReductionResultLayout(LayoutKind layoutKind,
+ VectorType srcVectorTy,
+ DistributeLayoutAttr consumerLayout,
+ SmallVector<int64_t> reductionDims,
+ const uArch::uArch *uArch);
+
+/// Setup the result layout attribute for a bitcast operation based on element
+/// type bitwidths. This ensures the source layout can always be derived from
+/// the result layout.
+///
+/// When casting from a narrower to a wider element type (srcElemTyBitWidth <
+/// resElemTyBitWidth), the result layout's innermost dimension data sizes
+/// (inst_data, lane_data) are scaled up by the bitwidth ratio. This maintains
+/// the invariant that the source layout can be recovered by adjusting the
+/// result layout based on bitwidth ratio of input vs output.
+DistributeLayoutAttr setupBitCastResultLayout(
+ LayoutKind layoutKind, VectorType srcVectorTy, VectorType resVectorTy,
+ DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch);
+
+/// Sets up the result layout for an insert strided slice operation.
+/// Creates a result layout based on the specified layout kind (InstData or
+/// Lane).
+DistributeLayoutAttr setupInsertStridedSliceResultLayout(
+ LayoutKind layoutKind, VectorType srcVectorTy, VectorType resVectorTy,
+ DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch);
+
+/// Sets up the anchor layout for a load gather operation.
+DistributeLayoutAttr
+setupLoadGatherAnchorLayout(LayoutKind layoutKind, VectorType vectorTy,
+ int chunkSize, DistributeLayoutAttr consumerLayout,
+ const uArch::uArch *uArch);
+
+/// Sets up the anchor layout for load matrix operation.
+DistributeLayoutAttr
+setupLoadMatrixAnchorLayout(LayoutKind layoutKind, VectorType vectorTy,
+ DistributeLayoutAttr consumerLayout,
+ const uArch::uArch *uArch);
+
+/// Sets up the anchor layout for a store scatter operation.
+DistributeLayoutAttr setupStoreScatterAnchorLayout(LayoutKind layoutKind,
+ VectorType vectorTy,
+ int chunkSize,
+ const uArch::uArch *uArch);
+
+/// Sets up the anchor layout for a store matrix operation.
+DistributeLayoutAttr setupStoreMatrixAnchorLayout(LayoutKind layoutKind,
+ VectorType vectorTy,
+ const uArch::uArch *uArch);
+
+} // namespace xegpu
+
+} // namespace mlir
+
+#endif // MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 700db5f9dd9be..4443f86d1e4e2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -137,12 +137,6 @@ template <typename T>
int getLargestDivisor(T dim, ArrayRef<T> candidates,
ArrayRef<T> candidateMultiples = {});
-/// Return the attribute name for the OpOperand to attach DistributeLayoutAttr
-std::string getTemporaryLayoutName(const OpOperand &operand);
-
-/// Return the attribute name for the OpResult to attach DistributeLayoutAttr
-std::string getTemporaryLayoutName(const OpResult result);
-
/// Retrieves the DistributeLayoutAttr associated with a given Value. For
/// TensorDescType values, the DistributeLayoutAttr is extracted from the
/// TensorDescType itself. For other values, it is obtained from the attributes
@@ -155,26 +149,6 @@ DistributeLayoutAttr getDistributeLayoutAttr(const Value value);
/// found, it will check the operand itself and its defining op.
DistributeLayoutAttr getDistributeLayoutAttr(const OpOperand &opr);
-/// Removes the LayoutAttr for a given OpOperand or OpResult if it exists.
-template <typename T,
- typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
- std::is_same_v<T, OpResult>>>
-void removeLayoutAttr(const T &operandOrResult);
-
-/// Removes the DistributeLayoutAttr for each OpOperand and OpResult of the
-/// given operation if they exist. If the operation contains regions, it is also
-/// applied recursively to the contained operations
-void removeLayoutAttrs(Operation *op);
-
-/// Updates the NamedAttribute sequence by dropping sg-layout and
-/// sg-data information from any DistributeLayoutAttr found.
-SmallVector<NamedAttribute>
-dropSgLayoutAndDataOnAttrs(ArrayRef<NamedAttribute> attrs);
-
-/// Updates the NamedAttribute sequence by dropping inst-data information from
-/// any DistributeLayoutAttr found.
-SmallVector<NamedAttribute> dropInstDataOnAttrs(ArrayRef<NamedAttribute> attrs);
-
/// [to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult
/// user should use setAnchorLayout instead
void setDistributeLayoutAttr(const OpResult &Result,
@@ -185,6 +159,12 @@ void setDistributeLayoutAttr(const OpResult &Result,
void setDistributeLayoutAttr(const OpOperand &opr,
const DistributeLayoutAttr layout);
+/// Return the attribute name for the OpOperand to attach DistributeLayoutAttr
+std::string getTemporaryLayoutName(const OpOperand &operand);
+
+/// Return the attribute name for the OpResult to attach DistributeLayoutAttr
+std::string getTemporaryLayoutName(const OpResult result);
+
/// get and set distribute layout attribute for non-anchor operations
/// (and offsets/masks of load/store ops before we get rid of their temp attrs)
template <typename T,
@@ -198,17 +178,6 @@ template <typename T,
void setTemporaryLayout(const T &operandOrResult,
const DistributeLayoutAttr layout);
-/// [to-be-deprecated] Set the DistributeLayoutAttr for each OpOperand and
-/// OpResult of of the given operation. If the operation contains regions, it is
-/// also applied recursively to the contained operations operation.
-/// TODO: To be replaced by recoverTemporaryLayouts()
-void recoverTemporaryLayoutsDeprecated(Operation *op);
-
-/// Attach layout attributes to all vector-type operands of operations within
-/// the given operation's region. Reports an error if any vector operand lacks
-/// a layout attribute.
-bool recoverTemporaryLayouts(Operation *rootOp);
-
/// Helper function to check if the layout is packed. Layout is packed if it is
/// 2D and lane_data[0] != 1 (data packed from col dimension).
/// TODO: Move to target info.
@@ -217,6 +186,15 @@ bool requirePacked(const LayoutAttr layout);
/// Helper function to check if the layout requires a transpose effect.
bool requireTranspose(const LayoutAttr layout, const uArch::uArch *uArch);
+// Check if dst shape is an expansion of src shape by inserting unit dimensions.
+bool matchUnitDimExpansion(ArrayRef<int64_t> src, ArrayRef<int64_t> dst,
+ SmallVector<int64_t> &expandedUnitDims);
+
+// Checks if dst shape is an expansion of src shape where each dimension in src
+// is split into one or more consecutive dimensions in dst
+bool matchSplitDimExpansion(ArrayRef<int64_t> src, ArrayRef<int64_t> dst,
+ SmallVector<SmallVector<int64_t>> &splitDimGroups);
+
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
index 05b4dbdbb0317..0341e4248767a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
+++ b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
@@ -216,15 +216,19 @@ struct SubgroupMatrixMultiplyAcc : public Instruction,
};
struct SpirvLoadGatherInstruction : public LoadGatherInstructionInterface {
- int32_t getMaxLaneLoadStoreSize(int32_t bitWidth) const override {
- return 16;
- }
+ int32_t getMaxLaneLoadSize(int32_t bitWidth) const override { return 16; }
};
struct SpirvStoreScatterInstruction : public StoreScatterInstructionInterface {
- int32_t getMaxLaneLoadStoreSize(int32_t bitWidth) const override {
- return 16;
- }
+ int32_t getMaxLaneStoreSize(int32_t bitWidth) const override { return 16; }
+};
+
+struct LoadMatrixInstruction : public LoadMatrixInstructionInterface {
+ int32_t getMaxLaneLoadSize(int32_t bitWidth) const override { return 16; }
+};
+
+struct StoreMatrixInstruction : public StoreMatrixInstructionInterface {
+ int32_t getMaxLaneStoreSize(int32_t bitWidth) const override { return 16; }
};
//===----------------------------------------------------------------------===//
@@ -239,9 +243,11 @@ struct PVCuArch final : public Xe2Plus {
static const Subgroup2DBlockPrefetchInstruction prefetchNdInst;
static const SpirvStoreScatterInstruction storeScatterInst;
static const SpirvLoadGatherInstruction loadGatherInst;
- static const Instruction *arr[] = {&dpasInst, &loadNdInst,
- &storeNdInst, &prefetchNdInst,
- &storeScatterInst, &loadGatherInst};
+ static const StoreMatrixInstruction storeMatrixInst;
+ static const LoadMatrixInstruction loadMatrixInst;
+ static const Instruction *arr[] = {
+ &dpasInst, &loadNdInst, &storeNdInst, &prefetchNdInst,
+ &storeScatterInst, &loadGatherInst, &storeMatrixInst, &loadMatrixInst};
return arr;
}
diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
index ee3d5a5a8c398..0c8673e602c46 100644
--- a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
+++ b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
@@ -40,7 +40,9 @@ enum class InstructionKind {
Subgroup2DBlockLoad, // Subgroup-level 2D block load instruction
Subgroup2DBlockPrefetch, // Subgroup-level 2D block prefetch instruction
StoreScatter, // Lane-level store (scalar, vector)
- LoadGather // Lane-level load (scalar, vector)
+ LoadGather, // Lane-level load (scalar, vector)
+ StoreMatrix, // Lane-level matrix store to slm
+ LoadMatrix // Lane-level matrix load to slm
// @TODO: Add more instructions as needed
};
@@ -71,6 +73,10 @@ struct Instruction {
return "store";
case InstructionKind::LoadGather:
return "load";
+ case InstructionKind::StoreMatrix:
+ return "store_matrix";
+ case InstructionKind::LoadMatrix:
+ return "load_matrix";
}
llvm_unreachable("Unknown InstructionKind");
}
@@ -254,6 +260,17 @@ struct MMAInstructionInterface {
// Common instructions (shared across architectures)
//===----------------------------------------------------------------------===//
+struct LoadGatherInstructionInterface : public Instruction {
+ LoadGatherInstructionInterface()
+ : Instruction(InstructionKind::LoadGather, InstructionScope::Lane) {}
+ static bool classof(const Instruction *B) {
+ return B->getInstructionKind() == InstructionKind::LoadGather;
+ }
+
+ virtual int32_t getMaxLaneLoadSize(int32_t bitWidth) const = 0;
+ virtual ~LoadGatherInstructionInterface() = default;
+};
+
struct StoreScatterInstructionInterface : public Instruction {
StoreScatterInstructionInterface()
: Instruction(InstructionKind::StoreScatter, InstructionScope::Lane) {}
@@ -261,19 +278,30 @@ struct StoreScatterInstructionInterface : public Instruction {
return B->getInstructionKind() == InstructionKind::StoreScatter;
}
- virtual int32_t getMaxLaneLoadStoreSize(int32_t bitWidth) const = 0;
+ virtual int32_t getMaxLaneStoreSize(int32_t bitWidth) const = 0;
virtual ~StoreScatterInstructionInterface() = default;
};
-struct LoadGatherInstructionInterface : public Instruction {
- LoadGatherInstructionInterface()
- : Instruction(InstructionKind::LoadGather, InstructionScope::Lane) {}
+struct LoadMatrixInstructionInterface : public Instruction {
+ LoadMatrixInstructionInterface()
+ : Instruction(InstructionKind::LoadMatrix, InstructionScope::Lane) {}
static bool classof(const Instruction *B) {
- return B->getInstructionKind() == InstructionKind::LoadGather;
+ return B->getInstructionKind() == InstructionKind::LoadMatrix;
}
- virtual int32_t getMaxLaneLoadStoreSize(int32_t bitWidth) const = 0;
- virtual ~LoadGatherInstructionInterface() = default;
+ virtual int32_t getMaxLaneLoadSize(int32_t bitWidth) const = 0;
+ virtual ~LoadMatrixInstructionInterface() = default;
+};
+
+struct StoreMatrixInstructionInterface : public Instruction {
+ StoreMatrixInstructionInterface()
+ : Instruction(InstructionKind::StoreMatrix, InstructionScope::Lane) {}
+ static bool classof(const Instruction *B) {
+ return B->getInstructionKind() == InstructionKind::StoreMatrix;
+ }
+
+ virtual int32_t getMaxLaneStoreSize(int32_t bitWidth) const = 0;
+ virtual ~StoreMatrixInstructionInterface() = default;
};
} // namespace uArch
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 217065b12c598..d99557e68f0ec 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -398,7 +398,7 @@ bool LayoutAttr::isEqualTo(const xegpu::DistributeLayoutAttr &other) {
// set the layout for unit dims: sg_data, inst_data and lane_data to 1
DistributeLayoutAttr
-LayoutAttr::setUnitDimData(SetVector<int64_t> unitDims) const {
+LayoutAttr::setUnitDimData(SmallVector<int64_t> unitDims) const {
auto sgDataOpt = getSgData();
auto instDataOpt = getInstData();
auto laneDataOpt = getLaneData();
@@ -407,15 +407,14 @@ LayoutAttr::setUnitDimData(SetVector<int64_t> unitDims) const {
SmallVector<int32_t> instData;
SmallVector<int32_t> laneData;
- if (sgDataOpt) {
+ if (sgDataOpt)
sgData = llvm::to_vector(sgDataOpt.asArrayRef());
- }
- if (instDataOpt) {
+
+ if (instDataOpt)
instData = llvm::to_vector(instDataOpt.asArrayRef());
- }
- if (laneDataOpt) {
+
+ if (laneDataOpt)
laneData = llvm::to_vector(laneDataOpt.asArrayRef());
- }
for (auto dim : unitDims) {
if (dim < static_cast<int64_t>(sgData.size()))
@@ -440,19 +439,17 @@ LayoutAttr::setUnitDimData(SetVector<int64_t> unitDims) const {
// set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
DistributeLayoutAttr
-LayoutAttr::setUnitDimLayout(SetVector<int64_t> unitDims) const {
+LayoutAttr::setUnitDimLayout(SmallVector<int64_t> unitDims) const {
auto sgLayoutOpt = getSgLayout();
auto laneLayoutOpt = getLaneLayout();
SmallVector<int32_t> sgLayout;
SmallVector<int32_t> laneLayout;
- if (sgLayoutOpt) {
+ if (sgLayoutOpt)
sgLayout = llvm::to_vector(sgLayoutOpt.asArrayRef());
- }
- if (laneLayoutOpt) {
+ if (laneLayoutOpt)
laneLayout = llvm::to_vector(laneLayoutOpt.asArrayRef());
- }
for (auto dim : unitDims) {
if (dim < static_cast<int64_t>(sgLayout.size()))
@@ -471,6 +468,174 @@ LayoutAttr::setUnitDimLayout(SetVector<int64_t> unitDims) const {
getLaneData(), getOrder());
}
+// Derive a new layout with sg_data, inst_data and lane_data set to the
+// specified values for the given dimension
+DistributeLayoutAttr LayoutAttr::setDimData(int64_t dim, int64_t sgData,
+ int64_t instData,
+ int64_t laneData) {
+
+ SmallVector<int64_t> sgDataVec = getEffectiveSgDataAsInt();
+ SmallVector<int64_t> instDataVec = getEffectiveInstDataAsInt();
+ SmallVector<int64_t> laneDataVec = getEffectiveLaneDataAsInt();
+
+ if (dim < static_cast<int64_t>(sgDataVec.size()) && sgData != -1)
+ sgDataVec[dim] = sgData;
+ if (dim < static_cast<int64_t>(instDataVec.size()) && instData != -1)
+ instDataVec[dim] = instData;
+ if (dim < static_cast<int64_t>(laneDataVec.size()) && laneData != -1)
+ laneDataVec[dim] = laneData;
+
+ SmallVector<int32_t> sgDataVec32(sgDataVec.begin(), sgDataVec.end());
+ SmallVector<int32_t> instDataVec32(instDataVec.begin(), instDataVec.end());
+ SmallVector<int32_t> laneDataVec32(laneDataVec.begin(), laneDataVec.end());
+
+ return LayoutAttr::get(
+ getContext(), getSgLayout(),
+ sgDataVec.empty() ? DenseI32ArrayAttr()
+ : DenseI32ArrayAttr::get(getContext(), sgDataVec32),
+ instDataVec.empty() ? DenseI32ArrayAttr()
+ : DenseI32ArrayAttr::get(getContext(), instDataVec32),
+ getLaneLayout(),
+ laneDataVec.empty() ? DenseI32ArrayAttr()
+ : DenseI32ArrayAttr::get(getContext(), laneDataVec32),
+ getOrder());
+}
+
+// Derive a new layout by collapsing dimensions.
+// `dimGroup` specifies a group of adjacent dimensions
+// that are collapsed into a single dimension in the derived layout.
+DistributeLayoutAttr LayoutAttr::collapseDims(SmallVector<int64_t> dimGroup) {
+
+ SmallVector<int64_t> sgLayout = getEffectiveSgLayoutAsInt();
+ SmallVector<int64_t> sgData = getEffectiveSgDataAsInt();
+ SmallVector<int64_t> instData = getEffectiveInstDataAsInt();
+ SmallVector<int64_t> laneLayout = getEffectiveLaneLayoutAsInt();
+ SmallVector<int64_t> laneData = getEffectiveLaneDataAsInt();
+
+ DenseI32ArrayAttr orderAttr = getOrder();
+ SmallVector<int32_t> orderVec;
+ if (orderAttr && !orderAttr.empty()) {
+ orderVec = llvm::to_vector(
+ llvm::map_range(orderAttr.asArrayRef(),
+ [](int32_t idx) { return static_cast<int32_t>(idx); }));
+ }
+
+ SmallVector<int64_t> sortedDimGroup = dimGroup;
+ llvm::sort(sortedDimGroup);
+ int64_t dimBeforeCurrent = -1;
+ for (auto dimIdx : sortedDimGroup) {
+ // when order is present, adjacency dims are on order values like [3, 2, 1,
+ // 0] in decreasing order otherwise based on dim indices like [0, 1, 2, 3]
+ // in increasing order
+ if (dimBeforeCurrent >= 0) {
+ if (!orderVec.empty()) {
+ int64_t orderBefore = orderVec[dimBeforeCurrent];
+ int64_t orderCurrent = orderVec[dimIdx];
+ if (orderBefore != (orderCurrent - 1))
+ llvm::report_fatal_error(
+ "dimensions being collapsed must be adjacent in order");
+ } else {
+ if (dimIdx != (dimBeforeCurrent + 1))
+ llvm::report_fatal_error(
+ "dimensions being collapsed must be adjacent");
+ }
+ }
+ dimBeforeCurrent = dimIdx;
+ }
+
+ int firstDim = sortedDimGroup.front();
+
+ // collapse the dimensions in dimGroup into one dimension by multiplying their
+ // sizes together
+
+ if (!sgLayout.empty()) {
+ int64_t collapsedSglayout = 1, collapsedSgData = 1;
+ for (auto dimIdx : dimGroup) {
+ collapsedSglayout *= sgLayout[dimIdx];
+ collapsedSgData *= sgData[dimIdx];
+ }
+ for (auto dimIdx : llvm::reverse(sortedDimGroup)) {
+ sgLayout.erase(sgLayout.begin() + dimIdx, sgLayout.begin() + dimIdx + 1);
+ sgData.erase(sgData.begin() + dimIdx, sgData.begin() + dimIdx + 1);
+ }
+ sgLayout.insert(sgLayout.begin() + firstDim, collapsedSglayout);
+ sgData.insert(sgData.begin() + firstDim, collapsedSgData);
+ }
+
+ if (!instData.empty()) {
+ int64_t collapsedInstData = 1;
+ for (auto dimIdx : dimGroup)
+ collapsedInstData *= instData[dimIdx];
+ for (auto dimIdx : llvm::reverse(sortedDimGroup))
+ instData.erase(instData.begin() + dimIdx, instData.begin() + dimIdx + 1);
+ instData.insert(instData.begin() + firstDim, collapsedInstData);
+ }
+
+ if (!laneLayout.empty()) {
+ int64_t collapsedLaneLayout = 1, collapsedLaneData = 1;
+ for (auto dimIdx : dimGroup) {
+ collapsedLaneLayout *= laneLayout[dimIdx];
+ collapsedLaneData *= laneData[dimIdx];
+ }
+ for (auto dimIdx : llvm::reverse(sortedDimGroup)) {
+ laneLayout.erase(laneLayout.begin() + dimIdx,
+ laneLayout.begin() + dimIdx + 1);
+ laneData.erase(laneData.begin() + dimIdx, laneData.begin() + dimIdx + 1);
+ }
+ laneLayout.insert(laneLayout.begin() + firstDim, collapsedLaneLayout);
+ laneData.insert(laneData.begin() + firstDim, collapsedLaneData);
+ }
+
+ // go through the values inside collapsedOrder, and re-map the order values
+ // to be in range of [0, N-1] where N is the number of dimensions in
+ // collapsed shape for exmaple, collapse dim group {2, 3} of order[1, 2, 3,
+ // 4] to new order[1, 3, 4]. the loop below remaps it to [1, 2, 3].
+ SmallVector<int32_t> collapsedOrder;
+ if (!orderVec.empty()) {
+
+ for (auto dimIdx : llvm::reverse(sortedDimGroup)) {
+ if (dimIdx != firstDim)
+ orderVec.erase(orderVec.begin() + dimIdx,
+ orderVec.begin() + dimIdx + 1);
+ }
+
+ // say we have orderVec = {5, 3, 2, 1, 0}
+ // Create indices [0, 1, 2, 3, 4]
+ SmallVector<size_t> indices =
+ llvm::to_vector(llvm::seq<size_t>(0, orderVec.size()));
+
+ // Sort indices based on corresponding values
+ llvm::sort(indices,
+ [&](size_t a, size_t b) { return orderVec[a] < orderVec[b]; });
+ collapsedOrder = llvm::to_vector(llvm::map_range(
+ indices, [&](size_t i) { return static_cast<int32_t>(i); }));
+ }
+
+ // Create collapsed layout
+ SmallVector<int32_t> sgLayout32(sgLayout.begin(), sgLayout.end());
+ SmallVector<int32_t> sgData32(sgData.begin(), sgData.end());
+ SmallVector<int32_t> instData32(instData.begin(), instData.end());
+ SmallVector<int32_t> laneLayout32(laneLayout.begin(), laneLayout.end());
+ SmallVector<int32_t> laneData32(laneData.begin(), laneData.end());
+
+ auto collapsedLayout = xegpu::LayoutAttr::get(
+ getContext(),
+ sgLayout32.empty() ? DenseI32ArrayAttr()
+ : DenseI32ArrayAttr::get(getContext(), sgLayout32),
+ sgData32.empty() ? DenseI32ArrayAttr()
+ : DenseI32ArrayAttr::get(getContext(), sgData32),
+ instData32.empty() ? DenseI32ArrayAttr()
+ : DenseI32ArrayAttr::get(getContext(), instData32),
+ laneLayout32.empty() ? DenseI32ArrayAttr()
+ : DenseI32ArrayAttr::get(getContext(), laneLayout32),
+ laneData32.empty() ? DenseI32ArrayAttr()
+ : DenseI32ArrayAttr::get(getContext(), laneData32),
+ collapsedOrder.empty()
+ ? DenseI32ArrayAttr()
+ : DenseI32ArrayAttr::get(getContext(), collapsedOrder));
+ return collapsedLayout;
+}
+
//===----------------------------------------------------------------------===//
// XeGPU_SliceAttr
//===----------------------------------------------------------------------===//
@@ -624,12 +789,12 @@ bool SliceAttr::isEqualTo(const xegpu::DistributeLayoutAttr &other) {
// shape is of rank 2, if we want to set unit dim [0] in sliced space, it maps
// to dim [0] in parent space; if we want to set unit dim [1] in sliced space,
// it maps to dim [2] in parent space.
-static SetVector<int64_t>
-mapSlicedDimsToParentSpace(const SetVector<int64_t> &dimsToMap,
+static SmallVector<int64_t>
+mapSlicedDimsToParentSpace(const SmallVector<int64_t> &dimsToMap,
ArrayRef<int64_t> sliceDims) {
- // Rather than recovering the exact parent rank, we compute a safe upper bound
- // so that dimsToMap can be adjusted safely. This upper bound is defined as
- // max(dimsToMap, sliceDims) + 1 + sliceDims.size().
+ // Rather than recovering the exact parent rank, we compute a safe upper
+ // bound so that dimsToMap can be adjusted safely. This upper bound is
+ // defined as max(dimsToMap, sliceDims) + 1 + sliceDims.size().
int64_t maxDim = -1;
maxDim =
std::max(maxDim, *std::max_element(sliceDims.begin(), sliceDims.end()));
@@ -648,10 +813,10 @@ mapSlicedDimsToParentSpace(const SetVector<int64_t> &dimsToMap,
}
// Map unit dims from sliced space to parent space
- SetVector<int64_t> adjustUnitDims;
+ SmallVector<int64_t> adjustUnitDims;
for (auto dim : dimsToMap) {
int64_t mappedDim = remainingDims[dim];
- adjustUnitDims.insert(mappedDim);
+ adjustUnitDims.push_back(mappedDim);
}
return adjustUnitDims;
@@ -659,12 +824,12 @@ mapSlicedDimsToParentSpace(const SetVector<int64_t> &dimsToMap,
// set the layout for unit dims: sg_data, inst_data and lane_data to 1
DistributeLayoutAttr
-SliceAttr::setUnitDimData(SetVector<int64_t> unitDims) const {
+SliceAttr::setUnitDimData(SmallVector<int64_t> unitDims) const {
DistributeLayoutAttr parentLayout = getParent();
ArrayRef<int64_t> sliceDims = getDims().asArrayRef();
- SetVector<int64_t> adjustUnitDims =
+ SmallVector<int64_t> adjustUnitDims =
mapSlicedDimsToParentSpace(unitDims, sliceDims);
return SliceAttr::get(getContext(),
@@ -673,18 +838,51 @@ SliceAttr::setUnitDimData(SetVector<int64_t> unitDims) const {
// set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
DistributeLayoutAttr
-SliceAttr::setUnitDimLayout(SetVector<int64_t> unitDims) const {
+SliceAttr::setUnitDimLayout(SmallVector<int64_t> unitDims) const {
DistributeLayoutAttr parentLayout = getParent();
ArrayRef<int64_t> sliceDims = getDims().asArrayRef();
- SetVector<int64_t> adjustUnitDims =
+ SmallVector<int64_t> adjustUnitDims =
mapSlicedDimsToParentSpace(unitDims, sliceDims);
return SliceAttr::get(
getContext(), parentLayout.setUnitDimLayout(adjustUnitDims), getDims());
}
+// Derive a new layout with sg_data, inst_data and lane_data set to the
+// specified values for the given dimension
+DistributeLayoutAttr SliceAttr::setDimData(int64_t dim, int64_t sgData,
+ int64_t instData, int64_t laneData) {
+ ArrayRef<int64_t> sliceDims = getDims().asArrayRef();
+ auto parent = getParent();
+
+ SmallVector<int64_t> dimSet;
+ dimSet.push_back(dim);
+ SmallVector<int64_t> adjustDims =
+ mapSlicedDimsToParentSpace(dimSet, sliceDims);
+ return SliceAttr::get(
+ getContext(),
+ parent.setDimData(adjustDims[0], sgData, instData, laneData), getDims());
+}
+
+// Derive a new layout by collapsing dimensions.
+// `dimGroup` specifies a group of adjacent dimensions
+// that are collapsed into a single dimension in the derived layout.
+DistributeLayoutAttr SliceAttr::collapseDims(SmallVector<int64_t> dimGroup) {
+
+ // Map the sliced dims from parent space to collapsed space
+ SmallVector<int64_t> sliceDims = llvm::to_vector(getDims().asArrayRef());
+
+ SmallVector<int64_t> dimsInParentSpace =
+ mapSlicedDimsToParentSpace(dimGroup, sliceDims);
+
+ auto collapsedParent = getParent().collapseDims(dimsInParentSpace);
+
+ return SliceAttr::get(getContext(), collapsedParent,
+ DenseI64ArrayAttr::get(getContext(), sliceDims));
+}
+
//===----------------------------------------------------------------------===//
// XeGPU_RangeAttr
//===----------------------------------------------------------------------===//
@@ -820,7 +1018,8 @@ TensorDescType::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
return emitError() << "unsupported element type " << elementType
<< ": expected integer or float";
- // for gather and scatter ops, Low-precision types are packed in 32-bit units.
+ // for gather and scatter ops, Low-precision types are packed in 32-bit
+ // units.
unsigned bitWidth = elementType.getIntOrFloatBitWidth();
int chunkAlignmentFactor =
bitWidth < xegpu::uArch::generalPackedFormatBitSize
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index 47a3f371164fd..cf99c3a68e7f8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -8,6 +8,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms
XeGPUPropagateLayout.cpp
XeGPUVectorLinearize.cpp
XeGPUPeepHoleOptimizer.cpp
+ XeGPULayoutImpl.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index c00b7d42d48a6..7ca1b957bbd01 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -12,6 +12,7 @@
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Interfaces/LoopLikeInterface.h"
#include "mlir/Pass/PassManager.h"
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
new file mode 100644
index 0000000000000..6e908cbf8c1e0
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -0,0 +1,851 @@
+//===---- XeGPULayoutImpl.cpp - MLIR Utilities for XeGPUOps
+//------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements layout utility functions for XeGPU dialect
+// transformation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <cstdint>
+#include <numeric>
+
+using namespace mlir;
+
+void xegpu::recoverTemporaryLayoutsDeprecated(Operation *op) {
+ op->walk([&](Operation *nestOp) {
+ for (OpOperand &opr : nestOp->getOpOperands()) {
+ auto layout = getDistributeLayoutAttr(opr.get());
+ setDistributeLayoutAttr(opr, layout);
+ }
+
+ for (OpResult result : nestOp->getOpResults()) {
+ auto layout = getDistributeLayoutAttr(result);
+ setDistributeLayoutAttr(result, layout);
+ }
+ });
+}
+
+SmallVector<NamedAttribute>
+xegpu::dropSgLayoutAndDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
+ SmallVector<NamedAttribute> out;
+ out.reserve(attrs.size());
+
+ for (auto attr : attrs) {
+ if (auto dist = dyn_cast<xegpu::DistributeLayoutAttr>(attr.getValue())) {
+ auto newLayout = dist.dropSgLayoutAndData();
+ if (newLayout)
+ out.emplace_back(attr.getName(), newLayout);
+ } else {
+ out.push_back(attr);
+ }
+ }
+
+ return out;
+}
+
+SmallVector<NamedAttribute>
+xegpu::dropInstDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
+ SmallVector<NamedAttribute> out;
+ out.reserve(attrs.size());
+
+ for (auto attr : attrs) {
+ if (auto dist = dyn_cast<xegpu::DistributeLayoutAttr>(attr.getValue())) {
+ auto newLayout = dist.dropInstData();
+ if (newLayout)
+ out.emplace_back(attr.getName(), newLayout);
+ } else {
+ out.push_back(attr);
+ }
+ }
+
+ return out;
+}
+
+// Attach layout attributes to all vector-type operands of operations within
+// the given operation's region. Reports an error if any vector operand lacks
+// a layout attribute.
+bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
+ auto result = rootOp->walk([&](Operation *op) {
+ for (OpOperand &operand : op->getOpOperands()) {
+ // Layouts are needed for vector type only.
+ if (!isa<VectorType>(operand.get().getType()))
+ continue;
+ auto layout = xegpu::getDistributeLayoutAttr(operand.get());
+ if (!layout) {
+ op->emitError("Could not find layout attribute for operand ")
+ << operand.getOperandNumber() << " of operation " << op->getName();
+ return WalkResult::interrupt();
+ }
+ xegpu::setDistributeLayoutAttr(operand, layout);
+ }
+ return WalkResult::advance();
+ });
+ return !result.wasInterrupted();
+}
+
+template <typename T, typename>
+void xegpu::removeLayoutAttr(const T &operandOrResult) {
+ Operation *owner = operandOrResult.getOwner();
+ std::string name = xegpu::getTemporaryLayoutName(operandOrResult);
+ if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
+ owner->removeAttr(name);
+}
+
+// Explicit instantiation for OpResult
+template void
+xegpu::removeLayoutAttr<mlir::OpResult>(const mlir::OpResult &result);
+
+// Explicit instantiation for OpOperand
+template void
+xegpu::removeLayoutAttr<mlir::OpOperand>(const mlir::OpOperand &operand);
+
+void xegpu::removeLayoutAttrs(Operation *op) {
+ op->walk([&](Operation *nestOp) {
+ // Remove all attributes of DistributeLayoutAttr type
+ SmallVector<StringAttr> attrsToRemove;
+ for (auto namedAttr : nestOp->getAttrs()) {
+ if (isa<DistributeLayoutAttr>(namedAttr.getValue()))
+ attrsToRemove.push_back(namedAttr.getName());
+ }
+ for (auto attrName : attrsToRemove)
+ nestOp->removeAttr(attrName);
+ });
+}
+
+/// Infers the source layout attribute for a broadcast operation given the
+/// result layout attribute, result shape, source shape.
+xegpu::DistributeLayoutAttr
+xegpu::inferBroadcastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
+ ArrayRef<int64_t> resShape,
+ ArrayRef<int64_t> srcShape) {
+
+ SmallVector<int64_t> bcastDims;
+ auto returnLayout = resLayout;
+
+ // Handling broadcast from low-rank to high-rank (e.g., 1D to 2D) case.
+ int dimDiff = resShape.size() - srcShape.size();
+
+ if (dimDiff > 0) {
+ // Adding the missing leading dims
+ for (int i = 0; i < dimDiff; i++)
+ bcastDims.push_back(i);
+
+ // Create a slice layout for the source
+ returnLayout = xegpu::SliceAttr::get(
+ resLayout.getContext(), resLayout,
+ DenseI64ArrayAttr::get(resLayout.getContext(), bcastDims));
+ }
+ return returnLayout;
+}
+
+/// Infers the source layout attribute for a reduction operation given the
+/// result layout attribute and reduced dims.
+xegpu::DistributeLayoutAttr
+xegpu::inferMultiReductionSourceLayout(xegpu::DistributeLayoutAttr resLayout,
+ SmallVector<int64_t> reduceDims) {
+
+ assert(isa<xegpu::SliceAttr>(resLayout) &&
+ "reduction result layout must be slice layout");
+
+ xegpu::SliceAttr sliceLayout = dyn_cast<xegpu::SliceAttr>(resLayout);
+ auto sliceDims = sliceLayout.getDims().asArrayRef();
+ assert(reduceDims == sliceDims &&
+ "reduction dims must match with slice dims");
+
+ return sliceLayout.getParent();
+}
+
+/// Infers the source layout attribute for a bitcast operation given the
+/// result layout attribute, result element type bitwidth, and source element
+/// type bitwidth.
+xegpu::DistributeLayoutAttr
+xegpu::inferBitCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
+ int resElemTyBitWidth, int srcElemTyBitWidth) {
+
+ SmallVector<int64_t> sgData = resLayout.getEffectiveSgDataAsInt();
+ SmallVector<int64_t> instData = resLayout.getEffectiveInstDataAsInt();
+ SmallVector<int64_t> laneData = resLayout.getEffectiveLaneDataAsInt();
+ size_t sgDataSize = sgData.size();
+ size_t instDataSize = instData.size();
+ size_t laneDataSize = laneData.size();
+ int64_t sgDataValue = -1;
+ int64_t instDataValue = -1;
+ int64_t laneDataValue = -1;
+ int64_t dim = resLayout.getRank() - 1;
+
+ if (srcElemTyBitWidth <= resElemTyBitWidth) {
+ int bitWidthRatio = resElemTyBitWidth / srcElemTyBitWidth;
+ if (sgDataSize)
+ sgDataValue = sgData.back() * bitWidthRatio;
+ if (instDataSize)
+ instDataValue = instData.back() * bitWidthRatio;
+ if (laneDataSize)
+ laneDataValue = laneData.back() * bitWidthRatio;
+ } else {
+ int bitWidthRatio = srcElemTyBitWidth / resElemTyBitWidth;
+ if (sgDataSize) {
+ assert((sgData.back() % bitWidthRatio) == 0 &&
+ "sgData not divisible by bitWidthRatio");
+ sgDataValue = sgData.back() / bitWidthRatio;
+ }
+ if (instDataSize) {
+ assert((instData.back() % bitWidthRatio) == 0 &&
+ "instData not divisible by bitWidthRatio");
+ instDataValue = instData.back() / bitWidthRatio;
+ }
+ if (laneDataSize) {
+ assert((laneData.back() % bitWidthRatio) == 0 &&
+ "laneData not divisible by bitWidthRatio");
+ laneDataValue = laneData.back() / bitWidthRatio;
+ }
+ }
+
+ xegpu::DistributeLayoutAttr finalSrcLayout;
+ finalSrcLayout =
+ resLayout.setDimData(dim, sgDataValue, instDataValue, laneDataValue);
+
+ return finalSrcLayout;
+}
+
+/// Infers the source layout attribute for an insert strided slice operation
+/// given the result layout attribute, result shape, and source shape. Removes
+/// leading dimensions from the result layout to match the source shape size.
+xegpu::DistributeLayoutAttr xegpu::inferInsertStridedSliceSourceLayout(
+ xegpu::DistributeLayoutAttr resLayout, ArrayRef<int64_t> resShape,
+ ArrayRef<int64_t> srcShape) {
+
+ int srcShapeSize = srcShape.size();
+ int resShapeSize = resShape.size();
+ int dimDiff = resShapeSize - srcShapeSize;
+
+ assert(isa<xegpu::LayoutAttr>(resLayout) &&
+ "insertStridedSlice result layout must be plain layout");
+ auto context = resLayout.getContext();
+ auto resInstData = resLayout.getEffectiveInstDataAsInt();
+ auto resLaneLayout = resLayout.getEffectiveLaneLayoutAsInt();
+ auto resLaneData = resLayout.getEffectiveLaneDataAsInt();
+
+ if (resInstData.size() != 0) {
+ SmallVector<int> inferredInstData(srcShapeSize);
+ for (int i = 0; i < srcShapeSize; i++)
+ inferredInstData[i] = resInstData[i + dimDiff];
+ return xegpu::LayoutAttr::get(context, inferredInstData);
+ }
+
+ if (resLaneLayout.size() != 0) {
+ SmallVector<int> inferredLaneLayout(srcShapeSize);
+ SmallVector<int> inferredLaneData(srcShapeSize);
+ for (int i = 0; i < srcShapeSize; i++) {
+ inferredLaneLayout[i] = resLaneLayout[i + dimDiff];
+ inferredLaneData[i] = resLaneData[i + dimDiff];
+ }
+ return xegpu::LayoutAttr::get(context, inferredLaneLayout,
+ inferredLaneData);
+ }
+ return nullptr;
+}
+
+/// Infers the source layout attribute for a shape cast operation given the
+/// result layout attribute, result shape, and source shape.
+xegpu::DistributeLayoutAttr
+xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
+ ArrayRef<int64_t> resShape,
+ ArrayRef<int64_t> srcShape) {
+
+ // There are three use cases:
+ // 1. expand dims of low-rank dimensions (e.g., 1D to 2D): to set up the
+ // tensor before broadcast
+ // 2. split dim of a high-rank dimension (e.g., 1D to 2D): to setup tensor
+ // for multi-stage reduction
+ // 3. combines all dims to a single dim and put in the innermost dim in 2d as
+ // [1, combinedData] or [combinedData]. Say, [2, 4, 8] -> [1, 64] or [64]
+ // Use cases are only supported after workgroup distribution,
+ // like cross-sg reduction saves multidimension data to
+ // 1D slm buffer, shapecast inserted by cse/canonicalization passes.
+
+ // Use case 1: Shapes only differ by expanding unit dimensions, for broadcast
+ SmallVector<int64_t> expandedUnitDims;
+
+ if (xegpu::matchUnitDimExpansion(srcShape, resShape, expandedUnitDims)) {
+ // create a slice layout for the source by removing the expanded unit dims
+ auto sliceDimsAttr = DenseI64ArrayAttr::get(
+ resLayout.getContext(), ArrayRef<int64_t>(expandedUnitDims));
+ auto srcLayout =
+ xegpu::SliceAttr::get(resLayout.getContext(), resLayout, sliceDimsAttr);
+ return srcLayout;
+ }
+
+ // Use case 2: Dim split from source to result, for multi-stage reduction
+ SmallVector<SmallVector<int64_t>> splitDimGroups;
+ if (xegpu::matchSplitDimExpansion(srcShape, resShape, splitDimGroups)) {
+ auto srcLayout = resLayout;
+ for (const auto &dimGroup : splitDimGroups)
+ srcLayout = srcLayout.collapseDims(dimGroup);
+
+ return srcLayout;
+ }
+
+ // Use case 3: Collaspse to innermost dim, for cross-sg reduction to SLM
+ auto matchCollapseToInnermostDim = [&](ArrayRef<int64_t> src,
+ ArrayRef<int64_t> dst) -> bool {
+ // only one non-unit dim in dst which is the innermost dim
+ if ((dst.size() != 2) && (dst.size() != 1))
+ return false;
+ int64_t srcSize = std::accumulate(src.begin(), src.end(), 1LL,
+ std::multiplies<int64_t>());
+ if (dst.size() == 1)
+ return (dst[0] == srcSize);
+ return (dst[0] == 1) && (dst[1] == srcSize);
+ };
+
+ if (matchCollapseToInnermostDim(srcShape, resShape)) {
+ int srcShapeSize = srcShape.size();
+ int resShapeSize = resShape.size();
+ auto context = resLayout.getContext();
+ auto resInstData = resLayout.getEffectiveInstDataAsInt();
+ auto resLaneLayout = resLayout.getEffectiveLaneLayoutAsInt();
+ auto resLaneData = resLayout.getEffectiveLaneDataAsInt();
+
+ // Extract layout info from result's innermost dimension and apply to
+ // source's innermost dimension while setting all other dimensions to 1.
+ // The inferred layout is restricted by srcShape to ensure it fits within
+ // the source dimensions.
+ // Examples 1:
+ // srcShape=[8, 16, 32], resShape=[1, 4096]
+ // resInstData=[1, 16]
+ // -> inferredInstData=[1, 1, min(16, 32)]=[1, 1, 16]
+ // Examples 2:
+ // srcShape=[4, 8, 64], resShape=[2048]
+ // resLaneLayout=[16], resLaneData=[2]
+ // -> inferredLaneLayout=[1, 1, 16]
+ // -> inferredLaneData=[1, 1, min(2, 64/16)]=[1, 1, 2]
+
+ if (resInstData.size() != 0) {
+ // assert resInstData must be 1 for all but the innermost dim
+ for (int i = 0; i < resShapeSize - 1; i++) {
+ assert(resInstData[i] == 1 &&
+ "only innermost dim can have non-unit instData");
+ }
+ SmallVector<int> inferredInstData(srcShapeSize, 1);
+ inferredInstData[srcShapeSize - 1] =
+ std::min(resInstData[resShapeSize - 1], srcShape[srcShapeSize - 1]);
+ return xegpu::LayoutAttr::get(context, inferredInstData);
+ }
+
+ if (resLaneLayout.size() != 0) {
+ for (int i = 0; i < resShapeSize - 1; i++) {
+ assert(resLaneData[i] == 1 &&
+ "only innermost dim can have non-unit instData");
+ }
+ assert(srcShape.back() % resLaneLayout.back() == 0 &&
+ "source innermost dim must be >= result lane layout");
+ SmallVector<int> inferredLaneLayout(srcShapeSize, 1);
+ SmallVector<int> inferredLaneData(srcShapeSize, 1);
+ inferredLaneLayout.back() = resLaneLayout.back();
+ inferredLaneData.back() = std::min(
+ resLaneData.back(), srcShape.back() / inferredLaneLayout.back());
+ return xegpu::LayoutAttr::get(context, inferredLaneLayout,
+ inferredLaneData);
+ }
+ }
+ llvm_unreachable("running into unsupported shape cast scenarios");
+ return nullptr;
+}
+
+/// Sets up layout for reduction operations by creating a SliceAttr for the
+/// result.
+///
+/// Algorithm Overview:
+/// This function attempts to construct a source layout that, when sliced along
+/// reduction dimensions, produces a result layout compatible with the
+/// consumer layout.
+///
+/// For subgroup layouts, it first tries to align the source layout's subgroup
+/// layout and data with the consumer's layout on non-reduction dimensions.
+/// Then, it distributes remaining subgroups across reduction dimensions. This
+/// avoids subgroup data redistribution overhead between the reduced result and
+/// its consumer.
+///
+/// InstData requries {1, ..., min(maxReduceVectorSize, srcShape),subgroupSize}
+/// Lane Layout requires {1, ..., 1, subgroupSize}
+/// Lane data requires {1, ..., min(maxReduceVectorSize, srcShape), 1}
+///
+/// Examples:
+/// 1. Subgroup layout - Row reduction on 2D tensor:
+/// srcShape=[32, 64], reductionDims=[1], resShape=[32], subgroupSize=16,
+/// workgroupSize=32
+/// Consumer Layout:
+/// #xegpu.slice<#xegpu.layout<sg_layout=[4, 8], sg_data=[8, 8]>, dims =
+/// [1]>} Result: srcLayout with sgLayout=[4, 8], sgData=[8, 8] (matches
+/// consumer on non-reduction dim, minimizing data redistribution on
+/// reduction dim)
+/// 2. Subgroup layout - Same example above but consumer has different layout:
+/// sgLayout=[32], sgData=[1]
+/// Result: srcLayout with sgLayout=[32,1], sgData=[1, 64]
+/// (distributes all subgroups on non reduction dim)
+///
+/// 2. InstData layout - Column reduction:
+/// srcShape=[32, 64], reductionDims=[0], subgroupSize=16
+/// Result: instData=[1, 16] (maxReduceVectorSize=1, subgroupSize on
+/// innermost)
+///
+/// 3. Lane layout - Multi-dimensional reduction:
+/// srcShape=[16, 32, 64], reductionDims=[1], subgroupSize=16
+/// Result: laneLayout=[1, 1, 16], laneData=[1, 1, 1]
+/// (subgroupSize on innermost dim, max vector size on reduction dim)
+
+xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
+ xegpu::LayoutKind layoutKind, VectorType srcVecTy,
+ DistributeLayoutAttr consumerLayout, SmallVector<int64_t> reductionDims,
+ const xegpu::uArch::uArch *uArch) {
+
+ auto srcShape = srcVecTy.getShape();
+ int srcRank = srcShape.size();
+ auto context = consumerLayout.getContext();
+
+ // Reduction layout requires at least 2D tensors
+ if (srcRank < 2)
+ return nullptr;
+
+ // Helper lambda to convert int64 vectors to int32 DenseArrayAttr
+ auto toInt32Attr = [&](ArrayRef<int64_t> vec) {
+ SmallVector<int32_t> vec32(vec.begin(), vec.end());
+ return DenseI32ArrayAttr::get(context, vec32);
+ };
+
+ // Extract original plain layout for workgroup/subgroup size recovery
+ xegpu::SliceAttr consumerSliceLayout =
+ dyn_cast<xegpu::SliceAttr>(consumerLayout);
+ DistributeLayoutAttr plainLayout =
+ consumerSliceLayout ? consumerSliceLayout.flatten().getParent()
+ : consumerLayout;
+
+ const int subgroupSize = uArch->getSubgroupSize();
+ int64_t maxReduceVectorSize = 1; // could extend to spirv vector Size
+
+ xegpu::DistributeLayoutAttr srcLayout;
+
+ if (layoutKind == xegpu::LayoutKind::Subgroup) {
+ auto sgLayoutVec = plainLayout.getEffectiveSgLayoutAsInt();
+ const int workgroupSize = std::accumulate(
+ sgLayoutVec.begin(), sgLayoutVec.end(), 1, std::multiplies<int64_t>());
+ SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank);
+ SmallVector<int64_t> consumerSgLayout =
+ consumerLayout.getEffectiveSgLayoutAsInt();
+ int remainingSgCount = workgroupSize;
+ int consumerIdx = consumerSgLayout.size() - 1;
+
+ // First pass: Match consumer's layout on non-reduction dimensions
+ for (int i = srcRank - 1; i >= 0; i--) {
+ if (!llvm::is_contained(reductionDims, i) && consumerIdx >= 0) {
+ sgLayout[i] = consumerSgLayout[consumerIdx];
+ assert((srcShape[i] % sgLayout[i] == 0) &&
+ "source shape not divisible by consumer sg_layout");
+ sgData[i] = srcShape[i] / sgLayout[i];
+ remainingSgCount /= sgLayout[i];
+ consumerIdx--;
+ }
+ }
+
+ // Second pass: Distribute remaining subgroups across reduction dimensions
+ for (int i = srcRank - 1; i >= 0; i--) {
+ if (llvm::is_contained(reductionDims, i)) {
+ sgLayout[i] =
+ std::min(srcShape[i], static_cast<int64_t>(remainingSgCount));
+ assert((srcShape[i] % sgLayout[i] == 0) &&
+ "source shape not divisible by sg_layout");
+ sgData[i] = srcShape[i] / sgLayout[i];
+ remainingSgCount /= sgLayout[i];
+ }
+ }
+
+ assert(remainingSgCount == 1 && "not all subgroups distributed");
+ srcLayout = xegpu::LayoutAttr::get(
+ context, toInt32Attr(sgLayout), toInt32Attr(sgData),
+ /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
+ /*lane_data =*/nullptr, /*order =*/nullptr);
+
+ } else if (layoutKind == xegpu::LayoutKind::InstData) {
+
+ SmallVector<int64_t> instData(srcRank, 1);
+ instData[srcRank - 2] =
+ std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
+ instData[srcRank - 1] = subgroupSize;
+ srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
+
+ } else if (layoutKind == xegpu::LayoutKind::Lane) {
+
+ SmallVector<int64_t> laneLayout(srcRank, 1), laneData(srcRank, 1);
+ laneLayout[srcRank - 1] = subgroupSize;
+ laneData[srcRank - 2] =
+ std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
+ srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(laneLayout),
+ toInt32Attr(laneData),
+ consumerLayout.getOrder());
+ }
+
+ return xegpu::SliceAttr::get(context, srcLayout,
+ DenseI64ArrayAttr::get(context, reductionDims));
+}
+
+/// Sets up the result layout for a bitcast operation.
+/// When casting to a smaller bitwidth, adjusts the layout dimensions (sgData,
+/// instData, or laneData) by multiplying by the bitwidth ratio to ensure the
+/// result layout can be correctly divided back to the source layout during
+/// inference.
+///
+/// Examples:
+/// 1. Casting f32 -> f16 (32-bit to 16-bit, bitWidthRatio = 2):
+/// Consumer layout: instData=[1, 16], subgroupSize=16
+/// Source shape: [8, 32]
+/// Result layout: instData=[1, 32] (16 * 2)
+/// The innermost dimension is multiplied by 2 to maintain consistency.
+///
+/// 2. Casting f32 -> i8 (32-bit to 8-bit, bitWidthRatio = 4):
+/// Consumer instData=[1, 16], subgroupSize=16
+/// Source shape: [4, 128]
+/// adjust the instData from [1, 16] to [1, 16 * 4 = 64]
+///
+/// 3. Casting i8 -> i32 (8-bit to 32-bit, bitWidthRatio = 1/4):
+/// Consumer layout: laneLayout=[1, 16], laneData=[1, 4]
+/// No adjustment needed - returns consumer layout directly.
+///
+xegpu::DistributeLayoutAttr xegpu::setupBitCastResultLayout(
+ xegpu::LayoutKind layoutKind, VectorType srcVecTy, VectorType resVecTy,
+ DistributeLayoutAttr consumerLayout, const xegpu::uArch::uArch *uArch) {
+
+ int srcElemTyBitWidth = srcVecTy.getElementType().getIntOrFloatBitWidth();
+ int resElemTyBitWidth = resVecTy.getElementType().getIntOrFloatBitWidth();
+
+ ArrayRef<int64_t> srcShape = srcVecTy.getShape();
+ SmallVector<int64_t> sgData = consumerLayout.getEffectiveSgDataAsInt();
+ SmallVector<int64_t> instData = consumerLayout.getEffectiveInstDataAsInt();
+ SmallVector<int64_t> laneData = consumerLayout.getEffectiveLaneDataAsInt();
+ size_t dim = srcShape.size() - 1;
+ int64_t sgDataValue = -1;
+ int64_t instDataValue = -1;
+ int64_t laneDataValue = -1;
+
+ const int subgroupSize = uArch->getSubgroupSize();
+
+ if (srcElemTyBitWidth > resElemTyBitWidth) {
+ // When casting to a smaller bitwidth, multiply the result layout
+ // accordingly to ensure it can be divided by the ratio back to the
+ // source layout.
+ int bitWidthRatio = srcElemTyBitWidth / resElemTyBitWidth;
+ int innermostDimLaneLayout = subgroupSize;
+ if (layoutKind == xegpu::LayoutKind::Subgroup) {
+ assert(sgData.size() == srcShape.size() &&
+ "sgData must be available for all dimensions");
+ sgDataValue = sgData[dim];
+ } else if (layoutKind == xegpu::LayoutKind::InstData) {
+ assert(instData.size() == srcShape.size() &&
+ "instData must be available for all dimensions");
+ instDataValue = instData[dim];
+ // Adjust instDataValue so it still fits within an instruction after
+ // dividing by bitWidthRatio
+ while ((instDataValue <= srcShape[dim]) &&
+ (instDataValue % (innermostDimLaneLayout * bitWidthRatio) != 0))
+ instDataValue *= 2;
+ assert((srcShape[dim] % instDataValue) == 0 &&
+ "srcShape, instData, and lanelayout for innermost must be 2^n !");
+ } else if (layoutKind == xegpu::LayoutKind::Lane) {
+ assert(laneData.size() == srcShape.size() &&
+ "laneData must be available for all dimensions");
+ laneDataValue = laneData[dim];
+ while ((laneDataValue <= srcShape[dim]) &&
+ (laneDataValue % bitWidthRatio != 0))
+ laneDataValue *= 2;
+ }
+ // Now set only instData and laneData, preserving sgData
+ xegpu::DistributeLayoutAttr resLayout;
+ resLayout = consumerLayout.setDimData(dim, sgDataValue, instDataValue,
+ laneDataValue);
+ return resLayout;
+ }
+ return consumerLayout;
+}
+
+/// Sets up the result layout for an insert strided slice operation.
+/// Creates a result layout based on the specified layout kind (InstData or
+/// Lane).
+/// Subgroup layout is currently not supported for this operation.
+/// InstData layout is first set to be {1, .., subgroupSize}.
+/// Lane layout is first set to be {1, ..., subgroupSize} with lane data {1,
+/// ..., 1}. The instData and laneData is then adjusted to contain packed data,
+/// by checking if the consumerLayout's innermost dimension.
+///
+/// Examples:
+/// 1. InstData layout without packing:
+/// resShape=[8, 32], subgroupSize=16, bitwidth=32
+/// packingFactor=1, packedDataSize=16
+/// consumerLayout: instData=[1, 16]
+/// Result: instData=[1, 16]
+///
+/// 2. InstData layout with packing:
+/// resShape=[8, 64], subgroupSize=16, bitwidth=8, packingFactor=4
+/// consumerLayout: instData=[1, 64]
+/// Result: instData=[1, 64] (adjusted for packed data)
+///
+/// 3. Lane layout without packing:
+/// resShape=[4, 64], subgroupSize=16, bitwidth=32
+/// consumerLayout: laneLayout=[1, 16], laneData=[1, 1]
+/// Result: laneLayout=[1, 16], laneData=[1, 1]
+///
+/// 4. Lane layout with packing:
+/// resShape=[4, 64], subgroupSize=16, bitwidth=16, packingFactor=2
+/// consumerLayout: laneLayout=[1, 16], laneData=[1, 2]
+/// Result: laneLayout=[1, 16], laneData=[1, 2] (adjusted for packed data)
+xegpu::DistributeLayoutAttr xegpu::setupInsertStridedSliceResultLayout(
+ xegpu::LayoutKind layoutKind, VectorType srcVectorTy,
+ VectorType resVectorTy, xegpu::DistributeLayoutAttr consumerLayout,
+ const xegpu::uArch::uArch *uArch) {
+
+ xegpu::DistributeLayoutAttr requiredResLayout;
+ auto subgroupSize = uArch->getSubgroupSize();
+ auto context = resVectorTy.getContext();
+ auto resShape = resVectorTy.getShape();
+ int resShapeSize = resShape.size();
+ auto srcShape = srcVectorTy.getShape();
+ SmallVector<int64_t> consumerInstData =
+ consumerLayout.getEffectiveInstDataAsInt();
+ SmallVector<int64_t> consumerLaneData =
+ consumerLayout.getEffectiveLaneDataAsInt();
+
+ SmallVector<int> instData(resShapeSize, 1);
+ SmallVector<int> laneLayout(resShapeSize, 1);
+ SmallVector<int> laneData(resShapeSize, 1);
+
+ const unsigned packingSize{uArch->getGeneralPackedFormatBitSize()};
+ unsigned bitwidth = resVectorTy.getElementType().getIntOrFloatBitWidth();
+ int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
+ int packedDataSize = subgroupSize * packingFactor;
+
+ if (layoutKind == xegpu::LayoutKind::Subgroup) {
+ assert(true &&
+ "subgroup layout assignment not supported for insertStridedSlice.");
+ } else if (layoutKind == xegpu::LayoutKind::InstData) {
+ assert(srcShape.back() >= subgroupSize &&
+ "source innermost dim must be >= subgroupSize");
+ instData.back() = subgroupSize;
+ if (consumerInstData.back() == packedDataSize &&
+ srcShape.back() >= packedDataSize)
+ instData.back() = packedDataSize;
+ requiredResLayout = xegpu::LayoutAttr::get(context, instData);
+ } else if (layoutKind == xegpu::LayoutKind::Lane) {
+ laneLayout.back() = subgroupSize;
+ laneData.back() = 1;
+ if (consumerLaneData.back() == packingFactor &&
+ srcShape.back() >= packedDataSize)
+ laneData.back() = packingFactor;
+ requiredResLayout = xegpu::LayoutAttr::get(context, laneLayout, laneData);
+ }
+ return requiredResLayout;
+}
+
+/// Sets up the anchor layout for load gather and load matrix operation.
+/// load matrix lowers to load gather and 1d block load. All of them share the
+/// same layout setup logic.
+/// For Subgroup layout, uses the consumer layout directly.
+/// non-chunked loads:
+/// InstData = {1, ..., min(consumer, maxLaneLoadSize * subgroupSize)}
+/// LaneLayout = {1, ..., subgroupSize}
+/// lane_data = {1, ..., min(consumer, maxLaneLoadSize)}
+/// chunked loads:
+/// InstData = {subgroupSize, min(consumer, maxLaneLoadSize)}
+/// LaneLayout = {subgroupSize, 1}
+/// lane_data={1,min(consumer, maxLaneLoadSize)}
+static xegpu::DistributeLayoutAttr setupGenericLoadAnchorLayout(
+ xegpu::LayoutKind layoutKind, mlir::MLIRContext *context,
+ xegpu::DistributeLayoutAttr consumerLayout, bool isChunkedLoad,
+ int maxChunkSize, int valShapeSize, int subgroupSize) {
+
+ if (layoutKind == xegpu::LayoutKind::Subgroup)
+ return consumerLayout;
+
+ SmallVector<int64_t> consumerInstData =
+ consumerLayout.getEffectiveInstDataAsInt();
+ SmallVector<int64_t> consumerLaneData =
+ consumerLayout.getEffectiveLaneDataAsInt();
+
+ SmallVector<int> instData(valShapeSize, 1);
+ SmallVector<int> laneLayout(valShapeSize, 1);
+ SmallVector<int> laneData(valShapeSize, 1);
+
+ if (!isChunkedLoad) {
+ if (layoutKind == xegpu::LayoutKind::InstData) {
+ instData[valShapeSize - 1] =
+ std::min(static_cast<int>(consumerInstData[valShapeSize - 1]),
+ maxChunkSize * subgroupSize);
+ return xegpu::LayoutAttr::get(context, instData);
+ } else if (layoutKind == xegpu::LayoutKind::Lane) {
+ laneLayout.back() = subgroupSize;
+ laneData.back() =
+ std::min(static_cast<int>(consumerLaneData.back()), maxChunkSize);
+ return xegpu::LayoutAttr::get(context, laneLayout, laneData);
+ }
+ } else {
+ assert(valShapeSize == 2 && "Chunked Store must access 2D tensor tile.");
+ if (layoutKind == xegpu::LayoutKind::InstData) {
+ instData[0] = subgroupSize;
+ instData[1] =
+ std::min(static_cast<int>(consumerInstData[1]), maxChunkSize);
+ return xegpu::LayoutAttr::get(context, instData);
+ } else if (layoutKind == xegpu::LayoutKind::Lane) {
+ laneLayout[0] = subgroupSize;
+ laneData[1] =
+ std::min(static_cast<int>(consumerLaneData[1]), maxChunkSize);
+ return xegpu::LayoutAttr::get(context, laneLayout, laneData);
+ }
+ }
+ return nullptr;
+}
+
+/// Sets up the anchor layout for a load gather operation.
+xegpu::DistributeLayoutAttr xegpu::setupLoadGatherAnchorLayout(
+ xegpu::LayoutKind layoutKind, VectorType resVecTy, int chunkSize,
+ xegpu::DistributeLayoutAttr consumerLayout, const uArch::uArch *uArch) {
+
+ const int subgroupSize = uArch->getSubgroupSize();
+ int resShapeSize = resVecTy.getShape().size();
+ auto context = resVecTy.getContext();
+ auto elemBitWidth = resVecTy.getElementType().getIntOrFloatBitWidth();
+
+ const auto *uArchInstruction =
+ dyn_cast<xegpu::uArch::SpirvLoadGatherInstruction>(
+ uArch->getInstruction(xegpu::uArch::InstructionKind::LoadGather));
+ int maxChunkSize = uArchInstruction->getMaxLaneLoadSize(elemBitWidth);
+
+ return setupGenericLoadAnchorLayout(layoutKind, context, consumerLayout,
+ (chunkSize > 1), maxChunkSize,
+ resShapeSize, subgroupSize);
+}
+
+/// Sets up the anchor layout for load matrix operation.
+/// TODO: enhance load matrix to indicate lowering to chunked load or not.
+xegpu::DistributeLayoutAttr
+xegpu::setupLoadMatrixAnchorLayout(xegpu::LayoutKind layoutKind,
+ VectorType resVecTy,
+ xegpu::DistributeLayoutAttr consumerLayout,
+ const xegpu::uArch::uArch *uArch) {
+
+ const int subgroupSize = uArch->getSubgroupSize();
+ int resShapeSize = resVecTy.getShape().size();
+ auto context = resVecTy.getContext();
+ auto elemBitWidth = resVecTy.getElementType().getIntOrFloatBitWidth();
+
+ const auto *uArchInstruction = dyn_cast<xegpu::uArch::LoadMatrixInstruction>(
+ uArch->getInstruction(xegpu::uArch::InstructionKind::LoadMatrix));
+ int maxChunkSize = uArchInstruction->getMaxLaneLoadSize(elemBitWidth);
+ return setupGenericLoadAnchorLayout(layoutKind, context, consumerLayout,
+ false, maxChunkSize, resShapeSize,
+ subgroupSize);
+}
+
+/// Sets up the anchor layout for store scatter and store matrix operation.
+/// store matrix lowers to store scatter and 1d block store. All of them share
+/// the same layout setup logic. For Subgroup layout, not support yet.
+/// non-chunked stores:
+/// InstData = {1, ..., subgroupSize}
+/// LaneLayout = {1, ..., subgroupSize}
+/// lane_data = {1, ..., 1}
+/// chunked stores:
+/// InstData = {subgroupSize, min(srcVec, maxLaneStoreSize)}
+/// LaneLayout = {subgroupSize, 1}
+/// lane_data={1,min(srcVec, maxLaneStoreSize)}
+static xegpu::DistributeLayoutAttr
+setupGenericStoreAnchorLayout(xegpu::LayoutKind layoutKind,
+ mlir::MLIRContext *context, bool isChunkedStore,
+ int maxChunkSize, ArrayRef<int64_t> srcShape,
+ int subgroupSize) {
+
+ int srcShapeSize = srcShape.size();
+ SmallVector<int> instData(srcShapeSize, 1);
+ SmallVector<int> laneLayout(srcShapeSize, 1);
+ SmallVector<int> laneData(srcShapeSize, 1);
+
+ if (layoutKind == xegpu::LayoutKind::Subgroup) {
+ assert(true &&
+ "subgroup layout assignment not supported for storeScatter.");
+ return nullptr;
+ }
+
+ if (!isChunkedStore) {
+ if (layoutKind == xegpu::LayoutKind::InstData) {
+ instData[srcShapeSize - 1] = subgroupSize;
+ return xegpu::LayoutAttr::get(context, instData);
+ } else if (layoutKind == xegpu::LayoutKind::Lane) {
+ laneLayout[srcShapeSize - 1] = subgroupSize;
+ return xegpu::LayoutAttr::get(context, laneLayout, laneData);
+ }
+ } else {
+ assert(srcShapeSize == 2 && "Chunked Store must access 2D tensor tile.");
+ if (layoutKind == xegpu::LayoutKind::InstData) {
+ instData[0] = subgroupSize;
+ instData[1] = std::min(static_cast<int>(srcShape[1]), maxChunkSize);
+ return xegpu::LayoutAttr::get(context, instData);
+ } else if (layoutKind == xegpu::LayoutKind::Lane) {
+ laneLayout[0] = subgroupSize;
+ laneData[1] = std::min(static_cast<int>(srcShape[1]), maxChunkSize);
+ return xegpu::LayoutAttr::get(context, laneLayout, laneData);
+ }
+ }
+ return nullptr;
+}
+
+/// Sets up the anchor layout for a store scatter operation.
+xegpu::DistributeLayoutAttr
+xegpu::setupStoreScatterAnchorLayout(xegpu::LayoutKind layoutKind,
+ VectorType srcVecTy, int chunkSize,
+ const uArch::uArch *uArch) {
+
+ const int subgroupSize = uArch->getSubgroupSize();
+ ArrayRef<int64_t> srcShape = srcVecTy.getShape();
+ auto context = srcVecTy.getContext();
+ auto elemBitWidth = srcVecTy.getElementType().getIntOrFloatBitWidth();
+
+ const auto *uArchInstruction =
+ dyn_cast<xegpu::uArch::SpirvStoreScatterInstruction>(
+ uArch->getInstruction(xegpu::uArch::InstructionKind::StoreScatter));
+ int maxChunkSize = uArchInstruction->getMaxLaneStoreSize(elemBitWidth);
+ return setupGenericStoreAnchorLayout(layoutKind, context, (chunkSize > 1),
+ maxChunkSize, srcShape, subgroupSize);
+}
+
+/// Sets up the anchor layout for a store matrix operation.
+xegpu::DistributeLayoutAttr
+xegpu::setupStoreMatrixAnchorLayout(xegpu::LayoutKind layoutKind,
+ VectorType srcVecTy,
+ const xegpu::uArch::uArch *uArch) {
+
+ const int subgroupSize = uArch->getSubgroupSize();
+ ArrayRef<int64_t> srcShape = srcVecTy.getShape();
+ auto context = srcVecTy.getContext();
+ auto elemBitWidth = srcVecTy.getElementType().getIntOrFloatBitWidth();
+
+ const auto *uArchInstruction = dyn_cast<xegpu::uArch::StoreMatrixInstruction>(
+ uArch->getInstruction(xegpu::uArch::InstructionKind::StoreMatrix));
+ int maxChunkSize = uArchInstruction->getMaxLaneStoreSize(elemBitWidth);
+
+ return setupGenericStoreAnchorLayout(layoutKind, context, false, maxChunkSize,
+ srcShape, subgroupSize);
+}
\ No newline at end of file
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
index 6a3e533fb2df4..8694bca974df1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
@@ -16,6 +16,7 @@
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
#include "mlir/Dialect/XeGPU/uArch/uArchBase.h"
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index faafb7e8cee61..ccfab7350e351 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -15,7 +15,7 @@
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
-#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
#include "mlir/IR/Attributes.h"
@@ -127,6 +127,7 @@ struct LayoutInfo {
}
Attribute get() { return storage; }
+ void set(const xegpu::DistributeLayoutAttr &layout) { storage = layout; }
};
SmallVector<int> LayoutInfo::getLaneLayout() const {
@@ -307,27 +308,6 @@ static LayoutInfo getSIMTLayoutInfoBlockIO(Ty ty,
ty.getContext(), {1, uArch->getSubgroupSize()}, {1, packingFactor}));
}
-/// Helper to get the default layout for a vector type.
-static LayoutInfo getSIMTLayoutInfoScatterIO(VectorType vectorTy,
- const xegpu::uArch::uArch *uArch) {
- // Expecting a 1D or 2D vector.
- assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) &&
- "Expected 1D or 2D vector.");
- // Expecting int or float element type.
- assert(vectorTy.getElementType().isIntOrFloat() &&
- "Expected int or float element type.");
- // If the rank is 1, then return default layout for 1D vector.
- const unsigned packingSize{uArch->getGeneralPackedFormatBitSize()};
- if (vectorTy.getRank() == 1)
- return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1, uArch);
- // Packing factor is determined by the element type bitwidth.
- unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
- int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
- return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(),
- {uArch->getSubgroupSize(), 1},
- {1, packingFactor}));
-}
-
/// Helper Function to get the expected layouts for DPAS operands. `lane_data`
/// is set according to the following criteria:
/// * For A operand, the data must be packed in minimum
@@ -417,11 +397,27 @@ class LayoutInfoPropagation
void visitShapeCastOp(vector::ShapeCastOp shapeCast,
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
+ void
+ visitInsertStridedSliceOp(vector::InsertStridedSliceOp insertStridedSlice,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
+
+ void visitLoadMatrixOp(xegpu::LoadMatrixOp load,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitStoreMatrixOp(xegpu::StoreMatrixOp store,
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
+ void visitLoadGatherOp(xegpu::LoadMatrixOp load,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
+
+ void visitStoreScatterOp(xegpu::StoreMatrixOp store,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
+
bool hasParamsOfLayoutKind(xegpu::DistributeLayoutAttr anchorLayout);
public:
@@ -497,6 +493,12 @@ LogicalResult LayoutInfoPropagation::visitOperation(
.Case([&](vector::ShapeCastOp shapeCastOp) {
visitShapeCastOp(shapeCastOp, operands, results);
})
+ .Case([&](vector::InsertStridedSliceOp insertStridedSliceOp) {
+ visitInsertStridedSliceOp(insertStridedSliceOp, operands, results);
+ })
+ .Case([&](xegpu::LoadMatrixOp loadMatrixOp) {
+ visitLoadMatrixOp(loadMatrixOp, operands, results);
+ })
.Case([&](xegpu::StoreMatrixOp storeMatrixOp) {
visitStoreMatrixOp(storeMatrixOp, operands, results);
})
@@ -646,32 +648,45 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
// The layout of the result must be present.
- LayoutInfo resultLayout = results[0]->getValue();
- if (!resultLayout.isAssigned())
- return;
- // We only consider 2D -> 1D reductions at this point.
- VectorType resultTy = llvm::dyn_cast<VectorType>(reduction.getDestType());
- if (!resultTy || resultTy.getRank() != 1) {
- reduction.emitWarning("Expecting output type to be 1D vector.");
+ LayoutInfo resLayoutInfo = results[0]->getValue();
+ if (!resLayoutInfo.isAssigned())
return;
- }
+
+ VectorType sourceTy = reduction.getSourceVectorType();
+ SmallVector<int64_t> reductionDims(reduction.getReductionDims());
+
auto uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
- // Given that the result is 1D, the layout of the operand should be 2D with
- // default layout.
- LayoutInfo operandLayout = getDefaultSIMTLayoutInfo(
- reduction->getContext(), 2, uArch->getSubgroupSize());
- propagateIfChanged(operands[0], operands[0]->meet(operandLayout));
+ auto consumerLayoutAttr =
+ dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+
+ // The result layout represents the layout requirements of the operation.
+ // it is recorded to anchor layout or temporary layout.
+ // it must be honored for current op and may conflict with the layout
+ // propagated from consumer op, the conflict is resolved in later phase by
+ // converting the required result layout to the consumer layout
+ auto requiredResLayoutAttr = xegpu::setupMultiReductionResultLayout(
+ layoutKind, sourceTy, consumerLayoutAttr, reductionDims, uArch);
+
+ xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);
+
+ // derive the source layout from the dominant layout and reduction dims
+ auto srcLayoutAttr = xegpu::inferMultiReductionSourceLayout(
+ requiredResLayoutAttr, reductionDims);
+
+ propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
// Accumulator should have the same layout as the result.
- propagateIfChanged(operands[1], operands[1]->meet(resultLayout));
+ propagateIfChanged(operands[1],
+ operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
}
void LayoutInfoPropagation::visitVectorBroadCastOp(
vector::BroadcastOp broadcast, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
// The layout of the result must be present.
- LayoutInfo resultLayout = results[0]->getValue();
- if (!resultLayout.isAssigned())
+ LayoutInfo resLayoutInfo = results[0]->getValue();
+ if (!resLayoutInfo.isAssigned())
return;
+
// Only consider vector to vector broadcasts for now.
VectorType resultTy = broadcast.getResultVectorType();
VectorType sourceTy = dyn_cast<VectorType>(broadcast.getSourceType());
@@ -679,55 +694,41 @@ void LayoutInfoPropagation::visitVectorBroadCastOp(
if (!sourceTy)
return;
- // Hanlding broadcast from low-rank to high-rank (e.g., 1D to 2D) case.
- if (sourceTy.getRank() != resultTy.getRank()) {
- auto sourceDims = sourceTy.getShape();
- auto resultDims = resultTy.getShape();
- SmallVector<int64_t> bcastDims;
- auto dimDiff = resultTy.getRank() - sourceTy.getRank();
- // adding the missing leading dims
- for (int i = 0; i < dimDiff; i++)
- bcastDims.push_back(i);
-
- // for the rest dims in the resultTy, if sourceTy dim is 1, then it's
- // broadcasted dim
- for (size_t i = 0; i < sourceDims.size(); i++)
- if ((sourceDims[i] == 1) && (resultDims[i + dimDiff] != 1))
- bcastDims.push_back(i + dimDiff);
-
- // create a slice layout for the source
- xegpu::SliceAttr sliceLayout = xegpu::SliceAttr::get(
- broadcast->getContext(),
- cast<xegpu::DistributeLayoutAttr>(resultLayout.get()),
- DenseI64ArrayAttr::get(broadcast->getContext(), bcastDims));
-
- propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(sliceLayout)));
- return;
- }
- propagateIfChanged(operands[0], operands[0]->meet(resultLayout));
+ auto srcShape = sourceTy.getShape();
+ auto resShape = resultTy.getShape();
+
+ size_t dimDiff = resultTy.getRank() - sourceTy.getRank();
+ for (size_t i = 0; i < srcShape.size(); i++)
+ if ((srcShape[i] == 1) && (resShape[i + dimDiff] != 1))
+ broadcast.emitWarning("broadcast must either from low-rank or same-rank "
+ "with unit-dim, mixed scenario is not supported!");
+
+ auto resultLayoutAttr =
+ dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+
+ xegpu::DistributeLayoutAttr srcLayoutAttr =
+ xegpu::inferBroadcastSourceLayout(resultLayoutAttr, resShape, srcShape);
+
+ propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
+ return;
}
void LayoutInfoPropagation::visitShapeCastOp(
vector::ShapeCastOp shapeCast, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
// The layout of the result must be present.
- LayoutInfo resultLayout = results[0]->getValue();
- if (!resultLayout.isAssigned())
+ LayoutInfo resLayoutInfo = results[0]->getValue();
+ if (!resLayoutInfo.isAssigned())
return;
- VectorType sourceTy = shapeCast.getSourceVectorType();
- VectorType resultTy = shapeCast.getResultVectorType();
- // Shape cast layout propagation only supports 1D -> 2D shape casts.
- // TODO: Support kD -> nD shape casts (k < n, n >= 2) where expanded dims are
- // unit dimensions and non-unit dims match.
- if (sourceTy.getRank() != 1 || resultTy.getRank() != 2) {
- shapeCast.emitWarning("Expecting shape cast to be 1D -> 2D.");
- return;
- }
- int64_t slicedDim = resultTy.getShape()[0] == 1 ? 0 : 1;
- xegpu::SliceAttr sliceLayout = xegpu::SliceAttr::get(
- shapeCast->getContext(), cast<xegpu::LayoutAttr>(resultLayout.get()),
- DenseI64ArrayAttr::get(shapeCast->getContext(), {slicedDim}));
- propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(sliceLayout)));
+ ArrayRef<int64_t> resShape = shapeCast.getResultVectorType().getShape();
+ ArrayRef<int64_t> srcShape = shapeCast.getSourceVectorType().getShape();
+ auto resultLayoutAttr =
+ dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+
+ xegpu::DistributeLayoutAttr srcLayoutAttr =
+ xegpu::inferShapeCastSourceLayout(resultLayoutAttr, resShape, srcShape);
+
+ propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
}
/// Propagate the layout of the result tensor to the source tensor descriptor
@@ -748,7 +749,6 @@ void LayoutInfoPropagation::visitUpdateNdOffsetOp(
void LayoutInfoPropagation::visitDpasOp(
xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
-
LayoutInfo dpasALayout;
LayoutInfo dpasBLayout;
LayoutInfo dpasCDLayout;
@@ -945,7 +945,6 @@ void LayoutInfoPropagation::visitDpasOp(
void LayoutInfoPropagation::visitStoreNdOp(
xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
-
LayoutInfo storeLayout;
xegpu::DistributeLayoutAttr anchorLayout = store.getLayoutAttr();
if (hasParamsOfLayoutKind(anchorLayout)) {
@@ -986,7 +985,7 @@ void LayoutInfoPropagation::visitStoreNdOp(
storeLayout =
getSIMTLayoutInfoBlockIO(store.getValueType(), uArch,
uArchInstruction->getPackedFormatBitSize());
- else { // LayoutKind::Subgroup
+ else { // xegpu::LayoutKind::Subgroup
auto sgSize = uArch->getSubgroupSize();
auto numSgOrErr = getNumSg(store, sgSize);
if (failed(numSgOrErr)) {
@@ -1026,7 +1025,6 @@ void LayoutInfoPropagation::visitStoreNdOp(
void LayoutInfoPropagation::visitLoadNdOp(
xegpu::LoadNdOp load, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
-
LayoutInfo loadLayout;
xegpu::DistributeLayoutAttr anchorLayout = load.getLayoutAttr();
if (hasParamsOfLayoutKind(anchorLayout)) {
@@ -1072,66 +1070,60 @@ void LayoutInfoPropagation::visitVectorBitcastOp(
vector::BitCastOp bitcast, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
// Need the layout of bitcast result to propagate to the operands.
- LayoutInfo resultLayout = results[0]->getValue();
- if (!resultLayout.isAssigned())
- return;
- int inElemTyBitWidth =
- bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth();
- int outElemTyBitWidth =
- bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
- // If the element bit widths are the same, then the layout does not change.
- if (inElemTyBitWidth == outElemTyBitWidth) {
- propagateIfChanged(operands[0], operands[0]->meet(resultLayout));
+ LayoutInfo resLayoutInfo = results[0]->getValue();
+ if (!resLayoutInfo.isAssigned())
return;
- }
- // Check if the result layout is valid. i.e. result vector can be distributed.
- auto resultLaneLayout = resultLayout.getLaneLayout();
- auto resultLaneData = resultLayout.getLaneData();
- if (failed(xegpu::getDistributedVectorType(
- bitcast.getResultVectorType(),
- xegpu::LayoutAttr::get(bitcast->getContext(), resultLaneLayout,
- resultLaneData)))) {
- bitcast.emitWarning(
- "Result vector type can not be evenly distributed across lanes.");
- return;
- }
- int64_t rank = bitcast.getSourceVectorType().getRank();
- // Bitcast is a `narrowing` if the input element type bit width larger than
- // the output element type bit width. eg. f32 -> f16 is a narrowing bitcast.
- bool isNarrowing = inElemTyBitWidth > outElemTyBitWidth;
- int bitCastRatio = isNarrowing ? inElemTyBitWidth / outElemTyBitWidth
- : outElemTyBitWidth / inElemTyBitWidth;
- SmallVector<int> sourceLaneLayout =
- resultLayout.getLaneLayout(); // Lane layout does not change for bitcast.
- SmallVector<int> outData = resultLayout.getLaneData();
-
- // TODO: Currently we assume that bitcasts does not require cross lane
- // communication. So each lane must own the required number of elements to
- // perform the bitcast locally without cross-lane communication.
- int outInnerBitsPerLane = outData[rank - 1] * outElemTyBitWidth;
- if (outInnerBitsPerLane < inElemTyBitWidth) {
- bitcast.emitWarning(
- "Narrowing bitcast with cross lane communication is not supported.");
- return;
- }
- // Check if each lane owns a single element in all dimensions except the
- // innermost dimension.
- SmallVector<int> sourceLaneData(outData.begin(), outData.end() - 1);
- if (llvm::any_of(sourceLaneData, [](int64_t d) { return d != 1; })) {
- bitcast.emitWarning("Each lane must not own multiple elements in any "
- "dimension other than "
- "the innermost dimension.");
+
+ auto srcVecType = bitcast.getSourceVectorType();
+ auto resVecType = bitcast.getResultVectorType();
+
+ auto consumerLayoutAttr =
+ dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+ auto uArch = getUArch(xegpu::getChipStr(bitcast).value_or(""));
+ auto requiredResLayoutAttr = setupBitCastResultLayout(
+ layoutKind, srcVecType, resVecType, consumerLayoutAttr, uArch);
+
+ xegpu::setTemporaryLayout(bitcast->getResult(0), requiredResLayoutAttr);
+
+ int inElemTyBitWidth = srcVecType.getElementType().getIntOrFloatBitWidth();
+ int outElemTyBitWidth = resVecType.getElementType().getIntOrFloatBitWidth();
+
+ // derive the source layout from the dominant layout and reduction dims
+ auto srcLayoutAttr = xegpu::inferBitCastSourceLayout(
+ requiredResLayoutAttr, outElemTyBitWidth, inElemTyBitWidth);
+
+ propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
+}
+
+void LayoutInfoPropagation::visitInsertStridedSliceOp(
+ vector::InsertStridedSliceOp insertStridedSlice,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
+ // The layout of the result must be present.
+ LayoutInfo resLayoutInfo = results[0]->getValue();
+ if (!resLayoutInfo.isAssigned())
return;
- }
- // Decide lane data based on whether the bitcast is narrowing or widening.
- int64_t innerMostLaneData = isNarrowing ? outData[rank - 1] / bitCastRatio
- : outData[rank - 1] * bitCastRatio;
- sourceLaneData.push_back(innerMostLaneData);
-
- propagateIfChanged(
- operands[0],
- operands[0]->meet(LayoutInfo(xegpu::LayoutAttr::get(
- bitcast->getContext(), sourceLaneLayout, sourceLaneData))));
+
+ auto srcVecType = insertStridedSlice.getSourceVectorType();
+ auto resVecType = insertStridedSlice.getDestVectorType();
+
+ auto consumerLayoutAttr =
+ dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+ auto uArch = getUArch(xegpu::getChipStr(insertStridedSlice).value_or(""));
+
+ auto requiredResLayoutAttr = xegpu::setupInsertStridedSliceResultLayout(
+ layoutKind, srcVecType, resVecType, consumerLayoutAttr, uArch);
+
+ xegpu::setTemporaryLayout(insertStridedSlice->getResult(0),
+ requiredResLayoutAttr);
+
+ auto srcLayoutAttr = xegpu::inferInsertStridedSliceSourceLayout(
+ requiredResLayoutAttr, resVecType.getShape(), srcVecType.getShape());
+
+ propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
+ propagateIfChanged(operands[1],
+ operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
+ return;
}
/// Propagate the layout of the result to the tensor descriptor, mask and offset
@@ -1139,97 +1131,56 @@ void LayoutInfoPropagation::visitVectorBitcastOp(
void LayoutInfoPropagation::visitLoadGatherOp(
xegpu::LoadGatherOp load, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
-
- LayoutInfo loadLayout;
- LayoutInfo maskLayout;
+ xegpu::DistributeLayoutAttr requiredAnchorLayoutAttr;
+ xegpu::DistributeLayoutAttr anchorLayoutAttr = load.getLayoutAttr();
auto uArch = getUArch(getChipStr(load).value_or(""));
- const int subgroupSize = uArch->getSubgroupSize();
- xegpu::DistributeLayoutAttr anchorLayout = load.getLayoutAttr();
- if (hasParamsOfLayoutKind(anchorLayout)) {
- loadLayout = LayoutInfo(anchorLayout);
- maskLayout = loadLayout;
- } else {
- LayoutInfo valueLayout = results[0]->getValue();
- // Need the layout of the value to propagate to the tensor descriptor.
- if (!valueLayout.isAssigned())
- return;
+ auto subgroupSize = uArch->getSubgroupSize();
+ VectorType resVecTy = load.getValueType();
+ int chunkSize = load.getChunkSize().value_or(1);
- auto resAttr = dyn_cast<xegpu::DistributeLayoutAttr>(valueLayout.get());
- auto instDataIncoming = resAttr.getEffectiveInstDataAsInt();
- if (auto sliceAttr = dyn_cast<xegpu::SliceAttr>(resAttr))
- instDataIncoming = SmallVector<int64_t>(
- cast<xegpu::LayoutAttr>(sliceAttr.flatten().getParent())
- .getInstData()
- .asArrayRef());
+ LayoutInfo resLayoutInfo = results[0]->getValue();
+ if (!resLayoutInfo.isAssigned())
+ return;
+ auto consumerLayoutAttr =
+ dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
- VectorType payloadTy = load.getValueType();
- if (!payloadTy) {
+ if (hasParamsOfLayoutKind(anchorLayoutAttr)) {
+ requiredAnchorLayoutAttr = anchorLayoutAttr;
+ } else {
+ if (!resVecTy) {
load.emitWarning("Not propagating, non-vector payload supplied.");
return;
}
- const auto *uArchInstruction =
- dyn_cast<xegpu::uArch::LoadGatherInstructionInterface>(
- uArch->getInstruction(xegpu::uArch::InstructionKind::LoadGather));
-
- // Check if value inst_data complies with uArch
- if (layoutKind == xegpu::LayoutKind::InstData) {
- // Each lane loads either one element
- SmallVector<int> instDataUarch{subgroupSize};
- // Or multiple elements as 2D with lane's elements in the inner dimension
- if (payloadTy.getRank() != 1) {
- if (payloadTy.getRank() != 2) {
- load.emitWarning("Expected 2D payload for LoadGatherOp.");
- return;
- }
- int elemBitWidth = payloadTy.getElementTypeBitWidth();
- instDataUarch.push_back((
- std::min(static_cast<int>(payloadTy.getShape().back()),
- uArchInstruction->getMaxLaneLoadStoreSize(elemBitWidth))));
- }
- // If inst data does not match, enforce the uArch-based one
- if (!llvm::equal(instDataIncoming, instDataUarch)) {
- xegpu::LayoutAttr sourceAttr = dyn_cast<xegpu::LayoutAttr>(resAttr);
- if (auto sliceAttr = dyn_cast<xegpu::SliceAttr>(resAttr)) {
- sourceAttr = cast<xegpu::LayoutAttr>(sliceAttr.flatten().getParent());
- }
- assert(sourceAttr);
- xegpu::DistributeLayoutAttr updatedLayoutAttr = xegpu::LayoutAttr::get(
- load.getContext(), sourceAttr.getSgLayout(), sourceAttr.getSgData(),
- DenseI32ArrayAttr::get(load.getContext(), instDataUarch),
- sourceAttr.getLaneLayout(), sourceAttr.getLaneData(),
- sourceAttr.getOrder());
-
- if (auto sliceAttr = dyn_cast<xegpu::SliceAttr>(resAttr))
- updatedLayoutAttr = xegpu::SliceAttr::get(
- load.getContext(), updatedLayoutAttr, sliceAttr.getDims());
- valueLayout = LayoutInfo(updatedLayoutAttr);
- }
- }
- loadLayout = valueLayout;
- load.setLayoutAttr(dyn_cast<xegpu::DistributeLayoutAttr>(loadLayout.get()));
+ requiredAnchorLayoutAttr = xegpu::setupLoadGatherAnchorLayout(
+ layoutKind, resVecTy, chunkSize, consumerLayoutAttr, uArch);
+ load.setLayoutAttr(requiredAnchorLayoutAttr);
}
- // If no user-defined anchor or we deal with a chunked op, set the default
- // mask layout.
- // Rank 1 data : Keep the mask layout aligned with data.
- // Rank >1 data: Enforce the default xegpu 1D layout for mask.
- if (!hasParamsOfLayoutKind(anchorLayout) ||
- load.getValueType().getRank() > 1) {
+ auto maskLayoutAttr = requiredAnchorLayoutAttr;
+ // Special handling mask layout for chunked ops: Enforce the default xegpu 1D
+ // layout for mask.
+ if (chunkSize > 1) {
if (layoutKind == xegpu::LayoutKind::InstData)
- maskLayout = LayoutInfo(
- xegpu::LayoutAttr::get(load->getContext(), {subgroupSize}));
+ maskLayoutAttr =
+ xegpu::LayoutAttr::get(load->getContext(), {subgroupSize});
else if (layoutKind == xegpu::LayoutKind::Lane)
- maskLayout =
- getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize);
+ maskLayoutAttr =
+ xegpu::LayoutAttr::get(load->getContext(), {subgroupSize}, {1});
+ else
+ assert(false &&
+ "chunked StoreScatterOp should not be used at workgroup level");
}
+ LayoutInfo maskLayoutInfo = LayoutInfo(maskLayoutAttr);
+ auto loadLayoutInfo = LayoutInfo(requiredAnchorLayoutAttr);
+
// Propagate the new layout to the tensor descriptor operand.
if (isa<xegpu::TensorDescType>(load.getSourceType()))
- propagateIfChanged(operands[0], operands[0]->meet(loadLayout));
+ propagateIfChanged(operands[0], operands[0]->meet(loadLayoutInfo));
// Propagate the new layout to the mask and optional offset operand.
- propagateIfChanged(operands[1], operands[1]->meet(maskLayout));
+ propagateIfChanged(operands[1], operands[1]->meet(maskLayoutInfo));
if (load.getOffsets())
- propagateIfChanged(operands[2], operands[2]->meet(maskLayout));
+ propagateIfChanged(operands[2], operands[2]->meet(maskLayoutInfo));
}
/// Propagate the layout of the descriptor to the vector offset operand in
@@ -1254,109 +1205,97 @@ void LayoutInfoPropagation::visitStoreScatterOp(
xegpu::StoreScatterOp storeScatter, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
- LayoutInfo payloadLayout;
- LayoutInfo maskLayout;
- xegpu::DistributeLayoutAttr anchorLayout = storeScatter.getLayoutAttr();
+ xegpu::DistributeLayoutAttr requiredAnchorLayoutAttr;
+ xegpu::DistributeLayoutAttr anchorLayoutAttr = storeScatter.getLayoutAttr();
auto uArch = getUArch(getChipStr(storeScatter).value_or(""));
- const int subgroupSize = uArch->getSubgroupSize();
+ auto subgroupSize = uArch->getSubgroupSize();
+ VectorType srcVecTy = storeScatter.getValueType();
+ int chunkSize = storeScatter.getChunkSize().value_or(1);
- if (hasParamsOfLayoutKind(anchorLayout)) {
- payloadLayout = LayoutInfo(anchorLayout);
- maskLayout = payloadLayout;
+ if (hasParamsOfLayoutKind(anchorLayoutAttr)) {
+ requiredAnchorLayoutAttr = anchorLayoutAttr;
} else {
- // Currently, for 2D StoreScatterOp we expect that the height dimension of
- // the tensor descriptor is equal to the subgroup size. This is ensured by
- // the op verifier.
- VectorType payloadTy = storeScatter.getValueType();
- if (!payloadTy) {
+ if (!srcVecTy) {
storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
return;
}
-
- if (layoutKind == xegpu::LayoutKind::InstData) {
- const auto *uArchInstruction =
- dyn_cast<xegpu::uArch::StoreScatterInstructionInterface>(
- uArch->getInstruction(
- xegpu::uArch::InstructionKind::StoreScatter));
- const int subgroupSize = uArch->getSubgroupSize();
- SmallVector<int> instDataUarch{subgroupSize};
- if (payloadTy.getRank() != 1) {
- if (payloadTy.getRank() != 2) {
- storeScatter.emitWarning("Expected 2D payload for StoreScatterOp.");
- return;
- }
- int elemBitWidth = payloadTy.getElementTypeBitWidth();
- instDataUarch.push_back((
- std::min(static_cast<int>(payloadTy.getShape().back()),
- uArchInstruction->getMaxLaneLoadStoreSize(elemBitWidth))));
- }
- payloadLayout = LayoutInfo(
- xegpu::LayoutAttr::get(storeScatter.getContext(), instDataUarch));
- } else {
- auto payloadShape = payloadTy.getShape();
- if (payloadShape.size() > 1)
- assert(payloadShape[0] == subgroupSize &&
- "Expected the first dimension of 2D tensor descriptor to be "
- "equal to "
- "subgroup size.");
- payloadLayout = getSIMTLayoutInfoScatterIO(payloadTy, uArch);
- }
-
- storeScatter.setLayoutAttr(
- dyn_cast<xegpu::DistributeLayoutAttr>(payloadLayout.get()));
+ requiredAnchorLayoutAttr = xegpu::setupStoreScatterAnchorLayout(
+ layoutKind, srcVecTy, chunkSize, uArch);
+ storeScatter.setLayoutAttr(requiredAnchorLayoutAttr);
}
- // If no user-defined anchor or we deal with a chunked op, set the default
- // mask layout.
- // Rank 1 data : Keep the mask layout aligned with data.
- // Rank >1 data: Enforce the default xegpu 1D layout for mask.
- if (!hasParamsOfLayoutKind(anchorLayout) ||
- storeScatter.getValueType().getRank() > 1) {
+ LayoutInfo srcLayoutInfo = LayoutInfo(requiredAnchorLayoutAttr);
+ auto maskLayoutAttr = requiredAnchorLayoutAttr;
+ // Special handling mask layout for chunked ops: Enforce the default xegpu 1D
+ // layout for mask.
+ if (chunkSize > 1) {
if (layoutKind == xegpu::LayoutKind::InstData)
- maskLayout = LayoutInfo(
- xegpu::LayoutAttr::get(storeScatter->getContext(), {subgroupSize}));
+ maskLayoutAttr =
+ xegpu::LayoutAttr::get(storeScatter->getContext(), {subgroupSize});
else if (layoutKind == xegpu::LayoutKind::Lane)
- maskLayout =
- getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize);
+ maskLayoutAttr = xegpu::LayoutAttr::get(storeScatter->getContext(),
+ {subgroupSize}, {1});
+ else
+ assert(false &&
+ "chunked StoreScatterOp should not be used at workgroup level");
}
+ LayoutInfo maskLayoutInfo = LayoutInfo(maskLayoutAttr);
+
// Propagate the payload operand layout
- propagateIfChanged(operands[0], operands[0]->meet(payloadLayout));
+ propagateIfChanged(operands[0], operands[0]->meet(srcLayoutInfo));
// Propagate the destination (if tdesc) operand layout
if (isa<xegpu::TensorDescType>(storeScatter.getDestType()))
- propagateIfChanged(operands[1], operands[1]->meet(payloadLayout));
+ propagateIfChanged(operands[1], operands[1]->meet(srcLayoutInfo));
// Propagate the new layout to the mask and optional offset operand.
- propagateIfChanged(operands[2], operands[2]->meet(maskLayout));
+ propagateIfChanged(operands[2], operands[2]->meet(maskLayoutInfo));
if (storeScatter.getOffsets())
- propagateIfChanged(operands[3], operands[3]->meet(maskLayout));
+ propagateIfChanged(operands[3], operands[3]->meet(maskLayoutInfo));
+}
+
+void LayoutInfoPropagation::visitLoadMatrixOp(
+ xegpu::LoadMatrixOp loadMatrixOp, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
+
+ LayoutInfo resLayoutInfo = results[0]->getValue();
+ auto consumerLayoutAttr =
+ dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+
+ xegpu::DistributeLayoutAttr anchorLayout = loadMatrixOp.getLayoutAttr();
+
+ // only need to set anchor layout, no need to porpagate to memdesc and
+ // offset
+ if (!hasParamsOfLayoutKind(anchorLayout)) {
+ VectorType resVecTy =
+ llvm::cast<VectorType>(loadMatrixOp.getRes().getType());
+ assert(resVecTy.getRank() == 2 && "Expecting 2D vector for store matrix.");
+ auto uArch = getUArch(getChipStr(loadMatrixOp).value_or(""));
+ auto requiredAnchorLayoutAttr = xegpu::setupLoadMatrixAnchorLayout(
+ layoutKind, resVecTy, consumerLayoutAttr, uArch);
+ loadMatrixOp.setLayoutAttr(requiredAnchorLayoutAttr);
+ }
}
// Store matrix is a flavor of scattered store for 2D shapes.
void LayoutInfoPropagation::visitStoreMatrixOp(
xegpu::StoreMatrixOp storeMatrix, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
- Value operand = storeMatrix.getData();
- unsigned index =
- std::distance(storeMatrix.operand_begin(),
- llvm::find(storeMatrix->getOperands(), operand));
-
xegpu::DistributeLayoutAttr anchorLayout = storeMatrix.getLayoutAttr();
LayoutInfo layout;
if (hasParamsOfLayoutKind(anchorLayout)) {
layout = LayoutInfo(anchorLayout);
} else {
- VectorType payloadTy = llvm::cast<VectorType>(operand.getType());
- assert(payloadTy.getRank() == 2 && "Expecting 2D vector for store matrix.");
+ VectorType srcVecTy =
+ llvm::cast<VectorType>(storeMatrix.getData().getType());
+ assert(srcVecTy.getRank() == 2 && "Expecting 2D vector for store matrix.");
auto uArch = getUArch(getChipStr(storeMatrix).value_or(""));
- SmallVector<int> instData = {1, uArch->getSubgroupSize()};
- if (layoutKind == xegpu::LayoutKind::InstData)
- layout = LayoutInfo(
- xegpu::LayoutAttr::get(storeMatrix.getContext(), instData));
- else
- layout = getSIMTLayoutInfoScatterIO(payloadTy, uArch);
+ auto requiredAnchorLayoutAttr =
+ xegpu::setupStoreMatrixAnchorLayout(layoutKind, srcVecTy, uArch);
+ storeMatrix.setLayoutAttr(requiredAnchorLayoutAttr);
+ layout = LayoutInfo(requiredAnchorLayoutAttr);
}
- propagateIfChanged(operands[index], operands[index]->meet(layout));
+ propagateIfChanged(operands[0], operands[0]->meet(layout));
}
namespace {
@@ -1736,10 +1675,24 @@ LogicalResult xegpu::propagateLayouts(OpBuilder &builder, Operation *target,
LayoutInfo layout = analysis.getLayoutInfo(val);
if (!layout.isAssigned())
return {};
+ if (auto opResult = dyn_cast<OpResult>(val)) {
+
+ Operation *defOp = opResult.getDefiningOp();
+ if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
+ auto anchorLayout = anchorOp.getAnchorLayout();
+ if (anchorLayout != nullptr)
+ return anchorLayout;
+ }
+ xegpu::DistributeLayoutAttr requiredResLayoutAttr =
+ xegpu::getTemporaryLayout(opResult);
+ if (requiredResLayoutAttr != nullptr)
+ return requiredResLayoutAttr;
+ }
xegpu::DistributeLayoutAttr layoutAttr =
cast<xegpu::DistributeLayoutAttr>(layout.get());
if (layout.isSliceLayout())
return cast<xegpu::SliceAttr>(layoutAttr);
+
return cast<xegpu::LayoutAttr>(layoutAttr);
};
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index a3d96b45f27b6..5cd9772204590 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -14,6 +14,7 @@
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
#include "mlir/IR/AffineMap.h"
@@ -1532,8 +1533,9 @@ struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern {
}
// case 2: source and result have same rank
if (rankDiff == 0) {
- SetVector<int64_t> broadcastUnitDims =
- broadcastOp.computeBroadcastedUnitDims();
+ auto broadcastUnitDimsSet = broadcastOp.computeBroadcastedUnitDims();
+ SmallVector<int64_t> broadcastUnitDims(broadcastUnitDimsSet.begin(),
+ broadcastUnitDimsSet.end());
bool isEqualTo = sourceLayout.isEqualTo(resultLayout);
if (!isEqualTo)
return rewriter.notifyMatchFailure(
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 8f4e2bb0451d8..2b1bd4d73a576 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -15,6 +15,7 @@
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/DebugLog.h"
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 45a002b63abd6..f37d25108dbcb 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -19,6 +19,7 @@
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Transforms/DialectConversion.h"
#include <optional>
@@ -1113,27 +1114,10 @@ struct WgToSgVectorShapeCastOp
return failure();
ArrayRef<int64_t> srcShape = srcType.getShape();
- llvm::SetVector<int64_t> expandedUnitDims;
-
- // Check if shapes only differ by expanding unit dimensions (like
- // expand_dims)
- auto checkOnlyExpandUnitDims = [&](ArrayRef<int64_t> src,
- ArrayRef<int64_t> dst) -> bool {
- // All unit dimensions in dst that don't appear in src are the expanded
- // unit dimensions
- size_t srcIdx = 0;
- for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx)
- if (srcIdx < src.size() && src[srcIdx] == dst[dstIdx])
- srcIdx++;
- else if (dst[dstIdx] == 1)
- expandedUnitDims.insert(dstIdx);
- else
- return false;
- return srcIdx == src.size();
- };
- xegpu::DistributeLayoutAttr layoutToDistribute = layout;
- if (checkOnlyExpandUnitDims(srcShape, wgShape)) {
+ xegpu::DistributeLayoutAttr layoutToDistribute = layout;
+ SmallVector<int64_t> expandedUnitDims;
+ if (xegpu::matchUnitDimExpansion(srcShape, wgShape, expandedUnitDims)) {
xegpu::DistributeLayoutAttr sourceLayout =
xegpu::getTemporaryLayout(op->getOpOperand(0));
@@ -1488,15 +1472,8 @@ struct WgToSgMultiDimReductionOp
SmallVector<OpFoldResult> storeOffsets2D = {rowOffsetStore, colOffset};
- auto storeMatrixLayout = xegpu::SliceAttr::get(
- rewriter.getContext(),
- xegpu::LayoutAttr::get(rewriter.getContext(), /*sg_layout =*/nullptr,
- /*sg_data =*/nullptr,
- /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
- /*lane_data =*/nullptr, /*order =*/nullptr),
- dyn_cast<xegpu::SliceAttr>(layout).getDims());
xegpu::StoreMatrixOp::create(rewriter, loc, storeData, memDesc.getResult(),
- storeOffsets2D, /*layout=*/storeMatrixLayout);
+ storeOffsets2D, /*layout=*/nullptr);
gpu::BarrierOp::create(rewriter, loc);
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 7e28c756f2d72..c47fd92fe46d7 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -366,111 +366,6 @@ template void xegpu::setTemporaryLayout<mlir::OpOperand>(
const mlir::OpOperand &operand,
const mlir::xegpu::DistributeLayoutAttr layout);
-void xegpu::recoverTemporaryLayoutsDeprecated(Operation *op) {
- op->walk([&](Operation *nestOp) {
- for (OpOperand &opr : nestOp->getOpOperands()) {
- auto layout = getDistributeLayoutAttr(opr.get());
- setDistributeLayoutAttr(opr, layout);
- }
-
- for (OpResult result : nestOp->getOpResults()) {
- auto layout = getDistributeLayoutAttr(result);
- setDistributeLayoutAttr(result, layout);
- }
- });
-}
-
-/// Attach layout attributes to all vector-type operands of operations within
-/// the given operation's region. Reports an error if any vector operand lacks
-/// a layout attribute.
-bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
- auto result = rootOp->walk([&](Operation *op) {
- for (OpOperand &operand : op->getOpOperands()) {
- // Layouts are needed for vector type only.
- if (!isa<VectorType>(operand.get().getType()))
- continue;
- auto layout = xegpu::getDistributeLayoutAttr(operand.get());
- if (!layout) {
- op->emitWarning("Could not find layout attribute for operand ")
- << operand.getOperandNumber() << " of operation " << op->getName();
- continue;
- }
- xegpu::setDistributeLayoutAttr(operand, layout);
- }
- return WalkResult::advance();
- });
- return !result.wasInterrupted();
-}
-
-template <typename T, typename>
-void xegpu::removeLayoutAttr(const T &operandOrResult) {
- Operation *owner = operandOrResult.getOwner();
- std::string name = xegpu::getTemporaryLayoutName(operandOrResult);
- if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
- owner->removeAttr(name);
-}
-
-SmallVector<NamedAttribute>
-xegpu::dropSgLayoutAndDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
- SmallVector<NamedAttribute> out;
- out.reserve(attrs.size());
-
- for (auto attr : attrs) {
- if (auto dist = dyn_cast<xegpu::DistributeLayoutAttr>(attr.getValue())) {
- auto newLayout = dist.dropSgLayoutAndData();
- if (newLayout)
- out.emplace_back(attr.getName(), newLayout);
- } else {
- out.push_back(attr);
- }
- }
-
- return out;
-}
-
-SmallVector<NamedAttribute>
-xegpu::dropInstDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
- SmallVector<NamedAttribute> out;
- out.reserve(attrs.size());
-
- for (auto attr : attrs) {
- if (auto dist = dyn_cast<xegpu::DistributeLayoutAttr>(attr.getValue())) {
- auto newLayout = dist.dropInstData();
- if (newLayout)
- out.emplace_back(attr.getName(), newLayout);
- } else {
- out.push_back(attr);
- }
- }
-
- return out;
-}
-
-// Explicit instantiation for OpResult
-template void
-xegpu::removeLayoutAttr<mlir::OpResult>(const mlir::OpResult &result);
-
-// Explicit instantiation for OpOperand
-template void
-xegpu::removeLayoutAttr<mlir::OpOperand>(const mlir::OpOperand &operand);
-
-void xegpu::removeLayoutAttrs(Operation *op) {
- op->walk([&](Operation *nestOp) {
- for (OpOperand &opr : nestOp->getOpOperands())
- removeLayoutAttr(opr);
- for (OpResult result : nestOp->getOpResults())
- removeLayoutAttr(result);
- if (op->hasAttrOfType<DistributeLayoutAttr>("layout"))
- op->removeAttr("layout");
- if (op->hasAttrOfType<DistributeLayoutAttr>("layout_a"))
- op->removeAttr("layout_a");
- if (op->hasAttrOfType<DistributeLayoutAttr>("layout_b"))
- op->removeAttr("layout_b");
- if (op->hasAttrOfType<DistributeLayoutAttr>("layout_cd"))
- op->removeAttr("layout_cd");
- });
-}
-
SmallVector<Value>
xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc,
Value value, ArrayRef<int64_t> shape) {
@@ -786,3 +681,58 @@ bool xegpu::requireTranspose(const xegpu::LayoutAttr layout,
return false;
return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
}
+
+// Check if dst shape is an expansion of src shape by inserting unit dimensions.
+// Returns true if all dimensions in src match corresponding dimensions in dst
+// (after skipping unit dimensions), and populates expandedUnitDims with the
+// indices of the unit dimensions in dst that were added (not present in src).
+// Example: src=[2,3], dst=[1,2,3,1] -> true, expandedUnitDims=[0,3]
+bool xegpu::matchUnitDimExpansion(ArrayRef<int64_t> src, ArrayRef<int64_t> dst,
+ SmallVector<int64_t> &expandedUnitDims) {
+ // All unit dimensions in dst that don't appear in src are the expanded
+ // unit dimensions
+ size_t srcIdx = 0;
+ for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx)
+ if (srcIdx < src.size() && src[srcIdx] == dst[dstIdx])
+ srcIdx++;
+ else if (dst[dstIdx] == 1)
+ expandedUnitDims.push_back(dstIdx);
+ else
+ return false;
+ return srcIdx == src.size();
+}
+
+// Checks if dst shape is an expansion of src shape where each dimension in src
+// is split into one or more consecutive dimensions in dst whose product equals
+// the original dimension. Populates splitDimGroups with groups of dst indices
+// that correspond to each src dimension. Example: src=[6,4], dst=[2,3,2,2] ->
+// true
+bool xegpu::matchSplitDimExpansion(
+ ArrayRef<int64_t> src, ArrayRef<int64_t> dst,
+ SmallVector<SmallVector<int64_t>> &splitDimGroups) {
+ // each dim in src can be mapped to one or more dims in dst whose product
+ // equals to the src dim
+ size_t srcIdx = 0;
+ int64_t accumulatedSize = 1;
+ SmallVector<int64_t> currentDstDims;
+
+ splitDimGroups.clear();
+ for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx) {
+ if (srcIdx >= src.size())
+ return false;
+ accumulatedSize *= dst[dstIdx];
+ currentDstDims.push_back(dstIdx);
+
+ if (accumulatedSize == src[srcIdx]) {
+ // Record the mapping: srcIdx -> currentDstDims
+ splitDimGroups.push_back(currentDstDims);
+ // move to next src dim
+ srcIdx++;
+ accumulatedSize = 1;
+ currentDstDims.clear();
+ } else if (accumulatedSize > src[srcIdx]) {
+ return false;
+ }
+ }
+ return srcIdx == src.size();
+}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index 9de2881d05d0b..b6c172ecf4ae0 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -217,7 +217,7 @@ gpu.module @test {
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<1024xf32>) {
// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<true> : vector<16xi1>
// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<12> : vector<16xindex>
-// CHECK: %[[LOADED:.*]] = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{layout = #xegpu.slice<#xegpu.layout<inst_data = [16, 16]>, dims = [0]>}> :
+// CHECK: %[[LOADED:.*]] = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{layout = #xegpu.layout<inst_data = [16]>}> :
// CHECK-SAME: memref<1024xf32>, vector<16xindex>, vector<16xi1> -> vector<16xf32>
// CHECK: %[[BCAST:.*]] = vector.broadcast %[[LOADED]] {layout_result_0 = #xegpu.layout<inst_data = [16, 16]>} : vector<16xf32> to vector<16x16xf32>
// CHECK: xegpu.store %[[BCAST]], %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 16 : i64, layout = #xegpu.layout<inst_data = [16, 16]>}> :
@@ -234,3 +234,89 @@ func.func @scatter_ops_chunksize_slice(%src: memref<1024xf32>) {
return
}
}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @insert_strided_slice_inst_data_no_packing(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x32xf32>) {
+// CHECK: %[[CST_SMALL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 16]>} dense<1.000000e+00> : vector<4x16xf32>
+// CHECK: %[[CST_LARGE:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 16]>} dense<0.000000e+00> : vector<8x32xf32>
+// CHECK: %[[INSERT:.*]] = vector.insert_strided_slice %[[CST_SMALL]], %[[CST_LARGE]] {layout_result_0 = #xegpu.layout<inst_data = [1, 16]>, offsets = [0, 0], strides = [1, 1]} : vector<4x16xf32> into vector<8x32xf32>
+// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+// CHECK: xegpu.store_nd %[[INSERT]], %[[TDESC]] <{layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+func.func @insert_strided_slice_inst_data_no_packing(%arg0: memref<8x32xf32>) {
+ %c0 = arith.constant 0 : index
+ %cst_small = arith.constant dense<1.0> : vector<4x16xf32>
+ %cst_large = arith.constant dense<0.0> : vector<8x32xf32>
+ %insert = vector.insert_strided_slice %cst_small, %cst_large {offsets = [0, 0], strides = [1, 1]} : vector<4x16xf32> into vector<8x32xf32>
+ %tdesc = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32>
+ xegpu.store_nd %insert, %tdesc : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32>
+ return
+}
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @insert_strided_slice_inst_data_with_packing(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x64xi8>) {
+// CHECK: %[[CST_SMALL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 64]>} dense<1> : vector<4x64xi8>
+// CHECK: %[[CST_LARGE:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 64]>} dense<0> : vector<8x64xi8>
+// CHECK: %[[INSERT:.*]] = vector.insert_strided_slice %[[CST_SMALL]], %[[CST_LARGE]] {layout_result_0 = #xegpu.layout<inst_data = [1, 64]>, offsets = [0, 0], strides = [1, 1]} : vector<4x64xi8> into vector<8x64xi8>
+func.func @insert_strided_slice_inst_data_with_packing(%arg0: memref<8x64xi8>) {
+ %c0 = arith.constant 0 : index
+ %cst_small = arith.constant dense<1> : vector<4x64xi8>
+ %cst_large = arith.constant dense<0> : vector<8x64xi8>
+ %insert = vector.insert_strided_slice %cst_small, %cst_large {offsets = [0, 0], strides = [1, 1]} : vector<4x64xi8> into vector<8x64xi8>
+ %tdesc = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x64xi8> -> !xegpu.tensor_desc<8x64xi8, #xegpu.layout<inst_data = [8, 64]>>
+ xegpu.store_nd %insert, %tdesc <{layout = #xegpu.layout<inst_data = [8, 64]>}>: vector<8x64xi8>, !xegpu.tensor_desc<8x64xi8, #xegpu.layout<inst_data = [8, 64]>>
+ return
+}
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_shape_cast_expand_non_unit_dims(
+// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[STEP:.*]]], %[[CST:.*]] <{layout = #xegpu.layout<inst_data = [16]>}> : memref<1024xf16>, vector<1024xindex>, vector<1024xi1> -> vector<1024xf16>
+// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} : vector<1024xf16> to vector<8x8x16xf16>
+// CHECK: %[[CST_0:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [1, 1, 16]>, dims = [0]>} dense<0.000000e+00> : vector<8x16xf16>
+// CHECK: %[[CST_1:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [1, 16]>, dims = [0]>} dense<0.000000e+00> : vector<16xf16>
+// CHECK: %[[REDUCE_0:.*]] = vector.multi_reduction <add>, %[[CAST]], %[[CST_0]] {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [1, 1, 16]>, dims = [0]>} [0] : vector<8x8x16xf16> to vector<8x16xf16>
+// CHECK: %[[REDUCE_1:.*]] = vector.multi_reduction <add>, %[[REDUCE_0]], %[[CST_1]] {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [1, 16]>, dims = [0]>} [0] : vector<8x16xf16> to vector<16xf16>
+func.func @vector_shape_cast_expand_non_unit_dims(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+ %cst = arith.constant dense<true> : vector<1024xi1>
+ %0 = vector.step : vector<1024xindex>
+ %1 = xegpu.load %arg0[%0], %cst : memref<1024xf16>, vector<1024xindex>, vector<1024xi1> -> vector<1024xf16>
+ %2 = vector.shape_cast %1 : vector<1024xf16> to vector<8x8x16xf16>
+ %cst_0 = arith.constant dense<0.000000e+00> : vector<8x16xf16>
+ %cst_1 = arith.constant dense<0.000000e+00> : vector<16xf16>
+ %3 = vector.multi_reduction <add>, %2, %cst_0 [0] : vector<8x8x16xf16> to vector<8x16xf16>
+ %4 = vector.multi_reduction <add>, %3, %cst_1 [0] : vector<8x16xf16> to vector<16xf16>
+ %cst_2 = arith.constant dense<true> : vector<16xi1>
+ %cst_3 = arith.constant dense<1> : vector<16xindex>
+ xegpu.store %4, %arg1[%cst_3], %cst_2 <{layout = #xegpu.layout<inst_data = [16]>}> : vector<16xf16>, memref<16xf16>, vector<16xindex>, vector<16xi1>
+ return
+ }
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_shape_cast_expand_and_merge(
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [32]>} dense<true> : vector<256xi1>
+// CHECK: %[[STEP:.*]] = vector.step {layout_result_0 = #xegpu.layout<inst_data = [32]>} : vector<256xindex>
+// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[STEP]]], %[[CST]] <{layout = #xegpu.layout<inst_data = [32]>}> : memref<256xf16>, vector<256xindex>, vector<256xi1> -> vector<256xf16>
+// CHECK: %[[CAST_0:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 32]>} : vector<256xf16> to vector<2x4x32xf16>
+// CHECK: %[[CAST_1:.*]] = vector.shape_cast %[[CAST_0]] {layout_result_0 = #xegpu.layout<inst_data = [1, 32]>} : vector<2x4x32xf16> to vector<1x256xf16>
+// CHECK: %[[CAST_2:.*]] = vector.shape_cast %[[CAST_1]] {layout_result_0 = #xegpu.layout<inst_data = [32]>} : vector<1x256xf16> to vector<256xf16>
+// CHECK: xegpu.store %[[CAST_2]], %arg1[%[[STEP]]], %[[CST]] <{layout = #xegpu.layout<inst_data = [32]>}> : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1>
+func.func @vector_shape_cast_expand_and_merge(%arg0: memref<256xf16>, %arg1: memref<256xf16>) {
+ %cst = arith.constant dense<true> : vector<256xi1>
+ %0 = vector.step : vector<256xindex>
+ %1 = xegpu.load %arg0[%0], %cst : memref<256xf16>, vector<256xindex>, vector<256xi1> -> vector<256xf16>
+ %2 = vector.shape_cast %1 : vector<256xf16> to vector<2x4x32xf16>
+
+ %4 = vector.shape_cast %2 : vector<2x4x32xf16> to vector<1x256xf16>
+ %5 = vector.shape_cast %4 : vector<1x256xf16> to vector<256xf16>
+ xegpu.store %5, %arg1[%0], %cst <{layout = #xegpu.layout<inst_data = [32] >}> : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1>
+ return
+ }
+}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
index 29e5b51627fb6..190b54912488f 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
@@ -123,3 +123,44 @@ gpu.module @test {
gpu.return
}
}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: vector_row_reduction
+// CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [1]>}
+ gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes
+ {known_block_size = array<i32: 1, 32, 1>} {
+ %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
+ %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
+ %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
+ %reduce = vector.multi_reduction <add>, %load, %cst [1] : vector<32x64xf32> to vector<32xf32>
+ %tdesc_dst = xegpu.create_nd_tdesc %dst : memref<32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<sg_layout = [32], sg_data = [1]>>
+ xegpu.store_nd %reduce, %tdesc_dst <{layout = #xegpu.layout<sg_layout = [32], sg_data = [1]>}>
+ : vector<32xf32>, !xegpu.tensor_desc<32xf32, #xegpu.layout<sg_layout = [32], sg_data = [1]>>
+ gpu.return
+ }
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: vector_nest_reduction
+ gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes
+ {known_block_size = array<i32: 1, 32, 1>} {
+ %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
+ %cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
+ %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>
+ %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x128xf32> -> vector<32x128xf32>
+ %bcast1 = vector.broadcast %load: vector<32x128xf32> to vector<4x32x128xf32>
+
+ // CHECK: %[[BCAST1:.*]] = vector.broadcast %{{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>} : vector<32x128xf32> to vector<4x32x128xf32>
+ // CHECK: %[[BCAST:.*]] = vector.multi_reduction <add>, %[[BCAST1]], %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>, dims = [0]>} [0] : vector<4x32x128xf32> to vector<32x128xf32>
+ // CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[BCAST]], %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [8, 16]>, dims = [1]>} [1] : vector<32x128xf32> to vector<32xf32>
+
+ %bcast = vector.multi_reduction <add>, %bcast1, %cst1 [0]: vector<4x32x128xf32> to vector<32x128xf32>
+ %reduce = vector.multi_reduction <add>, %bcast, %cst [1] : vector<32x128xf32> to vector<32xf32>
+ %mask = arith.constant dense<1>: vector<32xi1>
+ %offset = vector.step : vector<32xindex>
+ xegpu.store %reduce, %dst[%offset], %mask {layout = #xegpu.slice<#xegpu.layout<sg_layout=[4, 8], sg_data=[8, 16]>, dims = [1]>} : vector<32xf32>, memref<32xf32>, vector<32xindex>, vector<32xi1>
+ gpu.return
+ }
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index f4859fe324b19..17c9ec131ed70 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -104,21 +104,18 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor
gpu.module @test {
// CHECK-LABEL: func.func @load_gather_with_chunksize(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+// CHECK: %[[OFFSET:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
// CHECK-SAME: dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
-// CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> ->
-// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
-// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}>
-// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>, vector<16xi1> -> vector<16x16xf16>
+// CHECK-NEXT: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK-NEXT: %{{.*}} = xegpu.load %arg1[%[[OFFSET]]], %[[MASK]] <{chunk_size = 16 : i64, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x16xf16>
func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
%1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
- %cst_0 = arith.constant dense<true> : vector<16xi1>
- %2 = xegpu.create_tdesc %arg1, %cst : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>
- %3 = xegpu.load %2, %cst_0 : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>, vector<16xi1> -> vector<16x16xf16>
+ %offset = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
+ %mask = arith.constant dense<true> : vector<16xi1>
+ %3 = xegpu.load %arg1[%offset], %mask <{chunk_size=16}>
+ : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x16xf16>
%4 = vector.transpose %3, [1, 0] : vector<16x16xf16> to vector<16x16xf16>
%5 = xegpu.dpas %1, %4 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
%6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
@@ -151,16 +148,15 @@ func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf
gpu.module @test {
// CHECK-LABEL: func.func @store_scatter_with_chunksize(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> ->
-// CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.store %{{.*}}, %[[T0]], %{{.*}} : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>,
-// CHECK-SAME: #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, vector<16xi1>
+// CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 8]>} dense<1.000000e+00> : vector<16x8xf32>
+// CHECK-NEXT: %[[CST_0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK-NEXT: %[[CST_1:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
+// CHECK-NEXT: xegpu.store %[[CST]], %[[ARG0]][%[[CST_1]]], %[[CST_0]] <{chunk_size = 8 : i64, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 8]>}> : vector<16x8xf32>, memref<128xf32>, vector<16xindex>, vector<16xi1>
func.func @store_scatter_with_chunksize(%arg0: memref<128xf32>) {
- %cst = arith.constant dense<1.000000e+00> : vector<16x8xf32>
- %cst_0 = arith.constant dense<true> : vector<16xi1>
- %cst_1 = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
- %0 = xegpu.create_tdesc %arg0, %cst_1 : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
- xegpu.store %cst, %0, %cst_0 : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<16xi1>
+ %val = arith.constant dense<1.000000e+00> : vector<16x8xf32>
+ %mask = arith.constant dense<true> : vector<16xi1>
+ %offset = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
+ xegpu.store %val, %arg0[%offset], %mask <{chunk_size = 8}>: vector<16x8xf32>, memref<128xf32>, vector<16xindex>, vector<16xi1>
return
}
}
@@ -184,9 +180,9 @@ gpu.module @test {
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}>
+// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 8]>}>
// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 8]>}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
%1 = arith.constant dense<1>: vector<16xi1>
%offset = arith.constant dense<12> : vector<16xindex>
@@ -320,8 +316,9 @@ func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_require_cross_lane_shuffle(
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32>
-// CHECK: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
+// CHECK-SAME: !xegpu.tensor_desc<8x16xi32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
// CHECK-SAME: vector<8x16xi32> to vector<8x32xi16>
func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %arg1: memref<8x32xi16>) {
%c0 = arith.constant 0 : index
@@ -483,7 +480,7 @@ func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
gpu.module @test {
// CHECK-LABEL: func.func @vector_outer_reduction(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
-// CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [0] : vector<16x16xf32> to vector<16xf32>
+// CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf32> to vector<16xf32>
func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
%cst = arith.constant dense<0.000000e+00> : vector<16xf32>
%0 = vector.multi_reduction <add>, %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32>
@@ -495,7 +492,7 @@ func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor
gpu.module @test {
// CHECK-LABEL: func.func @vector_inner_reduction(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
-// CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [1] : vector<16x16xf32> to vector<16xf32>
+// CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1] : vector<16x16xf32> to vector<16xf32>
func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
%cst = arith.constant dense<0.000000e+00> : vector<16xf32>
%0 = vector.multi_reduction <add>, %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32>
@@ -642,6 +639,52 @@ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc
}
// -----
gpu.module @test {
+// CHECK-LABEL: func.func @vector_shape_cast_expand_non_unit_dims(
+// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[STEP:.*]]], %[[CST:.*]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : memref<1024xf16>, vector<1024xindex>, vector<1024xi1> -> vector<1024xf16>
+// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>} : vector<1024xf16> to vector<8x8x16xf16>
+// CHECK: %[[CST_0:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [0]>} dense<0.000000e+00> : vector<8x16xf16>
+// CHECK: %[[CST_1:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} dense<0.000000e+00> : vector<16xf16>
+// CHECK: %[[REDUCE_0:.*]] = vector.multi_reduction <add>, %[[CAST]], %[[CST_0]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [0]>} [0] : vector<8x8x16xf16> to vector<8x16xf16>
+// CHECK: %[[REDUCE_1:.*]] = vector.multi_reduction <add>, %[[REDUCE_0]], %[[CST_1]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<8x16xf16> to vector<16xf16>
+func.func @vector_shape_cast_expand_non_unit_dims(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+ %cst = arith.constant dense<true> : vector<1024xi1>
+ %0 = vector.step : vector<1024xindex>
+ %1 = xegpu.load %arg0[%0], %cst : memref<1024xf16>, vector<1024xindex>, vector<1024xi1> -> vector<1024xf16>
+ %2 = vector.shape_cast %1 : vector<1024xf16> to vector<8x8x16xf16>
+ %cst_0 = arith.constant dense<0.000000e+00> : vector<8x16xf16>
+ %cst_1 = arith.constant dense<0.000000e+00> : vector<16xf16>
+ %3 = vector.multi_reduction <add>, %2, %cst_0 [0] : vector<8x8x16xf16> to vector<8x16xf16>
+ %4 = vector.multi_reduction <add>, %3, %cst_1 [0] : vector<8x16xf16> to vector<16xf16>
+ %cst_2 = arith.constant dense<true> : vector<16xi1>
+ %cst_3 = arith.constant dense<1> : vector<16xindex>
+ xegpu.store %4, %arg1[%cst_3], %cst_2 <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1] >}> : vector<16xf16>, memref<16xf16>, vector<16xindex>, vector<16xi1>
+ return
+ }
+}
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_shape_cast_expand_and_merge(
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [2]>} dense<true> : vector<256xi1>
+// CHECK: %[[STEP:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [2]>} : vector<256xindex>
+// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[STEP]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [2]>}> : memref<256xf16>, vector<256xindex>, vector<256xi1> -> vector<256xf16>
+// CHECK: %[[CAST_0:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 2]>} : vector<256xf16> to vector<2x4x32xf16>
+// CHECK: %[[CAST_1:.*]] = vector.shape_cast %[[CAST_0]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} : vector<2x4x32xf16> to vector<1x256xf16>
+// CHECK: %[[CAST_2:.*]] = vector.shape_cast %[[CAST_1]] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [2]>} : vector<1x256xf16> to vector<256xf16>
+// CHECK: xegpu.store %[[CAST_2]], %arg1[%[[STEP]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [2]>}> : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1>
+func.func @vector_shape_cast_expand_and_merge(%arg0: memref<256xf16>, %arg1: memref<256xf16>) {
+ %cst = arith.constant dense<true> : vector<256xi1>
+ %0 = vector.step : vector<256xindex>
+ %1 = xegpu.load %arg0[%0], %cst : memref<256xf16>, vector<256xindex>, vector<256xi1> -> vector<256xf16>
+ %2 = vector.shape_cast %1 : vector<256xf16> to vector<2x4x32xf16>
+
+ %4 = vector.shape_cast %2 : vector<2x4x32xf16> to vector<1x256xf16>
+ %5 = vector.shape_cast %4 : vector<1x256xf16> to vector<256xf16>
+ xegpu.store %5, %arg1[%0], %cst <{layout = #xegpu.layout<lane_layout = [16], lane_data = [2] >}> : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1>
+ return
+ }
+}
+// -----
+gpu.module @test {
// CHECK-LABEL: func.func @vector_broadcast_1d_to_2d_broadcast_along_row(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
@@ -702,12 +745,50 @@ func.func @vector_broadcast_scalar_to_vector(%arg0: !xegpu.tensor_desc<16x16xf16
// -----
gpu.module @test {
// CHECK-LABEL: func.func @store_matrix(
-// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} dense<0.000000e+00> : vector<16x16xf16>
-// CHECK-NEXT: xegpu.store_matrix %[[CST]], %arg0[8, 8] : vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
-
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<16x16xf16>
+// CHECK-NEXT: xegpu.store_matrix %[[CST]], %arg0[8, 8] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
func.func @store_matrix(%arg0: !xegpu.mem_desc<16x64xf16>) {
%cst = arith.constant dense<0.0000> : vector<16x16xf16>
xegpu.store_matrix %cst, %arg0[8, 8]: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
return
}
}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @insert_strided_slice_lane_layout_no_packing(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x64xf32>) {
+// CHECK: %[[CST_SMALL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.000000e+00> : vector<2x32xf32>
+// CHECK: %[[CST_LARGE:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<4x64xf32>
+// CHECK: %[[INSERT:.*]] = vector.insert_strided_slice %[[CST_SMALL]], %[[CST_LARGE]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, offsets = [0, 0], strides = [1, 1]} : vector<2x32xf32> into vector<4x64xf32>
+// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<4x64xf32> -> !xegpu.tensor_desc<4x64xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: xegpu.store_nd %[[INSERT]], %[[TDESC]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<4x64xf32>, !xegpu.tensor_desc<4x64xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+func.func @insert_strided_slice_lane_layout_no_packing(%arg0: memref<4x64xf32>) {
+ %c0 = arith.constant 0 : index
+ %cst_small = arith.constant dense<1.0> : vector<2x32xf32>
+ %cst_large = arith.constant dense<0.0> : vector<4x64xf32>
+ %insert = vector.insert_strided_slice %cst_small, %cst_large {offsets = [0, 0], strides = [1, 1]} : vector<2x32xf32> into vector<4x64xf32>
+ %tdesc = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<4x64xf32> -> !xegpu.tensor_desc<4x64xf32>
+ xegpu.store_nd %insert, %tdesc : vector<4x64xf32>, !xegpu.tensor_desc<4x64xf32>
+ return
+}
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @insert_strided_slice_lane_layout_with_packing(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x64xf16>) {
+// CHECK: %[[CST_SMALL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} dense<1.000000e+00> : vector<2x32xf16>
+// CHECK: %[[CST_LARGE:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} dense<0.000000e+00> : vector<4x64xf16>
+// CHECK: %[[INSERT:.*]] = vector.insert_strided_slice %[[CST_SMALL]], %[[CST_LARGE]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>, offsets = [0, 0], strides = [1, 1]} : vector<2x32xf16> into vector<4x64xf16>
+func.func @insert_strided_slice_lane_layout_with_packing(%arg0: memref<4x64xf16>) {
+ %c0 = arith.constant 0 : index
+ %cst_small = arith.constant dense<1.0> : vector<2x32xf16>
+ %cst_large = arith.constant dense<0.0> : vector<4x64xf16>
+ %insert = vector.insert_strided_slice %cst_small, %cst_large {offsets = [0, 0], strides = [1, 1]} : vector<2x32xf16> into vector<4x64xf16>
+ %tdesc = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<4x64xf16> -> !xegpu.tensor_desc<4x64xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>>
+ xegpu.store_nd %insert, %tdesc <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}>: vector<4x64xf16>, !xegpu.tensor_desc<4x64xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>>
+ return
+}
+}
+
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 1fc2328d09046..9cb96775b4ee4 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -674,7 +674,7 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[MUL3:.*]] = arith.muli %[[AFFINE3]], %[[C1:.*]] : index
// CHECK-DAG: %[[ADD2:.*]] = arith.addi %[[ADD1]], %[[MUL3]] : index
// CHECK-DAG: %[[COL_OFFSET:.*]] = arith.muli %[[ADD2]], %[[C32:.*]] : index
- // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]][%[[ROW_OFFSET]], %[[COL_OFFSET]]] <{layout = #xegpu.slice<#xegpu.layout<>, dims = [1]>}>: vector<1x32xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+ // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]][%[[ROW_OFFSET]], %[[COL_OFFSET]]] : vector<1x32xf32>, !xegpu.mem_desc<32x32xf32>, index, index
// CHECK-DAG: gpu.barrier
// CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]][%[[C0:.*]], %[[COL_OFFSET]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<32x32xf32>
// CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32>
@@ -717,7 +717,7 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[MUL4:.*]] = arith.muli {{.*}}, %[[C1:.*]] : index
// CHECK-DAG: %[[ADD1:.*]] = arith.addi %[[C0:.*]], %[[MUL4]] : index
// CHECK-DAG: %[[COL_OFFSET:.*]] = arith.muli %[[ADD1]], %[[C32:.*]] : index
- // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]][%[[ROW_OFFSET]], %[[COL_OFFSET]]] <{layout = #xegpu.slice<#xegpu.layout<>, dims = [0]>}>: vector<1x32xf32>, !xegpu.mem_desc<8x128xf32>, index, index
+ // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]][%[[ROW_OFFSET]], %[[COL_OFFSET]]] : vector<1x32xf32>, !xegpu.mem_desc<8x128xf32>, index, index
// CHECK-DAG: gpu.barrier
// CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]][%[[C0:.*]], %[[COL_OFFSET]]] : !xegpu.mem_desc<8x128xf32>, index, index -> vector<8x32xf32>
// CHECK-DAG: %[[CST_CROSS_SG_1:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32>
@@ -766,7 +766,7 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[MUL4:.*]] = arith.muli {{.*}}, %[[C2:.*]] : index
// CHECK-DAG: %[[ADD3:.*]] = arith.addi %[[ADD2]], %[[MUL4]] : index
// CHECK-DAG: %[[COL_OFFSET:.*]] = arith.muli %[[ADD3]], %[[C1:.*]] : index
- // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]][%[[ROW_OFFSET]], %[[COL_OFFSET]]] <{layout = #xegpu.slice<#xegpu.layout<>, dims = [2, 3]>}>: vector<1x1xf32>, !xegpu.mem_desc<16x4xf32>, index, index
+ // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]][%[[ROW_OFFSET]], %[[COL_OFFSET]]] : vector<1x1xf32>, !xegpu.mem_desc<16x4xf32>, index, index
// CHECK-DAG: gpu.barrier
// CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]][%[[C0:.*]], %[[COL_OFFSET]]] : !xegpu.mem_desc<16x4xf32>, index, index -> vector<16x1xf32>
// CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
@@ -810,7 +810,7 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[MUL4:.*]] = arith.muli {{.*}}, %[[C2:.*]] : index
// CHECK-DAG: %[[ADD3:.*]] = arith.addi %[[ADD2]], %[[MUL4]] : index
// CHECK-DAG: %[[COL_OFFSET:.*]] = arith.muli %[[ADD3]], %[[C256:.*]] : index
- // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]][%[[ROW_OFFSET]], %[[COL_OFFSET]]] <{layout = #xegpu.slice<#xegpu.layout<>, dims = [2, 3]>}>: vector<1x256xf32>, !xegpu.mem_desc<16x1024xf32>, index, index
+ // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]][%[[ROW_OFFSET]], %[[COL_OFFSET]]] : vector<1x256xf32>, !xegpu.mem_desc<16x1024xf32>, index, index
// CHECK-DAG: gpu.barrier
// CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]][%[[C0:.*]], %[[COL_OFFSET]]] : !xegpu.mem_desc<16x1024xf32>, index, index -> vector<16x256xf32>
// CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<0.000000e+00> : vector<256xf32>
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 405e974500e08..20bcb24a301e6 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -14,6 +14,7 @@
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Value.h"
>From c20be65fe84b3e348014cd96e36ddccfae5fe0e6 Mon Sep 17 00:00:00 2001
From: Rainer Orth <ro at gcc.gnu.org>
Date: Fri, 6 Feb 2026 05:07:17 +0100
Subject: [PATCH 09/33] [ubsan][test] Skip Misc/Posix/static-link.cpp on
Solaris (#175464)
The `UBSan-Standalone-x86_64 :: TestCases/Misc/Posix/static-link.cpp`
test currently `FAIL`s on Solaris/x86_64 with
```
ld: fatal: option '-z record' is incompatible with building a static executable
```
One cannot create static executables on Solaris since no `libc.a` is
delivered, so this patch skips the test.
Tested on `x86_64-pc-solaris2.11`.
---
compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp b/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp
index 081eec049e3fc..daa76b7322669 100644
--- a/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp
+++ b/compiler-rt/test/ubsan/TestCases/Misc/Posix/static-link.cpp
@@ -6,7 +6,7 @@
// UNSUPPORTED: i386-target-arch, internal_symbolizer
// Does not link.
-// UNSUPPORTED: darwin
+// UNSUPPORTED: darwin,target={{.*solaris.*}}
#include <signal.h>
#include <stdio.h>
>From 2e46ed8bea63fbb63b96c5295be9b34ab23b2cfa Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Thu, 5 Feb 2026 20:09:49 -0800
Subject: [PATCH 10/33] [msan][NFCI] Add NEON vector compare tests (#177261)
Forked from llvm/test/CodeGen/AArch64/arm64-vcmp.ll
---
.../MemorySanitizer/AArch64/arm64-vcmp.ll | 795 ++++++++++++++++++
1 file changed, 795 insertions(+)
create mode 100644 llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcmp.ll
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcmp.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcmp.ll
new file mode 100644
index 0000000000000..d8326071ef411
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcmp.ll
@@ -0,0 +1,795 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=msan -S | FileCheck %s
+;
+; Handled strictly (suboptimal):
+; - llvm.aarch64.neon.facge
+; - llvm.aarch64.neon.facgt
+;
+; Handled heuristically: (none)
+;
+; Forked from llvm/test/CodeGen/AArch64/arm64-vcmp.ll
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+define void @fcmltz_4s(<4 x float> %a, ptr %p) nounwind sanitize_memory {
+; CHECK-LABEL: define void @fcmltz_4s(
+; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i32> [[_MSPROP]] to <4 x i1>
+; CHECK-NEXT: [[TEMP:%.*]] = fcmp olt <4 x float> [[A]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16>
+; CHECK-NEXT: [[TEMP2:%.*]] = sext <4 x i1> [[TEMP]] to <4 x i16>
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK: [[BB4]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB5]]:
+; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT: store <4 x i16> [[_MSPROP1]], ptr [[TMP8]], align 8
+; CHECK-NEXT: store <4 x i16> [[TEMP2]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %temp = fcmp olt <4 x float> %a, zeroinitializer
+ %temp2 = sext <4 x i1> %temp to <4 x i16>
+ store <4 x i16> %temp2, ptr %p, align 8
+ ret void
+}
+
+define <2 x i32> @facge_2s(<2 x float> %A, <2 x float> %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <2 x i32> @facge_2s(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[_MSLD:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[_MSLD1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD]] to i64
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB15:.*]], label %[[BB16:.*]], !prof [[PROF1]]
+; CHECK: [[BB15]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB16]]:
+; CHECK-NEXT: [[TEMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i32> [[TEMP3]]
+;
+ %temp3 = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %A, <2 x float> %B)
+ ret <2 x i32> %temp3
+}
+
+define <4 x i32> @facge_4s(<4 x float> %A, <4 x float> %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <4 x i32> @facge_4s(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[_MSLD1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB15:.*]], label %[[BB16:.*]], !prof [[PROF1]]
+; CHECK: [[BB15]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB16]]:
+; CHECK-NEXT: [[TEMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TEMP3]]
+;
+ %temp3 = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %A, <4 x float> %B)
+ ret <4 x i32> %temp3
+}
+
+define <2 x i64> @facge_2d(<2 x double> %A, <2 x double> %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @facge_2d(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[_MSLD:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[_MSLD1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB15:.*]], label %[[BB16:.*]], !prof [[PROF1]]
+; CHECK: [[BB15]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB16]]:
+; CHECK-NEXT: [[TEMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[A]], <2 x double> [[B]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TEMP3]]
+;
+ %temp3 = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %A, <2 x double> %B)
+ ret <2 x i64> %temp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i32> @facgt_2s(<2 x float> %A, <2 x float> %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <2 x i32> @facgt_2s(
+; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[_MSLD:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[_MSLD1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD]] to i64
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB15:.*]], label %[[BB16:.*]], !prof [[PROF1]]
+; CHECK: [[BB15]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB16]]:
+; CHECK-NEXT: [[TEMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[A]], <2 x float> [[B]])
+; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i32> [[TEMP3]]
+;
+ %temp3 = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %A, <2 x float> %B)
+ ret <2 x i32> %temp3
+}
+
+define <4 x i32> @facgt_4s(<4 x float> %A, <4 x float> %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <4 x i32> @facgt_4s(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[_MSLD1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB15:.*]], label %[[BB16:.*]], !prof [[PROF1]]
+; CHECK: [[BB15]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB16]]:
+; CHECK-NEXT: [[TEMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[TEMP3]]
+;
+ %temp3 = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %A, <4 x float> %B)
+ ret <4 x i32> %temp3
+}
+
+define <2 x i64> @facgt_2d(<2 x double> %A, <2 x double> %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @facgt_2d(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[_MSLD:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[_MSLD1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
+; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
+; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB15:.*]], label %[[BB16:.*]], !prof [[PROF1]]
+; CHECK: [[BB15]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB16]]:
+; CHECK-NEXT: [[TEMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[A]], <2 x double> [[B]])
+; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[TEMP3]]
+;
+ %temp3 = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %A, <2 x double> %B)
+ ret <2 x i64> %temp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @facge_s(float %A, float %B) nounwind sanitize_memory {
+; CHECK-LABEL: define i32 @facge_s(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB4]]:
+; CHECK-NEXT: [[MASK:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float [[A]], float [[B]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[MASK]]
+;
+ %mask = call i32 @llvm.aarch64.neon.facge.i32.f32(float %A, float %B)
+ ret i32 %mask
+}
+
+define i64 @facge_d(double %A, double %B) nounwind sanitize_memory {
+; CHECK-LABEL: define i64 @facge_d(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB4]]:
+; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double [[A]], double [[B]])
+; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i64 [[MASK]]
+;
+ %mask = call i64 @llvm.aarch64.neon.facge.i64.f64(double %A, double %B)
+ ret i64 %mask
+}
+
+declare i64 @llvm.aarch64.neon.facge.i64.f64(double, double)
+declare i32 @llvm.aarch64.neon.facge.i32.f32(float, float)
+
+define i32 @facgt_s(float %A, float %B) nounwind sanitize_memory {
+; CHECK-LABEL: define i32 @facgt_s(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB4]]:
+; CHECK-NEXT: [[MASK:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float [[A]], float [[B]])
+; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i32 [[MASK]]
+;
+ %mask = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %A, float %B)
+ ret i32 %mask
+}
+
+define i64 @facgt_d(double %A, double %B) nounwind sanitize_memory {
+; CHECK-LABEL: define i64 @facgt_d(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB4]]:
+; CHECK-NEXT: [[MASK:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double [[A]], double [[B]])
+; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret i64 [[MASK]]
+;
+ %mask = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %A, double %B)
+ ret i64 %mask
+}
+
+declare i64 @llvm.aarch64.neon.facgt.i64.f64(double, double)
+declare i32 @llvm.aarch64.neon.facgt.i32.f32(float, float)
+
+define <8 x i8> @cmtst_8b(ptr %A, ptr %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <8 x i8> @cmtst_8b(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB4]]:
+; CHECK-NEXT: [[TEMP1:%.*]] = load <8 x i8>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP7]], align 8
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK: [[BB8]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB9]]:
+; CHECK-NEXT: [[TEMP2:%.*]] = load <8 x i8>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT: [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
+; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i8> [[TEMP1]], [[_MSLD1]]
+; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i8> [[_MSLD]], [[TEMP2]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i8> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i8> [[TMP16]], [[TMP15]]
+; CHECK-NEXT: [[COMMONBITS:%.*]] = and <8 x i8> [[TEMP1]], [[TEMP2]]
+; CHECK-NEXT: [[TMP18:%.*]] = xor <8 x i8> [[COMMONBITS]], zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i8> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <8 x i8> [[TMP19]], zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = xor <8 x i8> [[TMP19]], splat (i8 -1)
+; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i8> [[TMP21]], [[TMP18]]
+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq <8 x i8> [[TMP22]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP20]], [[TMP23]]
+; CHECK-NEXT: [[MASK:%.*]] = icmp ne <8 x i8> [[COMMONBITS]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP:%.*]] = sext <8 x i1> [[_MSPROP_ICMP]] to <8 x i8>
+; CHECK-NEXT: [[RES:%.*]] = sext <8 x i1> [[MASK]] to <8 x i8>
+; CHECK-NEXT: store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i8> [[RES]]
+;
+ %temp1 = load <8 x i8>, ptr %A
+ %temp2 = load <8 x i8>, ptr %B
+ %commonbits = and <8 x i8> %temp1, %temp2
+ %mask = icmp ne <8 x i8> %commonbits, zeroinitializer
+ %res = sext <8 x i1> %mask to <8 x i8>
+ ret <8 x i8> %res
+}
+
+define <16 x i8> @cmtst_16b(ptr %A, ptr %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <16 x i8> @cmtst_16b(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB4]]:
+; CHECK-NEXT: [[TEMP1:%.*]] = load <16 x i8>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 16
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK: [[BB8]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB9]]:
+; CHECK-NEXT: [[TEMP2:%.*]] = load <16 x i8>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT: [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 16
+; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i8> [[TEMP1]], [[_MSLD1]]
+; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i8> [[_MSLD]], [[TEMP2]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i8> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i8> [[TMP16]], [[TMP15]]
+; CHECK-NEXT: [[COMMONBITS:%.*]] = and <16 x i8> [[TEMP1]], [[TEMP2]]
+; CHECK-NEXT: [[TMP18:%.*]] = xor <16 x i8> [[COMMONBITS]], zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i8> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <16 x i8> [[TMP19]], zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = xor <16 x i8> [[TMP19]], splat (i8 -1)
+; CHECK-NEXT: [[TMP22:%.*]] = and <16 x i8> [[TMP21]], [[TMP18]]
+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq <16 x i8> [[TMP22]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP20]], [[TMP23]]
+; CHECK-NEXT: [[MASK:%.*]] = icmp ne <16 x i8> [[COMMONBITS]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP:%.*]] = sext <16 x i1> [[_MSPROP_ICMP]] to <16 x i8>
+; CHECK-NEXT: [[RES:%.*]] = sext <16 x i1> [[MASK]] to <16 x i8>
+; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <16 x i8> [[RES]]
+;
+ %temp1 = load <16 x i8>, ptr %A
+ %temp2 = load <16 x i8>, ptr %B
+ %commonbits = and <16 x i8> %temp1, %temp2
+ %mask = icmp ne <16 x i8> %commonbits, zeroinitializer
+ %res = sext <16 x i1> %mask to <16 x i8>
+ ret <16 x i8> %res
+}
+
+define <4 x i16> @cmtst_4h(ptr %A, ptr %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <4 x i16> @cmtst_4h(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB4]]:
+; CHECK-NEXT: [[TEMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 8
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK: [[BB8]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB9]]:
+; CHECK-NEXT: [[TEMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT: [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
+; CHECK-NEXT: [[TMP13:%.*]] = and <4 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <4 x i16> [[TEMP1]], [[_MSLD1]]
+; CHECK-NEXT: [[TMP15:%.*]] = and <4 x i16> [[_MSLD]], [[TEMP2]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i16> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i16> [[TMP16]], [[TMP15]]
+; CHECK-NEXT: [[COMMONBITS:%.*]] = and <4 x i16> [[TEMP1]], [[TEMP2]]
+; CHECK-NEXT: [[TMP18:%.*]] = xor <4 x i16> [[COMMONBITS]], zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = or <4 x i16> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <4 x i16> [[TMP19]], zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = xor <4 x i16> [[TMP19]], splat (i16 -1)
+; CHECK-NEXT: [[TMP22:%.*]] = and <4 x i16> [[TMP21]], [[TMP18]]
+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq <4 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <4 x i1> [[TMP20]], [[TMP23]]
+; CHECK-NEXT: [[MASK:%.*]] = icmp ne <4 x i16> [[COMMONBITS]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP:%.*]] = sext <4 x i1> [[_MSPROP_ICMP]] to <4 x i16>
+; CHECK-NEXT: [[RES:%.*]] = sext <4 x i1> [[MASK]] to <4 x i16>
+; CHECK-NEXT: store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i16> [[RES]]
+;
+ %temp1 = load <4 x i16>, ptr %A
+ %temp2 = load <4 x i16>, ptr %B
+ %commonbits = and <4 x i16> %temp1, %temp2
+ %mask = icmp ne <4 x i16> %commonbits, zeroinitializer
+ %res = sext <4 x i1> %mask to <4 x i16>
+ ret <4 x i16> %res
+}
+
+define <8 x i16> @cmtst_8h(ptr %A, ptr %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <8 x i16> @cmtst_8h(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB4]]:
+; CHECK-NEXT: [[TEMP1:%.*]] = load <8 x i16>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 16
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK: [[BB8]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB9]]:
+; CHECK-NEXT: [[TEMP2:%.*]] = load <8 x i16>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT: [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
+; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i16> [[TEMP1]], [[_MSLD1]]
+; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i16> [[_MSLD]], [[TEMP2]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i16> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i16> [[TMP16]], [[TMP15]]
+; CHECK-NEXT: [[COMMONBITS:%.*]] = and <8 x i16> [[TEMP1]], [[TEMP2]]
+; CHECK-NEXT: [[TMP18:%.*]] = xor <8 x i16> [[COMMONBITS]], zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i16> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <8 x i16> [[TMP19]], zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = xor <8 x i16> [[TMP19]], splat (i16 -1)
+; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i16> [[TMP21]], [[TMP18]]
+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq <8 x i16> [[TMP22]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP20]], [[TMP23]]
+; CHECK-NEXT: [[MASK:%.*]] = icmp ne <8 x i16> [[COMMONBITS]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP:%.*]] = sext <8 x i1> [[_MSPROP_ICMP]] to <8 x i16>
+; CHECK-NEXT: [[RES:%.*]] = sext <8 x i1> [[MASK]] to <8 x i16>
+; CHECK-NEXT: store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <8 x i16> [[RES]]
+;
+ %temp1 = load <8 x i16>, ptr %A
+ %temp2 = load <8 x i16>, ptr %B
+ %commonbits = and <8 x i16> %temp1, %temp2
+ %mask = icmp ne <8 x i16> %commonbits, zeroinitializer
+ %res = sext <8 x i1> %mask to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <2 x i32> @cmtst_2s(ptr %A, ptr %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <2 x i32> @cmtst_2s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB4]]:
+; CHECK-NEXT: [[TEMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 8
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK: [[BB8]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB9]]:
+; CHECK-NEXT: [[TEMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT: [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
+; CHECK-NEXT: [[TMP13:%.*]] = and <2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <2 x i32> [[TEMP1]], [[_MSLD1]]
+; CHECK-NEXT: [[TMP15:%.*]] = and <2 x i32> [[_MSLD]], [[TEMP2]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <2 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP17:%.*]] = or <2 x i32> [[TMP16]], [[TMP15]]
+; CHECK-NEXT: [[COMMONBITS:%.*]] = and <2 x i32> [[TEMP1]], [[TEMP2]]
+; CHECK-NEXT: [[TMP18:%.*]] = xor <2 x i32> [[COMMONBITS]], zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = or <2 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <2 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = xor <2 x i32> [[TMP19]], splat (i32 -1)
+; CHECK-NEXT: [[TMP22:%.*]] = and <2 x i32> [[TMP21]], [[TMP18]]
+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq <2 x i32> [[TMP22]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <2 x i1> [[TMP20]], [[TMP23]]
+; CHECK-NEXT: [[MASK:%.*]] = icmp ne <2 x i32> [[COMMONBITS]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP:%.*]] = sext <2 x i1> [[_MSPROP_ICMP]] to <2 x i32>
+; CHECK-NEXT: [[RES:%.*]] = sext <2 x i1> [[MASK]] to <2 x i32>
+; CHECK-NEXT: store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i32> [[RES]]
+;
+ %temp1 = load <2 x i32>, ptr %A
+ %temp2 = load <2 x i32>, ptr %B
+ %commonbits = and <2 x i32> %temp1, %temp2
+ %mask = icmp ne <2 x i32> %commonbits, zeroinitializer
+ %res = sext <2 x i1> %mask to <2 x i32>
+ ret <2 x i32> %res
+}
+
+define <4 x i32> @cmtst_4s(ptr %A, ptr %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <4 x i32> @cmtst_4s(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB4]]:
+; CHECK-NEXT: [[TEMP1:%.*]] = load <4 x i32>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK: [[BB8]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB9]]:
+; CHECK-NEXT: [[TEMP2:%.*]] = load <4 x i32>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT: [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
+; CHECK-NEXT: [[TMP13:%.*]] = and <4 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <4 x i32> [[TEMP1]], [[_MSLD1]]
+; CHECK-NEXT: [[TMP15:%.*]] = and <4 x i32> [[_MSLD]], [[TEMP2]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP15]]
+; CHECK-NEXT: [[COMMONBITS:%.*]] = and <4 x i32> [[TEMP1]], [[TEMP2]]
+; CHECK-NEXT: [[TMP18:%.*]] = xor <4 x i32> [[COMMONBITS]], zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = or <4 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <4 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = xor <4 x i32> [[TMP19]], splat (i32 -1)
+; CHECK-NEXT: [[TMP22:%.*]] = and <4 x i32> [[TMP21]], [[TMP18]]
+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq <4 x i32> [[TMP22]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <4 x i1> [[TMP20]], [[TMP23]]
+; CHECK-NEXT: [[MASK:%.*]] = icmp ne <4 x i32> [[COMMONBITS]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP:%.*]] = sext <4 x i1> [[_MSPROP_ICMP]] to <4 x i32>
+; CHECK-NEXT: [[RES:%.*]] = sext <4 x i1> [[MASK]] to <4 x i32>
+; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <4 x i32> [[RES]]
+;
+ %temp1 = load <4 x i32>, ptr %A
+ %temp2 = load <4 x i32>, ptr %B
+ %commonbits = and <4 x i32> %temp1, %temp2
+ %mask = icmp ne <4 x i32> %commonbits, zeroinitializer
+ %res = sext <4 x i1> %mask to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @cmtst_2d(ptr %A, ptr %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @cmtst_2d(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB4]]:
+; CHECK-NEXT: [[TEMP1:%.*]] = load <2 x i64>, ptr [[A]], align 16
+; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP7]], align 16
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK: [[BB8]]:
+; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB9]]:
+; CHECK-NEXT: [[TEMP2:%.*]] = load <2 x i64>, ptr [[B]], align 16
+; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
+; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT: [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
+; CHECK-NEXT: [[TMP13:%.*]] = and <2 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT: [[TMP14:%.*]] = and <2 x i64> [[TEMP1]], [[_MSLD1]]
+; CHECK-NEXT: [[TMP15:%.*]] = and <2 x i64> [[_MSLD]], [[TEMP2]]
+; CHECK-NEXT: [[TMP16:%.*]] = or <2 x i64> [[TMP13]], [[TMP14]]
+; CHECK-NEXT: [[TMP17:%.*]] = or <2 x i64> [[TMP16]], [[TMP15]]
+; CHECK-NEXT: [[COMMONBITS:%.*]] = and <2 x i64> [[TEMP1]], [[TEMP2]]
+; CHECK-NEXT: [[TMP18:%.*]] = xor <2 x i64> [[COMMONBITS]], zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = or <2 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <2 x i64> [[TMP19]], zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = xor <2 x i64> [[TMP19]], splat (i64 -1)
+; CHECK-NEXT: [[TMP22:%.*]] = and <2 x i64> [[TMP21]], [[TMP18]]
+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq <2 x i64> [[TMP22]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <2 x i1> [[TMP20]], [[TMP23]]
+; CHECK-NEXT: [[MASK:%.*]] = icmp ne <2 x i64> [[COMMONBITS]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP:%.*]] = sext <2 x i1> [[_MSPROP_ICMP]] to <2 x i64>
+; CHECK-NEXT: [[RES:%.*]] = sext <2 x i1> [[MASK]] to <2 x i64>
+; CHECK-NEXT: store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <2 x i64> [[RES]]
+;
+ %temp1 = load <2 x i64>, ptr %A
+ %temp2 = load <2 x i64>, ptr %B
+ %commonbits = and <2 x i64> %temp1, %temp2
+ %mask = icmp ne <2 x i64> %commonbits, zeroinitializer
+ %res = sext <2 x i1> %mask to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <1 x i64> @fcmeq_d(<1 x double> %A, <1 x double> %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <1 x i64> @fcmeq_d(
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <1 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TST:%.*]] = fcmp oeq <1 x double> [[A]], [[B]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[MASK:%.*]] = sext <1 x i1> [[TST]] to <1 x i64>
+; CHECK-NEXT: store <1 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <1 x i64> [[MASK]]
+;
+ %tst = fcmp oeq <1 x double> %A, %B
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmge_d(<1 x double> %A, <1 x double> %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <1 x i64> @fcmge_d(
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <1 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TST:%.*]] = fcmp oge <1 x double> [[A]], [[B]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[MASK:%.*]] = sext <1 x i1> [[TST]] to <1 x i64>
+; CHECK-NEXT: store <1 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <1 x i64> [[MASK]]
+;
+ %tst = fcmp oge <1 x double> %A, %B
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmle_d(<1 x double> %A, <1 x double> %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <1 x i64> @fcmle_d(
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <1 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TST:%.*]] = fcmp ole <1 x double> [[A]], [[B]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[MASK:%.*]] = sext <1 x i1> [[TST]] to <1 x i64>
+; CHECK-NEXT: store <1 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <1 x i64> [[MASK]]
+;
+ %tst = fcmp ole <1 x double> %A, %B
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgt_d(<1 x double> %A, <1 x double> %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <1 x i64> @fcmgt_d(
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <1 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TST:%.*]] = fcmp ogt <1 x double> [[A]], [[B]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[MASK:%.*]] = sext <1 x i1> [[TST]] to <1 x i64>
+; CHECK-NEXT: store <1 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <1 x i64> [[MASK]]
+;
+ %tst = fcmp ogt <1 x double> %A, %B
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmlt_d(<1 x double> %A, <1 x double> %B) nounwind sanitize_memory {
+; CHECK-LABEL: define <1 x i64> @fcmlt_d(
+; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <1 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT: [[TST:%.*]] = fcmp olt <1 x double> [[A]], [[B]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[MASK:%.*]] = sext <1 x i1> [[TST]] to <1 x i64>
+; CHECK-NEXT: store <1 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <1 x i64> [[MASK]]
+;
+ %tst = fcmp olt <1 x double> %A, %B
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmnez_d(<1 x i64> %A) nounwind sanitize_memory {
+; CHECK-LABEL: define <1 x i64> @cmnez_d(
+; CHECK-SAME: <1 x i64> [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP2:%.*]] = xor <1 x i64> [[A]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = or <1 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <1 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = xor <1 x i64> [[TMP3]], splat (i64 -1)
+; CHECK-NEXT: [[TMP6:%.*]] = and <1 x i64> [[TMP5]], [[TMP2]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <1 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP_ICMP:%.*]] = and <1 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT: [[TST:%.*]] = icmp ne <1 x i64> [[A]], zeroinitializer
+; CHECK-NEXT: [[_MSPROP:%.*]] = sext <1 x i1> [[_MSPROP_ICMP]] to <1 x i64>
+; CHECK-NEXT: [[MASK:%.*]] = sext <1 x i1> [[TST]] to <1 x i64>
+; CHECK-NEXT: store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: ret <1 x i64> [[MASK]]
+;
+ %tst = icmp ne <1 x i64> %A, zeroinitializer
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
>From caf7e2becad04efd265f0c140e93bd04c3101874 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Thu, 5 Feb 2026 20:34:24 -0800
Subject: [PATCH 11/33] [msan][NFCI] Generalize
handleAVX512VectorGenericMaskedFP (#179850)
handleAVX512VectorGenericMaskedFP() assumes there is one vector of data
(excluding the mask). This patch generalizes it to allow multiple
vectors of data, which we assume will be munged together.
Future work can apply this to intrinsics such as:
```
<16 x float> @llvm.x86.avx512.mask.scalef.ps.512
(<16 x float>, <16 x float>, <16 x float>, i16, i32)
WriteThru A B Mask Rounding
```
---
.../Instrumentation/MemorySanitizer.cpp | 88 +++++++++++++------
1 file changed, 59 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index bb9657c0ec353..72eb0d8c3ff41 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -5036,7 +5036,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
- // Handle llvm.x86.avx512.* instructions that take a vector of floating-point
+ // Handle llvm.x86.avx512.* instructions that take vector(s) of floating-point
// values and perform an operation whose shadow propagation should be handled
// as all-or-nothing [*], with masking provided by a vector and a mask
// supplied as an integer.
@@ -5050,44 +5050,63 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
//
// <2 x double> @llvm.x86.avx512.rcp14.pd.128
// (<2 x double>, <2 x double>, i8)
+ // A WriteThru Mask
//
// <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512
// (<8 x double>, i32, <8 x double>, i8, i32)
// A Imm WriteThru Mask Rounding
//
- // All operands other than A and WriteThru (e.g., Mask, Imm, Rounding) must
- // be fully initialized.
+ // <16 x float> @llvm.x86.avx512.mask.scalef.ps.512
+ // (<16 x float>, <16 x float>, <16 x float>, i16, i32)
+ // WriteThru A B Mask Rnd
+ //
+ // All operands other than A, B, ..., and WriteThru (e.g., Mask, Imm,
+ // Rounding) must be fully initialized.
//
- // Dst[i] = Mask[i] ? some_op(A[i]) : WriteThru[i]
- // Dst_shadow[i] = Mask[i] ? all_or_nothing(A_shadow[i]) : WriteThru_shadow[i]
- void handleAVX512VectorGenericMaskedFP(IntrinsicInst &I, unsigned AIndex,
+ // Dst[i] = Mask[i] ? some_op(A[i], B[i], ...)
+ // : WriteThru[i]
+ // Dst_shadow[i] = Mask[i] ? all_or_nothing(A_shadow[i] | B_shadow[i] | ...)
+ // : WriteThru_shadow[i]
+ void handleAVX512VectorGenericMaskedFP(IntrinsicInst &I,
+ SmallVector<unsigned, 4> DataIndices,
unsigned WriteThruIndex,
unsigned MaskIndex) {
IRBuilder<> IRB(&I);
unsigned NumArgs = I.arg_size();
- assert(AIndex < NumArgs);
+
assert(WriteThruIndex < NumArgs);
assert(MaskIndex < NumArgs);
- assert(AIndex != WriteThruIndex);
- assert(AIndex != MaskIndex);
assert(WriteThruIndex != MaskIndex);
-
- Value *A = I.getOperand(AIndex);
Value *WriteThru = I.getOperand(WriteThruIndex);
- Value *Mask = I.getOperand(MaskIndex);
- assert(isFixedFPVector(A));
- assert(isFixedFPVector(WriteThru));
-
- [[maybe_unused]] unsigned ANumElements =
- cast<FixedVectorType>(A->getType())->getNumElements();
unsigned OutputNumElements =
cast<FixedVectorType>(WriteThru->getType())->getNumElements();
- assert(ANumElements == OutputNumElements);
+
+ assert(DataIndices.size() > 0);
+
+ bool isData[16] = {false};
+ assert(NumArgs <= 16);
+ for (unsigned i : DataIndices) {
+ assert(i < NumArgs);
+ assert(i != WriteThruIndex);
+ assert(i != MaskIndex);
+
+ isData[i] = true;
+
+ Value *A = I.getOperand(i);
+ assert(isFixedFPVector(A));
+ [[maybe_unused]] unsigned ANumElements =
+ cast<FixedVectorType>(A->getType())->getNumElements();
+ assert(ANumElements == OutputNumElements);
+ }
+
+ Value *Mask = I.getOperand(MaskIndex);
+
+ assert(isFixedFPVector(WriteThru));
for (unsigned i = 0; i < NumArgs; ++i) {
- if (i != AIndex && i != WriteThruIndex) {
+ if (!isData[i] && i != WriteThruIndex) {
// Imm, Mask, Rounding etc. are "control" data, hence we require that
// they be fully initialized.
assert(I.getOperand(i)->getType()->isIntegerTy());
@@ -5096,24 +5115,32 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
// The mask has 1 bit per element of A, but a minimum of 8 bits.
- if (Mask->getType()->getScalarSizeInBits() == 8 && ANumElements < 8)
- Mask = IRB.CreateTrunc(Mask, Type::getIntNTy(*MS.C, ANumElements));
- assert(Mask->getType()->getScalarSizeInBits() == ANumElements);
+ if (Mask->getType()->getScalarSizeInBits() == 8 && OutputNumElements < 8)
+ Mask = IRB.CreateTrunc(Mask, Type::getIntNTy(*MS.C, OutputNumElements));
+ assert(Mask->getType()->getScalarSizeInBits() == OutputNumElements);
assert(I.getType() == WriteThru->getType());
Mask = IRB.CreateBitCast(
Mask, FixedVectorType::get(IRB.getInt1Ty(), OutputNumElements));
- Value *AShadow = getShadow(A);
+ Value *DataShadow = nullptr;
+ for (unsigned i : DataIndices) {
+ Value *A = I.getOperand(i);
+ if (DataShadow)
+ DataShadow = IRB.CreateOr(DataShadow, getShadow(A));
+ else
+ DataShadow = getShadow(A);
+ }
// All-or-nothing shadow
- AShadow = IRB.CreateSExt(IRB.CreateICmpNE(AShadow, getCleanShadow(AShadow)),
- AShadow->getType());
+ DataShadow =
+ IRB.CreateSExt(IRB.CreateICmpNE(DataShadow, getCleanShadow(DataShadow)),
+ DataShadow->getType());
Value *WriteThruShadow = getShadow(WriteThru);
- Value *Shadow = IRB.CreateSelect(Mask, AShadow, WriteThruShadow);
+ Value *Shadow = IRB.CreateSelect(Mask, DataShadow, WriteThruShadow);
setShadow(&I, Shadow);
setOriginForNaryOp(I);
@@ -6607,7 +6634,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_512:
case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_256:
case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_128:
- handleAVX512VectorGenericMaskedFP(I, /*AIndex=*/0, /*WriteThruIndex=*/1,
+ handleAVX512VectorGenericMaskedFP(I, /*DataIndices=*/{0},
+ /*WriteThruIndex=*/1,
/*MaskIndex=*/2);
break;
@@ -6659,7 +6687,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_avx512fp16_mask_rcp_ph_512:
case Intrinsic::x86_avx512fp16_mask_rcp_ph_256:
case Intrinsic::x86_avx512fp16_mask_rcp_ph_128:
- handleAVX512VectorGenericMaskedFP(I, /*AIndex=*/0, /*WriteThruIndex=*/1,
+ handleAVX512VectorGenericMaskedFP(I, /*DataIndices=*/{0},
+ /*WriteThruIndex=*/1,
/*MaskIndex=*/2);
break;
@@ -6715,7 +6744,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::x86_avx10_mask_rndscale_bf16_512:
case Intrinsic::x86_avx10_mask_rndscale_bf16_256:
case Intrinsic::x86_avx10_mask_rndscale_bf16_128:
- handleAVX512VectorGenericMaskedFP(I, /*AIndex=*/0, /*WriteThruIndex=*/2,
+ handleAVX512VectorGenericMaskedFP(I, /*DataIndices=*/{0},
+ /*WriteThruIndex=*/2,
/*MaskIndex=*/3);
break;
>From 90786e97dabe946ebb7933f8efc7215379c910d1 Mon Sep 17 00:00:00 2001
From: Snehasish Kumar <mail at snehasish.net>
Date: Thu, 5 Feb 2026 21:09:06 -0800
Subject: [PATCH 12/33] [InstCombine][profcheck] Fix profile metadata
propagation for umax in InstCombine (#179332)
Select instructions created from the expansion of an umax intrinsic do
not have profile data even though the function may have profile data.
This is because PGO instrumentation does not support intrinsics.
Assisted-by: gemini
---
llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 6 ++++--
llvm/test/Transforms/InstCombine/add-shl-mul-umax.ll | 12 +++++++++---
llvm/utils/profcheck-xfail.txt | 1 -
3 files changed, 13 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 3559bcff4d932..652e94d5a9fbd 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -51,6 +51,7 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Statepoint.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
@@ -2117,8 +2118,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
return nullptr;
Value *Cmp = Builder.CreateICmpEQ(X, ConstantInt::get(X->getType(), 0));
- Value *NewSelect =
- Builder.CreateSelect(Cmp, ConstantInt::get(X->getType(), 1), A);
+ Value *NewSelect = nullptr;
+ NewSelect = Builder.CreateSelectWithUnknownProfile(
+ Cmp, ConstantInt::get(X->getType(), 1), A, DEBUG_TYPE);
return replaceInstUsesWith(*II, NewSelect);
};
diff --git a/llvm/test/Transforms/InstCombine/add-shl-mul-umax.ll b/llvm/test/Transforms/InstCombine/add-shl-mul-umax.ll
index a219503456432..385bab52a8f32 100644
--- a/llvm/test/Transforms/InstCombine/add-shl-mul-umax.ll
+++ b/llvm/test/Transforms/InstCombine/add-shl-mul-umax.ll
@@ -11,12 +11,12 @@
; Positive Test Cases for `shl`
-define i64 @test_shl_by_2(i64 %x) {
+define i64 @test_shl_by_2(i64 %x) !prof !0 {
; CHECK-LABEL: define i64 @test_shl_by_2(
-; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-SAME: i64 [[X:%.*]]) !prof [[PROF0:![0-9]+]] {
; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i64 [[X]], 2
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[X]], 0
-; CHECK-NEXT: [[MAX:%.*]] = select i1 [[TMP1]], i64 1, i64 [[TMP2]]
+; CHECK-NEXT: [[MAX:%.*]] = select i1 [[TMP1]], i64 1, i64 [[TMP2]], !prof [[PROF1:![0-9]+]]
; CHECK-NEXT: ret i64 [[MAX]]
;
%x1 = add i64 %x, 1
@@ -351,3 +351,9 @@ define i64 @test_mul_multi_use_mul(i64 %x) {
%max = call i64 @llvm.umax.i64(i64 %mul, i64 %x1)
ret i64 %max
}
+
+!0 = !{!"function_entry_count", i64 1}
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i64 1}
+; CHECK: [[PROF1]] = !{!"unknown", !"instcombine"}
+;.
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 468fdc1f4c986..f8526550257cf 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -208,7 +208,6 @@ Transforms/IndVarSimplify/invalidate-modified-lcssa-phi.ll
Transforms/IndVarSimplify/pr45835.ll
Transforms/IndVarSimplify/preserving-debugloc-rem-div.ll
Transforms/InstCombine/2004-09-20-BadLoadCombine.ll
-Transforms/InstCombine/add-shl-mul-umax.ll
Transforms/InstCombine/and2.ll
Transforms/InstCombine/and-fcmp.ll
Transforms/InstCombine/and-or-icmps.ll
>From 55245651ade64196e12736c437ee3e31aa5abd13 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Thu, 5 Feb 2026 21:30:45 -0800
Subject: [PATCH 13/33] [ELF][test] Consolidate invalid-eh-frame tests
Merge 9 individual invalid-eh-frame test files into 2 categorized files
using split-file:
- eh-frame-invalid-cie.s: CIE structure errors (too small, unexpected
end, failed string/LEB128 reads)
- eh-frame-invalid-fde-encoding.s: EhReader::getFdeEncoding errors
(unknown augmentation string, unknown FDE encoding, unsupported
aligned encoding, corrupted CIE)
---
lld/test/ELF/eh-frame-invalid-cie.s | 77 ++++++++++++
lld/test/ELF/eh-frame-invalid-fde-encoding.s | 122 +++++++++++++++++++
lld/test/ELF/invalid-eh-frame.s | 17 ---
lld/test/ELF/invalid-eh-frame2.s | 23 ----
lld/test/ELF/invalid-eh-frame3.s | 21 ----
lld/test/ELF/invalid-eh-frame4.s | 30 -----
lld/test/ELF/invalid-eh-frame5.s | 28 -----
lld/test/ELF/invalid-eh-frame6.s | 32 -----
lld/test/ELF/invalid-eh-frame7.s | 30 -----
lld/test/ELF/invalid-eh-frame8.s | 30 -----
lld/test/ELF/invalid-eh-frame9.s | 15 ---
11 files changed, 199 insertions(+), 226 deletions(-)
create mode 100644 lld/test/ELF/eh-frame-invalid-cie.s
create mode 100644 lld/test/ELF/eh-frame-invalid-fde-encoding.s
delete mode 100644 lld/test/ELF/invalid-eh-frame.s
delete mode 100644 lld/test/ELF/invalid-eh-frame2.s
delete mode 100644 lld/test/ELF/invalid-eh-frame3.s
delete mode 100644 lld/test/ELF/invalid-eh-frame4.s
delete mode 100644 lld/test/ELF/invalid-eh-frame5.s
delete mode 100644 lld/test/ELF/invalid-eh-frame6.s
delete mode 100644 lld/test/ELF/invalid-eh-frame7.s
delete mode 100644 lld/test/ELF/invalid-eh-frame8.s
delete mode 100644 lld/test/ELF/invalid-eh-frame9.s
diff --git a/lld/test/ELF/eh-frame-invalid-cie.s b/lld/test/ELF/eh-frame-invalid-cie.s
new file mode 100644
index 0000000000000..0a40a1121a9db
--- /dev/null
+++ b/lld/test/ELF/eh-frame-invalid-cie.s
@@ -0,0 +1,77 @@
+# REQUIRES: x86
+## Test CIE structure errors in .eh_frame.
+
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 too-small.s -o too-small.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 unexpected-end.s -o unexpected-end.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 failed-string.s -o failed-string.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 failed-leb128.s -o failed-leb128.o
+
+# RUN: not ld.lld --eh-frame-hdr too-small.o 2>&1 | FileCheck %s --check-prefix=TOO-SMALL --implicit-check-not=error:
+# RUN: not ld.lld --eh-frame-hdr unexpected-end.o 2>&1 | FileCheck %s --check-prefix=UNEXPECTED-END
+# RUN: not ld.lld --eh-frame-hdr failed-string.o 2>&1 | FileCheck %s --check-prefix=FAILED-STRING --implicit-check-not=error:
+# RUN: not ld.lld --eh-frame-hdr failed-leb128.o 2>&1 | FileCheck %s --check-prefix=FAILED-LEB128
+
+# TOO-SMALL: error: corrupted .eh_frame: CIE is too small
+# TOO-SMALL-NEXT: >>> defined in too-small.o:(.eh_frame+0x0)
+
+# UNEXPECTED-END: error: corrupted .eh_frame: unexpected end of CIE
+# UNEXPECTED-END-NEXT: >>> defined in unexpected-end.o:(.eh_frame+0x8)
+
+# FAILED-STRING: error: corrupted .eh_frame: corrupted CIE (failed to read string)
+# FAILED-STRING-NEXT: >>> defined in failed-string.o:(.eh_frame+0x9)
+
+# FAILED-LEB128: error: corrupted .eh_frame: corrupted CIE (failed to read LEB128)
+# FAILED-LEB128-NEXT: >>> defined in failed-leb128.o:(.eh_frame+0xc)
+
+#--- too-small.s
+.section .eh_frame,"a", at unwind
+ .byte 0x03
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+
+#--- unexpected-end.s
+.section .eh_frame,"a", at unwind
+ .byte 0x04
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+
+#--- failed-string.s
+.section .eh_frame,"a", at unwind
+.align 1
+ .byte 0x08
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x01
+ .byte 0x01
+ .byte 0x01
+ .byte 0x01
+
+#--- failed-leb128.s
+.section .eh_frame,"a", at unwind
+ .byte 0x08
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x01
+ .byte 0x01
+ .byte 0x00
+ .byte 0x01
diff --git a/lld/test/ELF/eh-frame-invalid-fde-encoding.s b/lld/test/ELF/eh-frame-invalid-fde-encoding.s
new file mode 100644
index 0000000000000..a4802d533ae70
--- /dev/null
+++ b/lld/test/ELF/eh-frame-invalid-fde-encoding.s
@@ -0,0 +1,122 @@
+# REQUIRES: x86
+## Test EhReader::getFdeEncoding errors in .eh_frame.
+
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 unknown-aug.s -o unknown-aug.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 corrupted.s -o corrupted.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 unknown-fde-encoding.s -o unknown-fde-encoding.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 aligned-encoding.s -o aligned-encoding.o
+
+# RUN: not ld.lld --eh-frame-hdr unknown-aug.o 2>&1 | FileCheck %s --check-prefix=UNKNOWN-AUG -DPREFIX=error --implicit-check-not=error:
+# RUN: ld.lld --eh-frame-hdr unknown-aug.o --noinhibit-exec 2>&1 | FileCheck %s --check-prefix=UNKNOWN-AUG -DPREFIX=warning
+# RUN: not ld.lld --eh-frame-hdr corrupted.o 2>&1 | FileCheck %s --check-prefix=CORRUPTED --implicit-check-not=error:
+# RUN: not ld.lld --eh-frame-hdr unknown-fde-encoding.o 2>&1 | FileCheck %s --check-prefix=UNKNOWN-FDE --implicit-check-not=error:
+# RUN: ld.lld --eh-frame-hdr unknown-fde-encoding.o --noinhibit-exec
+# RUN: not ld.lld --eh-frame-hdr aligned-encoding.o 2>&1 | FileCheck %s --check-prefix=ALIGNED --implicit-check-not=error:
+
+# UNKNOWN-AUG: [[PREFIX]]: corrupted .eh_frame: unknown .eh_frame augmentation string: {{.+}}
+
+# CORRUPTED: error: corrupted .eh_frame: corrupted CIE
+
+# UNKNOWN-FDE: error: corrupted .eh_frame: unknown FDE encoding
+# UNKNOWN-FDE-NEXT: >>> defined in unknown-fde-encoding.o:(.eh_frame+0xe)
+
+# ALIGNED: error: corrupted .eh_frame: DW_EH_PE_aligned encoding is not supported
+
+#--- unknown-aug.s
+.section .eh_frame,"a", at unwind
+ .byte 0x0E
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x01
+ .byte 0x01
+ .byte 0x00
+ .byte 0x01
+
+ .byte 0x01 # LEB128
+ .byte 0x01 # LEB128
+
+ .byte 0x01
+ .byte 0x01
+ .byte 0x01
+ .byte 0x01
+
+#--- corrupted.s
+.section .eh_frame,"a", at unwind
+ .byte 0x0E
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x01
+
+ .byte 0x50 # Augmentation string: 'P','\0'
+ .byte 0x00
+
+ .byte 0x01
+
+ .byte 0x01 # LEB128
+ .byte 0x01 # LEB128
+
+ .byte 0x03
+ .byte 0x01
+ .byte 0x01
+ .byte 0x01
+
+#--- unknown-fde-encoding.s
+.section .eh_frame,"a", at unwind
+ .byte 0x0E
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x01
+
+ .byte 0x50 # Augmentation string: 'P','\0'
+ .byte 0x00
+
+ .byte 0x01
+
+ .byte 0x01 # LEB128
+ .byte 0x01 # LEB128
+
+ .byte 0x01
+ .byte 0x01
+ .byte 0x01
+ .byte 0x01
+
+#--- aligned-encoding.s
+.section .eh_frame,"a", at unwind
+ .byte 0x0E
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x00
+ .byte 0x01
+
+ .byte 0x50 # Augmentation string: 'P','\0'
+ .byte 0x00
+
+ .byte 0x01
+
+ .byte 0x01 # LEB128
+ .byte 0x01 # LEB128
+
+ .byte 0x51
+ .byte 0x01
+ .byte 0x01
+ .byte 0x01
diff --git a/lld/test/ELF/invalid-eh-frame.s b/lld/test/ELF/invalid-eh-frame.s
deleted file mode 100644
index ccf433c6a6755..0000000000000
--- a/lld/test/ELF/invalid-eh-frame.s
+++ /dev/null
@@ -1,17 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s
-
-# CHECK: error: corrupted .eh_frame: unexpected end of CIE
-# CHECK-NEXT: >>> defined in {{.*}}:(.eh_frame+0x8)
-
-.section .eh_frame,"a", at unwind
- .byte 0x04
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
diff --git a/lld/test/ELF/invalid-eh-frame2.s b/lld/test/ELF/invalid-eh-frame2.s
deleted file mode 100644
index 01f38738519b6..0000000000000
--- a/lld/test/ELF/invalid-eh-frame2.s
+++ /dev/null
@@ -1,23 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s
-# RUN: ld.lld --eh-frame-hdr %t -o /dev/null --noinhibit-exec
-
-# CHECK: error: corrupted .eh_frame: corrupted CIE (failed to read string)
-# CHECK-NEXT: >>> defined in {{.*}}:(.eh_frame+0x9)
-
-.section .eh_frame,"a", at unwind
-.align 1
- .byte 0x08
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x01
- .byte 0x01
- .byte 0x01
- .byte 0x01
diff --git a/lld/test/ELF/invalid-eh-frame3.s b/lld/test/ELF/invalid-eh-frame3.s
deleted file mode 100644
index 6f7c7d957ab69..0000000000000
--- a/lld/test/ELF/invalid-eh-frame3.s
+++ /dev/null
@@ -1,21 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s
-
-# CHECK: error: corrupted .eh_frame: corrupted CIE (failed to read LEB128)
-# CHECK-NEXT: >>> defined in {{.*}}:(.eh_frame+0xc)
-
-.section .eh_frame,"a", at unwind
- .byte 0x08
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x01
- .byte 0x01
- .byte 0x00
- .byte 0x01
diff --git a/lld/test/ELF/invalid-eh-frame4.s b/lld/test/ELF/invalid-eh-frame4.s
deleted file mode 100644
index 51f276bc0014e..0000000000000
--- a/lld/test/ELF/invalid-eh-frame4.s
+++ /dev/null
@@ -1,30 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERROR --implicit-check-not=error:
-# RUN: ld.lld --eh-frame-hdr %t -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s --check-prefix=WARN --implicit-check-not=error:
-
-# ERROR: error: corrupted .eh_frame: unknown .eh_frame augmentation string: {{.+}}
-# WARN: warning: corrupted .eh_frame: unknown .eh_frame augmentation string: {{.+}}
-
-.section .eh_frame,"a", at unwind
- .byte 0x0E
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x01
- .byte 0x01
- .byte 0x00
- .byte 0x01
-
- .byte 0x01 # LEB128
- .byte 0x01 # LEB128
-
- .byte 0x01
- .byte 0x01
- .byte 0x01
- .byte 0x01
diff --git a/lld/test/ELF/invalid-eh-frame5.s b/lld/test/ELF/invalid-eh-frame5.s
deleted file mode 100644
index af86ed94d121f..0000000000000
--- a/lld/test/ELF/invalid-eh-frame5.s
+++ /dev/null
@@ -1,28 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error:
-
-# CHECK: error: corrupted .eh_frame: unknown .eh_frame augmentation string: {{.+}}
-
-.section .eh_frame,"a", at unwind
- .byte 0x0E
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x03
- .byte 0x01
- .byte 0x00
- .byte 0x01
-
- .byte 0x01 # LEB128
- .byte 0x01 # LEB128
-
- .byte 0x01
- .byte 0x01
- .byte 0x01
- .byte 0x01
diff --git a/lld/test/ELF/invalid-eh-frame6.s b/lld/test/ELF/invalid-eh-frame6.s
deleted file mode 100644
index 6888419da3e3d..0000000000000
--- a/lld/test/ELF/invalid-eh-frame6.s
+++ /dev/null
@@ -1,32 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s
-# RUN: ld.lld --eh-frame-hdr %t -o /dev/null --noinhibit-exec
-
-# CHECK: error: corrupted .eh_frame: unknown FDE encoding
-# CHECK-NEXT: >>> defined in {{.*}}:(.eh_frame+0xe)
-
-.section .eh_frame,"a", at unwind
- .byte 0x0E
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x01
-
- .byte 0x50 # Augmentation string: 'P','\0'
- .byte 0x00
-
- .byte 0x01
-
- .byte 0x01 # LEB128
- .byte 0x01 # LEB128
-
- .byte 0x01
- .byte 0x01
- .byte 0x01
- .byte 0x01
diff --git a/lld/test/ELF/invalid-eh-frame7.s b/lld/test/ELF/invalid-eh-frame7.s
deleted file mode 100644
index 6955d51e7aef6..0000000000000
--- a/lld/test/ELF/invalid-eh-frame7.s
+++ /dev/null
@@ -1,30 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s
-
-# CHECK: error: corrupted .eh_frame: DW_EH_PE_aligned encoding is not supported
-
-.section .eh_frame,"a", at unwind
- .byte 0x0E
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x01
-
- .byte 0x50 # Augmentation string: 'P','\0'
- .byte 0x00
-
- .byte 0x01
-
- .byte 0x01 # LEB128
- .byte 0x01 # LEB128
-
- .byte 0x51
- .byte 0x01
- .byte 0x01
- .byte 0x01
diff --git a/lld/test/ELF/invalid-eh-frame8.s b/lld/test/ELF/invalid-eh-frame8.s
deleted file mode 100644
index 856fddb19c025..0000000000000
--- a/lld/test/ELF/invalid-eh-frame8.s
+++ /dev/null
@@ -1,30 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s
-
-# CHECK: error: corrupted .eh_frame: corrupted CIE
-
-.section .eh_frame,"a", at unwind
- .byte 0x0E
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x01
-
- .byte 0x50 # Augmentation string: 'P','\0'
- .byte 0x00
-
- .byte 0x01
-
- .byte 0x01 # LEB128
- .byte 0x01 # LEB128
-
- .byte 0x03
- .byte 0x01
- .byte 0x01
- .byte 0x01
diff --git a/lld/test/ELF/invalid-eh-frame9.s b/lld/test/ELF/invalid-eh-frame9.s
deleted file mode 100644
index 436b34bb3a802..0000000000000
--- a/lld/test/ELF/invalid-eh-frame9.s
+++ /dev/null
@@ -1,15 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s
-
-# CHECK: error: corrupted .eh_frame: CIE is too small
-
-.section .eh_frame,"a", at unwind
- .byte 0x03
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
>From a3eda32d98b22e8cfe4b9a542840ef9485236593 Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97 at outlook.com>
Date: Fri, 6 Feb 2026 13:32:12 +0800
Subject: [PATCH 14/33] [NVPTX] Remove `NoSignedZerosFPMath` uses (#180086)
Users should use `nsz` flag instead.
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 2 +-
llvm/test/CodeGen/NVPTX/fma-relu-contract.ll | 30 ++++++++++----------
2 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 8c07d96fcd010..e5a492fa90fbd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2499,7 +2499,7 @@ def NVPTX_fma_oneuse_and_nnan : PatFrag<(ops node:$a, node:$b, node:$c),
// PatFrag is for a fmaxnum node with nsz
def NVPTX_fmaxnum_nsz : PatFrag<(ops node:$a, node:$b),
(fmaxnum node:$a, node:$b), [{
- return N->getFlags().hasNoSignedZeros() || TM.Options.NoSignedZerosFPMath;
+ return N->getFlags().hasNoSignedZeros();
}]>;
class FMARELUInst<RegTyInfo t, bit allow_ftz, PatFrag zero_pat>
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
index 92293ab171a12..6b261411d6144 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -52,7 +52,7 @@ define half @fma_f16_expanded_no_nans(half %a, half %b, half %c) #0 {
%1 = fmul half %a, %b
%2 = fadd half %1, %c
%3 = fcmp ogt half %2, 0.0
- %4 = select i1 %3, half %2, half 0.0
+ %4 = select nsz i1 %3, half %2, half 0.0
ret half %4
}
@@ -113,7 +113,7 @@ define half @fma_f16_expanded_no_nans_multiple_uses_of_fma(half %a, half %b, hal
%1 = fmul half %a, %b
%2 = fadd half %1, %c
%3 = fcmp ogt half %2, 0.0
- %4 = select i1 %3, half %2, half 0.0
+ %4 = select nsz i1 %3, half %2, half 0.0
%5 = fadd half %2, 7.0
%6 = fadd half %4, %5
ret half %6
@@ -212,7 +212,7 @@ define half @fma_f16_expanded_maxnum_no_nans(half %a, half %b, half %c) #0 {
; CHECK-SM70-NEXT: ret;
%1 = fmul half %a, %b
%2 = fadd half %1, %c
- %3 = call half @llvm.maxnum.f16(half %2, half 0.0)
+ %3 = call nsz half @llvm.maxnum.f16(half %2, half 0.0)
ret half %3
}
@@ -332,7 +332,7 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
%1 = fmul bfloat %a, %b
%2 = fadd bfloat %1, %c
%3 = fcmp ogt bfloat %2, 0.0
- %4 = select i1 %3, bfloat %2, bfloat 0.0
+ %4 = select nsz i1 %3, bfloat %2, bfloat 0.0
ret bfloat %4
}
@@ -428,7 +428,7 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
%1 = fmul bfloat %a, %b
%2 = fadd bfloat %1, %c
%3 = fcmp ogt bfloat %2, 0.0
- %4 = select i1 %3, bfloat %2, bfloat 0.0
+ %4 = select nsz i1 %3, bfloat %2, bfloat 0.0
%5 = fadd bfloat %2, 7.0
%6 = fadd bfloat %4, %5
ret bfloat %6
@@ -491,7 +491,7 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c)
; CHECK-SM70-NEXT: ret;
%1 = fmul bfloat %a, %b
%2 = fadd bfloat %1, %c
- %3 = call bfloat @llvm.maxnum.bf16(bfloat %2, bfloat 0.0)
+ %3 = call nsz bfloat @llvm.maxnum.bf16(bfloat %2, bfloat 0.0)
ret bfloat %3
}
@@ -541,7 +541,7 @@ define <2 x half> @fma_f16x2_expanded_no_nans(<2 x half> %a, <2 x half> %b, <2 x
%1 = fmul <2 x half> %a, %b
%2 = fadd <2 x half> %1, %c
%3 = fcmp ogt <2 x half> %2, <half 0.0, half 0.0>
- %4 = select <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0>
+ %4 = select nsz <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0>
ret <2 x half> %4
}
@@ -606,7 +606,7 @@ define <2 x half> @fma_f16x2_expanded_no_nans_multiple_uses_of_fma(<2 x half> %a
%1 = fmul <2 x half> %a, %b
%2 = fadd <2 x half> %1, %c
%3 = fcmp ogt <2 x half> %2, <half 0.0, half 0.0>
- %4 = select <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0>
+ %4 = select nsz <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0>
%5 = fadd <2 x half> %2, <half 7.0, half 7.0>
%6 = fadd <2 x half> %4, %5
ret <2 x half> %6
@@ -662,7 +662,7 @@ define <2 x half> @fma_f16x2_expanded_unsafe_with_nans(<2 x half> %a, <2 x half>
%1 = fmul <2 x half> %a, %b
%2 = fadd <2 x half> %1, %c
%3 = fcmp ogt <2 x half> %2, <half 0.0, half 0.0>
- %4 = select <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0>
+ %4 = select nsz <2 x i1> %3, <2 x half> %2, <2 x half> <half 0.0, half 0.0>
ret <2 x half> %4
}
@@ -713,7 +713,7 @@ define <2 x half> @fma_f16x2_expanded_maxnum_no_nans(<2 x half> %a, <2 x half> %
; CHECK-SM70-NEXT: ret;
%1 = fmul <2 x half> %a, %b
%2 = fadd <2 x half> %1, %c
- %3 = call <2 x half> @llvm.maxnum.f16x2(<2 x half> %2, <2 x half> <half 0.0, half 0.0>)
+ %3 = call nsz <2 x half> @llvm.maxnum.f16x2(<2 x half> %2, <2 x half> <half 0.0, half 0.0>)
ret <2 x half> %3
}
@@ -795,7 +795,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x
%1 = fmul <2 x bfloat> %a, %b
%2 = fadd <2 x bfloat> %1, %c
%3 = fcmp ogt <2 x bfloat> %2, <bfloat 0.0, bfloat 0.0>
- %4 = select <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>
+ %4 = select nsz <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>
ret <2 x bfloat> %4
}
@@ -873,7 +873,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
%1 = fmul <2 x bfloat> %a, %b
%2 = fadd <2 x bfloat> %1, %c
%3 = fcmp ogt <2 x bfloat> %2, <bfloat 0.0, bfloat 0.0>
- %4 = select <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>
+ %4 = select nsz <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>
ret <2 x bfloat> %4
}
@@ -1017,7 +1017,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
%1 = fmul <2 x bfloat> %a, %b
%2 = fadd <2 x bfloat> %1, %c
%3 = fcmp ogt <2 x bfloat> %2, <bfloat 0.0, bfloat 0.0>
- %4 = select <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>
+ %4 = select nsz <2 x i1> %3, <2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>
%5 = fadd <2 x bfloat> %2, <bfloat 7.0, bfloat 7.0>
%6 = fadd <2 x bfloat> %4, %5
ret <2 x bfloat> %6
@@ -1105,8 +1105,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
; CHECK-SM70-NEXT: ret;
%1 = fmul <2 x bfloat> %a, %b
%2 = fadd <2 x bfloat> %1, %c
- %3 = call <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
+ %3 = call nsz <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %2, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
ret <2 x bfloat> %3
}
-attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+attributes #0 = { "no-nans-fp-math"="true" }
>From 5e635ebdce18ba733d57c871983924f289ee3158 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Thu, 5 Feb 2026 21:35:38 -0800
Subject: [PATCH 15/33] [msan] Handle NEON bfmmla (#176264)
This patch adapts handleNEONMatrixMultiply() (used for integer matrix
multiply: smmla/ummla/usmmla) to floating-point (bfmmla).
---
.../Instrumentation/MemorySanitizer.cpp | 132 ++++++++++++------
.../aarch64-bf16-dotprod-intrinsics.ll | 33 ++---
.../MemorySanitizer/AArch64/aarch64-matmul.ll | 4 +-
3 files changed, 107 insertions(+), 62 deletions(-)
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 72eb0d8c3ff41..98c4866cd7825 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -5440,16 +5440,27 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
}
- // <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8
- // (<4 x i32> %R, <16 x i8> %X, <16 x i8> %Y)
- // <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8
- // (<4 x i32> %R, <16 x i8> %X, <16 x i8> %Y)
- // <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8
- // (<4 x i32> R%, <16 x i8> %X, <16 x i8> %Y)
+ // Integer matrix multiplication:
+ // - <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8
+ // (<4 x i32> %R, <16 x i8> %X, <16 x i8> %Y)
+ // - <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8
+ // (<4 x i32> %R, <16 x i8> %X, <16 x i8> %Y)
+ // - <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8
+ // (<4 x i32> %R, <16 x i8> %X, <16 x i8> %Y)
//
// Note:
- // - < 4 x *> is a 2x2 matrix
- // - <16 x *> is a 2x8 matrix and 8x2 matrix respectively
+ // - <4 x i32> is a 2x2 matrix
+ // - <16 x i8> %X and %Y are 2x8 and 8x2 matrices respectively
+ //
+ // 2x8 %X 8x2 %Y
+ // [ X01 X02 X03 X04 X05 X06 X07 X08 ] [ Y01 Y09 ]
+ // [ X09 X10 X11 X12 X13 X14 X15 X16 ] x [ Y02 Y10 ]
+ // [ Y03 Y11 ]
+ // [ Y04 Y12 ]
+ // [ Y05 Y13 ]
+ // [ Y06 Y14 ]
+ // [ Y07 Y15 ]
+ // [ Y08 Y16 ]
//
// The general shadow propagation approach is:
// 1) get the shadows of the input matrices %X and %Y
@@ -5463,14 +5474,30 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// TODO: consider allowing multiplication of zero with an uninitialized value
// to result in an initialized value.
//
- // TODO: handle floating-point matrix multiply using ummla on the shadows:
- // case Intrinsic::aarch64_neon_bfmmla:
- // handleNEONMatrixMultiply(I, /*ARows=*/ 2, /*ACols=*/ 4,
- // /*BRows=*/ 4, /*BCols=*/ 2);
+ // Floating-point matrix multiplication:
+ // - <4 x float> @llvm.aarch64.neon.bfmmla
+ // (<4 x float> %R, <8 x bfloat> %X, <8 x bfloat> %Y)
+ // %X and %Y are 2x4 and 4x2 matrices respectively
+ //
+ // Although there are half as many elements of %X and %Y compared to the
+ // integer case, each element is twice the bit-width. Thus, we can reuse the
+ // shadow propagation logic if we cast the shadows to the same type as the
+ // integer case, and apply ummla to the shadows:
//
- void handleNEONMatrixMultiply(IntrinsicInst &I, unsigned int ARows,
- unsigned int ACols, unsigned int BRows,
- unsigned int BCols) {
+ // 2x4 %X 4x2 %Y
+ // [ A01:A02 A03:A04 A05:A06 A07:A08 ] [ B01:B02 B09:B10 ]
+ // [ A09:A10 A11:A12 A13:A14 A15:A16 ] x [ B03:B04 B11:B12 ]
+ // [ B05:B06 B13:B14 ]
+ // [ B07:B08 B15:B16 ]
+ //
+ // For example, consider multiplying the first row of %X with the first
+ // column of Y. We want to know if
+ // A01:A02*B01:B02 + A03:A04*B03:B04 + A05:A06*B06:B06 + A07:A08*B07:B08 is
+ // fully initialized, which will be true if and only if (A01, A02, ..., A08)
+ // and (B01, B02, ..., B08) are each fully initialized. This latter condition
+ // is equivalent to what is tested by the instrumentation for the integer
+ // form.
+ void handleNEONMatrixMultiply(IntrinsicInst &I) {
IRBuilder<> IRB(&I);
assert(I.arg_size() == 3);
@@ -5488,47 +5515,70 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
[[maybe_unused]] FixedVectorType *ATy = cast<FixedVectorType>(A->getType());
[[maybe_unused]] FixedVectorType *BTy = cast<FixedVectorType>(B->getType());
- assert(ACols == BRows);
- assert(ATy->getNumElements() == ARows * ACols);
- assert(BTy->getNumElements() == BRows * BCols);
- assert(RTy->getNumElements() == ARows * BCols);
+ Value *ShadowR = getShadow(&I, 0);
+ Value *ShadowA = getShadow(&I, 1);
+ Value *ShadowB = getShadow(&I, 2);
+
+ // We will use ummla to compute the shadow. These are the types it expects.
+ // These are also the types of the corresponding shadows.
+ FixedVectorType *ExpectedRTy =
+ FixedVectorType::get(IntegerType::get(*MS.C, 32), 4);
+ FixedVectorType *ExpectedATy =
+ FixedVectorType::get(IntegerType::get(*MS.C, 8), 16);
+ FixedVectorType *ExpectedBTy =
+ FixedVectorType::get(IntegerType::get(*MS.C, 8), 16);
- LLVM_DEBUG(dbgs() << "### R: " << *RTy->getElementType() << "\n");
- LLVM_DEBUG(dbgs() << "### A: " << *ATy->getElementType() << "\n");
if (RTy->getElementType()->isIntegerTy()) {
- // Types are not identical e.g., <4 x i32> %R, <16 x i8> %A
+ // Types of R and A/B are not identical e.g., <4 x i32> %R, <16 x i8> %A
assert(ATy->getElementType()->isIntegerTy());
+
+ assert(RTy == ExpectedRTy);
+ assert(ATy == ExpectedATy);
+ assert(BTy == ExpectedBTy);
} else {
- assert(RTy->getElementType()->isFloatingPointTy());
assert(ATy->getElementType()->isFloatingPointTy());
+ assert(BTy->getElementType()->isFloatingPointTy());
+
+ // Technically, what we care about is that:
+ // getShadowTy(RTy)->canLosslesslyBitCastTo(ExpectedRTy)) etc.
+ // but that is equivalent.
+ assert(RTy->canLosslesslyBitCastTo(ExpectedRTy));
+ assert(ATy->canLosslesslyBitCastTo(ExpectedATy));
+ assert(BTy->canLosslesslyBitCastTo(ExpectedBTy));
+
+ ShadowA = IRB.CreateBitCast(ShadowA, getShadowTy(ExpectedATy));
+ ShadowB = IRB.CreateBitCast(ShadowB, getShadowTy(ExpectedBTy));
}
assert(ATy->getElementType() == BTy->getElementType());
- Value *ShadowR = getShadow(&I, 0);
- Value *ShadowA = getShadow(&I, 1);
- Value *ShadowB = getShadow(&I, 2);
+ // From this point on, use Expected{R,A,B}Type.
// If the value is fully initialized, the shadow will be 000...001.
// Otherwise, the shadow will be all zero.
// (This is the opposite of how we typically handle shadows.)
- ShadowA = IRB.CreateZExt(IRB.CreateICmpEQ(ShadowA, getCleanShadow(A)),
- ShadowA->getType());
- ShadowB = IRB.CreateZExt(IRB.CreateICmpEQ(ShadowB, getCleanShadow(B)),
- ShadowB->getType());
-
- Value *ShadowAB = IRB.CreateIntrinsic(
- I.getType(), I.getIntrinsicID(), {getCleanShadow(R), ShadowA, ShadowB});
-
+ ShadowA =
+ IRB.CreateZExt(IRB.CreateICmpEQ(ShadowA, getCleanShadow(ExpectedATy)),
+ getShadowTy(ExpectedATy));
+ ShadowB =
+ IRB.CreateZExt(IRB.CreateICmpEQ(ShadowB, getCleanShadow(ExpectedBTy)),
+ getShadowTy(ExpectedBTy));
+
+ Value *ShadowAB =
+ IRB.CreateIntrinsic(ExpectedRTy, Intrinsic::aarch64_neon_ummla,
+ {getCleanShadow(ExpectedRTy), ShadowA, ShadowB});
+
+ // ummla multiplies a 2x8 matrix with an 8x2 matrix. If all entries of the
+ // input matrices are equal to 0x1, all entries of the output matrix will
+ // be 0x8.
Value *FullyInit = ConstantVector::getSplat(
- RTy->getElementCount(),
- ConstantInt::get(cast<VectorType>(getShadowTy(R))->getElementType(),
- ACols));
+ ExpectedRTy->getElementCount(),
+ ConstantInt::get(ExpectedRTy->getElementType(), 0x8));
ShadowAB = IRB.CreateSExt(IRB.CreateICmpNE(ShadowAB, FullyInit),
ShadowAB->getType());
- ShadowR = IRB.CreateSExt(IRB.CreateICmpNE(ShadowR, getCleanShadow(R)),
- ShadowR->getType());
+ ShadowR = IRB.CreateSExt(
+ IRB.CreateICmpNE(ShadowR, getCleanShadow(ExpectedRTy)), ExpectedRTy);
setShadow(&I, IRB.CreateOr(ShadowAB, ShadowR));
setOriginForNaryOp(I);
@@ -6980,8 +7030,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
case Intrinsic::aarch64_neon_smmla:
case Intrinsic::aarch64_neon_ummla:
case Intrinsic::aarch64_neon_usmmla:
- handleNEONMatrixMultiply(I, /*ARows=*/2, /*ACols=*/8, /*BRows=*/8,
- /*BCols=*/2);
+ case Intrinsic::aarch64_neon_bfmmla:
+ handleNEONMatrixMultiply(I);
break;
// <2 x i32> @llvm.aarch64.neon.{u,s,us}dot.v2i32.v8i8
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-bf16-dotprod-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-bf16-dotprod-intrinsics.ll
index d166c8c77900f..08c675899d997 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-bf16-dotprod-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-bf16-dotprod-intrinsics.ll
@@ -5,8 +5,7 @@
; simplified to only have the cases that directly pass the function parameters
; to the intrinsic.
;
-; Strictly handled:
-; - llvm.aarch64.neon.bfmmla
+; Strictly handled: (none)
;
; Heuristically handled: (none)
@@ -71,21 +70,20 @@ define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bflo
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1:![0-9]+]]
-; CHECK: [[BB6]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB7]]:
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8>
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i1> [[TMP5]] to <16 x i8>
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <16 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i1> [[TMP7]] to <16 x i8>
+; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> [[TMP6]], <16 x i8> [[TMP8]])
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP9]], splat (i32 8)
+; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> [[TMP11]], [[TMP13]]
; CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
-; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <4 x i32> [[TMP14]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]]
;
entry:
@@ -152,6 +150,3 @@ declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bflo
declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
-;.
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
-;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-matmul.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-matmul.ll
index 5112a65d5ff30..71bd3fc26eb8e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-matmul.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/aarch64-matmul.ll
@@ -22,7 +22,7 @@ define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) sa
; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i1> [[TMP3]] to <16 x i8>
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <16 x i8> [[TMP2]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i1> [[TMP5]] to <16 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> [[TMP4]], <16 x i8> [[TMP6]])
+; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> [[TMP4]], <16 x i8> [[TMP6]])
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <4 x i32> [[TMP7]], splat (i32 8)
; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32>
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
@@ -76,7 +76,7 @@ define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) s
; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i1> [[TMP3]] to <16 x i8>
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <16 x i8> [[TMP2]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i1> [[TMP5]] to <16 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> [[TMP4]], <16 x i8> [[TMP6]])
+; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> [[TMP4]], <16 x i8> [[TMP6]])
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <4 x i32> [[TMP7]], splat (i32 8)
; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32>
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
>From d79e1307310a99d25dbba1229bb188a6eacbedc3 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Thu, 5 Feb 2026 21:35:47 -0800
Subject: [PATCH 16/33] [RISCV] Reorder the operands for RISCVISD::PPAIRE_DB.
NFC (#180111)
Order the operands so the the low and high part of the rs1 regpair are
first, followed by the low and high part of the rs2 regpair.
Also change the type to use v4i8 for the result so that it's only
shuffling elements not combining elements into a larger elment.
I'm planning to add ADDD and SUBD opcodes that will be defined with the
same operand order allowing RISCVISelDAGToDAG.cpp code to be shared.
---
llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 4 ++--
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 12 ++++++------
llvm/lib/Target/RISCV/RISCVInstrInfoP.td | 12 +++++++-----
3 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 1c88b667b61aa..e2e2e33e0607e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1906,14 +1906,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
SDValue Ops[] = {
CurDAG->getTargetConstant(RISCV::GPRPairRegClassID, DL, MVT::i32), Val0,
- CurDAG->getTargetConstant(RISCV::sub_gpr_even, DL, MVT::i32), Val2,
+ CurDAG->getTargetConstant(RISCV::sub_gpr_even, DL, MVT::i32), Val1,
CurDAG->getTargetConstant(RISCV::sub_gpr_odd, DL, MVT::i32)};
SDValue RegPair0 =
SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
MVT::Untyped, Ops),
0);
SDValue Ops1[] = {
- CurDAG->getTargetConstant(RISCV::GPRPairRegClassID, DL, MVT::i32), Val1,
+ CurDAG->getTargetConstant(RISCV::GPRPairRegClassID, DL, MVT::i32), Val2,
CurDAG->getTargetConstant(RISCV::sub_gpr_even, DL, MVT::i32), Val3,
CurDAG->getTargetConstant(RISCV::sub_gpr_odd, DL, MVT::i32)};
SDValue RegPair1 =
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6e4aaa67ff8b8..1303fa8383409 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4603,7 +4603,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
if (VT != MVT::v4i8)
return SDValue();
- // <4 x i8> BUILD_VECTOR a, b, c, d -> PACK(PPACK.DH pair(a, b), pair(c, d))
+ // <4 x i8> BUILD_VECTOR a, b, c, d -> PACK(PPACK.DH pair(a, c), pair(b, d))
SDValue Val0 =
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i8, Op->getOperand(0));
SDValue Val1 =
@@ -4612,17 +4612,17 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i8, Op->getOperand(2));
SDValue Val3 =
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i8, Op->getOperand(3));
- SDValue PackDH =
- DAG.getNode(RISCVISD::PPAIRE_DB, DL, {MVT::v2i16, MVT::v2i16},
- {Val0, Val1, Val2, Val3});
+ SDValue PPairDB =
+ DAG.getNode(RISCVISD::PPAIRE_DB, DL, {MVT::v4i8, MVT::v4i8},
+ {Val0, Val2, Val1, Val3});
return DAG.getNode(
ISD::BITCAST, DL, MVT::v4i8,
SDValue(
DAG.getMachineNode(
RISCV::PACK, DL, MVT::i32,
- {DAG.getNode(ISD::BITCAST, DL, MVT::i32, PackDH.getValue(0)),
- DAG.getNode(ISD::BITCAST, DL, MVT::i32, PackDH.getValue(1))}),
+ {DAG.getNode(ISD::BITCAST, DL, MVT::i32, PPairDB.getValue(0)),
+ DAG.getNode(ISD::BITCAST, DL, MVT::i32, PPairDB.getValue(1))}),
0));
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 9c7f9a86611f8..d399010a6a777 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -24,11 +24,13 @@ def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> {
let RenderMethod = "addSImm8UnsignedOperands";
}
-// (<2 x i16>, <2 x i16>) PPAIRE_DB (<4 x i8>, <4 x i8>, <4 x i8>, <4 x i8>)
-def SDT_RISCVPPairE_DB
- : SDTypeProfile<2, 4, [SDTCisVT<0, v2i16>, SDTCisSameAs<0, 1>,
- SDTCisVT<2, v4i8>, SDTCisSameAs<0, 3>,
- SDTCisSameAs<0, 4>, SDTCisSameAs<0, 5>]>;
+// (rdlo, rdhi) PPAIRE_DB (rs1plo, rs1phi, rs2plo, rs2phi)
+def SDT_RISCVPPairE_DB : SDTypeProfile<2, 4, [SDTCisVT<0, v4i8>,
+ SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisSameAs<0, 4>,
+ SDTCisSameAs<0, 5>]>;
def riscv_ppaire_db : RVSDNode<"PPAIRE_DB", SDT_RISCVPPairE_DB>;
// A 8-bit signed immediate allowing range [-128, 255]
>From 07f14918cc3dcbd28b76cfbee34872d376f5cc10 Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97 at outlook.com>
Date: Fri, 6 Feb 2026 14:04:23 +0800
Subject: [PATCH 17/33] [LangRef] Correct `uitofp` example (#180123)
The to type is incorrect, use `float` instead.
---
llvm/docs/LangRef.rst | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ddd5087830acc..74c576d5128f8 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -12763,8 +12763,8 @@ Example:
%X = uitofp i32 257 to float ; yields float:257.0
%Y = uitofp i8 -1 to double ; yields double:255.0
- %a = uitofp nneg i32 256 to i32 ; yields float:256.0
- %b = uitofp nneg i32 -256 to i32 ; yields i32 poison
+ %a = uitofp nneg i32 256 to float ; yields float:256.0
+ %b = uitofp nneg i32 -256 to float ; yields float poison
'``sitofp .. to``' Instruction
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>From 1da85aeaa6681466d52e19d18b667eccf94efe93 Mon Sep 17 00:00:00 2001
From: Michael Park <mcypark at gmail.com>
Date: Thu, 5 Feb 2026 22:04:54 -0800
Subject: [PATCH 18/33] [C++20][Modules] Fix relocatable PCH feature. (#180023)
---
clang/lib/Serialization/ASTWriter.cpp | 8 +++++---
clang/test/PCH/reloc.c | 5 +++++
2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 3e10bbfedfe65..b55440b4a4f39 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -1526,10 +1526,12 @@ void ASTWriter::WriteControlBlock(Preprocessor &PP, StringRef isysroot) {
// Write out all other paths relative to the base directory if possible.
BaseDirectory.assign(BaseDir->begin(), BaseDir->end());
- } else if (!isysroot.empty()) {
- // Write out paths relative to the sysroot if possible.
- BaseDirectory = std::string(isysroot);
}
+ } else if (!isysroot.empty()) {
+ // Write out paths relative to the sysroot if possible.
+ SmallString<128> CleanedSysroot(isysroot);
+ cleanPathForOutput(PP.getFileManager(), CleanedSysroot);
+ BaseDirectory.assign(CleanedSysroot.begin(), CleanedSysroot.end());
}
// Module map file
diff --git a/clang/test/PCH/reloc.c b/clang/test/PCH/reloc.c
index 435fde2e19234..019e3c495218b 100644
--- a/clang/test/PCH/reloc.c
+++ b/clang/test/PCH/reloc.c
@@ -3,6 +3,11 @@
// RUN: %clang -target x86_64-apple-darwin11 -fsyntax-only \
// RUN: -include-pch %t -isysroot %S/Inputs/libroot %s -Xclang -verify
// RUN: not %clang -target x86_64-apple-darwin11 -include-pch %t %s
+// RUN: llvm-bcanalyzer --dump --disable-histogram %t \
+// RUN: | FileCheck %s
+// CHECK: <ORIGINAL_FILE {{.*}}/> blob data = 'usr{{[/\\]}}include{{[/\\]}}reloc.h'
+// CHECK: <INPUT_FILE {{.*}}/> blob data = 'usr{{[/\\]}}include{{[/\\]}}reloc.h'
+// CHECK: <INPUT_FILE {{.*}}/> blob data = 'usr{{[/\\]}}include{{[/\\]}}reloc2.h'
// REQUIRES: x86-registered-target
#include <reloc.h>
>From 3e0b0891c9643470f490dea12d39c87d046763eb Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder at redhat.com>
Date: Fri, 6 Feb 2026 07:06:18 +0100
Subject: [PATCH 19/33] [clang][bytecode] Check pointer lifetime in
CheckDestructor (#179957)
So we diagnose double-destroy scenarios.
---
clang/lib/AST/ByteCode/Interp.cpp | 2 ++
clang/test/AST/ByteCode/records.cpp | 9 +++++++++
2 files changed, 11 insertions(+)
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index d095e6f862fc5..8eaff4bb07f7d 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1549,6 +1549,8 @@ bool CheckDestructor(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
return false;
if (!CheckRange(S, OpPC, Ptr, AK_Destroy))
return false;
+ if (!CheckLifetime(S, OpPC, Ptr.getLifetime(), AK_Destroy))
+ return false;
// Can't call a dtor on a global variable.
if (Ptr.block()->isStatic()) {
diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp
index 4799ebe25dde1..804b4c6ea466a 100644
--- a/clang/test/AST/ByteCode/records.cpp
+++ b/clang/test/AST/ByteCode/records.cpp
@@ -602,6 +602,15 @@ namespace Destructors {
}
static_assert(testS() == 1); // both-error {{not an integral constant expression}} \
// both-note {{in call to 'testS()'}}
+
+ struct A { int n; };
+ constexpr void double_destroy() {
+ A a;
+ a.~A();
+ a.~A(); // both-note {{destruction of object outside its lifetime}}
+ }
+ static_assert((double_destroy(), true)); // both-error {{not an integral constant expression}} \
+ // both-note {{in call to}}
}
namespace BaseToDerived {
>From e2bf6e923ba00b27c856614d9e4385b8988155c6 Mon Sep 17 00:00:00 2001
From: Brandon Wu <brandon.wu at sifive.com>
Date: Fri, 6 Feb 2026 14:12:18 +0800
Subject: [PATCH 20/33] [RISCV][llvm] Support INSERT_VECTOR_ELT codegen for P
extension (#179471)
Add custom lowering for INSERT_VECTOR_ELT on P extension vector types
using the MVM instruction.
TODO: Handle <4 x i8> on RV64 which is constructed to extract_vector_elt
+ build_vector instead of insert_vector_elt.
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 49 ++++++++++++-
llvm/lib/Target/RISCV/RISCVInstrInfoP.td | 10 +++
llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll | 78 +++++++++++++++++++--
llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll | 76 ++++++++++++++++++--
4 files changed, 202 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 1303fa8383409..34616a4c09d0c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -575,7 +575,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SCALAR_TO_VECTOR, VTs, Legal);
setOperationAction({ISD::SHL, ISD::SRL, ISD::SRA}, VTs, Custom);
setOperationAction(ISD::BITCAST, VTs, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VTs, Custom);
+ setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}, VTs,
+ Custom);
setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VTs,
Legal);
setOperationAction(ISD::SELECT, VTs, Custom);
@@ -10697,6 +10698,52 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
return DAG.getBitcast(VecVT, IntInsert);
}
+ if (Subtarget.enablePExtSIMDCodeGen() && VecVT.isFixedLengthVector()) {
+ auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
+ if (!IdxC)
+ return SDValue();
+
+ unsigned IdxVal = IdxC->getZExtValue();
+ unsigned NumElts = VecVT.getVectorNumElements();
+ MVT EltVT = VecVT.getVectorElementType();
+ Vec = DAG.getBitcast(XLenVT, Vec);
+ SDValue ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, Val);
+
+ // For 2-element vectors, BUILD_VECTOR is more efficient since it only needs
+ // at most 2 instructions.
+ if (NumElts == 2) {
+ unsigned EltBits = EltVT.getSizeInBits();
+ SDValue Elt0, Elt1;
+ if (IdxVal == 0) {
+ Elt0 = ExtVal;
+ Elt1 = DAG.getNode(ISD::SRL, DL, XLenVT, Vec,
+ DAG.getConstant(EltBits, DL, XLenVT));
+ } else {
+ Elt0 = Vec;
+ Elt1 = ExtVal;
+ }
+ return DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, Elt0, Elt1);
+ }
+
+ // For 4/8-element vectors, use MVM(or MERGE) instruction which does bitwise
+ // select: rd = (~mask & rd) | (mask & rs1).
+ // This generates: slli + lui/li + mvm
+ if (NumElts == 4 || NumElts == 8) {
+ unsigned EltBits = EltVT.getSizeInBits();
+ unsigned ShiftAmt = IdxVal * EltBits;
+ uint64_t PosMask = ((1ULL << EltBits) - 1) << ShiftAmt;
+
+ SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, XLenVT, ExtVal,
+ DAG.getConstant(ShiftAmt, DL, XLenVT));
+ SDValue Mask = DAG.getConstant(PosMask, DL, XLenVT);
+ SDValue Result =
+ DAG.getNode(RISCVISD::MVM, DL, XLenVT, Vec, ShiftedVal, Mask);
+ return DAG.getBitcast(VecVT, Result);
+ }
+
+ return SDValue();
+ }
+
MVT ContainerVT = VecVT;
// If the operand is a fixed-length vector, convert to a scalable one.
if (VecVT.isFixedLengthVector()) {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index d399010a6a777..386d9c7377eca 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -1486,6 +1486,12 @@ def riscv_mulhr : RVSDNode<"MULHR", SDTIntBinOp>;
def riscv_mulhru : RVSDNode<"MULHRU", SDTIntBinOp>;
def riscv_mulhrsu : RVSDNode<"MULHRSU", SDTIntBinOp>;
+def SDT_RISCVMVM : SDTypeProfile<1, 3, [SDTCisInt<0>,
+ SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>]>;
+def riscv_mvm : RVSDNode<"MVM", SDT_RISCVMVM>;
+
let Predicates = [HasStdExtP] in {
def : PatGpr<abs, ABS>;
def : PatGpr<ctls, CLS>;
@@ -1495,6 +1501,10 @@ let Predicates = [HasStdExtP] in {
def : Pat<(XLenVT (fshr GPR:$rs1, GPR:$rd, shiftMaskXLen:$rs2)),
(SRX GPR:$rd, GPR:$rs1, shiftMaskXLen:$rs2)>;
+ // Pattern for insert_vector_elt
+ def : Pat<(XLenVT (riscv_mvm GPR:$rd, GPR:$rs1, GPR:$rs2)),
+ (MVM GPR:$rd, GPR:$rs1, GPR:$rs2)>;
+
// Basic 8-bit arithmetic patterns
def: Pat<(XLenVecI8VT (add GPR:$rs1, GPR:$rs2)), (PADD_B GPR:$rs1, GPR:$rs2)>;
def: Pat<(XLenVecI8VT (sub GPR:$rs1, GPR:$rs2)), (PSUB_B GPR:$rs1, GPR:$rs2)>;
diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
index 491c2e1ee8a0a..3193a0e48f428 100644
--- a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
@@ -488,6 +488,76 @@ define i8 @test_extract_vector_8_elem1(<4 x i8> %a) {
ret i8 %extracted
}
+define <2 x i16> @test_insert_vector_16(<2 x i16> %a, i16 %val) {
+; CHECK-RV32-LABEL: test_insert_vector_16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: srli a0, a0, 16
+; CHECK-RV32-NEXT: pack a0, a1, a0
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_insert_vector_16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: srli a0, a0, 16
+; CHECK-RV64-NEXT: ppaire.h a0, a1, a0
+; CHECK-RV64-NEXT: ret
+ %res = insertelement <2 x i16> %a, i16 %val, i32 0
+ ret <2 x i16> %res
+}
+
+define <2 x i16> @test_insert_vector_16_elem1(<2 x i16> %a, i16 %val) {
+; CHECK-RV32-LABEL: test_insert_vector_16_elem1:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: pack a0, a0, a1
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_insert_vector_16_elem1:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ppaire.h a0, a0, a1
+; CHECK-RV64-NEXT: ret
+ %res = insertelement <2 x i16> %a, i16 %val, i32 1
+ ret <2 x i16> %res
+}
+
+define <4 x i8> @test_insert_vector_8(<4 x i8> %a, i8 %val) {
+; CHECK-RV32-LABEL: test_insert_vector_8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: li a2, 255
+; CHECK-RV32-NEXT: mvm a0, a1, a2
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_insert_vector_8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: srli a2, a0, 8
+; CHECK-RV64-NEXT: srli a3, a0, 24
+; CHECK-RV64-NEXT: srli a0, a0, 16
+; CHECK-RV64-NEXT: ppaire.b a0, a0, a3
+; CHECK-RV64-NEXT: ppaire.b a1, a1, a2
+; CHECK-RV64-NEXT: ppaire.h a0, a1, a0
+; CHECK-RV64-NEXT: ret
+ %res = insertelement <4 x i8> %a, i8 %val, i32 0
+ ret <4 x i8> %res
+}
+
+define <4 x i8> @test_insert_vector_8_elem2(<4 x i8> %a, i8 %val) {
+; CHECK-RV32-LABEL: test_insert_vector_8_elem2:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: slli a1, a1, 16
+; CHECK-RV32-NEXT: lui a2, 4080
+; CHECK-RV32-NEXT: mvm a0, a1, a2
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64-LABEL: test_insert_vector_8_elem2:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: srli a2, a0, 8
+; CHECK-RV64-NEXT: srli a3, a0, 24
+; CHECK-RV64-NEXT: ppaire.b a1, a1, a3
+; CHECK-RV64-NEXT: ppaire.b a0, a0, a2
+; CHECK-RV64-NEXT: ppaire.h a0, a0, a1
+; CHECK-RV64-NEXT: ret
+ %res = insertelement <4 x i8> %a, i8 %val, i32 2
+ ret <4 x i8> %res
+}
+
; Test for splat
define <4 x i8> @test_non_const_splat_i8(i8 %elt) {
; CHECK-LABEL: test_non_const_splat_i8:
@@ -1629,10 +1699,10 @@ define <2 x i16> @test_select_v2i16(i1 %cond, <2 x i16> %a, <2 x i16> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: andi a3, a0, 1
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: bnez a3, .LBB115_2
+; CHECK-NEXT: bnez a3, .LBB119_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: .LBB115_2:
+; CHECK-NEXT: .LBB119_2:
; CHECK-NEXT: ret
%res = select i1 %cond, <2 x i16> %a, <2 x i16> %b
ret <2 x i16> %res
@@ -1643,10 +1713,10 @@ define <4 x i8> @test_select_v4i8(i1 %cond, <4 x i8> %a, <4 x i8> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: andi a3, a0, 1
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: bnez a3, .LBB116_2
+; CHECK-NEXT: bnez a3, .LBB120_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: .LBB116_2:
+; CHECK-NEXT: .LBB120_2:
; CHECK-NEXT: ret
%res = select i1 %cond, <4 x i8> %a, <4 x i8> %b
ret <4 x i8> %res
diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
index 5d249d580b64f..0da0bb339b0a2 100644
--- a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
@@ -545,6 +545,70 @@ define i32 @test_extract_vector_32_elem1(<2 x i32> %a) {
ret i32 %extracted
}
+define <4 x i16> @test_insert_vector_16(<4 x i16> %a, i16 %val) {
+; CHECK-LABEL: test_insert_vector_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a2, 16
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: mvm a0, a1, a2
+; CHECK-NEXT: ret
+ %res = insertelement <4 x i16> %a, i16 %val, i32 0
+ ret <4 x i16> %res
+}
+
+define <4 x i16> @test_insert_vector_16_elem2(<4 x i16> %a, i16 %val) {
+; CHECK-LABEL: test_insert_vector_16_elem2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a1, a1, 32
+; CHECK-NEXT: lui a2, 65535
+; CHECK-NEXT: slli a2, a2, 20
+; CHECK-NEXT: mvm a0, a1, a2
+; CHECK-NEXT: ret
+ %res = insertelement <4 x i16> %a, i16 %val, i32 2
+ ret <4 x i16> %res
+}
+
+define <8 x i8> @test_insert_vector_8(<8 x i8> %a, i8 %val) {
+; CHECK-LABEL: test_insert_vector_8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a2, 255
+; CHECK-NEXT: mvm a0, a1, a2
+; CHECK-NEXT: ret
+ %res = insertelement <8 x i8> %a, i8 %val, i32 0
+ ret <8 x i8> %res
+}
+
+define <8 x i8> @test_insert_vector_8_elem3(<8 x i8> %a, i8 %val) {
+; CHECK-LABEL: test_insert_vector_8_elem3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: slli a1, a1, 24
+; CHECK-NEXT: li a2, 255
+; CHECK-NEXT: slli a2, a2, 24
+; CHECK-NEXT: mvm a0, a1, a2
+; CHECK-NEXT: ret
+ %res = insertelement <8 x i8> %a, i8 %val, i32 3
+ ret <8 x i8> %res
+}
+
+define <2 x i32> @test_insert_vector_32(<2 x i32> %a, i32 %val) {
+; CHECK-LABEL: test_insert_vector_32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: srli a0, a0, 32
+; CHECK-NEXT: pack a0, a1, a0
+; CHECK-NEXT: ret
+ %res = insertelement <2 x i32> %a, i32 %val, i32 0
+ ret <2 x i32> %res
+}
+
+define <2 x i32> @test_insert_vector_32_elem1(<2 x i32> %a, i32 %val) {
+; CHECK-LABEL: test_insert_vector_32_elem1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pack a0, a0, a1
+; CHECK-NEXT: ret
+ %res = insertelement <2 x i32> %a, i32 %val, i32 1
+ ret <2 x i32> %res
+}
+
; Test basic add/sub operations for v2i32 (RV64 only)
define <2 x i32> @test_padd_w(<2 x i32> %a, <2 x i32> %b) {
; CHECK-LABEL: test_padd_w:
@@ -1941,10 +2005,10 @@ define <4 x i16> @test_select_v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: andi a3, a0, 1
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: bnez a3, .LBB155_2
+; CHECK-NEXT: bnez a3, .LBB161_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: .LBB155_2:
+; CHECK-NEXT: .LBB161_2:
; CHECK-NEXT: ret
%res = select i1 %cond, <4 x i16> %a, <4 x i16> %b
ret <4 x i16> %res
@@ -1955,10 +2019,10 @@ define <8 x i8> @test_select_v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: andi a3, a0, 1
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: bnez a3, .LBB156_2
+; CHECK-NEXT: bnez a3, .LBB162_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: .LBB156_2:
+; CHECK-NEXT: .LBB162_2:
; CHECK-NEXT: ret
%res = select i1 %cond, <8 x i8> %a, <8 x i8> %b
ret <8 x i8> %res
@@ -1969,10 +2033,10 @@ define <2 x i32> @test_select_v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: andi a3, a0, 1
; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: bnez a3, .LBB157_2
+; CHECK-NEXT: bnez a3, .LBB163_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: .LBB157_2:
+; CHECK-NEXT: .LBB163_2:
; CHECK-NEXT: ret
%res = select i1 %cond, <2 x i32> %a, <2 x i32> %b
ret <2 x i32> %res
>From e576fc1a90c0984cc44fcf38d7b176216af1128a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 6 Feb 2026 07:20:14 +0000
Subject: [PATCH 21/33] [X86] combineSetCC - attempt to match more complex
icmp_eq/ne patterns before falling back to PTEST/PMOVMSKB patterns (#180034)
combineVectorSizedSetCCEquality attempts to convert equality comparisons
of larger-than-legal scalar integers to PTEST/PMOVMSKB vector
comparisons.
However, combineSetCC has a number of other folds with more complex
icmp_eq/ne patterns that work with big integers (including bit test and
reduction patterns) that don't get a change to match as
combineVectorSizedSetCCEquality is run first, and the other folds are
then more difficult to match from PTEST/PMOVMSKB nodes.
This patch moves the combineVectorSizedSetCCEquality fold later to give
other icmp_eq/ne folds a chance to run first.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 10 +-
llvm/test/CodeGen/X86/bitcast-vector-bool.ll | 162 ++++---------------
2 files changed, 34 insertions(+), 138 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 083a02b782779..3ec2bf9b19360 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57441,12 +57441,6 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
EVT OpVT = LHS.getValueType();
SDLoc DL(N);
- if (CC == ISD::SETNE || CC == ISD::SETEQ) {
- if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
- Subtarget))
- return V;
- }
-
if (VT == MVT::i1) {
X86::CondCode X86CC;
if (SDValue V =
@@ -57562,6 +57556,10 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
}
}
}
+
+ if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
+ Subtarget))
+ return V;
}
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index fae1ff90dd8d5..7f899e372caed 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -1193,162 +1193,60 @@ define i64 @bitcast_v128i8_to_v2i64(<128 x i8> %a0) nounwind {
define i1 @trunc_v128i8_cmp(<128 x i8> %a0) nounwind {
; SSE2-SSSE3-LABEL: trunc_v128i8_cmp:
; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: pand %xmm7, %xmm3
+; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pand %xmm6, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: psllw $7, %xmm0
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSE2-SSSE3-NEXT: psllw $7, %xmm1
-; SSE2-SSSE3-NEXT: pmovmskb %xmm1, %ecx
-; SSE2-SSSE3-NEXT: shll $16, %ecx
-; SSE2-SSSE3-NEXT: orl %eax, %ecx
-; SSE2-SSSE3-NEXT: psllw $7, %xmm2
-; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %edx
-; SSE2-SSSE3-NEXT: psllw $7, %xmm3
-; SSE2-SSSE3-NEXT: pmovmskb %xmm3, %eax
-; SSE2-SSSE3-NEXT: shll $16, %eax
-; SSE2-SSSE3-NEXT: orl %edx, %eax
-; SSE2-SSSE3-NEXT: shlq $32, %rax
-; SSE2-SSSE3-NEXT: orq %rcx, %rax
-; SSE2-SSSE3-NEXT: psllw $7, %xmm4
-; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %ecx
-; SSE2-SSSE3-NEXT: psllw $7, %xmm5
-; SSE2-SSSE3-NEXT: pmovmskb %xmm5, %edx
-; SSE2-SSSE3-NEXT: shll $16, %edx
-; SSE2-SSSE3-NEXT: orl %ecx, %edx
-; SSE2-SSSE3-NEXT: psllw $7, %xmm6
-; SSE2-SSSE3-NEXT: pmovmskb %xmm6, %ecx
-; SSE2-SSSE3-NEXT: psllw $7, %xmm7
-; SSE2-SSSE3-NEXT: pmovmskb %xmm7, %esi
-; SSE2-SSSE3-NEXT: shll $16, %esi
-; SSE2-SSSE3-NEXT: orl %ecx, %esi
-; SSE2-SSSE3-NEXT: shlq $32, %rsi
-; SSE2-SSSE3-NEXT: orq %rdx, %rsi
-; SSE2-SSSE3-NEXT: movq %rsi, %xmm0
-; SSE2-SSSE3-NEXT: movq %rax, %xmm1
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE2-SSSE3-NEXT: pcmpeqb %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSE2-SSSE3-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-SSSE3-NEXT: xorl $65535, %eax # imm = 0xFFFF
; SSE2-SSSE3-NEXT: setne %al
; SSE2-SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_v128i8_cmp:
; SSE41: # %bb.0:
-; SSE41-NEXT: psllw $7, %xmm0
-; SSE41-NEXT: pmovmskb %xmm0, %eax
-; SSE41-NEXT: psllw $7, %xmm1
-; SSE41-NEXT: pmovmskb %xmm1, %ecx
-; SSE41-NEXT: shll $16, %ecx
-; SSE41-NEXT: orl %eax, %ecx
-; SSE41-NEXT: psllw $7, %xmm2
-; SSE41-NEXT: pmovmskb %xmm2, %edx
-; SSE41-NEXT: psllw $7, %xmm3
-; SSE41-NEXT: pmovmskb %xmm3, %eax
-; SSE41-NEXT: shll $16, %eax
-; SSE41-NEXT: orl %edx, %eax
-; SSE41-NEXT: shlq $32, %rax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: psllw $7, %xmm4
-; SSE41-NEXT: pmovmskb %xmm4, %ecx
-; SSE41-NEXT: psllw $7, %xmm5
-; SSE41-NEXT: pmovmskb %xmm5, %edx
-; SSE41-NEXT: shll $16, %edx
-; SSE41-NEXT: orl %ecx, %edx
-; SSE41-NEXT: psllw $7, %xmm6
-; SSE41-NEXT: pmovmskb %xmm6, %ecx
-; SSE41-NEXT: psllw $7, %xmm7
-; SSE41-NEXT: pmovmskb %xmm7, %esi
-; SSE41-NEXT: shll $16, %esi
-; SSE41-NEXT: orl %ecx, %esi
-; SSE41-NEXT: shlq $32, %rsi
-; SSE41-NEXT: orq %rdx, %rsi
-; SSE41-NEXT: movq %rsi, %xmm0
-; SSE41-NEXT: movq %rax, %xmm1
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT: ptest %xmm0, %xmm1
+; SSE41-NEXT: pand %xmm7, %xmm3
+; SSE41-NEXT: pand %xmm5, %xmm1
+; SSE41-NEXT: pand %xmm3, %xmm1
+; SSE41-NEXT: pand %xmm6, %xmm2
+; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: setae %al
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_v128i8_cmp:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllw $7, %xmm0, %xmm4
-; AVX1-NEXT: vpmovmskb %xmm4, %eax
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %ecx
-; AVX1-NEXT: shll $16, %ecx
-; AVX1-NEXT: orl %eax, %ecx
-; AVX1-NEXT: vpsllw $7, %xmm1, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %edx
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: orl %edx, %eax
-; AVX1-NEXT: shlq $32, %rax
-; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vpsllw $7, %xmm2, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %ecx
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
-; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %edx
-; AVX1-NEXT: shll $16, %edx
-; AVX1-NEXT: orl %ecx, %edx
-; AVX1-NEXT: vpsllw $7, %xmm3, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %ecx
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0
-; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %esi
-; AVX1-NEXT: shll $16, %esi
-; AVX1-NEXT: orl %ecx, %esi
-; AVX1-NEXT: shlq $32, %rsi
-; AVX1-NEXT: orq %rdx, %rsi
-; AVX1-NEXT: vmovq %rsi, %xmm0
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vptest %xmm1, %xmm0
+; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
; AVX1-NEXT: setae %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v128i8_cmp:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllw $7, %ymm1, %ymm1
-; AVX2-NEXT: vpmovmskb %ymm1, %eax
-; AVX2-NEXT: shlq $32, %rax
-; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX2-NEXT: vpmovmskb %ymm0, %ecx
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: vpsllw $7, %ymm3, %ymm0
-; AVX2-NEXT: vpmovmskb %ymm0, %eax
-; AVX2-NEXT: shlq $32, %rax
-; AVX2-NEXT: vpsllw $7, %ymm2, %ymm0
-; AVX2-NEXT: vpmovmskb %ymm0, %edx
-; AVX2-NEXT: orq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm0
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vptest %xmm1, %xmm0
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673]
+; AVX2-NEXT: vptest %ymm1, %ymm0
; AVX2-NEXT: setae %al
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_v128i8_cmp:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllw $7, %zmm0, %zmm0
-; AVX512-NEXT: vpmovb2m %zmm0, %k0
-; AVX512-NEXT: kmovq %k0, %rax
-; AVX512-NEXT: vpsllw $7, %zmm1, %zmm0
-; AVX512-NEXT: vpmovb2m %zmm0, %k0
-; AVX512-NEXT: kmovq %k0, %rcx
-; AVX512-NEXT: vmovq %rcx, %xmm0
-; AVX512-NEXT: vmovq %rax, %xmm1
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vptest %xmm1, %xmm0
-; AVX512-NEXT: setae %al
+; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 & zmm2 & zmm1
+; AVX512-NEXT: vpcmpneqd %zmm2, %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
+; AVX512-NEXT: setne %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = trunc <128 x i8> %a0 to <128 x i1>
>From 4b5504fbb1194e87ea94fb26cd6b1b724eac548e Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Thu, 5 Feb 2026 23:22:46 -0800
Subject: [PATCH 22/33] [msan][NFCI] Refactor icmp eq/ne into
propagateEqualityComparison() (#180115)
This will be useful for handling switch
(https://github.com/llvm/llvm-project/pull/179851).
---
.../Instrumentation/MemorySanitizer.cpp | 61 ++++++++++++-------
1 file changed, 39 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 98c4866cd7825..c304efa9cb21d 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2422,6 +2422,44 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
I.setSuccessOrdering(addReleaseOrdering(I.getSuccessOrdering()));
}
+ /// Generic handler to compute shadow for == and != comparisons.
+ ///
+ /// This function is used by handleEqualityComparison and visitSwitchInst.
+ ///
+ /// Sometimes the comparison result is known even if some of the bits of the
+ /// arguments are not.
+ Value *propagateEqualityComparison(IRBuilder<> &IRB, Value *A, Value *B,
+ Value *Sa, Value *Sb) {
+ assert(getShadowTy(A) == Sa->getType());
+ assert(getShadowTy(B) == Sb->getType());
+
+ // Get rid of pointers and vectors of pointers.
+ // For ints (and vectors of ints), types of A and Sa match,
+ // and this is a no-op.
+ A = IRB.CreatePointerCast(A, Sa->getType());
+ B = IRB.CreatePointerCast(B, Sb->getType());
+
+ // A == B <==> (C = A^B) == 0
+ // A != B <==> (C = A^B) != 0
+ // Sc = Sa | Sb
+ Value *C = IRB.CreateXor(A, B);
+ Value *Sc = IRB.CreateOr(Sa, Sb);
+ // Now dealing with i = (C == 0) comparison (or C != 0, does not matter now)
+ // Result is defined if one of the following is true
+ // * there is a defined 1 bit in C
+ // * C is fully defined
+ // Si = !(C & ~Sc) && Sc
+ Value *Zero = Constant::getNullValue(Sc->getType());
+ Value *MinusOne = Constant::getAllOnesValue(Sc->getType());
+ Value *LHS = IRB.CreateICmpNE(Sc, Zero);
+ Value *RHS =
+ IRB.CreateICmpEQ(IRB.CreateAnd(IRB.CreateXor(Sc, MinusOne), C), Zero);
+ Value *Si = IRB.CreateAnd(LHS, RHS);
+ Si->setName("_msprop_icmp");
+
+ return Si;
+ }
+
// Vector manipulation.
void visitExtractElementInst(ExtractElementInst &I) {
insertCheckShadowOf(I.getOperand(1), &I);
@@ -2992,29 +3030,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Value *Sa = getShadow(A);
Value *Sb = getShadow(B);
- // Get rid of pointers and vectors of pointers.
- // For ints (and vectors of ints), types of A and Sa match,
- // and this is a no-op.
- A = IRB.CreatePointerCast(A, Sa->getType());
- B = IRB.CreatePointerCast(B, Sb->getType());
+ Value *Si = propagateEqualityComparison(IRB, A, B, Sa, Sb);
- // A == B <==> (C = A^B) == 0
- // A != B <==> (C = A^B) != 0
- // Sc = Sa | Sb
- Value *C = IRB.CreateXor(A, B);
- Value *Sc = IRB.CreateOr(Sa, Sb);
- // Now dealing with i = (C == 0) comparison (or C != 0, does not matter now)
- // Result is defined if one of the following is true
- // * there is a defined 1 bit in C
- // * C is fully defined
- // Si = !(C & ~Sc) && Sc
- Value *Zero = Constant::getNullValue(Sc->getType());
- Value *MinusOne = Constant::getAllOnesValue(Sc->getType());
- Value *LHS = IRB.CreateICmpNE(Sc, Zero);
- Value *RHS =
- IRB.CreateICmpEQ(IRB.CreateAnd(IRB.CreateXor(Sc, MinusOne), C), Zero);
- Value *Si = IRB.CreateAnd(LHS, RHS);
- Si->setName("_msprop_icmp");
setShadow(&I, Si);
setOriginForNaryOp(I);
}
>From 54c1d34238fc96f74e88caf11db81fd18a883c7c Mon Sep 17 00:00:00 2001
From: Kyungtak Woo <kevinwkt at google.com>
Date: Fri, 6 Feb 2026 01:37:58 -0600
Subject: [PATCH 23/33] [NewPM] Port x86-indirect-branch-tracking (#179874)
Similar to other portings created by @aidenboom154. No specific test
coverage as there are no MIR->MIR tests that exercise this pass.
---
llvm/lib/Target/X86/X86.h | 10 ++-
llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp | 3 +-
.../Target/X86/X86IndirectBranchTracking.cpp | 71 ++++++++++---------
llvm/lib/Target/X86/X86PassRegistry.def | 2 +-
llvm/lib/Target/X86/X86TargetMachine.cpp | 2 +-
llvm/test/CodeGen/X86/llc-pipeline-npm.ll | 4 ++
6 files changed, 55 insertions(+), 37 deletions(-)
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 355f0959b4c7d..1cb00fbe90d95 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -66,7 +66,14 @@ FunctionPass *createX86IssueVZeroUpperPass();
/// This pass inserts ENDBR instructions before indirect jump/call
/// destinations as part of CET IBT mechanism.
-FunctionPass *createX86IndirectBranchTrackingPass();
+class X86IndirectBranchTrackingPass
+ : public PassInfoMixin<X86IndirectBranchTrackingPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
+FunctionPass *createX86IndirectBranchTrackingLegacyPass();
/// Return a pass that pads short functions with NOOPs.
/// This will prevent a stall when returning on the Atom.
@@ -437,6 +444,7 @@ void initializeX86FastPreTileConfigLegacyPass(PassRegistry &);
void initializeX86FastTileConfigLegacyPass(PassRegistry &);
void initializeX86FixupSetCCLegacyPass(PassRegistry &);
void initializeX86FlagsCopyLoweringLegacyPass(PassRegistry &);
+void initializeX86IndirectBranchTrackingLegacyPass(PassRegistry &);
void initializeX86LoadValueInjectionLoadHardeningLegacyPass(PassRegistry &);
void initializeX86LoadValueInjectionRetHardeningLegacyPass(PassRegistry &);
void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
index d7c9932dd3f0d..9405761fb6714 100644
--- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
+++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
@@ -173,8 +173,7 @@ void X86CodeGenPassBuilder::addPreEmitPass(PassManagerWrapper &PMW) const {
addMachineFunctionPass(BreakFalseDepsPass(), PMW);
}
- // TODO(boomanaiden154): Add X86IndirectBranchTrackingPass here once it has
- // been ported.
+ addMachineFunctionPass(X86IndirectBranchTrackingPass(), PMW);
// TODO(boomanaiden154): Add X86IssueVZeroUpperPass here once it has been
// ported.
diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
index 52be14228e555..6ccad26a890dd 100644
--- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -36,42 +36,29 @@ cl::opt<bool> IndirectBranchTracking(
STATISTIC(NumEndBranchAdded, "Number of ENDBR instructions added");
namespace {
-class X86IndirectBranchTrackingPass : public MachineFunctionPass {
+class X86IndirectBranchTrackingLegacy : public MachineFunctionPass {
public:
- X86IndirectBranchTrackingPass() : MachineFunctionPass(ID) {}
+ static char ID;
+
+ X86IndirectBranchTrackingLegacy() : MachineFunctionPass(ID) {}
StringRef getPassName() const override {
return "X86 Indirect Branch Tracking";
}
bool runOnMachineFunction(MachineFunction &MF) override;
-
-private:
- static char ID;
-
- /// Machine instruction info used throughout the class.
- const X86InstrInfo *TII = nullptr;
-
- /// Endbr opcode for the current machine function.
- unsigned int EndbrOpcode = 0;
-
- /// Adds a new ENDBR instruction to the beginning of the MBB.
- /// The function will not add it if already exists.
- /// It will add ENDBR32 or ENDBR64 opcode, depending on the target.
- /// \returns true if the ENDBR was added and false otherwise.
- bool addENDBR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const;
};
-} // end anonymous namespace
-
-char X86IndirectBranchTrackingPass::ID = 0;
-
-FunctionPass *llvm::createX86IndirectBranchTrackingPass() {
- return new X86IndirectBranchTrackingPass();
-}
+/// Adds a new ENDBR instruction to the beginning of the MBB.
+/// The function will not add it if already exists.
+/// It will add ENDBR32 or ENDBR64 opcode, depending on the target.
+/// \returns true if the ENDBR was added and false otherwise.
+static bool addENDBR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
+ MachineFunction &MF = *MBB.getParent();
+ const X86Subtarget &SubTarget = MF.getSubtarget<X86Subtarget>();
+ const X86InstrInfo *TII = SubTarget.getInstrInfo();
+ unsigned EndbrOpcode = SubTarget.is64Bit() ? X86::ENDBR64 : X86::ENDBR32;
-bool X86IndirectBranchTrackingPass::addENDBR(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
assert(TII && "Target instruction info was not initialized");
assert((X86::ENDBR64 == EndbrOpcode || X86::ENDBR32 == EndbrOpcode) &&
"Unexpected Endbr opcode");
@@ -86,6 +73,17 @@ bool X86IndirectBranchTrackingPass::addENDBR(
return false;
}
+} // end anonymous namespace
+
+char X86IndirectBranchTrackingLegacy::ID = 0;
+
+INITIALIZE_PASS(X86IndirectBranchTrackingLegacy, DEBUG_TYPE,
+ "X86 Indirect Branch Tracking", false, false)
+
+FunctionPass *llvm::createX86IndirectBranchTrackingLegacyPass() {
+ return new X86IndirectBranchTrackingLegacy();
+}
+
static bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
if (!MOp.isGlobal())
return false;
@@ -113,9 +111,7 @@ static bool needsPrologueENDBR(MachineFunction &MF, const Module *M) {
}
}
-bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
- const X86Subtarget &SubTarget = MF.getSubtarget<X86Subtarget>();
-
+static bool runIndirectBranchTracking(MachineFunction &MF) {
const Module *M = MF.getFunction().getParent();
// Check that the cf-protection-branch is enabled.
Metadata *isCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
@@ -135,9 +131,6 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
// True if the current MF was changed and false otherwise.
bool Changed = false;
- TII = SubTarget.getInstrInfo();
- EndbrOpcode = SubTarget.is64Bit() ? X86::ENDBR64 : X86::ENDBR32;
-
// If function is reachable indirectly, mark the first BB with ENDBR.
if (needsPrologueENDBR(MF, M)) {
auto MBB = MF.begin();
@@ -189,3 +182,17 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
}
return Changed;
}
+
+bool X86IndirectBranchTrackingLegacy::runOnMachineFunction(
+ MachineFunction &MF) {
+ return runIndirectBranchTracking(MF);
+}
+
+PreservedAnalyses
+X86IndirectBranchTrackingPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ return runIndirectBranchTracking(MF)
+ ? getMachineFunctionPassPreservedAnalyses()
+ .preserveSet<CFGAnalyses>()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def
index 2f9b880249465..91156260d2c7c 100644
--- a/llvm/lib/Target/X86/X86PassRegistry.def
+++ b/llvm/lib/Target/X86/X86PassRegistry.def
@@ -48,6 +48,7 @@ MACHINE_FUNCTION_PASS("x86-fixup-setcc", X86FixupSetCCPass())
MACHINE_FUNCTION_PASS("x86-fixup-vector-constants", X86FixupVectorConstantsPass())
MACHINE_FUNCTION_PASS("x86-flags-copy-lowering", X86FlagsCopyLoweringPass())
MACHINE_FUNCTION_PASS("x86-fp-stackifier", X86FPStackifierPass())
+MACHINE_FUNCTION_PASS("x86-indirect-branch-tracking", X86IndirectBranchTrackingPass())
MACHINE_FUNCTION_PASS("x86-isel", X86ISelDAGToDAGPass(*this))
MACHINE_FUNCTION_PASS("x86-lower-tile-copy", X86LowerTileCopyPass())
MACHINE_FUNCTION_PASS("x86-lvi-load", X86LoadValueInjectionLoadHardeningPass())
@@ -67,7 +68,6 @@ MACHINE_FUNCTION_PASS("x86-wineh-unwindv2", X86WinEHUnwindV2Pass())
#endif
DUMMY_MACHINE_FUNCTION_PASS("x86-execution-domain-fix", X86ExecutionDomainFix())
DUMMY_MACHINE_FUNCTION_PASS("x86-global-base-reg", X86GlobalBaseRegPass())
-DUMMY_MACHINE_FUNCTION_PASS("x86-indirect-branch-tracking", X86IndirectBranchTrackingPass())
DUMMY_MACHINE_FUNCTION_PASS("x86-indirect-thunks", X86IndirectThunks())
DUMMY_MACHINE_FUNCTION_PASS("x86-insert-x87-wait", X86InsertX87WaitPass())
DUMMY_MACHINE_FUNCTION_PASS("x86-issue-vzero-upper", X86IssueVZeroUpperPass())
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 48fc38a1f11f5..95ad484d04ccd 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -566,7 +566,7 @@ void X86PassConfig::addPreEmitPass() {
addPass(createBreakFalseDeps());
}
- addPass(createX86IndirectBranchTrackingPass());
+ addPass(createX86IndirectBranchTrackingLegacyPass());
addPass(createX86IssueVZeroUpperPass());
diff --git a/llvm/test/CodeGen/X86/llc-pipeline-npm.ll b/llvm/test/CodeGen/X86/llc-pipeline-npm.ll
index a8d29b1affbbd..3b1992388ff63 100644
--- a/llvm/test/CodeGen/X86/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/X86/llc-pipeline-npm.ll
@@ -55,6 +55,7 @@
; O0-NEXT: fentry-insert
; O0-NEXT: xray-instrumentation
; O0-NEXT: patchable-function
+; O0-NEXT: x86-indirect-branch-tracking
; O0-NEXT: x86-compress-evex
; O0-NEXT: FuncletLayoutPass
; O0-NEXT: remove-loads-into-fake-uses
@@ -164,6 +165,7 @@
; O2-NEXT: xray-instrumentation
; O2-NEXT: patchable-function
; O2-NEXT: BreakFalseDepsPass
+; O2-NEXT: x86-indirect-branch-tracking
; O2-NEXT: x86-fixup-bw-insts
; O2-NEXT: x86-fixup-leas
; O2-NEXT: x86-fixup-inst-tuning
@@ -228,6 +230,7 @@
; O0-WINDOWS-NEXT: fentry-insert
; O0-WINDOWS-NEXT: xray-instrumentation
; O0-WINDOWS-NEXT: patchable-function
+; O0-WINDOWS-NEXT: x86-indirect-branch-tracking
; O0-WINDOWS-NEXT: x86-compress-evex
; O0-WINDOWS-NEXT: FuncletLayoutPass
; O0-WINDOWS-NEXT: remove-loads-into-fake-uses
@@ -340,6 +343,7 @@
; O3-WINDOWS-NEXT: xray-instrumentation
; O3-WINDOWS-NEXT: patchable-function
; O3-WINDOWS-NEXT: BreakFalseDepsPass
+; O3-WINDOWS-NEXT: x86-indirect-branch-tracking
; O3-WINDOWS-NEXT: x86-fixup-bw-insts
; O3-WINDOWS-NEXT: x86-fixup-leas
; O3-WINDOWS-NEXT: x86-fixup-inst-tuning
>From 525fa28cc3faf2e8da83d84199c9a065762f821d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Thu, 5 Feb 2026 23:41:45 -0800
Subject: [PATCH 24/33] [ELF][test] Consolidate .eh_frame FDE encoding tests
Merge eh-frame-value-format{1..9}.s into eh-frame-fde-encoding.s
(encoding is the DWARF term) using split-file. Add .eh_frame and
.eh_frame_hdr content verification for absptr, udata2, sdata4, udata4.
Move error test (9) to eh-frame-invalid-fde-encoding.s.
---
lld/test/ELF/eh-frame-fde-encoding.s | 203 +++++++++++++++++++
lld/test/ELF/eh-frame-invalid-fde-encoding.s | 28 +++
lld/test/ELF/eh-frame-value-format1.s | 35 ----
lld/test/ELF/eh-frame-value-format2.s | 35 ----
lld/test/ELF/eh-frame-value-format3.s | 28 ---
lld/test/ELF/eh-frame-value-format4.s | 28 ---
lld/test/ELF/eh-frame-value-format5.s | 35 ----
lld/test/ELF/eh-frame-value-format6.s | 35 ----
lld/test/ELF/eh-frame-value-format7.s | 77 -------
lld/test/ELF/eh-frame-value-format8.s | 74 -------
lld/test/ELF/eh-frame-value-format9.s | 28 ---
11 files changed, 231 insertions(+), 375 deletions(-)
create mode 100644 lld/test/ELF/eh-frame-fde-encoding.s
delete mode 100644 lld/test/ELF/eh-frame-value-format1.s
delete mode 100644 lld/test/ELF/eh-frame-value-format2.s
delete mode 100644 lld/test/ELF/eh-frame-value-format3.s
delete mode 100644 lld/test/ELF/eh-frame-value-format4.s
delete mode 100644 lld/test/ELF/eh-frame-value-format5.s
delete mode 100644 lld/test/ELF/eh-frame-value-format6.s
delete mode 100644 lld/test/ELF/eh-frame-value-format7.s
delete mode 100644 lld/test/ELF/eh-frame-value-format8.s
delete mode 100644 lld/test/ELF/eh-frame-value-format9.s
diff --git a/lld/test/ELF/eh-frame-fde-encoding.s b/lld/test/ELF/eh-frame-fde-encoding.s
new file mode 100644
index 0000000000000..0a7943c8b2077
--- /dev/null
+++ b/lld/test/ELF/eh-frame-fde-encoding.s
@@ -0,0 +1,203 @@
+# REQUIRES: x86
+## Test that various DW_EH_PE_* encodings in CIE are accepted.
+
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 absptr.s -o absptr.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 sdata2.s -o sdata2.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 sdata4.s -o sdata4.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 sdata8.s -o sdata8.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 signed.s -o signed.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 udata2.s -o udata2.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 udata4.s -o udata4.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 udata8.s -o udata8.o
+
+# RUN: ld.lld --eh-frame-hdr sdata2.o -o /dev/null
+# RUN: ld.lld --eh-frame-hdr sdata8.o -o /dev/null
+# RUN: ld.lld --eh-frame-hdr signed.o -o /dev/null
+
+# RUN: ld.lld --eh-frame-hdr --image-base=0 -Ttext=0x1000 absptr.o -o absptr
+# RUN: ld.lld --eh-frame-hdr --image-base=0 -Ttext=0x1000 udata2.o -o udata2
+# RUN: ld.lld --eh-frame-hdr --image-base=0 -Ttext=0x2000 sdata4.o -o sdata4
+# RUN: ld.lld --eh-frame-hdr --image-base=0 -Ttext=0x2000 udata4.o -o udata4
+
+## absptr/udata2: Also verify .eh_frame content to test relocation with addend.
+## .eh_frame_hdr initial_location: foo(0x1000)+0x234 - .eh_frame_hdr(0x2004) = 0xfffff230
+# RUN: llvm-readobj -x .eh_frame_hdr -x .eh_frame absptr | FileCheck %s --check-prefix=ABSPTR
+# ABSPTR: Hex dump of section '.eh_frame_hdr':
+# ABSPTR-NEXT: 0x00002004 011b033b 10000000 01000000 30f2ffff
+# ABSPTR-NEXT: 0x00002014 24000000
+# ABSPTR: Hex dump of section '.eh_frame':
+# ABSPTR-NEXT: 0x00002018 0c000000 00000000 01520001 010100ff
+# ABSPTR-NEXT: 0x00002028 0c000000 14000000 34120000 00000000
+## CIE offset--^ ^-- PC begin = 0x1234 (foo + 0x234)
+
+# RUN: llvm-readobj -x .eh_frame_hdr -x .eh_frame udata2 | FileCheck %s --check-prefix=UDATA2
+# UDATA2: Hex dump of section '.eh_frame_hdr':
+# UDATA2-NEXT: 0x00002004 011b033b 10000000 01000000 30f2ffff
+# UDATA2-NEXT: 0x00002014 26000000
+# UDATA2: Hex dump of section '.eh_frame':
+# UDATA2-NEXT: 0x00002018 0e000000 00000000 01525300 01010102
+# UDATA2-NEXT: 0x00002028 ff000600 00001600 00003412
+## CIE offset--^ ^-- PC begin = 0x1234 (foo + 0x234)
+
+# RUN: llvm-readelf -x .eh_frame_hdr sdata4 udata4 | FileCheck %s --check-prefix=HDR4
+# HDR4: 0x00003004 011b033b 10000000 01000000 fcefffff
+# HDR4-NEXT: 0x00003014 24000000
+# HDR4: 0x00003004 011b033b 10000000 01000000 fcefffff
+# HDR4-NEXT: 0x00003014 24000000
+
+#--- absptr.s
+## DW_EH_PE_absptr (0x00) with FDE for verification
+.text
+.globl foo
+foo:
+ nop
+
+.section .eh_frame,"a", at unwind
+ .long 12 # Size
+ .long 0x00 # ID (CIE)
+ .byte 0x01 # Version
+ .byte 0x52 # Augmentation string: 'R','\0'
+ .byte 0x00
+ .byte 0x01 # Code alignment
+ .byte 0x01 # Data alignment
+ .byte 0x01 # Return address register
+ .byte 0x00 # DW_EH_PE_absptr
+ .byte 0xFF
+
+ .long 12 # Size
+ .long 0x14 # CIE offset
+ .quad foo + 0x234 # PC begin
+
+#--- sdata2.s
+## DW_EH_PE_sdata2 (0x0A)
+.section .eh_frame,"a", at unwind
+ .long 0x0E # Size
+ .long 0x00 # ID (CIE)
+ .byte 0x01 # Version
+ .byte 0x50 # Augmentation string: 'P','\0'
+ .byte 0x00
+ .byte 0x01 # Code alignment
+ .byte 0x01 # Data alignment (LEB128)
+ .byte 0x01 # Return address register (LEB128)
+ .byte 0x0A # DW_EH_PE_sdata2
+ .short 0xFFFF
+ .byte 0xFF
+
+#--- sdata4.s
+## DW_EH_PE_sdata4 (0x0B) with FDE for verification
+.text
+.globl foo
+foo:
+ nop
+
+.section .eh_frame,"a", at unwind
+ .long 12 # Size
+ .long 0x00 # ID (CIE)
+ .byte 0x01 # Version
+ .byte 0x52 # Augmentation string: 'R','\0'
+ .byte 0x00
+ .byte 0x01 # Code alignment
+ .byte 0x01 # Data alignment
+ .byte 0x01 # Return address register
+ .byte 0x0B # DW_EH_PE_sdata4
+ .byte 0xFF
+
+ .long 12 # Size
+ .long 0x14 # CIE offset
+ .long foo # PC begin
+ .long 1 # PC range
+
+#--- sdata8.s
+## DW_EH_PE_sdata8 (0x0C)
+.section .eh_frame,"a", at unwind
+ .long 0x14 # Size
+ .long 0x00 # ID (CIE)
+ .byte 0x01 # Version
+ .byte 0x50 # Augmentation string: 'P','\0'
+ .byte 0x00
+ .byte 0x01 # Code alignment
+ .byte 0x01 # Data alignment (LEB128)
+ .byte 0x01 # Return address register (LEB128)
+ .byte 0x0C # DW_EH_PE_sdata8
+ .quad 0xFFFFFFFFFFFFFFFF
+ .byte 0xFF
+
+#--- signed.s
+## DW_EH_PE_signed (0x08)
+.section .eh_frame,"a", at unwind
+ .long 0x14 # Size
+ .long 0x00 # ID (CIE)
+ .byte 0x01 # Version
+ .byte 0x50 # Augmentation string: 'P','\0'
+ .byte 0x00
+ .byte 0x01 # Code alignment
+ .byte 0x01 # Data alignment (LEB128)
+ .byte 0x01 # Return address register (LEB128)
+ .byte 0x08 # DW_EH_PE_signed
+ .quad 0xFFFFFFFFFFFFFFFF
+ .byte 0xFF
+
+#--- udata2.s
+## DW_EH_PE_udata2 (0x02) with FDE for verification
+.text
+.globl foo
+foo:
+ nop
+
+.section .eh_frame,"a", at unwind
+ .long 14 # Size
+ .long 0x00 # ID (CIE)
+ .byte 0x01 # Version
+ .byte 0x52 # Augmentation string: 'R','S','\0'
+ .byte 0x53
+ .byte 0x00
+ .byte 0x01 # Code alignment
+ .byte 0x01 # Data alignment
+ .byte 0x01 # Return address register
+ .byte 0x02 # DW_EH_PE_udata2
+ .byte 0xFF
+ .byte 0x00
+
+ .long 6 # Size
+ .long 0x16 # CIE offset
+ .short foo + 0x234 # PC begin
+
+#--- udata4.s
+## DW_EH_PE_udata4 (0x03) with FDE for verification
+.text
+.globl foo
+foo:
+ nop
+
+.section .eh_frame,"a", at unwind
+ .long 12 # Size
+ .long 0x00 # ID (CIE)
+ .byte 0x01 # Version
+ .byte 0x52 # Augmentation string: 'R','\0'
+ .byte 0x00
+ .byte 0x01 # Code alignment
+ .byte 0x01 # Data alignment
+ .byte 0x01 # Return address register
+ .byte 0x03 # DW_EH_PE_udata4
+ .byte 0xFF
+
+ .long 12 # Size
+ .long 0x14 # CIE offset
+ .long foo # PC begin
+ .long 1 # PC range
+
+#--- udata8.s
+## DW_EH_PE_udata8 (0x04)
+.section .eh_frame,"a", at unwind
+ .long 0x14 # Size
+ .long 0x00 # ID (CIE)
+ .byte 0x01 # Version
+ .byte 0x50 # Augmentation string: 'P','\0'
+ .byte 0x00
+ .byte 0x01 # Code alignment
+ .byte 0x01 # Data alignment (LEB128)
+ .byte 0x01 # Return address register (LEB128)
+ .byte 0x04 # DW_EH_PE_udata8
+ .quad 0xFFFFFFFFFFFFFFFF
+ .byte 0xFF
diff --git a/lld/test/ELF/eh-frame-invalid-fde-encoding.s b/lld/test/ELF/eh-frame-invalid-fde-encoding.s
index a4802d533ae70..9a463f95b5568 100644
--- a/lld/test/ELF/eh-frame-invalid-fde-encoding.s
+++ b/lld/test/ELF/eh-frame-invalid-fde-encoding.s
@@ -6,6 +6,7 @@
# RUN: llvm-mc -filetype=obj -triple=x86_64 corrupted.s -o corrupted.o
# RUN: llvm-mc -filetype=obj -triple=x86_64 unknown-fde-encoding.s -o unknown-fde-encoding.o
# RUN: llvm-mc -filetype=obj -triple=x86_64 aligned-encoding.s -o aligned-encoding.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 unknown-size-encoding.s -o unknown-size-encoding.o
# RUN: not ld.lld --eh-frame-hdr unknown-aug.o 2>&1 | FileCheck %s --check-prefix=UNKNOWN-AUG -DPREFIX=error --implicit-check-not=error:
# RUN: ld.lld --eh-frame-hdr unknown-aug.o --noinhibit-exec 2>&1 | FileCheck %s --check-prefix=UNKNOWN-AUG -DPREFIX=warning
@@ -13,6 +14,7 @@
# RUN: not ld.lld --eh-frame-hdr unknown-fde-encoding.o 2>&1 | FileCheck %s --check-prefix=UNKNOWN-FDE --implicit-check-not=error:
# RUN: ld.lld --eh-frame-hdr unknown-fde-encoding.o --noinhibit-exec
# RUN: not ld.lld --eh-frame-hdr aligned-encoding.o 2>&1 | FileCheck %s --check-prefix=ALIGNED --implicit-check-not=error:
+# RUN: not ld.lld --eh-frame-hdr unknown-size-encoding.o 2>&1 | FileCheck %s --check-prefix=UNKNOWN-SIZE --implicit-check-not=error:
# UNKNOWN-AUG: [[PREFIX]]: corrupted .eh_frame: unknown .eh_frame augmentation string: {{.+}}
@@ -23,6 +25,8 @@
# ALIGNED: error: corrupted .eh_frame: DW_EH_PE_aligned encoding is not supported
+# UNKNOWN-SIZE: error: unknown FDE size encoding
+
#--- unknown-aug.s
.section .eh_frame,"a", at unwind
.byte 0x0E
@@ -120,3 +124,27 @@
.byte 0x01
.byte 0x01
.byte 0x01
+
+#--- unknown-size-encoding.s
+.section .eh_frame,"a", at unwind
+ .long 12 # Size
+ .long 0x00 # ID
+ .byte 0x01 # Version.
+
+ .byte 0x52 # Augmentation string: 'R','\0'
+ .byte 0x00
+
+# Code and data alignment factors.
+ .byte 0x01 # LEB128
+ .byte 0x01 # LEB128
+
+# Return address register.
+ .byte 0x01 # LEB128
+
+ .byte 0xFE # 'R' value: invalid <0xFE>
+
+ .byte 0xFF
+
+ .long 12 # Size
+ .long 0x14 # ID
+ .quad .eh_frame
diff --git a/lld/test/ELF/eh-frame-value-format1.s b/lld/test/ELF/eh-frame-value-format1.s
deleted file mode 100644
index da078eb79b425..0000000000000
--- a/lld/test/ELF/eh-frame-value-format1.s
+++ /dev/null
@@ -1,35 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: ld.lld --eh-frame-hdr %t -o /dev/null
-
-.section .eh_frame,"a", at unwind
- .byte 0x14
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x01
-
- .byte 0x50 # Augmentation string: 'P','\0'
- .byte 0x00
-
- .byte 0x01
-
- .byte 0x01 # LEB128
- .byte 0x01 # LEB128
-
- .byte 0x04 # DW_EH_PE_udata8
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
-
- .byte 0xFF
diff --git a/lld/test/ELF/eh-frame-value-format2.s b/lld/test/ELF/eh-frame-value-format2.s
deleted file mode 100644
index 1c907501ead31..0000000000000
--- a/lld/test/ELF/eh-frame-value-format2.s
+++ /dev/null
@@ -1,35 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: ld.lld --eh-frame-hdr %t -o /dev/null
-
-.section .eh_frame,"a", at unwind
- .byte 0x14
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x01
-
- .byte 0x50 # Augmentation string: 'P','\0'
- .byte 0x00
-
- .byte 0x01
-
- .byte 0x01 # LEB128
- .byte 0x01 # LEB128
-
- .byte 0x0C # DW_EH_PE_sdata8
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
-
- .byte 0xFF
diff --git a/lld/test/ELF/eh-frame-value-format3.s b/lld/test/ELF/eh-frame-value-format3.s
deleted file mode 100644
index 46e8db90ec18e..0000000000000
--- a/lld/test/ELF/eh-frame-value-format3.s
+++ /dev/null
@@ -1,28 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: ld.lld --eh-frame-hdr %t -o /dev/null
-
-.section .eh_frame,"a", at unwind
- .byte 0x0E
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x01
-
- .byte 0x50 # Augmentation string: 'P','\0'
- .byte 0x00
-
- .byte 0x01
-
- .byte 0x01 # LEB128
- .byte 0x01 # LEB128
-
- .byte 0x0A # DW_EH_PE_sdata2
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
diff --git a/lld/test/ELF/eh-frame-value-format4.s b/lld/test/ELF/eh-frame-value-format4.s
deleted file mode 100644
index e3e516d4eed25..0000000000000
--- a/lld/test/ELF/eh-frame-value-format4.s
+++ /dev/null
@@ -1,28 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: ld.lld --eh-frame-hdr %t -o /dev/null
-
-.section .eh_frame,"a", at unwind
- .byte 0x0E
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x01
-
- .byte 0x50 # Augmentation string: 'P','\0'
- .byte 0x00
-
- .byte 0x01
-
- .byte 0x01 # LEB128
- .byte 0x01 # LEB128
-
- .byte 0x02 # DW_EH_PE_udata2
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
diff --git a/lld/test/ELF/eh-frame-value-format5.s b/lld/test/ELF/eh-frame-value-format5.s
deleted file mode 100644
index cfa23723ea296..0000000000000
--- a/lld/test/ELF/eh-frame-value-format5.s
+++ /dev/null
@@ -1,35 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: ld.lld --eh-frame-hdr %t -o /dev/null
-
-.section .eh_frame,"a", at unwind
- .byte 0x14
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x01
-
- .byte 0x50 # Augmentation string: 'P','\0'
- .byte 0x00
-
- .byte 0x01
-
- .byte 0x01 # LEB128
- .byte 0x01 # LEB128
-
- .byte 0x08 # DW_EH_PE_signed
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
-
- .byte 0xFF
diff --git a/lld/test/ELF/eh-frame-value-format6.s b/lld/test/ELF/eh-frame-value-format6.s
deleted file mode 100644
index 23093b204c501..0000000000000
--- a/lld/test/ELF/eh-frame-value-format6.s
+++ /dev/null
@@ -1,35 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
-# RUN: ld.lld --eh-frame-hdr %t -o /dev/null
-
-.section .eh_frame,"a", at unwind
- .byte 0x14
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x00
- .byte 0x01
-
- .byte 0x50 # Augmentation string: 'P','\0'
- .byte 0x00
-
- .byte 0x01
-
- .byte 0x01 # LEB128
- .byte 0x01 # LEB128
-
- .byte 0x00 # DW_EH_PE_absptr
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
- .byte 0xFF
-
- .byte 0xFF
diff --git a/lld/test/ELF/eh-frame-value-format7.s b/lld/test/ELF/eh-frame-value-format7.s
deleted file mode 100644
index 2291f90f7c04a..0000000000000
--- a/lld/test/ELF/eh-frame-value-format7.s
+++ /dev/null
@@ -1,77 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
-# RUN: ld.lld --eh-frame-hdr --image-base=0 -Ttext=0x1000 %t.o -o %t
-# RUN: llvm-readobj -S --section-data %t | FileCheck %s
-
-## Check we are able to handle DW_EH_PE_udata2 encoding.
-
-# CHECK: Section {
-# CHECK: Index:
-# CHECK: Name: .eh_frame_hdr
-# CHECK-NEXT: Type: SHT_PROGBITS
-# CHECK-NEXT: Flags [
-# CHECK-NEXT: SHF_ALLOC
-# CHECK-NEXT: ]
-# CHECK-NEXT: Address: 0x2004
-# CHECK-NEXT: Offset: 0x1004
-# CHECK-NEXT: Size: 20
-# CHECK-NEXT: Link: 0
-# CHECK-NEXT: Info: 0
-# CHECK-NEXT: AddressAlignment: 4
-# CHECK-NEXT: EntrySize: 0
-# CHECK-NEXT: SectionData (
-# CHECK-NEXT: 0000: 011B033B 10000000 01000000 30F2FFFF
-# CHECK-NEXT: 0010: 26000000
-# Header (always 4 bytes): 011B033B
-# 10000000 = .eh_frame(0x2018) - .eh_frame_hdr(0x2004) - 4
-# 01000000 = 1 = the number of FDE pointers in the table.
-# 30F2FFFF = foo(0x1000) - 0x234(addend) - .eh_frame_hdr(0x2004)
-
-# CHECK: Section {
-# CHECK: Index:
-# CHECK: Name: .eh_frame
-# CHECK-NEXT: Type: SHT_PROGBITS
-# CHECK-NEXT: Flags [
-# CHECK-NEXT: SHF_ALLOC
-# CHECK-NEXT: ]
-# CHECK-NEXT: Address: 0x2018
-# CHECK-NEXT: Offset: 0x1018
-# CHECK-NEXT: Size:
-# CHECK-NEXT: Link:
-# CHECK-NEXT: Info:
-# CHECK-NEXT: AddressAlignment:
-# CHECK-NEXT: EntrySize:
-# CHECK-NEXT: SectionData (
-# CHECK-NEXT: 0000: 0E000000 00000000 01525300 01010102
-# CHECK-NEXT: 0010: FF000600 00001600 00003412 00000000
-# ^
-# ---> ADDR(foo) + 0x234 = 0x1234
-
-.text
-.global foo
-foo:
- nop
-
-.section .eh_frame,"a", at unwind
- .long 14 # Size
- .long 0x00 # ID
- .byte 0x01 # Version.
-
- .byte 0x52 # Augmentation string: 'R','S','\0'
- .byte 0x53
- .byte 0x00
-
- .byte 0x01
-
- .byte 0x01 # LEB128
- .byte 0x01 # LEB128
-
- .byte 0x02 # DW_EH_PE_udata2
-
- .byte 0xFF
- .byte 0
-
- .long 0x6 # Size
- .long 0x16 # ID
- .short foo + 0x234
diff --git a/lld/test/ELF/eh-frame-value-format8.s b/lld/test/ELF/eh-frame-value-format8.s
deleted file mode 100644
index 6e053ad558138..0000000000000
--- a/lld/test/ELF/eh-frame-value-format8.s
+++ /dev/null
@@ -1,74 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
-# RUN: ld.lld --eh-frame-hdr --image-base=0 -Ttext=0x1000 %t.o -o %t
-# RUN: llvm-readobj -S --section-data %t | FileCheck %s
-
-## Check we are able to handle DW_EH_PE_absptr encoding.
-
-# CHECK: Section {
-# CHECK: Index:
-# CHECK: Name: .eh_frame_hdr
-# CHECK-NEXT: Type: SHT_PROGBITS
-# CHECK-NEXT: Flags [
-# CHECK-NEXT: SHF_ALLOC
-# CHECK-NEXT: ]
-# CHECK-NEXT: Address: 0x2004
-# CHECK-NEXT: Offset: 0x1004
-# CHECK-NEXT: Size: 20
-# CHECK-NEXT: Link: 0
-# CHECK-NEXT: Info: 0
-# CHECK-NEXT: AddressAlignment: 4
-# CHECK-NEXT: EntrySize: 0
-# CHECK-NEXT: SectionData (
-# CHECK-NEXT: 0000: 011B033B 10000000 01000000 30F2FFFF
-# CHECK-NEXT: 0010: 24000000
-# Header (always 4 bytes): 011B033B
-# 10000000 = .eh_frame(0x2018) - .eh_frame_hdr(0x2004) - 4
-# 01000000 = 1 = the number of FDE pointers in the table.
-# 30F2FFFF = foo(0x1000) - 0x234(addend) - .eh_frame_hdr(0x2004)
-
-# CHECK: Section {
-# CHECK: Index:
-# CHECK: Name: .eh_frame
-# CHECK-NEXT: Type: SHT_PROGBITS
-# CHECK-NEXT: Flags [
-# CHECK-NEXT: SHF_ALLOC
-# CHECK-NEXT: ]
-# CHECK-NEXT: Address: 0x2018
-# CHECK-NEXT: Offset: 0x1018
-# CHECK-NEXT: Size:
-# CHECK-NEXT: Link:
-# CHECK-NEXT: Info:
-# CHECK-NEXT: AddressAlignment:
-# CHECK-NEXT: EntrySize:
-# CHECK-NEXT: SectionData (
-# CHECK-NEXT: 0000: 0C000000 00000000 01520001 010100FF
-# CHECK-NEXT: 0010: 0C000000 14000000 34120000 00000000
-# ^
-# ---> ADDR(foo) + 0x234 = 0x1234
-.text
-.global foo
-foo:
- nop
-
-.section .eh_frame,"a", at unwind
- .long 12 # Size
- .long 0x00 # ID
- .byte 0x01 # Version.
-
- .byte 0x52 # Augmentation string: 'R','\0'
- .byte 0x00
-
- .byte 0x01
-
- .byte 0x01 # LEB128
- .byte 0x01 # LEB128
-
- .byte 0x00 # DW_EH_PE_absptr
-
- .byte 0xFF
-
- .long 12 # Size
- .long 0x14 # ID
- .quad foo + 0x234
diff --git a/lld/test/ELF/eh-frame-value-format9.s b/lld/test/ELF/eh-frame-value-format9.s
deleted file mode 100644
index 1c5ca3bbafd7e..0000000000000
--- a/lld/test/ELF/eh-frame-value-format9.s
+++ /dev/null
@@ -1,28 +0,0 @@
-# REQUIRES: x86
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
-# RUN: not ld.lld --eh-frame-hdr %t.o -o /dev/null 2>&1 | FileCheck %s
-# CHECK: error: unknown FDE size encoding
-
-.section .eh_frame,"a", at unwind
- .long 12 # Size
- .long 0x00 # ID
- .byte 0x01 # Version.
-
- .byte 0x52 # Augmentation string: 'R','\0'
- .byte 0x00
-
-# Code and data alignment factors.
- .byte 0x01 # LEB128
- .byte 0x01 # LEB128
-
-# Return address register.
- .byte 0x01 # LEB128
-
- .byte 0xFE # 'R' value: invalid <0xFE>
-
- .byte 0xFF
-
- .long 12 # Size
- .long 0x14 # ID
- .quad .eh_frame
>From b6beec44e8395085ea5c9085b2df500b7c23b1ee Mon Sep 17 00:00:00 2001
From: NeKon69 <nobodqwe at gmail.com>
Date: Fri, 6 Feb 2026 10:51:08 +0300
Subject: [PATCH 25/33] [clangd] Fix call hierarchy crash on malformed request
(#179718)
The code for parsing a call hierarchy request was not using `ObjectMapper`
correctly: it was calling `map()` without first calling `operator bool()` to
check that an object was parsed at all.
Fixes #179109
---
clang-tools-extra/clangd/Protocol.cpp | 4 ++--
clang-tools-extra/clangd/test/call-hierarchy.test | 14 +++++++++++++-
2 files changed, 15 insertions(+), 3 deletions(-)
diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp
index 9926f2dd63de5..a697486d48f9c 100644
--- a/clang-tools-extra/clangd/Protocol.cpp
+++ b/clang-tools-extra/clangd/Protocol.cpp
@@ -1511,7 +1511,7 @@ bool fromJSON(const llvm::json::Value &Params, CallHierarchyItem &I,
bool fromJSON(const llvm::json::Value &Params,
CallHierarchyIncomingCallsParams &C, llvm::json::Path P) {
llvm::json::ObjectMapper O(Params, P);
- return O.map("item", C.item);
+ return O && O.map("item", C.item);
}
llvm::json::Value toJSON(const CallHierarchyIncomingCall &C) {
@@ -1521,7 +1521,7 @@ llvm::json::Value toJSON(const CallHierarchyIncomingCall &C) {
bool fromJSON(const llvm::json::Value &Params,
CallHierarchyOutgoingCallsParams &C, llvm::json::Path P) {
llvm::json::ObjectMapper O(Params, P);
- return O.map("item", C.item);
+ return O && O.map("item", C.item);
}
llvm::json::Value toJSON(const CallHierarchyOutgoingCall &C) {
diff --git a/clang-tools-extra/clangd/test/call-hierarchy.test b/clang-tools-extra/clangd/test/call-hierarchy.test
index 6548ea0068a8d..f0d57b60421a4 100644
--- a/clang-tools-extra/clangd/test/call-hierarchy.test
+++ b/clang-tools-extra/clangd/test/call-hierarchy.test
@@ -34,6 +34,18 @@
# CHECK-NEXT: "uri": "file://{{.*}}/clangd-test/main.cpp"
# CHECK-NEXT: }
---
-{"jsonrpc":"2.0","id":3,"method":"shutdown"}
+{"jsonrpc":"2.0","id":3,"method":"callHierarchy/incomingCalls","params":[]}
+# CHECK: "error": {
+# CHECK-NEXT: "code": -32602,
+# CHECK-NEXT: "message": "failed to decode callHierarchy/incomingCalls request: expected object"
+# CHECK-NEXT: }
+---
+{"jsonrpc":"2.0","id":4,"method":"callHierarchy/outgoingCalls","params":4}
+# CHECK: "error": {
+# CHECK-NEXT: "code": -32602,
+# CHECK-NEXT: "message": "failed to decode callHierarchy/outgoingCalls request: expected object"
+# CHECK-NEXT: }
+---
+{"jsonrpc":"2.0","id":5,"method":"shutdown"}
---
{"jsonrpc":"2.0","method":"exit"}
>From af4f1dc5b1da7b3f2d4b2b6ba7206b61b9f7ca84 Mon Sep 17 00:00:00 2001
From: JaydeepChauhan14 <chauhan.jaydeep.ashwinbhai at intel.com>
Date: Thu, 5 Feb 2026 23:53:42 -0800
Subject: [PATCH 26/33] Precommit test for combine ADC(ADD(X,Y),0,Carry) ->
ADC(X,Y,Carry) (#177539)
**Reference PR-** https://github.com/llvm/llvm-project/pull/176713
---
llvm/test/CodeGen/X86/combine-adc.ll | 115 +++++++++++++++++++++++++++
1 file changed, 115 insertions(+)
diff --git a/llvm/test/CodeGen/X86/combine-adc.ll b/llvm/test/CodeGen/X86/combine-adc.ll
index a2aaea31aa6ff..4c864f9b77b68 100644
--- a/llvm/test/CodeGen/X86/combine-adc.ll
+++ b/llvm/test/CodeGen/X86/combine-adc.ll
@@ -136,5 +136,120 @@ define i32 @adc_merge_sub(i32 %a0) nounwind {
ret i32 %result
}
+; Basic positive test
+define i32 @adc_add(i32 %0, i32 %1, i32 %2, i32 %3) nounwind {
+; X86-LABEL: adc_add:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: cmpl %ecx, %eax
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: js .LBB4_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: .LBB4_2:
+; X86-NEXT: retl
+;
+; X64-LABEL: adc_add:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: addl %ecx, %edx
+; X64-NEXT: cmpl %esi, %edi
+; X64-NEXT: adcl $0, %edx
+; X64-NEXT: cmovsl %edi, %eax
+; X64-NEXT: retq
+ %5 = icmp ult i32 %0, %1
+ %6 = add i32 %3, %2
+ %7 = zext i1 %5 to i32
+ %8 = add i32 %6, %7
+ %9 = icmp slt i32 %8, 0
+ %10 = select i1 %9, i32 %0, i32 %1
+ ret i32 %10
+}
+
+; Negative test: Carry or overflow flag is used
+define i32 @adc_add_wrong_flags(i32 %0, i32 %1, i32 %2, i32 %3) nounwind {
+; X86-LABEL: adc_add_wrong_flags:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: cmpl %ecx, %eax
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: jb .LBB5_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: .LBB5_2:
+; X86-NEXT: retl
+;
+; X64-LABEL: adc_add_wrong_flags:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: addl %ecx, %edx
+; X64-NEXT: cmpl %esi, %edi
+; X64-NEXT: adcl $0, %edx
+; X64-NEXT: cmovbl %edi, %eax
+; X64-NEXT: retq
+ %5 = icmp ult i32 %0, %1
+ %6 = add i32 %3, %2
+ %7 = zext i1 %5 to i32
+ %8 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %6, i32 %7)
+ %9 = extractvalue { i32, i1 } %8, 1
+ %10 = select i1 %9, i32 %0, i32 %1
+ ret i32 %10
+}
+
+; Negative test: Multi-use
+define i32 @adc_add_multi_use(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, ptr %5) nounwind {
+; X86-LABEL: adc_add_multi_use:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: leal (%edi,%esi), %ebx
+; X86-NEXT: cmpl %ecx, %eax
+; X86-NEXT: movl %ebx, (%edx)
+; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: addl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: js .LBB6_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: .LBB6_2:
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl
+;
+; X64-LABEL: adc_add_multi_use:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT: # kill: def $edx killed $edx def $rdx
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: leal (%rcx,%rdx), %esi
+; X64-NEXT: cmpl %eax, %edi
+; X64-NEXT: movl %esi, (%r9)
+; X64-NEXT: adcl %edx, %ecx
+; X64-NEXT: addl %r8d, %ecx
+; X64-NEXT: cmovsl %edi, %eax
+; X64-NEXT: retq
+ %7 = icmp ult i32 %0, %1
+ %8 = add i32 %3, %2
+ store i32 %8, ptr %5, align 4
+ %9 = zext i1 %7 to i32
+ %10 = add i32 %8, %9
+ %11 = add i32 %10, %4
+ %12 = icmp slt i32 %11, 0
+ %13 = select i1 %12, i32 %0, i32 %1
+ ret i32 %13
+}
+
declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32)
declare void @use(i8)
>From 8b9161492d3d332474648e2486276469010bbae7 Mon Sep 17 00:00:00 2001
From: Nishant Sachdeva <32475507+nishant-sachdeva at users.noreply.github.com>
Date: Fri, 6 Feb 2026 13:29:57 +0530
Subject: [PATCH 27/33] [llvm-ir2vec] Adding FuncEmb API to ir2vec python
bindings (#179908)
Adding FuncEmb API to ir2vec python bindings. Provide the IR name of a
function, and the API returns the func Embedding for it.
---
.../llvm-ir2vec/bindings/ir2vec-bindings.py | 16 +++++++
llvm/tools/llvm-ir2vec/Bindings/PyIR2Vec.cpp | 45 +++++++++++++++----
2 files changed, 53 insertions(+), 8 deletions(-)
diff --git a/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-bindings.py b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-bindings.py
index a209a47cba42e..a0d61e4808292 100644
--- a/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-bindings.py
+++ b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-bindings.py
@@ -22,6 +22,15 @@
print(f"Function: {func_name}")
print(f" Embedding: {emb.tolist()}")
+ # Test getFuncEmb for individual functions
+ print("\n=== Single Function Embeddings ===")
+
+ # Test valid function names
+ for func_name in ["add", "multiply", "conditional"]:
+ func_emb = tool.getFuncEmb(func_name)
+ print(f"Function: {func_name}")
+ print(f" Embedding: {func_emb.tolist()}")
+
# CHECK: SUCCESS: Tool initialized
# CHECK: Tool type: IR2VecTool
# CHECK: === Function Embeddings ===
@@ -31,3 +40,10 @@
# CHECK-NEXT: Embedding: [413.20000000298023, 421.20000000298023, 429.20000000298023]
# CHECK: Function: multiply
# CHECK-NEXT: Embedding: [50.0, 52.0, 54.0]
+# CHECK: === Single Function Embeddings ===
+# CHECK: Function: add
+# CHECK-NEXT: Embedding: [38.0, 40.0, 42.0]
+# CHECK: Function: multiply
+# CHECK-NEXT: Embedding: [50.0, 52.0, 54.0]
+# CHECK: Function: conditional
+# CHECK-NEXT: Embedding: [413.20000000298023, 421.20000000298023, 429.20000000298023]
diff --git a/llvm/tools/llvm-ir2vec/Bindings/PyIR2Vec.cpp b/llvm/tools/llvm-ir2vec/Bindings/PyIR2Vec.cpp
index 08a0844b44eef..5032a053ce7b6 100644
--- a/llvm/tools/llvm-ir2vec/Bindings/PyIR2Vec.cpp
+++ b/llvm/tools/llvm-ir2vec/Bindings/PyIR2Vec.cpp
@@ -76,23 +76,48 @@ class PyIR2VecTool {
if (!ToolFuncEmbMap)
throw nb::value_error(toString(ToolFuncEmbMap.takeError()).c_str());
- nb::dict NBFuncEmbMap;
+ nb::dict NbFuncEmbMap;
for (const auto &[FuncPtr, FuncEmb] : *ToolFuncEmbMap) {
auto FuncEmbVec = FuncEmb.getData();
- double *NBFuncEmbVec = new double[FuncEmbVec.size()];
- std::copy(FuncEmbVec.begin(), FuncEmbVec.end(), NBFuncEmbVec);
+ double *NbFuncEmbVec = new double[FuncEmbVec.size()];
+ std::copy(FuncEmbVec.begin(), FuncEmbVec.end(), NbFuncEmbVec);
auto NbArray = nb::ndarray<nb::numpy, double>(
- NBFuncEmbVec, {FuncEmbVec.size()},
- nb::capsule(NBFuncEmbVec, [](void *P) noexcept {
+ NbFuncEmbVec, {FuncEmbVec.size()},
+ nb::capsule(NbFuncEmbVec, [](void *P) noexcept {
delete[] static_cast<double *>(P);
}));
- NBFuncEmbMap[nb::str(FuncPtr->getName().str().c_str())] = NbArray;
+ NbFuncEmbMap[nb::str(FuncPtr->getName().str().c_str())] = NbArray;
}
- return NBFuncEmbMap;
+ return NbFuncEmbMap;
+ }
+
+ nb::ndarray<nb::numpy, double> getFuncEmb(const std::string &FuncName) {
+ const Function *F = M->getFunction(FuncName);
+
+ if (!F)
+ throw nb::value_error(
+ ("Function '" + FuncName + "' not found in module").c_str());
+
+ auto ToolFuncEmb = Tool->getFunctionEmbedding(*F, OutputEmbeddingMode);
+
+ if (!ToolFuncEmb)
+ throw nb::value_error(toString(ToolFuncEmb.takeError()).c_str());
+
+ auto FuncEmbVec = ToolFuncEmb->getData();
+ double *NbFuncEmbVec = new double[FuncEmbVec.size()];
+ std::copy(FuncEmbVec.begin(), FuncEmbVec.end(), NbFuncEmbVec);
+
+ auto NbArray = nb::ndarray<nb::numpy, double>(
+ NbFuncEmbVec, {FuncEmbVec.size()},
+ nb::capsule(NbFuncEmbVec, [](void *P) noexcept {
+ delete[] static_cast<double *>(P);
+ }));
+
+ return NbArray;
}
};
@@ -108,7 +133,11 @@ NB_MODULE(ir2vec, m) {
.def("getFuncEmbMap", &PyIR2VecTool::getFuncEmbMap,
"Generate function-level embeddings for all functions\n"
"Returns: dict[str, ndarray[float64]] - "
- "{function_name: embedding}");
+ "{function_name: embedding}")
+ .def("getFuncEmb", &PyIR2VecTool::getFuncEmb, nb::arg("funcName"),
+ "Generate embedding for a single function by name\n"
+ "Args: funcName (str) - IR-Name of the function\n"
+ "Returns: ndarray[float64] - Function embedding vector");
m.def(
"initEmbedding",
>From 2f90e7117e1a3a145a52934d32f49c1cb23b185c Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas at devlieghere.com>
Date: Fri, 6 Feb 2026 09:12:00 +0100
Subject: [PATCH 28/33] [llvm] Update policy for Doxygen comments in the Coding
Standards (#179898)
This PR updates the policy regarding Doxygen comments in the Coding
Standards based on an RFC discussion on Discourse:
https://discourse.llvm.org/t/rfc-policy-for-doxygen-comments-in-lldb/89675/
---
llvm/docs/CodingStandards.rst | 29 +++++++++++++++++------------
1 file changed, 17 insertions(+), 12 deletions(-)
diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst
index 63f6663d687ea..593388db7ddf2 100644
--- a/llvm/docs/CodingStandards.rst
+++ b/llvm/docs/CodingStandards.rst
@@ -278,11 +278,23 @@ Use the ``\file`` command to turn the standard file header into a file-level
comment.
Include descriptive paragraphs for all public interfaces (public classes,
-member and non-member functions). Avoid restating the information that can
-be inferred from the API name. The first sentence (or a paragraph beginning
-with ``\brief``) is used as an abstract. Try to use a single sentence as the
-``\brief`` adds visual clutter. Put detailed discussion into separate
-paragraphs.
+member and non-member functions). Avoid restating the information that can be
+inferred from the API name or signature. The first sentence (or a paragraph
+beginning with ``\brief``) is used as an abstract. Try to use a single
+sentence as the ``\brief`` adds visual clutter. Put detailed discussion into
+separate paragraphs.
+
+A minimal documentation comment:
+
+.. code-block:: c++
+
+ /// Sets the xyzzy property to \p Baz.
+ void setXyzzy(bool Baz);
+
+Only include code examples, function parameters and return values when it
+provides additional information, such as intent, usage, or behavior that’s
+non-obvious. Use descriptive function and argument names to
+eliminate the need for documentation comments when possible.
To refer to parameter names inside a paragraph, use the ``\p name`` command.
Don't use the ``\arg name`` command since it starts a new paragraph that
@@ -298,13 +310,6 @@ respectively.
To describe function return value, start a new paragraph with the ``\returns``
command.
-A minimal documentation comment:
-
-.. code-block:: c++
-
- /// Sets the xyzzy property to \p Baz.
- void setXyzzy(bool Baz);
-
A documentation comment that uses all Doxygen features in a preferred way:
.. code-block:: c++
>From 64c1ee72486080faec76d9930f84d3418bb5cb44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1s=20Longeri?= <tlongeri at google.com>
Date: Fri, 6 Feb 2026 00:16:35 -0800
Subject: [PATCH 29/33] Fix Bazel build for ba58225 (#180136)
---
.../mlir/python/BUILD.bazel | 43 +++++++++++++++++++
1 file changed, 43 insertions(+)
diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
index 912fcd33b4a2a..6291457081cb4 100644
--- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
@@ -523,6 +523,34 @@ filegroup(
],
)
+##---------------------------------------------------------------------------##
+# X86Vector dialect.
+##---------------------------------------------------------------------------##
+
+gentbl_filegroup(
+ name = "X86VectorPyGen",
+ tbl_outs = {
+ "mlir/dialects/_x86vector_ops_gen.py": [
+ "-gen-python-op-bindings",
+ "-bind-dialect=x86vector",
+ ],
+ },
+ tblgen = "//mlir:mlir-tblgen",
+ td_file = "mlir/dialects/X86Vector.td",
+ deps = [
+ "//mlir:X86VectorTdFiles",
+ "//mlir:OpBaseTdFiles",
+ ],
+)
+
+filegroup(
+ name = "X86VectorPyFiles",
+ srcs = [
+ "mlir/dialects/x86vector.py",
+ ":X86VectorPyGen",
+ ],
+)
+
##---------------------------------------------------------------------------##
# IRDL dialect.
##---------------------------------------------------------------------------##
@@ -1332,6 +1360,20 @@ gentbl_filegroup(
],
)
+gentbl_filegroup(
+ name = "X86VectorTransformOpsPyGen",
+ tbl_outs = {"mlir/dialects/_x86vector_transform_ops_gen.py": [
+ "-gen-python-op-bindings",
+ "-bind-dialect=transform",
+ "-dialect-extension=x86vector_transform"
+ ]},
+ tblgen = "//mlir:mlir-tblgen",
+ td_file = "mlir/dialects/X86VectorTransformOps.td",
+ deps = [
+ "//mlir:X86VectorTransformOpsTdFiles",
+ ]
+)
+
gentbl_filegroup(
name = "GPUTransformOpsPyGen",
tbl_outs = {"mlir/dialects/_gpu_transform_ops_gen.py": [
@@ -1542,6 +1584,7 @@ filegroup(
":TransformSMTExtensionOpsPyGen",
":VectorTransformEnumPyGen",
":VectorTransformOpsPyGen",
+ ":X86VectorTransformOpsPyGen",
":XeGPUTransformOpsPyGen",
],
)
>From 221d9720367b0b5e29d411b3f939fb94e619b594 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Fri, 6 Feb 2026 09:36:42 +0100
Subject: [PATCH 30/33] [NFC][LowerMemIntrinsics] Use TypeSize consistently for
type sizes (#179945)
PR #169040 already started using `TypeSize` for the return value of
`DataLayout::getType*Size` in the memset lowering, this PR adjusts other uses
in LowerMemIntrinsics to do the same. Currently, scalable vector types are not
supported as access types for the mem-intrinsic lowering.
---
.../Transforms/Utils/LowerMemIntrinsics.cpp | 24 +++++++++++--------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 0bf3f262f87e1..7623f3b9a6c08 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -317,11 +317,13 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
"Atomic memcpy lowering is not supported for vector operand type");
Type *Int8Type = Type::getInt8Ty(Ctx);
- unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+ TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+ assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
"Atomic memcpy lowering is not supported for selected operand size");
- uint64_t LoopEndCount = alignDown(CopyLen->getZExtValue(), LoopOpSize);
+ uint64_t LoopEndCount =
+ alignDown(CopyLen->getZExtValue(), LoopOpSize.getFixedValue());
// Skip the loop expansion entirely if the loop would never be taken.
if (LoopEndCount != 0) {
@@ -379,7 +381,7 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
Align PartDstAlign(commonAlignment(DstAlign, BytesCopied));
- unsigned OperandSize = DL.getTypeStoreSize(OpTy);
+ TypeSize OperandSize = DL.getTypeStoreSize(OpTy);
assert((!AtomicElementSize || OperandSize % *AtomicElementSize == 0) &&
"Atomic memcpy lowering is not supported for selected operand size");
@@ -432,7 +434,7 @@ void llvm::createMemCpyLoopUnknownSize(
Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
"Atomic memcpy lowering is not supported for vector operand type");
- unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+ TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
"Atomic memcpy lowering is not supported for selected operand size");
@@ -441,7 +443,7 @@ void llvm::createMemCpyLoopUnknownSize(
Type *ResidualLoopOpType = AtomicElementSize
? Type::getIntNTy(Ctx, *AtomicElementSize * 8)
: Int8Type;
- unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
+ TypeSize ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
assert(ResidualLoopOpSize == (AtomicElementSize ? *AtomicElementSize : 1) &&
"Store size is expected to match type size");
@@ -576,7 +578,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
SrcAlign, DstAlign);
- unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+ TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
Type *Int8Type = Type::getInt8Ty(Ctx);
bool LoopOpIsInt8 = LoopOpType == Int8Type;
@@ -585,7 +587,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
bool RequiresResidual = !LoopOpIsInt8;
Type *ResidualLoopOpType = Int8Type;
- unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
+ TypeSize ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
// Calculate the loop trip count and remaining bytes to copy after the loop.
IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
@@ -847,11 +849,13 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
SrcAlign, DstAlign);
- unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+ TypeSize LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+ assert(LoopOpSize.isFixed() && "LoopOpType cannot be a scalable vector type");
Type *Int8Type = Type::getInt8Ty(Ctx);
// Calculate the loop trip count and remaining bytes to copy after the loop.
- uint64_t BytesCopiedInLoop = alignDown(CopyLen->getZExtValue(), LoopOpSize);
+ uint64_t BytesCopiedInLoop =
+ alignDown(CopyLen->getZExtValue(), LoopOpSize.getFixedValue());
uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopiedInLoop;
IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
@@ -886,7 +890,7 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
Align ResSrcAlign(commonAlignment(SrcAlign, BytesCopied));
Align ResDstAlign(commonAlignment(DstAlign, BytesCopied));
- unsigned OperandSize = DL.getTypeStoreSize(OpTy);
+ TypeSize OperandSize = DL.getTypeStoreSize(OpTy);
// If we used LoopOpType as GEP element type, we would iterate over the
// buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
>From e60828e5d810d6eedbe83a97c011f8c27f544109 Mon Sep 17 00:00:00 2001
From: Steffen Larsen <sholstla at amd.com>
Date: Fri, 6 Feb 2026 09:40:32 +0100
Subject: [PATCH 31/33] [DAGCombiner] Fix exact power-of-two signed division
for large integers (#177340)
Previously, the DAG combiner did not optimize exact signed division by a
power-of-two constant divisor for integer types exceeding the size of
division supported by the target architecture (e.g., i128 on x86-64).
However, such an optimization was expected by the division expansion
logic, leading to unsupported division operations making it to
instruction selection.
This commit addresses this issue by making an exception to the existing
exclusion of signed division with the exact flag for the aforementioned
operations. That is, the DAG combiner will now optimize exact signed
division if the divisor is a power-of-two constant and the integer type
exceeds the size of division supported by the target architecture.
---------
Signed-off-by: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 +-
llvm/test/CodeGen/AMDGPU/div_i128.ll | 182 ++
llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 1760 ++++++++++++++---
llvm/test/CodeGen/X86/div_i129_v_pow2k.ll | 405 ++++
4 files changed, 2127 insertions(+), 228 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/div_i129_v_pow2k.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index df69f0870d27a..6c60161f51068 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5236,12 +5236,16 @@ SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
EVT VT = N->getValueType(0);
EVT CCVT = getSetCCResultType(VT);
unsigned BitWidth = VT.getScalarSizeInBits();
+ unsigned MaxLegalDivRemBitWidth = TLI.getMaxDivRemBitWidthSupported();
// fold (sdiv X, pow2) -> simple ops after legalize
// FIXME: We check for the exact bit here because the generic lowering gives
// better results in that case. The target-specific lowering should learn how
- // to handle exact sdivs efficiently.
- if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1)) {
+ // to handle exact sdivs efficiently. An exception is made for large bitwidths
+ // exceeding what the target can natively support, as division expansion was
+ // skipped in favor of this optimization.
+ if ((!N->getFlags().hasExact() || BitWidth > MaxLegalDivRemBitWidth) &&
+ isDivisorPowerOfTwo(N1)) {
// Target-specific implementation of sdiv x, pow2.
if (SDValue Res = BuildSDIVPow2(N))
return Res;
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index d5b5ab6e457f9..5a4aa4effac00 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -4373,6 +4373,115 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
ret i128 %div
}
+define i128 @v_sdiv_exact_i128_v_pow2k(i128 %lhs) {
+; GFX9-LABEL: v_sdiv_exact_i128_v_pow2k:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], 31, v[4:5]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3]
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v4
+; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT: v_ashrrev_i32_e32 v2, 1, v3
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-O0-LABEL: v_sdiv_exact_i128_v_pow2k:
+; GFX9-O0: ; %bb.0:
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT: s_mov_b32 s4, 63
+; GFX9-O0-NEXT: v_ashrrev_i64 v[4:5], s4, v[4:5]
+; GFX9-O0-NEXT: s_mov_b32 s5, 31
+; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s5, v[4:5]
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7
+; GFX9-O0-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-O0-NEXT: s_mov_b32 s6, s8
+; GFX9-O0-NEXT: s_mov_b32 s4, s9
+; GFX9-O0-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v5, vcc, v2, v4, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT: s_mov_b32 s4, 33
+; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_lshl_or_b32 v0, v2, s5, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6
+; GFX9-O0-NEXT: v_ashrrev_i64 v[3:4], s4, v[3:4]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT: s_mov_b32 s4, 1
+; GFX9-O0-NEXT: v_alignbit_b32 v1, v1, v2, s4
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT: s_mov_b32 s4, 32
+; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4]
+; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-G-LABEL: v_sdiv_exact_i128_v_pow2k:
+; GFX9-G: ; %bb.0:
+; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-G-NEXT: v_mov_b32_e32 v4, v1
+; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3]
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v4
+; GFX9-G-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX9-G-NEXT: v_ashrrev_i32_e32 v2, 1, v3
+; GFX9-G-NEXT: v_mov_b32_e32 v3, v4
+; GFX9-G-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-G-O0-LABEL: v_sdiv_exact_i128_v_pow2k:
+; GFX9-G-O0: ; %bb.0:
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v0, v1
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v0, v[5:6]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6
+; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v2
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v3, v2, v4
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v2, v2, v4
+; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31]
+ %div = sdiv exact i128 %lhs, 8589934592
+ ret i128 %div
+}
+
define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
; GFX9-LABEL: v_udiv_i128_v_pow2k:
; GFX9: ; %bb.0:
@@ -4445,3 +4554,76 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
%div = udiv i128 %lhs, 8589934592
ret i128 %div
}
+
+define i128 @v_udiv_exact_i128_v_pow2k(i128 %lhs) {
+; GFX9-LABEL: v_udiv_exact_i128_v_pow2k:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v1
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3]
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v4
+; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-O0-LABEL: v_udiv_exact_i128_v_pow2k:
+; GFX9-O0: ; %bb.0:
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
+; GFX9-O0-NEXT: s_mov_b32 s4, 33
+; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT: s_mov_b32 s5, 31
+; GFX9-O0-NEXT: v_lshl_or_b32 v0, v4, s5, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-O0-NEXT: v_lshrrev_b64 v[2:3], s4, v[1:2]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT: s_mov_b32 s4, 1
+; GFX9-O0-NEXT: v_alignbit_b32 v1, v1, v4, s4
+; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-G-LABEL: v_udiv_exact_i128_v_pow2k:
+; GFX9-G: ; %bb.0:
+; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-G-NEXT: v_mov_b32_e32 v4, v1
+; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3]
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v4
+; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v3
+; GFX9-G-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-G-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-G-O0-LABEL: v_udiv_exact_i128_v_pow2k:
+; GFX9-G-O0: ; %bb.0:
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v0, v1
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v0, v[4:5]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v2
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v2, v3
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31]
+ %div = udiv exact i128 %lhs, 8589934592
+ ret i128 %div
+}
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 1e96b63bcd321..2b434c54da9c2 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -822,6 +822,688 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
ret <2 x i128> %shl
}
+define <2 x i128> @v_sdiv_v2i128_v_pow2k(<2 x i128> %lhs) {
+; SDAG-LABEL: v_sdiv_v2i128_v_pow2k:
+; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_ashrrev_i32_e32 v18, 31, v3
+; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 0, v0
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], 0x7f
+; SDAG-NEXT: v_mov_b32_e32 v19, v18
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v2, vcc
+; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, v1, v10, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, v0, v8, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v2, v11, s[4:5]
+; SDAG-NEXT: v_ffbh_u32_e32 v1, v12
+; SDAG-NEXT: v_ffbh_u32_e32 v2, v13
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v3, v0, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v0, v12, v10
+; SDAG-NEXT: v_ffbh_u32_e32 v3, v10
+; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v1
+; SDAG-NEXT: v_or_b32_e32 v1, v13, v11
+; SDAG-NEXT: v_add_i32_e32 v3, vcc, 32, v3
+; SDAG-NEXT: v_ffbh_u32_e32 v14, v11
+; SDAG-NEXT: v_min_u32_e32 v2, v8, v2
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; SDAG-NEXT: v_min_u32_e32 v0, v3, v14
+; SDAG-NEXT: v_add_i32_e32 v1, vcc, 64, v2
+; SDAG-NEXT: v_addc_u32_e64 v2, s[8:9], 0, 0, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v2, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, 0x5e, v0
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc
+; SDAG-NEXT: v_xor_b32_e32 v0, 0x7f, v2
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v9, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v8
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v1, v3, v9
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_cndmask_b32_e32 v14, v15, v14, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT: v_and_b32_e32 v0, 1, v14
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0
+; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, 0, s[4:5]
+; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v13, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v12, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB1_6
+; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
+; SDAG-NEXT: v_add_i32_e32 v20, vcc, 1, v2
+; SDAG-NEXT: v_sub_i32_e64 v0, s[4:5], 63, v2
+; SDAG-NEXT: v_addc_u32_e32 v21, vcc, 0, v3, vcc
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[12:13], v0
+; SDAG-NEXT: v_addc_u32_e32 v22, vcc, 0, v8, vcc
+; SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v9, vcc
+; SDAG-NEXT: v_or_b32_e32 v8, v20, v22
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0x7f, v2
+; SDAG-NEXT: v_or_b32_e32 v9, v21, v23
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[10:11], v16
+; SDAG-NEXT: v_sub_i32_e32 v17, vcc, 64, v16
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[12:13], v16
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[12:13], v17
+; SDAG-NEXT: v_or_b32_e32 v3, v3, v9
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v8
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v16
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v15, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v14, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v10, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-NEXT: v_mov_b32_e32 v3, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB1_5
+; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
+; SDAG-NEXT: v_lshr_b64 v[2:3], v[12:13], v20
+; SDAG-NEXT: v_sub_i32_e32 v14, vcc, 64, v20
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[10:11], v14
+; SDAG-NEXT: v_or_b32_e32 v15, v3, v15
+; SDAG-NEXT: v_or_b32_e32 v14, v2, v14
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v20
+; SDAG-NEXT: v_subrev_i32_e64 v2, s[4:5], 64, v20
+; SDAG-NEXT: v_lshr_b64 v[2:3], v[10:11], v2
+; SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v15, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20
+; SDAG-NEXT: v_cndmask_b32_e64 v13, v3, v13, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v12, v2, v12, s[4:5]
+; SDAG-NEXT: v_lshr_b64 v[2:3], v[10:11], v20
+; SDAG-NEXT: v_cndmask_b32_e32 v15, 0, v3, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v14, 0, v2, vcc
+; SDAG-NEXT: v_mov_b32_e32 v10, 0
+; SDAG-NEXT: v_mov_b32_e32 v11, 0
+; SDAG-NEXT: s_mov_b64 s[4:5], 0
+; SDAG-NEXT: v_mov_b32_e32 v3, 0
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 31, v9
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v13
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v25, 31, v1
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
+; SDAG-NEXT: v_or_b32_e32 v9, v17, v9
+; SDAG-NEXT: v_or_b32_e32 v8, v16, v8
+; SDAG-NEXT: v_or_b32_e32 v14, v14, v24
+; SDAG-NEXT: v_or_b32_e32 v12, v12, v25
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v2
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, -1, v12
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 1, v13, vcc
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v14, vcc
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v15, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v2
+; SDAG-NEXT: v_subrev_i32_e32 v12, vcc, 0, v12
+; SDAG-NEXT: v_and_b32_e32 v16, 2, v2
+; SDAG-NEXT: v_and_b32_e32 v2, 1, v2
+; SDAG-NEXT: v_subb_u32_e32 v13, vcc, v13, v16, vcc
+; SDAG-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v14, vcc
+; SDAG-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v15, vcc
+; SDAG-NEXT: v_add_i32_e32 v20, vcc, -1, v20
+; SDAG-NEXT: v_addc_u32_e32 v21, vcc, -1, v21, vcc
+; SDAG-NEXT: v_addc_u32_e32 v22, vcc, -1, v22, vcc
+; SDAG-NEXT: v_addc_u32_e32 v23, vcc, -1, v23, vcc
+; SDAG-NEXT: v_or_b32_e32 v16, v20, v22
+; SDAG-NEXT: v_or_b32_e32 v17, v21, v23
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_or_b32_e32 v1, v11, v1
+; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v0, v10, v0
+; SDAG-NEXT: v_mov_b32_e32 v17, v3
+; SDAG-NEXT: v_mov_b32_e32 v16, v2
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execnz .LBB1_3
+; SDAG-NEXT: ; %bb.4: ; %Flow13
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB1_5: ; %Flow14
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v9
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v10
+; SDAG-NEXT: v_or_b32_e32 v16, v3, v9
+; SDAG-NEXT: v_or_b32_e32 v17, v2, v8
+; SDAG-NEXT: .LBB1_6: ; %Flow16
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_ashrrev_i32_e32 v20, 31, v7
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, 0, v4
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], 0x7f
+; SDAG-NEXT: v_mov_b32_e32 v21, v20
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc
+; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v4, v2, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5]
+; SDAG-NEXT: v_ffbh_u32_e32 v3, v10
+; SDAG-NEXT: v_ffbh_u32_e32 v4, v11
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v2, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v2, v10, v8
+; SDAG-NEXT: v_ffbh_u32_e32 v5, v8
+; SDAG-NEXT: v_add_i32_e32 v6, vcc, 32, v3
+; SDAG-NEXT: v_or_b32_e32 v3, v11, v9
+; SDAG-NEXT: v_add_i32_e32 v5, vcc, 32, v5
+; SDAG-NEXT: v_ffbh_u32_e32 v7, v9
+; SDAG-NEXT: v_min_u32_e32 v4, v6, v4
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT: v_min_u32_e32 v2, v5, v7
+; SDAG-NEXT: v_add_i32_e32 v3, vcc, 64, v4
+; SDAG-NEXT: v_addc_u32_e64 v4, s[8:9], 0, 0, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, 0x5e, v2
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v4, vcc
+; SDAG-NEXT: v_xor_b32_e32 v6, 0x7f, v2
+; SDAG-NEXT: v_subb_u32_e32 v4, vcc, 0, v12, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, 0, v12, vcc
+; SDAG-NEXT: v_or_b32_e32 v6, v6, v4
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v7, v3, v5
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT: v_and_b32_e32 v6, 1, v12
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v6
+; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, v9, 0, s[4:5]
+; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: v_cndmask_b32_e64 v12, v8, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, v11, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v10, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB1_12
+; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
+; SDAG-NEXT: v_add_i32_e32 v22, vcc, 1, v2
+; SDAG-NEXT: v_sub_i32_e64 v6, s[4:5], 63, v2
+; SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v3, vcc
+; SDAG-NEXT: v_lshl_b64 v[6:7], v[10:11], v6
+; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v4, vcc
+; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v5, vcc
+; SDAG-NEXT: v_or_b32_e32 v3, v22, v24
+; SDAG-NEXT: v_sub_i32_e32 v5, vcc, 0x7f, v2
+; SDAG-NEXT: v_or_b32_e32 v4, v23, v25
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[8:9], v5
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, 64, v5
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[10:11], v5
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4]
+; SDAG-NEXT: v_lshr_b64 v[2:3], v[10:11], v2
+; SDAG-NEXT: v_or_b32_e32 v3, v13, v3
+; SDAG-NEXT: v_or_b32_e32 v2, v12, v2
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v5
+; SDAG-NEXT: v_cndmask_b32_e64 v4, v7, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v15, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v14, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5
+; SDAG-NEXT: v_cndmask_b32_e64 v5, v4, v9, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v8, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v6, 0
+; SDAG-NEXT: v_mov_b32_e32 v7, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB1_11
+; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
+; SDAG-NEXT: v_lshr_b64 v[6:7], v[10:11], v22
+; SDAG-NEXT: v_sub_i32_e32 v12, vcc, 64, v22
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[8:9], v12
+; SDAG-NEXT: v_or_b32_e32 v13, v7, v13
+; SDAG-NEXT: v_or_b32_e32 v12, v6, v12
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22
+; SDAG-NEXT: v_subrev_i32_e64 v6, s[4:5], 64, v22
+; SDAG-NEXT: v_lshr_b64 v[6:7], v[8:9], v6
+; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v7, v11, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v6, v10, s[4:5]
+; SDAG-NEXT: v_lshr_b64 v[6:7], v[8:9], v22
+; SDAG-NEXT: v_cndmask_b32_e32 v13, 0, v7, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v6, vcc
+; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
+; SDAG-NEXT: s_mov_b64 s[4:5], 0
+; SDAG-NEXT: v_mov_b32_e32 v7, 0
+; SDAG-NEXT: v_mov_b32_e32 v14, 0
+; SDAG-NEXT: v_mov_b32_e32 v15, 0
+; SDAG-NEXT: .LBB1_9: ; %udiv-do-while
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v11
+; SDAG-NEXT: v_or_b32_e32 v12, v12, v6
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v5
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v6
+; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v3
+; SDAG-NEXT: v_or_b32_e32 v4, v4, v6
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT: v_or_b32_e32 v5, v9, v5
+; SDAG-NEXT: v_or_b32_e32 v4, v8, v4
+; SDAG-NEXT: v_or_b32_e32 v3, v15, v3
+; SDAG-NEXT: v_or_b32_e32 v2, v14, v2
+; SDAG-NEXT: v_sub_i32_e32 v6, vcc, -1, v10
+; SDAG-NEXT: v_subb_u32_e32 v6, vcc, 1, v11, vcc
+; SDAG-NEXT: v_subb_u32_e32 v6, vcc, 0, v12, vcc
+; SDAG-NEXT: v_subb_u32_e32 v6, vcc, 0, v13, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v14, 31, v6
+; SDAG-NEXT: v_and_b32_e32 v6, 1, v14
+; SDAG-NEXT: v_and_b32_e32 v14, 2, v14
+; SDAG-NEXT: v_subrev_i32_e32 v10, vcc, 0, v10
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v14, vcc
+; SDAG-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v12, vcc
+; SDAG-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc
+; SDAG-NEXT: v_add_i32_e32 v22, vcc, -1, v22
+; SDAG-NEXT: v_addc_u32_e32 v23, vcc, -1, v23, vcc
+; SDAG-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc
+; SDAG-NEXT: v_addc_u32_e32 v25, vcc, -1, v25, vcc
+; SDAG-NEXT: v_or_b32_e32 v15, v23, v25
+; SDAG-NEXT: v_or_b32_e32 v14, v22, v24
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v15, v7
+; SDAG-NEXT: v_mov_b32_e32 v14, v6
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execnz .LBB1_9
+; SDAG-NEXT: ; %bb.10: ; %Flow
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB1_11: ; %Flow11
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[4:5], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v3
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT: v_or_b32_e32 v12, v12, v4
+; SDAG-NEXT: v_or_b32_e32 v7, v7, v3
+; SDAG-NEXT: v_or_b32_e32 v6, v6, v2
+; SDAG-NEXT: .LBB1_12: ; %Flow12
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_xor_b32_e32 v3, v1, v19
+; SDAG-NEXT: v_xor_b32_e32 v2, v0, v18
+; SDAG-NEXT: v_xor_b32_e32 v1, v16, v19
+; SDAG-NEXT: v_xor_b32_e32 v0, v17, v18
+; SDAG-NEXT: v_xor_b32_e32 v8, v13, v21
+; SDAG-NEXT: v_xor_b32_e32 v9, v12, v20
+; SDAG-NEXT: v_xor_b32_e32 v5, v7, v21
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v18
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v19, vcc
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v18, vcc
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v19, vcc
+; SDAG-NEXT: v_xor_b32_e32 v4, v6, v20
+; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v20
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v21, vcc
+; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v9, v20, vcc
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v21, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sdiv_v2i128_v_pow2k:
+; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v3
+; GISEL-NEXT: v_mov_b32_e32 v14, 0x5e
+; GISEL-NEXT: v_mov_b32_e32 v8, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v9, 0
+; GISEL-NEXT: v_xor_b32_e32 v0, v18, v0
+; GISEL-NEXT: v_xor_b32_e32 v1, v18, v1
+; GISEL-NEXT: v_xor_b32_e32 v2, v18, v2
+; GISEL-NEXT: v_xor_b32_e32 v3, v18, v3
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v0, v18
+; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v1, v18, vcc
+; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v2, v18, vcc
+; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v18, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v2, v11
+; GISEL-NEXT: v_ffbh_u32_e32 v3, v10
+; GISEL-NEXT: v_or_b32_e32 v0, v10, v12
+; GISEL-NEXT: v_or_b32_e32 v1, v11, v13
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 32, v3
+; GISEL-NEXT: v_ffbh_u32_e32 v15, v13
+; GISEL-NEXT: v_ffbh_u32_e32 v16, v12
+; GISEL-NEXT: v_min_u32_e32 v2, v2, v3
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 32, v16
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, 64, v2
+; GISEL-NEXT: v_min_u32_e32 v1, v15, v3
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v0
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v2
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v8, v8, v0
+; GISEL-NEXT: v_or_b32_e32 v9, v3, v1
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v15, v14, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v9, v16, v14
+; GISEL-NEXT: v_and_b32_e32 v14, 1, v9
+; GISEL-NEXT: v_or_b32_e32 v8, v9, v8
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v16, v10, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v14, 1, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v17, v11, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v8, v12, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v9, v13, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB1_6
+; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
+; GISEL-NEXT: v_add_i32_e32 v19, vcc, 1, v2
+; GISEL-NEXT: v_addc_u32_e64 v20, s[4:5], 0, v3, vcc
+; GISEL-NEXT: v_sub_i32_e32 v23, vcc, 0x7f, v2
+; GISEL-NEXT: v_not_b32_e32 v2, 63
+; GISEL-NEXT: v_addc_u32_e64 v21, vcc, 0, v0, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v22, vcc, 0, v1, vcc
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v23, v2
+; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 64, v23
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[10:11], v23
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], v23
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_lshr_b64 v[8:9], v[10:11], v8
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[10:11], v14
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
+; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v8, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v9, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v12, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v13, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GISEL-NEXT: v_mov_b32_e32 v2, s10
+; GISEL-NEXT: v_mov_b32_e32 v3, s11
+; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9]
+; GISEL-NEXT: s_cbranch_execz .LBB1_5
+; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, 0xffffffc0, v19
+; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 64, v19
+; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v19
+; GISEL-NEXT: v_lshr_b64 v[2:3], v[10:11], v19
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_lshl_b64 v[23:24], v[12:13], v17
+; GISEL-NEXT: v_lshr_b64 v[12:13], v[12:13], v16
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v2, v23
+; GISEL-NEXT: v_or_b32_e32 v1, v3, v24
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v0, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v1, v11, vcc
+; GISEL-NEXT: v_mov_b32_e32 v11, 0
+; GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GISEL-NEXT: v_mov_b32_e32 v2, s10
+; GISEL-NEXT: v_mov_b32_e32 v3, s11
+; GISEL-NEXT: .LBB1_3: ; %udiv-do-while3
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v15
+; GISEL-NEXT: v_lshl_b64 v[23:24], v[12:13], 1
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v12, 31, v13
+; GISEL-NEXT: v_lshrrev_b32_e32 v13, 31, v9
+; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GISEL-NEXT: v_add_i32_e32 v19, vcc, -1, v19
+; GISEL-NEXT: v_addc_u32_e32 v20, vcc, -1, v20, vcc
+; GISEL-NEXT: v_or_b32_e32 v14, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v15, v1, v3
+; GISEL-NEXT: v_or_b32_e32 v2, v16, v12
+; GISEL-NEXT: v_or_b32_e32 v0, v23, v13
+; GISEL-NEXT: v_or_b32_e32 v8, v8, v10
+; GISEL-NEXT: v_addc_u32_e32 v21, vcc, -1, v21, vcc
+; GISEL-NEXT: v_addc_u32_e32 v22, vcc, -1, v22, vcc
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, 1, v24
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, 0, v2, vcc
+; GISEL-NEXT: v_subrev_i32_e64 v12, s[4:5], 0, v0
+; GISEL-NEXT: v_or_b32_e32 v0, v19, v21
+; GISEL-NEXT: v_or_b32_e32 v1, v20, v22
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, 0, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GISEL-NEXT: v_and_b32_e32 v10, 1, v0
+; GISEL-NEXT: v_and_b32_e32 v3, 2, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v10
+; GISEL-NEXT: v_mov_b32_e32 v1, v11
+; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v24, v3
+; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v2, s[4:5]
+; GISEL-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GISEL-NEXT: v_subbrev_u32_e64 v17, vcc, 0, v17, s[4:5]
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GISEL-NEXT: s_cbranch_execnz .LBB1_3
+; GISEL-NEXT: ; %bb.4: ; %Flow13
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: .LBB1_5: ; %Flow14
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1
+; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v15
+; GISEL-NEXT: v_or_b32_e32 v8, v8, v10
+; GISEL-NEXT: v_or_b32_e32 v16, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v17, v1, v3
+; GISEL-NEXT: .LBB1_6: ; %Flow16
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_ashrrev_i32_e32 v19, 31, v7
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x5e
+; GISEL-NEXT: v_mov_b32_e32 v12, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v13, 0
+; GISEL-NEXT: v_xor_b32_e32 v0, v19, v4
+; GISEL-NEXT: v_xor_b32_e32 v1, v19, v5
+; GISEL-NEXT: v_xor_b32_e32 v3, v19, v6
+; GISEL-NEXT: v_xor_b32_e32 v6, v19, v7
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v19
+; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v1, v19, vcc
+; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v3, v19, vcc
+; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v6, v19, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v3, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v6, v4
+; GISEL-NEXT: v_or_b32_e32 v0, v4, v10
+; GISEL-NEXT: v_or_b32_e32 v1, v5, v11
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, 32, v6
+; GISEL-NEXT: v_ffbh_u32_e32 v7, v11
+; GISEL-NEXT: v_ffbh_u32_e32 v14, v10
+; GISEL-NEXT: v_min_u32_e32 v3, v3, v6
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, 32, v14
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, 64, v3
+; GISEL-NEXT: v_min_u32_e32 v1, v7, v6
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[12:13]
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v6, 0x7f, v2
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v6, v6, v0
+; GISEL-NEXT: v_or_b32_e32 v7, v3, v1
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v7, v14, v12
+; GISEL-NEXT: v_and_b32_e32 v12, 1, v7
+; GISEL-NEXT: v_or_b32_e32 v6, v7, v6
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, v4, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v14, 1, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v13, v5, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB1_12
+; GISEL-NEXT: ; %bb.7: ; %udiv-bb1
+; GISEL-NEXT: v_add_i32_e32 v20, vcc, 1, v2
+; GISEL-NEXT: v_addc_u32_e64 v21, s[4:5], 0, v3, vcc
+; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v2
+; GISEL-NEXT: v_not_b32_e32 v2, 63
+; GISEL-NEXT: v_addc_u32_e64 v22, vcc, 0, v0, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc
+; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v24, v2
+; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], 64, v24
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v24
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[10:11], v24
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_lshr_b64 v[6:7], v[4:5], v6
+; GISEL-NEXT: v_lshl_b64 v[14:15], v[4:5], v12
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v6, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v7, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v14, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v11, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GISEL-NEXT: v_mov_b32_e32 v2, s10
+; GISEL-NEXT: v_mov_b32_e32 v3, s11
+; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9]
+; GISEL-NEXT: s_cbranch_execz .LBB1_11
+; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
+; GISEL-NEXT: v_add_i32_e32 v24, vcc, 0xffffffc0, v20
+; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 64, v20
+; GISEL-NEXT: v_lshr_b64 v[0:1], v[10:11], v20
+; GISEL-NEXT: v_lshr_b64 v[2:3], v[4:5], v20
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_lshl_b64 v[14:15], v[10:11], v14
+; GISEL-NEXT: v_lshr_b64 v[10:11], v[10:11], v24
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v14
+; GISEL-NEXT: v_or_b32_e32 v3, v3, v15
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v20
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v11, v3, v5, vcc
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GISEL-NEXT: v_mov_b32_e32 v2, s10
+; GISEL-NEXT: v_mov_b32_e32 v3, s11
+; GISEL-NEXT: .LBB1_9: ; %udiv-do-while
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_lshl_b64 v[24:25], v[10:11], 1
+; GISEL-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 31, v11
+; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v7
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], 1
+; GISEL-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v11, 31, v13
+; GISEL-NEXT: v_add_i32_e32 v20, vcc, -1, v20
+; GISEL-NEXT: v_addc_u32_e32 v21, vcc, -1, v21, vcc
+; GISEL-NEXT: v_or_b32_e32 v5, v14, v5
+; GISEL-NEXT: v_or_b32_e32 v10, v24, v10
+; GISEL-NEXT: v_or_b32_e32 v6, v6, v11
+; GISEL-NEXT: v_or_b32_e32 v12, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v13, v1, v3
+; GISEL-NEXT: v_addc_u32_e32 v22, vcc, -1, v22, vcc
+; GISEL-NEXT: v_addc_u32_e32 v23, vcc, -1, v23, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, 1, v25
+; GISEL-NEXT: v_subb_u32_e32 v0, vcc, 0, v5, vcc
+; GISEL-NEXT: v_subrev_i32_e64 v10, s[4:5], 0, v10
+; GISEL-NEXT: v_or_b32_e32 v0, v20, v22
+; GISEL-NEXT: v_or_b32_e32 v1, v21, v23
+; GISEL-NEXT: v_subb_u32_e32 v2, vcc, 0, v15, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v2
+; GISEL-NEXT: v_and_b32_e32 v3, 1, v0
+; GISEL-NEXT: v_and_b32_e32 v2, 2, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-NEXT: v_mov_b32_e32 v1, v4
+; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v25, v2
+; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v5, s[4:5]
+; GISEL-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GISEL-NEXT: v_subbrev_u32_e64 v15, vcc, 0, v15, s[4:5]
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GISEL-NEXT: s_cbranch_execnz .LBB1_9
+; GISEL-NEXT: ; %bb.10: ; %Flow
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: .LBB1_11: ; %Flow11
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], 1
+; GISEL-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v13
+; GISEL-NEXT: v_or_b32_e32 v6, v6, v4
+; GISEL-NEXT: v_or_b32_e32 v12, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v13, v1, v3
+; GISEL-NEXT: .LBB1_12: ; %Flow12
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: v_xor_b32_e32 v0, v16, v18
+; GISEL-NEXT: v_xor_b32_e32 v1, v17, v18
+; GISEL-NEXT: v_xor_b32_e32 v2, v8, v18
+; GISEL-NEXT: v_xor_b32_e32 v3, v9, v18
+; GISEL-NEXT: v_xor_b32_e32 v4, v12, v19
+; GISEL-NEXT: v_xor_b32_e32 v5, v13, v19
+; GISEL-NEXT: v_xor_b32_e32 v6, v6, v19
+; GISEL-NEXT: v_xor_b32_e32 v7, v7, v19
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v18
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v18, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v19
+; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v19, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v18, vcc
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v18, vcc
+; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v19, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v19, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %shl = sdiv <2 x i128> %lhs, <i128 8589934592, i128 8589934592>
+ ret <2 x i128> %shl
+}
+
define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-LABEL: v_udiv_v2i128_vv:
; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
@@ -887,7 +1569,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
; SDAG-NEXT: v_cndmask_b32_e64 v19, v0, 0, s[4:5]
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
-; SDAG-NEXT: s_cbranch_execz .LBB1_6
+; SDAG-NEXT: s_cbranch_execz .LBB2_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
; SDAG-NEXT: v_add_i32_e32 v26, vcc, 1, v20
; SDAG-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v20
@@ -917,7 +1599,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; SDAG-NEXT: s_cbranch_execz .LBB1_5
+; SDAG-NEXT: s_cbranch_execz .LBB2_5
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v26
; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 64, v26
@@ -945,7 +1627,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: v_mov_b32_e32 v24, 0
; SDAG-NEXT: v_mov_b32_e32 v25, 0
-; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3
+; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v19
; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
@@ -986,10 +1668,10 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v25, v21
; SDAG-NEXT: v_mov_b32_e32 v24, v20
; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; SDAG-NEXT: s_cbranch_execnz .LBB1_3
+; SDAG-NEXT: s_cbranch_execnz .LBB2_3
; SDAG-NEXT: ; %bb.4: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
-; SDAG-NEXT: .LBB1_5: ; %Flow14
+; SDAG-NEXT: .LBB2_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v19
@@ -997,7 +1679,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v2, v2, v8
; SDAG-NEXT: v_or_b32_e32 v18, v21, v1
; SDAG-NEXT: v_or_b32_e32 v19, v20, v0
-; SDAG-NEXT: .LBB1_6: ; %Flow16
+; SDAG-NEXT: .LBB2_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_or_b32_e32 v1, v13, v15
; SDAG-NEXT: v_or_b32_e32 v0, v12, v14
@@ -1058,7 +1740,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; SDAG-NEXT: s_cbranch_execz .LBB1_12
+; SDAG-NEXT: s_cbranch_execz .LBB2_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
; SDAG-NEXT: v_add_i32_e32 v22, vcc, 1, v0
; SDAG-NEXT: v_sub_i32_e64 v8, s[4:5], 63, v0
@@ -1088,7 +1770,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v17, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; SDAG-NEXT: s_cbranch_execz .LBB1_11
+; SDAG-NEXT: s_cbranch_execz .LBB2_11
; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v22
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v22
@@ -1116,7 +1798,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc
; SDAG-NEXT: v_mov_b32_e32 v4, 0
; SDAG-NEXT: v_mov_b32_e32 v5, 0
-; SDAG-NEXT: .LBB1_9: ; %udiv-do-while
+; SDAG-NEXT: .LBB2_9: ; %udiv-do-while
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v7
@@ -1157,10 +1839,10 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v4, v16
; SDAG-NEXT: v_mov_b32_e32 v5, v17
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; SDAG-NEXT: s_cbranch_execnz .LBB1_9
+; SDAG-NEXT: s_cbranch_execnz .LBB2_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB1_11: ; %Flow11
+; SDAG-NEXT: .LBB2_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[8:9], v[0:1], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v11
@@ -1168,7 +1850,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v8, v8, v4
; SDAG-NEXT: v_or_b32_e32 v10, v17, v1
; SDAG-NEXT: v_or_b32_e32 v11, v16, v0
-; SDAG-NEXT: .LBB1_12: ; %Flow12
+; SDAG-NEXT: .LBB2_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v0, v19
; SDAG-NEXT: v_mov_b32_e32 v1, v18
@@ -1243,7 +1925,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
-; GISEL-NEXT: s_cbranch_execz .LBB1_6
+; GISEL-NEXT: s_cbranch_execz .LBB2_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v22
; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v23, vcc
@@ -1275,7 +1957,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v18, s8
; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
-; GISEL-NEXT: s_cbranch_execz .LBB1_5
+; GISEL-NEXT: s_cbranch_execz .LBB2_5
; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v26
; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v26
@@ -1304,7 +1986,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v20, s6
; GISEL-NEXT: v_mov_b32_e32 v19, s5
; GISEL-NEXT: v_mov_b32_e32 v18, s4
-; GISEL-NEXT: .LBB1_3: ; %udiv-do-while3
+; GISEL-NEXT: .LBB2_3: ; %udiv-do-while3
; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v23
; GISEL-NEXT: v_lshl_b64 v[20:21], v[22:23], 1
@@ -1343,200 +2025,826 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v19, v1
; GISEL-NEXT: v_mov_b32_e32 v18, v0
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GISEL-NEXT: s_cbranch_execnz .LBB1_3
+; GISEL-NEXT: s_cbranch_execnz .LBB2_3
+; GISEL-NEXT: ; %bb.4: ; %Flow13
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB2_5: ; %Flow14
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[22:23], 1
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v8, 31, v23
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v8
+; GISEL-NEXT: v_or_b32_e32 v18, v18, v0
+; GISEL-NEXT: v_or_b32_e32 v19, v19, v1
+; GISEL-NEXT: .LBB2_6: ; %Flow16
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_or_b32_e32 v0, v12, v14
+; GISEL-NEXT: v_or_b32_e32 v1, v13, v15
+; GISEL-NEXT: v_or_b32_e32 v8, v4, v6
+; GISEL-NEXT: v_or_b32_e32 v9, v5, v7
+; GISEL-NEXT: v_ffbh_u32_e32 v16, v13
+; GISEL-NEXT: v_ffbh_u32_e32 v17, v12
+; GISEL-NEXT: v_ffbh_u32_e32 v20, v15
+; GISEL-NEXT: v_ffbh_u32_e32 v21, v14
+; GISEL-NEXT: v_ffbh_u32_e32 v22, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v23, v4
+; GISEL-NEXT: v_ffbh_u32_e32 v24, v7
+; GISEL-NEXT: v_ffbh_u32_e32 v25, v6
+; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v11, 0
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 32, v17
+; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v21
+; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], 32, v23
+; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], 32, v25
+; GISEL-NEXT: v_min_u32_e32 v0, v16, v0
+; GISEL-NEXT: v_min_u32_e32 v1, v20, v1
+; GISEL-NEXT: v_min_u32_e32 v8, v22, v8
+; GISEL-NEXT: v_min_u32_e32 v9, v24, v9
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, 64, v0
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 64, v8
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v1
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[10:11]
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v16
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v8, v8, v0
+; GISEL-NEXT: v_or_b32_e32 v9, v17, v1
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v9, v20, v10
+; GISEL-NEXT: v_and_b32_e32 v10, 1, v9
+; GISEL-NEXT: v_or_b32_e32 v8, v9, v8
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v4, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v20, 1, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v5, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v8, v6, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v9, v7, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB2_12
+; GISEL-NEXT: ; %bb.7: ; %udiv-bb1
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v16
+; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v17, vcc
+; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16
+; GISEL-NEXT: v_not_b32_e32 v9, 63
+; GISEL-NEXT: v_addc_u32_e64 v24, vcc, 0, v0, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v26, v9
+; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v26
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v26
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v26
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v10
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[4:5], v9
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26
+; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v20, v16
+; GISEL-NEXT: v_or_b32_e32 v1, v21, v17
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v23, s11
+; GISEL-NEXT: v_mov_b32_e32 v22, s10
+; GISEL-NEXT: v_mov_b32_e32 v21, s9
+; GISEL-NEXT: v_mov_b32_e32 v20, s8
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB2_11
+; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
+; GISEL-NEXT: v_add_i32_e32 v28, vcc, 0xffffffc0, v8
+; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v8
+; GISEL-NEXT: v_lshr_b64 v[16:17], v[6:7], v8
+; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v8
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v12
+; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], v22
+; GISEL-NEXT: v_lshr_b64 v[6:7], v[6:7], v28
+; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc
+; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: v_or_b32_e32 v20, v20, v22
+; GISEL-NEXT: v_or_b32_e32 v21, v21, v23
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v20, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v21, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc
+; GISEL-NEXT: v_mov_b32_e32 v5, 0
+; GISEL-NEXT: v_mov_b32_e32 v23, s7
+; GISEL-NEXT: v_mov_b32_e32 v22, s6
+; GISEL-NEXT: v_mov_b32_e32 v21, s5
+; GISEL-NEXT: v_mov_b32_e32 v20, s4
+; GISEL-NEXT: .LBB2_9: ; %udiv-do-while
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], 1
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v7
+; GISEL-NEXT: v_lshrrev_b32_e32 v30, 31, v1
+; GISEL-NEXT: v_lshl_b64 v[6:7], v[9:10], 1
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v9, 31, v10
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, -1, v8
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, -1, v11, vcc
+; GISEL-NEXT: v_or_b32_e32 v16, v16, v4
+; GISEL-NEXT: v_or_b32_e32 v22, v22, v30
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT: v_or_b32_e32 v9, v20, v6
+; GISEL-NEXT: v_or_b32_e32 v10, v21, v7
+; GISEL-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc
+; GISEL-NEXT: v_addc_u32_e32 v25, vcc, -1, v25, vcc
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v26, v22
+; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v27, v23, vcc
+; GISEL-NEXT: v_or_b32_e32 v6, v8, v24
+; GISEL-NEXT: v_or_b32_e32 v7, v11, v25
+; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v28, v16, vcc
+; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v29, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v4
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v4, 1, v6
+; GISEL-NEXT: v_and_b32_e32 v7, v6, v12
+; GISEL-NEXT: v_and_b32_e32 v30, v6, v13
+; GISEL-NEXT: v_and_b32_e32 v31, v6, v14
+; GISEL-NEXT: v_and_b32_e32 v32, v6, v15
+; GISEL-NEXT: v_mov_b32_e32 v21, v5
+; GISEL-NEXT: v_mov_b32_e32 v20, v4
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v22, v7
+; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v23, v30, vcc
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v31, vcc
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v32, vcc
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execnz .LBB2_9
+; GISEL-NEXT: ; %bb.10: ; %Flow
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB2_11: ; %Flow11
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], 1
+; GISEL-NEXT: v_lshl_b64 v[8:9], v[0:1], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v10
+; GISEL-NEXT: v_or_b32_e32 v8, v8, v0
+; GISEL-NEXT: v_or_b32_e32 v10, v20, v4
+; GISEL-NEXT: v_or_b32_e32 v11, v21, v5
+; GISEL-NEXT: .LBB2_12: ; %Flow12
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: v_mov_b32_e32 v0, v18
+; GISEL-NEXT: v_mov_b32_e32 v1, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v10
+; GISEL-NEXT: v_mov_b32_e32 v5, v11
+; GISEL-NEXT: v_mov_b32_e32 v6, v8
+; GISEL-NEXT: v_mov_b32_e32 v7, v9
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %shl = udiv <2 x i128> %lhs, %rhs
+ ret <2 x i128> %shl
+}
+
+define <2 x i128> @v_udiv_v2i128_v_pow2k(<2 x i128> %lhs) {
+; SDAG-LABEL: v_udiv_v2i128_v_pow2k:
+; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v9, v3
+; SDAG-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-NEXT: v_or_b32_e32 v3, v1, v9
+; SDAG-NEXT: v_or_b32_e32 v2, v0, v8
+; SDAG-NEXT: v_ffbh_u32_e32 v10, v8
+; SDAG-NEXT: v_ffbh_u32_e32 v11, v9
+; SDAG-NEXT: v_ffbh_u32_e32 v12, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v13, v1
+; SDAG-NEXT: v_mov_b32_e32 v15, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], 0x7f
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT: v_add_i32_e32 v2, vcc, 32, v10
+; SDAG-NEXT: v_add_i32_e32 v3, vcc, 32, v12
+; SDAG-NEXT: v_min_u32_e32 v2, v2, v11
+; SDAG-NEXT: v_min_u32_e32 v3, v3, v13
+; SDAG-NEXT: v_add_i32_e32 v3, vcc, 64, v3
+; SDAG-NEXT: v_addc_u32_e64 v10, s[8:9], 0, 0, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; SDAG-NEXT: v_sub_i32_e32 v12, vcc, 0x5e, v2
+; SDAG-NEXT: v_subb_u32_e32 v13, vcc, 0, v10, vcc
+; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v12
+; SDAG-NEXT: v_subb_u32_e32 v14, vcc, 0, v15, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[12:13]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v15, vcc, 0, v15, vcc
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v14
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v3, v13, v15
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_and_b32_e32 v2, 1, v10
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v2
+; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v9, 0, s[4:5]
+; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v0, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB3_6
+; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
+; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v12
+; SDAG-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v12
+; SDAG-NEXT: v_addc_u32_e32 v19, vcc, 0, v13, vcc
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[0:1], v2
+; SDAG-NEXT: v_addc_u32_e32 v20, vcc, 0, v14, vcc
+; SDAG-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc
+; SDAG-NEXT: v_or_b32_e32 v10, v18, v20
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0x7f, v12
+; SDAG-NEXT: v_or_b32_e32 v11, v19, v21
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[8:9], v16
+; SDAG-NEXT: v_sub_i32_e32 v17, vcc, 64, v16
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[0:1], v16
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_lshr_b64 v[10:11], v[0:1], v17
+; SDAG-NEXT: v_or_b32_e32 v11, v13, v11
+; SDAG-NEXT: v_or_b32_e32 v10, v12, v10
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v16
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v15, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v13, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB3_5
+; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
+; SDAG-NEXT: v_lshr_b64 v[12:13], v[0:1], v18
+; SDAG-NEXT: v_sub_i32_e32 v14, vcc, 64, v18
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[8:9], v14
+; SDAG-NEXT: v_or_b32_e32 v15, v13, v15
+; SDAG-NEXT: v_or_b32_e32 v14, v12, v14
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
+; SDAG-NEXT: v_subrev_i32_e64 v12, s[4:5], 64, v18
+; SDAG-NEXT: v_lshr_b64 v[12:13], v[8:9], v12
+; SDAG-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
+; SDAG-NEXT: v_cndmask_b32_e64 v15, v13, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v12, v14, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v1, v0, s[4:5]
+; SDAG-NEXT: v_lshr_b64 v[0:1], v[8:9], v18
+; SDAG-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-NEXT: s_mov_b64 s[4:5], 0
+; SDAG-NEXT: v_mov_b32_e32 v13, 0
+; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
+; SDAG-NEXT: .LBB3_3: ; %udiv-do-while3
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v12, 31, v15
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v3
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v12
+; SDAG-NEXT: v_or_b32_e32 v12, v14, v22
+; SDAG-NEXT: v_sub_i32_e32 v14, vcc, -1, v12
+; SDAG-NEXT: v_subb_u32_e32 v14, vcc, 1, v15, vcc
+; SDAG-NEXT: v_subb_u32_e32 v14, vcc, 0, v16, vcc
+; SDAG-NEXT: v_subb_u32_e32 v14, vcc, 0, v17, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v22, 31, v14
+; SDAG-NEXT: v_subrev_i32_e32 v14, vcc, 0, v12
+; SDAG-NEXT: v_and_b32_e32 v12, 2, v22
+; SDAG-NEXT: v_subb_u32_e32 v15, vcc, v15, v12, vcc
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v12, 31, v11
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v12
+; SDAG-NEXT: v_or_b32_e32 v3, v1, v3
+; SDAG-NEXT: v_or_b32_e32 v2, v0, v2
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT: v_and_b32_e32 v12, 1, v22
+; SDAG-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v16, vcc
+; SDAG-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v17, vcc
+; SDAG-NEXT: v_add_i32_e32 v18, vcc, -1, v18
+; SDAG-NEXT: v_addc_u32_e32 v19, vcc, -1, v19, vcc
+; SDAG-NEXT: v_addc_u32_e32 v20, vcc, -1, v20, vcc
+; SDAG-NEXT: v_addc_u32_e32 v21, vcc, -1, v21, vcc
+; SDAG-NEXT: v_or_b32_e32 v22, v18, v20
+; SDAG-NEXT: v_or_b32_e32 v23, v19, v21
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; SDAG-NEXT: v_or_b32_e32 v11, v9, v11
+; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v10, v8, v10
+; SDAG-NEXT: v_mov_b32_e32 v8, v12
+; SDAG-NEXT: v_mov_b32_e32 v9, v13
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execnz .LBB3_3
+; SDAG-NEXT: ; %bb.4: ; %Flow13
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB3_5: ; %Flow14
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v11
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[10:11], 1
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v8
+; SDAG-NEXT: v_or_b32_e32 v10, v13, v1
+; SDAG-NEXT: v_or_b32_e32 v11, v12, v0
+; SDAG-NEXT: .LBB3_6: ; %Flow16
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_or_b32_e32 v1, v5, v7
+; SDAG-NEXT: v_or_b32_e32 v0, v4, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v8, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v9, v7
+; SDAG-NEXT: v_ffbh_u32_e32 v12, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v13, v5
+; SDAG-NEXT: v_mov_b32_e32 v15, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], 0x7f
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; SDAG-NEXT: v_add_i32_e32 v0, vcc, 32, v8
+; SDAG-NEXT: v_add_i32_e32 v1, vcc, 32, v12
+; SDAG-NEXT: v_min_u32_e32 v0, v0, v9
+; SDAG-NEXT: v_min_u32_e32 v1, v1, v13
+; SDAG-NEXT: v_add_i32_e32 v1, vcc, 64, v1
+; SDAG-NEXT: v_addc_u32_e64 v8, s[8:9], 0, 0, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 0x5e, v0
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v8, vcc
+; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v0
+; SDAG-NEXT: v_subb_u32_e32 v14, vcc, 0, v15, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v15, vcc, 0, v15, vcc
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v14
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v9, v1, v15
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_and_b32_e32 v8, 1, v12
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v8
+; SDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, 0, s[4:5]
+; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, v5, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, v4, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB3_12
+; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
+; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v0
+; SDAG-NEXT: v_sub_i32_e64 v8, s[4:5], 63, v0
+; SDAG-NEXT: v_addc_u32_e32 v19, vcc, 0, v1, vcc
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[4:5], v8
+; SDAG-NEXT: v_addc_u32_e32 v20, vcc, 0, v14, vcc
+; SDAG-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc
+; SDAG-NEXT: v_or_b32_e32 v12, v18, v20
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0x7f, v0
+; SDAG-NEXT: v_or_b32_e32 v13, v19, v21
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], v16
+; SDAG-NEXT: v_sub_i32_e32 v17, vcc, 64, v16
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[4:5], v16
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; SDAG-NEXT: v_lshr_b64 v[12:13], v[4:5], v17
+; SDAG-NEXT: v_or_b32_e32 v1, v1, v13
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v12
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v16
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v15, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v14, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v14, 0
+; SDAG-NEXT: v_mov_b32_e32 v15, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB3_11
+; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[4:5], v18
+; SDAG-NEXT: v_sub_i32_e32 v14, vcc, 64, v18
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[6:7], v14
+; SDAG-NEXT: v_or_b32_e32 v15, v9, v15
+; SDAG-NEXT: v_or_b32_e32 v14, v8, v14
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
+; SDAG-NEXT: v_subrev_i32_e64 v8, s[4:5], 64, v18
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[6:7], v8
+; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v5, v8, v14, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v5, v4, s[4:5]
+; SDAG-NEXT: v_lshr_b64 v[4:5], v[6:7], v18
+; SDAG-NEXT: v_cndmask_b32_e32 v17, 0, v5, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v16, 0, v4, vcc
+; SDAG-NEXT: v_mov_b32_e32 v4, 0
+; SDAG-NEXT: v_mov_b32_e32 v5, 0
+; SDAG-NEXT: s_mov_b64 s[4:5], 0
+; SDAG-NEXT: v_mov_b32_e32 v15, 0
+; SDAG-NEXT: v_mov_b32_e32 v6, 0
+; SDAG-NEXT: v_mov_b32_e32 v7, 0
+; SDAG-NEXT: .LBB3_9: ; %udiv-do-while
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v14, 31, v9
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v1
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v14
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v22
+; SDAG-NEXT: v_sub_i32_e32 v14, vcc, -1, v8
+; SDAG-NEXT: v_subb_u32_e32 v14, vcc, 1, v9, vcc
+; SDAG-NEXT: v_subb_u32_e32 v14, vcc, 0, v16, vcc
+; SDAG-NEXT: v_subb_u32_e32 v14, vcc, 0, v17, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v14, 31, v14
+; SDAG-NEXT: v_subrev_i32_e32 v8, vcc, 0, v8
+; SDAG-NEXT: v_and_b32_e32 v22, 2, v14
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v22, vcc
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v13
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v22
+; SDAG-NEXT: v_or_b32_e32 v1, v5, v1
+; SDAG-NEXT: v_or_b32_e32 v0, v4, v0
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
+; SDAG-NEXT: v_and_b32_e32 v14, 1, v14
+; SDAG-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v16, vcc
+; SDAG-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v17, vcc
+; SDAG-NEXT: v_add_i32_e32 v18, vcc, -1, v18
+; SDAG-NEXT: v_addc_u32_e32 v19, vcc, -1, v19, vcc
+; SDAG-NEXT: v_addc_u32_e32 v20, vcc, -1, v20, vcc
+; SDAG-NEXT: v_addc_u32_e32 v21, vcc, -1, v21, vcc
+; SDAG-NEXT: v_or_b32_e32 v22, v18, v20
+; SDAG-NEXT: v_or_b32_e32 v23, v19, v21
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; SDAG-NEXT: v_or_b32_e32 v13, v7, v13
+; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v12, v6, v12
+; SDAG-NEXT: v_mov_b32_e32 v6, v14
+; SDAG-NEXT: v_mov_b32_e32 v7, v15
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execnz .LBB3_9
+; SDAG-NEXT: ; %bb.10: ; %Flow
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB3_11: ; %Flow11
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[0:1], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v13
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[12:13], 1
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v4
+; SDAG-NEXT: v_or_b32_e32 v12, v15, v1
+; SDAG-NEXT: v_or_b32_e32 v13, v14, v0
+; SDAG-NEXT: .LBB3_12: ; %Flow12
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v0, v11
+; SDAG-NEXT: v_mov_b32_e32 v1, v10
+; SDAG-NEXT: v_mov_b32_e32 v4, v13
+; SDAG-NEXT: v_mov_b32_e32 v5, v12
+; SDAG-NEXT: v_mov_b32_e32 v6, v8
+; SDAG-NEXT: v_mov_b32_e32 v7, v9
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_udiv_v2i128_v_pow2k:
+; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v8, v2
+; GISEL-NEXT: v_mov_b32_e32 v9, v3
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_or_b32_e32 v2, v0, v8
+; GISEL-NEXT: v_or_b32_e32 v3, v1, v9
+; GISEL-NEXT: v_ffbh_u32_e32 v12, v1
+; GISEL-NEXT: v_ffbh_u32_e32 v13, v0
+; GISEL-NEXT: v_ffbh_u32_e32 v14, v9
+; GISEL-NEXT: v_ffbh_u32_e32 v15, v8
+; GISEL-NEXT: v_mov_b32_e32 v16, 0x5e
+; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v11, 0
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, 32, v13
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_min_u32_e32 v2, v12, v13
+; GISEL-NEXT: v_min_u32_e32 v3, v14, v15
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 64, v2
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v16, v2
+; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[14:15], v[10:11]
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v14
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13]
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v12
+; GISEL-NEXT: v_or_b32_e32 v3, v15, v13
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v3, v17, v10
+; GISEL-NEXT: v_and_b32_e32 v10, 1, v3
+; GISEL-NEXT: v_or_b32_e32 v2, v3, v2
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v0, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v16, 1, v2
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB3_6
+; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v14
+; GISEL-NEXT: v_addc_u32_e64 v19, s[4:5], 0, v15, vcc
+; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 0x7f, v14
+; GISEL-NEXT: v_not_b32_e32 v2, 63
+; GISEL-NEXT: v_addc_u32_e64 v20, vcc, 0, v12, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v13, vcc
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v22, v2
+; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], 64, v22
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[0:1], v22
+; GISEL-NEXT: v_lshl_b64 v[10:11], v[8:9], v22
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_lshr_b64 v[12:13], v[0:1], v12
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[0:1], v14
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22
+; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v3, vcc
+; GISEL-NEXT: v_or_b32_e32 v2, v12, v10
+; GISEL-NEXT: v_or_b32_e32 v3, v13, v11
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v13, s11
+; GISEL-NEXT: v_mov_b32_e32 v12, s10
+; GISEL-NEXT: v_mov_b32_e32 v11, s9
+; GISEL-NEXT: v_mov_b32_e32 v10, s8
+; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9]
+; GISEL-NEXT: s_cbranch_execz .LBB3_5
+; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
+; GISEL-NEXT: v_add_i32_e32 v22, vcc, 0xffffffc0, v18
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 64, v18
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GISEL-NEXT: v_lshr_b64 v[10:11], v[8:9], v18
+; GISEL-NEXT: v_lshr_b64 v[12:13], v[0:1], v18
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[8:9], v16
+; GISEL-NEXT: v_or_b32_e32 v12, v12, v16
+; GISEL-NEXT: v_or_b32_e32 v13, v13, v17
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_lshr_b64 v[8:9], v[8:9], v22
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v11, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: v_mov_b32_e32 v13, s11
+; GISEL-NEXT: v_mov_b32_e32 v12, s10
+; GISEL-NEXT: v_mov_b32_e32 v11, s9
+; GISEL-NEXT: v_mov_b32_e32 v10, s8
+; GISEL-NEXT: .LBB3_3: ; %udiv-do-while3
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_lshl_b64 v[12:13], v[14:15], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v15
+; GISEL-NEXT: v_or_b32_e32 v14, v10, v12
+; GISEL-NEXT: v_or_b32_e32 v15, v11, v13
+; GISEL-NEXT: v_lshl_b64 v[10:11], v[8:9], 1
+; GISEL-NEXT: v_lshl_b64 v[12:13], v[16:17], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v8, 31, v9
+; GISEL-NEXT: v_lshrrev_b32_e32 v9, 31, v3
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, -1, v18
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, -1, v19, vcc
+; GISEL-NEXT: v_or_b32_e32 v12, v12, v8
+; GISEL-NEXT: v_or_b32_e32 v8, v10, v9
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v0
+; GISEL-NEXT: v_addc_u32_e32 v20, vcc, -1, v20, vcc
+; GISEL-NEXT: v_addc_u32_e32 v21, vcc, -1, v21, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, 1, v11
+; GISEL-NEXT: v_subb_u32_e32 v0, vcc, 0, v12, vcc
+; GISEL-NEXT: v_subrev_i32_e64 v8, s[4:5], 0, v8
+; GISEL-NEXT: v_or_b32_e32 v9, v18, v20
+; GISEL-NEXT: v_or_b32_e32 v10, v19, v21
+; GISEL-NEXT: v_subb_u32_e32 v0, vcc, 0, v13, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[9:10]
+; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v0
+; GISEL-NEXT: v_and_b32_e32 v0, 1, v9
+; GISEL-NEXT: v_and_b32_e32 v9, 2, v9
+; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], v11, v9
+; GISEL-NEXT: v_mov_b32_e32 v11, v1
+; GISEL-NEXT: v_mov_b32_e32 v10, v0
+; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v12, s[4:5]
+; GISEL-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GISEL-NEXT: v_subbrev_u32_e64 v17, vcc, 0, v13, s[4:5]
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GISEL-NEXT: s_cbranch_execnz .LBB3_3
; GISEL-NEXT: ; %bb.4: ; %Flow13
-; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB1_5: ; %Flow14
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
-; GISEL-NEXT: v_lshl_b64 v[0:1], v[22:23], 1
+; GISEL-NEXT: .LBB3_5: ; %Flow14
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[14:15], 1
; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GISEL-NEXT: v_lshrrev_b32_e32 v8, 31, v23
+; GISEL-NEXT: v_lshrrev_b32_e32 v8, 31, v15
; GISEL-NEXT: v_or_b32_e32 v2, v2, v8
-; GISEL-NEXT: v_or_b32_e32 v18, v18, v0
-; GISEL-NEXT: v_or_b32_e32 v19, v19, v1
-; GISEL-NEXT: .LBB1_6: ; %Flow16
-; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: v_or_b32_e32 v10, v10, v0
+; GISEL-NEXT: v_or_b32_e32 v11, v11, v1
+; GISEL-NEXT: .LBB3_6: ; %Flow16
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL-NEXT: s_mov_b64 s[8:9], 0
-; GISEL-NEXT: v_or_b32_e32 v0, v12, v14
-; GISEL-NEXT: v_or_b32_e32 v1, v13, v15
-; GISEL-NEXT: v_or_b32_e32 v8, v4, v6
-; GISEL-NEXT: v_or_b32_e32 v9, v5, v7
-; GISEL-NEXT: v_ffbh_u32_e32 v16, v13
-; GISEL-NEXT: v_ffbh_u32_e32 v17, v12
-; GISEL-NEXT: v_ffbh_u32_e32 v20, v15
-; GISEL-NEXT: v_ffbh_u32_e32 v21, v14
-; GISEL-NEXT: v_ffbh_u32_e32 v22, v5
-; GISEL-NEXT: v_ffbh_u32_e32 v23, v4
-; GISEL-NEXT: v_ffbh_u32_e32 v24, v7
-; GISEL-NEXT: v_ffbh_u32_e32 v25, v6
-; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f
-; GISEL-NEXT: v_mov_b32_e32 v11, 0
+; GISEL-NEXT: v_or_b32_e32 v0, v4, v6
+; GISEL-NEXT: v_or_b32_e32 v1, v5, v7
+; GISEL-NEXT: v_ffbh_u32_e32 v12, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v13, v4
+; GISEL-NEXT: v_ffbh_u32_e32 v14, v7
+; GISEL-NEXT: v_ffbh_u32_e32 v15, v6
+; GISEL-NEXT: v_mov_b32_e32 v16, 0x5e
+; GISEL-NEXT: v_mov_b32_e32 v8, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v9, 0
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, 32, v13
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
-; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 32, v17
-; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v21
-; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], 32, v23
-; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], 32, v25
-; GISEL-NEXT: v_min_u32_e32 v0, v16, v0
-; GISEL-NEXT: v_min_u32_e32 v1, v20, v1
-; GISEL-NEXT: v_min_u32_e32 v8, v22, v8
-; GISEL-NEXT: v_min_u32_e32 v9, v24, v9
-; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_min_u32_e32 v0, v12, v13
+; GISEL-NEXT: v_min_u32_e32 v1, v14, v15
; GISEL-NEXT: v_add_i32_e32 v0, vcc, 64, v0
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 64, v8
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v1
-; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v16, v0
+; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[10:11]
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v16
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[14:15], v[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v14
; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v8, v8, v0
-; GISEL-NEXT: v_or_b32_e32 v9, v17, v1
+; GISEL-NEXT: v_or_b32_e32 v9, v15, v1
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_or_b32_e32 v9, v20, v10
-; GISEL-NEXT: v_and_b32_e32 v10, 1, v9
+; GISEL-NEXT: v_or_b32_e32 v9, v17, v12
+; GISEL-NEXT: v_and_b32_e32 v12, 1, v9
; GISEL-NEXT: v_or_b32_e32 v8, v9, v8
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v4, 0, vcc
-; GISEL-NEXT: v_and_b32_e32 v20, 1, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v11, v5, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, v4, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v16, 1, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v13, v5, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v8, v6, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v9, v7, 0, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
-; GISEL-NEXT: s_cbranch_execz .LBB1_12
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB3_12
; GISEL-NEXT: ; %bb.7: ; %udiv-bb1
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v16
-; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v17, vcc
-; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16
-; GISEL-NEXT: v_not_b32_e32 v9, 63
-; GISEL-NEXT: v_addc_u32_e64 v24, vcc, 0, v0, s[4:5]
-; GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v26, v9
-; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v26
-; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v26
-; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v26
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v14
+; GISEL-NEXT: v_addc_u32_e64 v19, s[4:5], 0, v15, vcc
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0x7f, v14
+; GISEL-NEXT: v_not_b32_e32 v8, 63
+; GISEL-NEXT: v_addc_u32_e64 v20, vcc, 0, v0, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v1, vcc
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v16, v8
+; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], 64, v16
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v16
+; GISEL-NEXT: v_lshl_b64 v[8:9], v[6:7], v16
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
-; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v10
-; GISEL-NEXT: v_lshl_b64 v[22:23], v[4:5], v9
-; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26
-; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc
-; GISEL-NEXT: v_or_b32_e32 v0, v20, v16
-; GISEL-NEXT: v_or_b32_e32 v1, v21, v17
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
+; GISEL-NEXT: v_lshr_b64 v[12:13], v[4:5], v12
+; GISEL-NEXT: v_lshl_b64 v[14:15], v[4:5], v14
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
+; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v8, v12, v8
+; GISEL-NEXT: v_or_b32_e32 v9, v13, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v15, v9, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v7, vcc
; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
-; GISEL-NEXT: v_mov_b32_e32 v23, s11
-; GISEL-NEXT: v_mov_b32_e32 v22, s10
-; GISEL-NEXT: v_mov_b32_e32 v21, s9
-; GISEL-NEXT: v_mov_b32_e32 v20, s8
-; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
-; GISEL-NEXT: s_cbranch_execz .LBB1_11
+; GISEL-NEXT: v_mov_b32_e32 v15, s11
+; GISEL-NEXT: v_mov_b32_e32 v14, s10
+; GISEL-NEXT: v_mov_b32_e32 v13, s9
+; GISEL-NEXT: v_mov_b32_e32 v12, s8
+; GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[12:13], exec, s[8:9]
+; GISEL-NEXT: s_cbranch_execz .LBB3_11
; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
-; GISEL-NEXT: v_add_i32_e32 v28, vcc, 0xffffffc0, v8
-; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v8
-; GISEL-NEXT: v_lshr_b64 v[16:17], v[6:7], v8
-; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v8
-; GISEL-NEXT: s_mov_b64 s[4:5], 0
-; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v12
-; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc
-; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], v22
-; GISEL-NEXT: v_lshr_b64 v[6:7], v[6:7], v28
-; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc
-; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc
-; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GISEL-NEXT: v_or_b32_e32 v20, v20, v22
-; GISEL-NEXT: v_or_b32_e32 v21, v21, v23
-; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8
-; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v20, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v21, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
-; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc
+; GISEL-NEXT: v_add_i32_e32 v22, vcc, 0xffffffc0, v18
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 64, v18
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GISEL-NEXT: v_lshr_b64 v[12:13], v[6:7], v18
+; GISEL-NEXT: v_lshr_b64 v[14:15], v[4:5], v18
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v16
+; GISEL-NEXT: v_or_b32_e32 v14, v14, v16
+; GISEL-NEXT: v_or_b32_e32 v15, v15, v17
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_lshr_b64 v[6:7], v[6:7], v22
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v12, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v13, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
; GISEL-NEXT: v_mov_b32_e32 v5, 0
-; GISEL-NEXT: v_mov_b32_e32 v23, s7
-; GISEL-NEXT: v_mov_b32_e32 v22, s6
-; GISEL-NEXT: v_mov_b32_e32 v21, s5
-; GISEL-NEXT: v_mov_b32_e32 v20, s4
-; GISEL-NEXT: .LBB1_9: ; %udiv-do-while
+; GISEL-NEXT: v_mov_b32_e32 v15, s11
+; GISEL-NEXT: v_mov_b32_e32 v14, s10
+; GISEL-NEXT: v_mov_b32_e32 v13, s9
+; GISEL-NEXT: v_mov_b32_e32 v12, s8
+; GISEL-NEXT: .LBB3_9: ; %udiv-do-while
; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], 1
-; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
-; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v7
-; GISEL-NEXT: v_lshrrev_b32_e32 v30, 31, v1
-; GISEL-NEXT: v_lshl_b64 v[6:7], v[9:10], 1
-; GISEL-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; GISEL-NEXT: v_lshrrev_b32_e32 v9, 31, v10
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, -1, v8
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, -1, v11, vcc
-; GISEL-NEXT: v_or_b32_e32 v16, v16, v4
-; GISEL-NEXT: v_or_b32_e32 v22, v22, v30
-; GISEL-NEXT: v_or_b32_e32 v0, v0, v9
-; GISEL-NEXT: v_or_b32_e32 v9, v20, v6
-; GISEL-NEXT: v_or_b32_e32 v10, v21, v7
-; GISEL-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc
-; GISEL-NEXT: v_addc_u32_e32 v25, vcc, -1, v25, vcc
-; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v26, v22
-; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v27, v23, vcc
-; GISEL-NEXT: v_or_b32_e32 v6, v8, v24
-; GISEL-NEXT: v_or_b32_e32 v7, v11, v25
-; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v28, v16, vcc
-; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v29, v17, vcc
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v4
-; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GISEL-NEXT: v_and_b32_e32 v4, 1, v6
-; GISEL-NEXT: v_and_b32_e32 v7, v6, v12
-; GISEL-NEXT: v_and_b32_e32 v30, v6, v13
-; GISEL-NEXT: v_and_b32_e32 v31, v6, v14
-; GISEL-NEXT: v_and_b32_e32 v32, v6, v15
-; GISEL-NEXT: v_mov_b32_e32 v21, v5
-; GISEL-NEXT: v_mov_b32_e32 v20, v4
-; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v22, v7
-; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v23, v30, vcc
-; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v31, vcc
-; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v32, vcc
-; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GISEL-NEXT: s_cbranch_execnz .LBB1_9
+; GISEL-NEXT: v_lshl_b64 v[14:15], v[0:1], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v1
+; GISEL-NEXT: v_or_b32_e32 v0, v12, v14
+; GISEL-NEXT: v_or_b32_e32 v1, v13, v15
+; GISEL-NEXT: v_lshl_b64 v[12:13], v[6:7], 1
+; GISEL-NEXT: v_lshl_b64 v[14:15], v[16:17], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v7
+; GISEL-NEXT: v_lshrrev_b32_e32 v7, 31, v9
+; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, -1, v18
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, -1, v19, vcc
+; GISEL-NEXT: v_or_b32_e32 v14, v14, v6
+; GISEL-NEXT: v_or_b32_e32 v6, v12, v7
+; GISEL-NEXT: v_or_b32_e32 v8, v8, v4
+; GISEL-NEXT: v_addc_u32_e32 v20, vcc, -1, v20, vcc
+; GISEL-NEXT: v_addc_u32_e32 v21, vcc, -1, v21, vcc
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 1, v13
+; GISEL-NEXT: v_subb_u32_e32 v4, vcc, 0, v14, vcc
+; GISEL-NEXT: v_subrev_i32_e64 v6, s[4:5], 0, v6
+; GISEL-NEXT: v_or_b32_e32 v16, v18, v20
+; GISEL-NEXT: v_or_b32_e32 v17, v19, v21
+; GISEL-NEXT: v_subb_u32_e32 v4, vcc, 0, v15, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GISEL-NEXT: v_and_b32_e32 v4, 1, v7
+; GISEL-NEXT: v_and_b32_e32 v7, 2, v7
+; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v13, v7
+; GISEL-NEXT: v_mov_b32_e32 v13, v5
+; GISEL-NEXT: v_mov_b32_e32 v12, v4
+; GISEL-NEXT: v_subbrev_u32_e64 v16, s[4:5], 0, v14, s[4:5]
+; GISEL-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GISEL-NEXT: v_subbrev_u32_e64 v17, vcc, 0, v15, s[4:5]
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GISEL-NEXT: s_cbranch_execnz .LBB3_9
; GISEL-NEXT: ; %bb.10: ; %Flow
-; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB1_11: ; %Flow11
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
-; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], 1
-; GISEL-NEXT: v_lshl_b64 v[8:9], v[0:1], 1
-; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v10
-; GISEL-NEXT: v_or_b32_e32 v8, v8, v0
-; GISEL-NEXT: v_or_b32_e32 v10, v20, v4
-; GISEL-NEXT: v_or_b32_e32 v11, v21, v5
-; GISEL-NEXT: .LBB1_12: ; %Flow12
+; GISEL-NEXT: .LBB3_11: ; %Flow11
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
-; GISEL-NEXT: v_mov_b32_e32 v0, v18
-; GISEL-NEXT: v_mov_b32_e32 v1, v19
-; GISEL-NEXT: v_mov_b32_e32 v4, v10
-; GISEL-NEXT: v_mov_b32_e32 v5, v11
+; GISEL-NEXT: v_lshl_b64 v[4:5], v[0:1], 1
+; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1
+; GISEL-NEXT: v_or_b32_e32 v8, v8, v0
+; GISEL-NEXT: v_or_b32_e32 v12, v12, v4
+; GISEL-NEXT: v_or_b32_e32 v13, v13, v5
+; GISEL-NEXT: .LBB3_12: ; %Flow12
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: v_mov_b32_e32 v0, v10
+; GISEL-NEXT: v_mov_b32_e32 v1, v11
+; GISEL-NEXT: v_mov_b32_e32 v4, v12
+; GISEL-NEXT: v_mov_b32_e32 v5, v13
; GISEL-NEXT: v_mov_b32_e32 v6, v8
; GISEL-NEXT: v_mov_b32_e32 v7, v9
; GISEL-NEXT: s_setpc_b64 s[30:31]
- %shl = udiv <2 x i128> %lhs, %rhs
+ %shl = udiv <2 x i128> %lhs, <i128 8589934592, i128 8589934592>
ret <2 x i128> %shl
}
@@ -1624,7 +2932,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
; SDAG-NEXT: v_cndmask_b32_e64 v34, v16, 0, s[4:5]
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
-; SDAG-NEXT: s_cbranch_execz .LBB2_6
+; SDAG-NEXT: s_cbranch_execz .LBB4_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10
; SDAG-NEXT: v_sub_i32_e64 v8, s[4:5], 63, v10
@@ -1654,7 +2962,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v11, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; SDAG-NEXT: s_cbranch_execz .LBB2_5
+; SDAG-NEXT: s_cbranch_execz .LBB4_5
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
; SDAG-NEXT: v_lshr_b64 v[22:23], v[16:17], v32
; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v32
@@ -1682,7 +2990,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cndmask_b32_e32 v24, v22, v16, vcc
; SDAG-NEXT: v_mov_b32_e32 v22, 0
; SDAG-NEXT: v_mov_b32_e32 v23, 0
-; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3
+; SDAG-NEXT: .LBB4_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v19
; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
@@ -1723,10 +3031,10 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v23, v11
; SDAG-NEXT: v_mov_b32_e32 v22, v10
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; SDAG-NEXT: s_cbranch_execnz .LBB2_3
+; SDAG-NEXT: s_cbranch_execnz .LBB4_3
; SDAG-NEXT: ; %bb.4: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB2_5: ; %Flow14
+; SDAG-NEXT: .LBB4_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v19
@@ -1734,7 +3042,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v8, v8, v20
; SDAG-NEXT: v_or_b32_e32 v33, v11, v19
; SDAG-NEXT: v_or_b32_e32 v34, v10, v18
-; SDAG-NEXT: .LBB2_6: ; %Flow16
+; SDAG-NEXT: .LBB4_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_ashrrev_i32_e32 v32, 31, v7
; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 0, v4
@@ -1815,7 +3123,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cndmask_b32_e64 v21, v10, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; SDAG-NEXT: s_cbranch_execz .LBB2_12
+; SDAG-NEXT: s_cbranch_execz .LBB4_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v12
; SDAG-NEXT: v_sub_i32_e64 v14, s[4:5], 63, v12
@@ -1845,7 +3153,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; SDAG-NEXT: s_cbranch_execz .LBB2_11
+; SDAG-NEXT: s_cbranch_execz .LBB4_11
; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
; SDAG-NEXT: v_lshr_b64 v[22:23], v[10:11], v38
; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v38
@@ -1873,7 +3181,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cndmask_b32_e32 v24, v22, v10, vcc
; SDAG-NEXT: v_mov_b32_e32 v22, 0
; SDAG-NEXT: v_mov_b32_e32 v23, 0
-; SDAG-NEXT: .LBB2_9: ; %udiv-do-while
+; SDAG-NEXT: .LBB4_9: ; %udiv-do-while
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v25
@@ -1914,10 +3222,10 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v23, v19
; SDAG-NEXT: v_mov_b32_e32 v22, v18
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; SDAG-NEXT: s_cbranch_execnz .LBB2_9
+; SDAG-NEXT: s_cbranch_execnz .LBB4_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB2_11: ; %Flow11
+; SDAG-NEXT: .LBB4_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v13
@@ -1925,7 +3233,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v14, v14, v20
; SDAG-NEXT: v_or_b32_e32 v20, v19, v13
; SDAG-NEXT: v_or_b32_e32 v21, v18, v12
-; SDAG-NEXT: .LBB2_12: ; %Flow12
+; SDAG-NEXT: .LBB4_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_mul_lo_u32 v18, v34, v3
; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v34, v2, 0
@@ -2075,7 +3383,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
-; GISEL-NEXT: s_cbranch_execz .LBB2_6
+; GISEL-NEXT: s_cbranch_execz .LBB4_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
; GISEL-NEXT: v_add_i32_e32 v31, vcc, 1, v2
; GISEL-NEXT: v_addc_u32_e64 v32, s[4:5], 0, v3, vcc
@@ -2107,7 +3415,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v3, s11
; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
-; GISEL-NEXT: s_cbranch_execz .LBB2_5
+; GISEL-NEXT: s_cbranch_execz .LBB4_5
; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
; GISEL-NEXT: v_add_i32_e32 v24, vcc, 0xffffffc0, v31
; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v31
@@ -2136,7 +3444,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v1, s5
; GISEL-NEXT: v_mov_b32_e32 v2, s6
; GISEL-NEXT: v_mov_b32_e32 v3, s7
-; GISEL-NEXT: .LBB2_3: ; %udiv-do-while3
+; GISEL-NEXT: .LBB4_3: ; %udiv-do-while3
; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_lshrrev_b32_e32 v39, 31, v21
; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
@@ -2175,10 +3483,10 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v0, v22
; GISEL-NEXT: v_mov_b32_e32 v1, v23
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GISEL-NEXT: s_cbranch_execnz .LBB2_3
+; GISEL-NEXT: s_cbranch_execnz .LBB4_3
; GISEL-NEXT: ; %bb.4: ; %Flow13
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB2_5: ; %Flow14
+; GISEL-NEXT: .LBB4_5: ; %Flow14
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
@@ -2186,7 +3494,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v18, v18, v20
; GISEL-NEXT: v_or_b32_e32 v31, v0, v2
; GISEL-NEXT: v_or_b32_e32 v32, v1, v3
-; GISEL-NEXT: .LBB2_6: ; %Flow16
+; GISEL-NEXT: .LBB4_6: ; %Flow16
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_ashrrev_i32_e32 v33, 31, v7
@@ -2266,7 +3574,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
-; GISEL-NEXT: s_cbranch_execz .LBB2_12
+; GISEL-NEXT: s_cbranch_execz .LBB4_12
; GISEL-NEXT: ; %bb.7: ; %udiv-bb1
; GISEL-NEXT: v_add_i32_e32 v36, vcc, 1, v14
; GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v15, vcc
@@ -2298,7 +3606,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v3, s11
; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
-; GISEL-NEXT: s_cbranch_execz .LBB2_11
+; GISEL-NEXT: s_cbranch_execz .LBB4_11
; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
; GISEL-NEXT: v_add_i32_e32 v24, vcc, 0xffffffc0, v36
; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v36
@@ -2327,7 +3635,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v1, s5
; GISEL-NEXT: v_mov_b32_e32 v2, s6
; GISEL-NEXT: v_mov_b32_e32 v3, s7
-; GISEL-NEXT: .LBB2_9: ; %udiv-do-while
+; GISEL-NEXT: .LBB4_9: ; %udiv-do-while
; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v21
@@ -2366,10 +3674,10 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc
; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GISEL-NEXT: s_cbranch_execnz .LBB2_9
+; GISEL-NEXT: s_cbranch_execnz .LBB4_9
; GISEL-NEXT: ; %bb.10: ; %Flow
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB2_11: ; %Flow11
+; GISEL-NEXT: .LBB4_11: ; %Flow11
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], 1
; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1
@@ -2377,7 +3685,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v2, v2, v14
; GISEL-NEXT: v_or_b32_e32 v20, v0, v22
; GISEL-NEXT: v_or_b32_e32 v21, v1, v23
-; GISEL-NEXT: .LBB2_12: ; %Flow12
+; GISEL-NEXT: .LBB4_12: ; %Flow12
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0
@@ -2495,7 +3803,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
; SDAG-NEXT: v_cndmask_b32_e64 v33, v0, 0, s[4:5]
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
-; SDAG-NEXT: s_cbranch_execz .LBB3_6
+; SDAG-NEXT: s_cbranch_execz .LBB5_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v18
; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v18
@@ -2525,7 +3833,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; SDAG-NEXT: s_cbranch_execz .LBB3_5
+; SDAG-NEXT: s_cbranch_execz .LBB5_5
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v30
; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v30
@@ -2553,7 +3861,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cndmask_b32_e32 v26, v24, v0, vcc
; SDAG-NEXT: v_mov_b32_e32 v24, 0
; SDAG-NEXT: v_mov_b32_e32 v25, 0
-; SDAG-NEXT: .LBB3_3: ; %udiv-do-while3
+; SDAG-NEXT: .LBB5_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v21
; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
@@ -2594,10 +3902,10 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v25, v19
; SDAG-NEXT: v_mov_b32_e32 v24, v18
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; SDAG-NEXT: s_cbranch_execnz .LBB3_3
+; SDAG-NEXT: s_cbranch_execnz .LBB5_3
; SDAG-NEXT: ; %bb.4: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB3_5: ; %Flow14
+; SDAG-NEXT: .LBB5_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21
@@ -2605,7 +3913,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
; SDAG-NEXT: v_or_b32_e32 v32, v19, v21
; SDAG-NEXT: v_or_b32_e32 v33, v18, v20
-; SDAG-NEXT: .LBB3_6: ; %Flow16
+; SDAG-NEXT: .LBB5_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_or_b32_e32 v19, v13, v15
; SDAG-NEXT: v_or_b32_e32 v18, v12, v14
@@ -2666,7 +3974,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cndmask_b32_e64 v25, v4, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; SDAG-NEXT: s_cbranch_execz .LBB3_12
+; SDAG-NEXT: s_cbranch_execz .LBB5_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v20
; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v20
@@ -2696,7 +4004,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v23, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
-; SDAG-NEXT: s_cbranch_execz .LBB3_11
+; SDAG-NEXT: s_cbranch_execz .LBB5_11
; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
; SDAG-NEXT: v_lshr_b64 v[26:27], v[4:5], v34
; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 64, v34
@@ -2724,7 +4032,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cndmask_b32_e32 v28, v26, v4, vcc
; SDAG-NEXT: v_mov_b32_e32 v26, 0
; SDAG-NEXT: v_mov_b32_e32 v27, 0
-; SDAG-NEXT: .LBB3_9: ; %udiv-do-while
+; SDAG-NEXT: .LBB5_9: ; %udiv-do-while
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; SDAG-NEXT: v_lshl_b64 v[30:31], v[30:31], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v29
@@ -2765,10 +4073,10 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v27, v23
; SDAG-NEXT: v_mov_b32_e32 v26, v22
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; SDAG-NEXT: s_cbranch_execnz .LBB3_9
+; SDAG-NEXT: s_cbranch_execnz .LBB5_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
-; SDAG-NEXT: .LBB3_11: ; %Flow11
+; SDAG-NEXT: .LBB5_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v21
@@ -2776,7 +4084,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v18, v18, v24
; SDAG-NEXT: v_or_b32_e32 v24, v23, v21
; SDAG-NEXT: v_or_b32_e32 v25, v22, v20
-; SDAG-NEXT: .LBB3_12: ; %Flow12
+; SDAG-NEXT: .LBB5_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_mul_lo_u32 v23, v33, v11
; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v33, v10, 0
@@ -2890,7 +4198,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
-; GISEL-NEXT: s_cbranch_execz .LBB3_6
+; GISEL-NEXT: s_cbranch_execz .LBB5_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
; GISEL-NEXT: v_add_i32_e32 v30, vcc, 1, v18
; GISEL-NEXT: v_addc_u32_e64 v31, s[4:5], 0, v19, vcc
@@ -2922,7 +4230,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v16, s8
; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
-; GISEL-NEXT: s_cbranch_execz .LBB3_5
+; GISEL-NEXT: s_cbranch_execz .LBB5_5
; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
; GISEL-NEXT: v_add_i32_e32 v26, vcc, 0xffffffc0, v30
; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v30
@@ -2951,7 +4259,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v18, s6
; GISEL-NEXT: v_mov_b32_e32 v17, s5
; GISEL-NEXT: v_mov_b32_e32 v16, s4
-; GISEL-NEXT: .LBB3_3: ; %udiv-do-while3
+; GISEL-NEXT: .LBB5_3: ; %udiv-do-while3
; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_lshrrev_b32_e32 v38, 31, v23
; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1
@@ -2990,10 +4298,10 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v16, v24
; GISEL-NEXT: v_mov_b32_e32 v17, v25
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GISEL-NEXT: s_cbranch_execnz .LBB3_3
+; GISEL-NEXT: s_cbranch_execnz .LBB5_3
; GISEL-NEXT: ; %bb.4: ; %Flow13
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB3_5: ; %Flow14
+; GISEL-NEXT: .LBB5_5: ; %Flow14
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1
; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
@@ -3001,7 +4309,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v20, v20, v22
; GISEL-NEXT: v_or_b32_e32 v32, v16, v18
; GISEL-NEXT: v_or_b32_e32 v33, v17, v19
-; GISEL-NEXT: .LBB3_6: ; %Flow16
+; GISEL-NEXT: .LBB5_6: ; %Flow16
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: s_mov_b64 s[8:9], 0
; GISEL-NEXT: v_or_b32_e32 v16, v12, v14
@@ -3063,7 +4371,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
-; GISEL-NEXT: s_cbranch_execz .LBB3_12
+; GISEL-NEXT: s_cbranch_execz .LBB5_12
; GISEL-NEXT: ; %bb.7: ; %udiv-bb1
; GISEL-NEXT: v_add_i32_e32 v34, vcc, 1, v22
; GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v23, vcc
@@ -3095,7 +4403,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v16, s8
; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
-; GISEL-NEXT: s_cbranch_execz .LBB3_11
+; GISEL-NEXT: s_cbranch_execz .LBB5_11
; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
; GISEL-NEXT: v_add_i32_e32 v28, vcc, 0xffffffc0, v34
; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 64, v34
@@ -3124,7 +4432,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v18, s6
; GISEL-NEXT: v_mov_b32_e32 v17, s5
; GISEL-NEXT: v_mov_b32_e32 v16, s4
-; GISEL-NEXT: .LBB3_9: ; %udiv-do-while
+; GISEL-NEXT: .LBB5_9: ; %udiv-do-while
; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_lshl_b64 v[18:19], v[24:25], 1
; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v25
@@ -3163,10 +4471,10 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_subb_u32_e32 v30, vcc, v18, v30, vcc
; GISEL-NEXT: v_subb_u32_e32 v31, vcc, v31, v50, vcc
; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GISEL-NEXT: s_cbranch_execnz .LBB3_9
+; GISEL-NEXT: s_cbranch_execnz .LBB5_9
; GISEL-NEXT: ; %bb.10: ; %Flow
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: .LBB3_11: ; %Flow11
+; GISEL-NEXT: .LBB5_11: ; %Flow11
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_lshl_b64 v[26:27], v[24:25], 1
; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1
@@ -3174,7 +4482,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v18, v18, v22
; GISEL-NEXT: v_or_b32_e32 v24, v16, v26
; GISEL-NEXT: v_or_b32_e32 v25, v17, v27
-; GISEL-NEXT: .LBB3_12: ; %Flow12
+; GISEL-NEXT: .LBB5_12: ; %Flow12
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0
; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0
diff --git a/llvm/test/CodeGen/X86/div_i129_v_pow2k.ll b/llvm/test/CodeGen/X86/div_i129_v_pow2k.ll
new file mode 100644
index 0000000000000..4d6d795e3beb8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/div_i129_v_pow2k.ll
@@ -0,0 +1,405 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-- -O0 | FileCheck %s --check-prefix=X64-O0
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=i686-- -O0 | FileCheck %s --check-prefix=X86-O0
+
+define i129 @v_sdiv_i129_v_pow2k(i129 %lhs) nounwind {
+; X64-LABEL: v_sdiv_i129_v_pow2k:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: andl $1, %edx
+; X64-NEXT: negq %rdx
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: shldq $32, %rdx, %rax
+; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: shrdq $33, %rsi, %rax
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: movq %rcx, %rdx
+; X64-NEXT: negq %rdx
+; X64-NEXT: shldq $31, %rsi, %rdx
+; X64-NEXT: retq
+;
+; X64-O0-LABEL: v_sdiv_i129_v_pow2k:
+; X64-O0: # %bb.0:
+; X64-O0-NEXT: movl %edx, %eax
+; X64-O0-NEXT: andl $1, %eax
+; X64-O0-NEXT: movl %eax, %ecx
+; X64-O0-NEXT: negq %rcx
+; X64-O0-NEXT: movl %ecx, %r8d
+; X64-O0-NEXT: andl $1, %r8d
+; X64-O0-NEXT: # implicit-def: $rax
+; X64-O0-NEXT: movl %r8d, %eax
+; X64-O0-NEXT: shldq $32, %rcx, %rax
+; X64-O0-NEXT: addq %rax, %rdi
+; X64-O0-NEXT: adcq $0, %rsi
+; X64-O0-NEXT: adcq $0, %rdx
+; X64-O0-NEXT: movq %rsi, %rax
+; X64-O0-NEXT: shldq $31, %rdi, %rax
+; X64-O0-NEXT: movl %edx, %ecx
+; X64-O0-NEXT: andl $1, %ecx
+; X64-O0-NEXT: # kill: def $rcx killed $ecx
+; X64-O0-NEXT: movq %rcx, %rdx
+; X64-O0-NEXT: negq %rdx
+; X64-O0-NEXT: shldq $31, %rsi, %rdx
+; X64-O0-NEXT: retq
+;
+; X86-LABEL: v_sdiv_i129_v_pow2k:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: negl %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: addl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: adcl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: shldl $31, %edx, %ebx
+; X86-NEXT: shldl $31, %esi, %edx
+; X86-NEXT: shldl $31, %edi, %esi
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: negl %edi
+; X86-NEXT: movl %esi, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ebx, 8(%eax)
+; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movb %cl, 16(%eax)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl $4
+;
+; X86-O0-LABEL: v_sdiv_i129_v_pow2k:
+; X86-O0: # %bb.0:
+; X86-O0-NEXT: pushl %ebp
+; X86-O0-NEXT: pushl %ebx
+; X86-O0-NEXT: pushl %edi
+; X86-O0-NEXT: pushl %esi
+; X86-O0-NEXT: subl $8, %esp
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-O0-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-O0-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-O0-NEXT: movl %edx, %ecx
+; X86-O0-NEXT: andl $1, %ecx
+; X86-O0-NEXT: negl %ecx
+; X86-O0-NEXT: movl %ecx, %edi
+; X86-O0-NEXT: andl $1, %edi
+; X86-O0-NEXT: addl %ecx, %eax
+; X86-O0-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-O0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-O0-NEXT: adcl %edi, %esi
+; X86-O0-NEXT: adcl $0, %ebp
+; X86-O0-NEXT: adcl $0, %ebx
+; X86-O0-NEXT: adcl $0, %edx
+; X86-O0-NEXT: movl %edx, %edi
+; X86-O0-NEXT: shldl $31, %ebx, %edi
+; X86-O0-NEXT: shldl $31, %ebp, %ebx
+; X86-O0-NEXT: shldl $31, %esi, %ebp
+; X86-O0-NEXT: andl $1, %edx
+; X86-O0-NEXT: movl %edx, %esi
+; X86-O0-NEXT: negl %esi
+; X86-O0-NEXT: movl %ebp, (%ecx)
+; X86-O0-NEXT: movl %ebx, 4(%ecx)
+; X86-O0-NEXT: movl %edi, 8(%ecx)
+; X86-O0-NEXT: movl %esi, 12(%ecx)
+; X86-O0-NEXT: # kill: def $dl killed $dl killed $edx
+; X86-O0-NEXT: movb %dl, 16(%ecx)
+; X86-O0-NEXT: addl $8, %esp
+; X86-O0-NEXT: popl %esi
+; X86-O0-NEXT: popl %edi
+; X86-O0-NEXT: popl %ebx
+; X86-O0-NEXT: popl %ebp
+; X86-O0-NEXT: retl $4
+ %div = sdiv i129 %lhs, 8589934592
+ ret i129 %div
+}
+
+define i129 @v_sdiv_exact_i129_v_pow2k(i129 %lhs) nounwind {
+; X64-LABEL: v_sdiv_exact_i129_v_pow2k:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: andl $1, %edx
+; X64-NEXT: negq %rdx
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: shldq $32, %rdx, %rax
+; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: shrdq $33, %rsi, %rax
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: movq %rcx, %rdx
+; X64-NEXT: negq %rdx
+; X64-NEXT: shldq $31, %rsi, %rdx
+; X64-NEXT: retq
+;
+; X64-O0-LABEL: v_sdiv_exact_i129_v_pow2k:
+; X64-O0: # %bb.0:
+; X64-O0-NEXT: movl %edx, %eax
+; X64-O0-NEXT: andl $1, %eax
+; X64-O0-NEXT: movl %eax, %ecx
+; X64-O0-NEXT: negq %rcx
+; X64-O0-NEXT: movl %ecx, %r8d
+; X64-O0-NEXT: andl $1, %r8d
+; X64-O0-NEXT: # implicit-def: $rax
+; X64-O0-NEXT: movl %r8d, %eax
+; X64-O0-NEXT: shldq $32, %rcx, %rax
+; X64-O0-NEXT: addq %rax, %rdi
+; X64-O0-NEXT: adcq $0, %rsi
+; X64-O0-NEXT: adcq $0, %rdx
+; X64-O0-NEXT: movq %rsi, %rax
+; X64-O0-NEXT: shldq $31, %rdi, %rax
+; X64-O0-NEXT: movl %edx, %ecx
+; X64-O0-NEXT: andl $1, %ecx
+; X64-O0-NEXT: # kill: def $rcx killed $ecx
+; X64-O0-NEXT: movq %rcx, %rdx
+; X64-O0-NEXT: negq %rdx
+; X64-O0-NEXT: shldq $31, %rsi, %rdx
+; X64-O0-NEXT: retq
+;
+; X86-LABEL: v_sdiv_exact_i129_v_pow2k:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: negl %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: addl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: adcl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: shldl $31, %edx, %ebx
+; X86-NEXT: shldl $31, %esi, %edx
+; X86-NEXT: shldl $31, %edi, %esi
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: negl %edi
+; X86-NEXT: movl %esi, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ebx, 8(%eax)
+; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movb %cl, 16(%eax)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl $4
+;
+; X86-O0-LABEL: v_sdiv_exact_i129_v_pow2k:
+; X86-O0: # %bb.0:
+; X86-O0-NEXT: pushl %ebp
+; X86-O0-NEXT: pushl %ebx
+; X86-O0-NEXT: pushl %edi
+; X86-O0-NEXT: pushl %esi
+; X86-O0-NEXT: subl $8, %esp
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-O0-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-O0-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-O0-NEXT: movl %edx, %ecx
+; X86-O0-NEXT: andl $1, %ecx
+; X86-O0-NEXT: negl %ecx
+; X86-O0-NEXT: movl %ecx, %edi
+; X86-O0-NEXT: andl $1, %edi
+; X86-O0-NEXT: addl %ecx, %eax
+; X86-O0-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-O0-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-O0-NEXT: adcl %edi, %esi
+; X86-O0-NEXT: adcl $0, %ebp
+; X86-O0-NEXT: adcl $0, %ebx
+; X86-O0-NEXT: adcl $0, %edx
+; X86-O0-NEXT: movl %edx, %edi
+; X86-O0-NEXT: shldl $31, %ebx, %edi
+; X86-O0-NEXT: shldl $31, %ebp, %ebx
+; X86-O0-NEXT: shldl $31, %esi, %ebp
+; X86-O0-NEXT: andl $1, %edx
+; X86-O0-NEXT: movl %edx, %esi
+; X86-O0-NEXT: negl %esi
+; X86-O0-NEXT: movl %ebp, (%ecx)
+; X86-O0-NEXT: movl %ebx, 4(%ecx)
+; X86-O0-NEXT: movl %edi, 8(%ecx)
+; X86-O0-NEXT: movl %esi, 12(%ecx)
+; X86-O0-NEXT: # kill: def $dl killed $dl killed $edx
+; X86-O0-NEXT: movb %dl, 16(%ecx)
+; X86-O0-NEXT: addl $8, %esp
+; X86-O0-NEXT: popl %esi
+; X86-O0-NEXT: popl %edi
+; X86-O0-NEXT: popl %ebx
+; X86-O0-NEXT: popl %ebp
+; X86-O0-NEXT: retl $4
+ %div = sdiv exact i129 %lhs, 8589934592
+ ret i129 %div
+}
+
+define i129 @v_udiv_i129_v_pow2k(i129 %lhs) nounwind {
+; X64-LABEL: v_udiv_i129_v_pow2k:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: andl $1, %edx
+; X64-NEXT: shrdq $33, %rsi, %rax
+; X64-NEXT: shldq $31, %rsi, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: retq
+;
+; X64-O0-LABEL: v_udiv_i129_v_pow2k:
+; X64-O0: # %bb.0:
+; X64-O0-NEXT: movq %rsi, %rax
+; X64-O0-NEXT: shldq $31, %rdi, %rax
+; X64-O0-NEXT: movl %edx, %ecx
+; X64-O0-NEXT: andl $1, %ecx
+; X64-O0-NEXT: movl %ecx, %edx
+; X64-O0-NEXT: shldq $31, %rsi, %rdx
+; X64-O0-NEXT: xorl %ecx, %ecx
+; X64-O0-NEXT: # kill: def $rcx killed $ecx
+; X64-O0-NEXT: retq
+;
+; X86-LABEL: v_udiv_i129_v_pow2k:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: shrdl $1, %esi, %edx
+; X86-NEXT: shldl $31, %edi, %ecx
+; X86-NEXT: shldl $31, %esi, %edi
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl %edi, 4(%eax)
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: movl $0, 12(%eax)
+; X86-NEXT: movb $0, 16(%eax)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl $4
+;
+; X86-O0-LABEL: v_udiv_i129_v_pow2k:
+; X86-O0: # %bb.0:
+; X86-O0-NEXT: pushl %ebx
+; X86-O0-NEXT: pushl %edi
+; X86-O0-NEXT: pushl %esi
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-O0-NEXT: movl %ecx, %eax
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-O0-NEXT: shldl $31, %esi, %edx
+; X86-O0-NEXT: shldl $31, %edi, %esi
+; X86-O0-NEXT: shldl $31, %ebx, %edi
+; X86-O0-NEXT: movl %edi, (%ecx)
+; X86-O0-NEXT: movl %esi, 4(%ecx)
+; X86-O0-NEXT: movl %edx, 8(%ecx)
+; X86-O0-NEXT: movl $0, 12(%ecx)
+; X86-O0-NEXT: movb $0, 16(%ecx)
+; X86-O0-NEXT: popl %esi
+; X86-O0-NEXT: popl %edi
+; X86-O0-NEXT: popl %ebx
+; X86-O0-NEXT: retl $4
+ %div = udiv i129 %lhs, 8589934592
+ ret i129 %div
+}
+
+define i129 @v_udiv_exact_i129_v_pow2k(i129 %lhs) nounwind {
+; X64-LABEL: v_udiv_exact_i129_v_pow2k:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: andl $1, %edx
+; X64-NEXT: shrdq $33, %rsi, %rax
+; X64-NEXT: shldq $31, %rsi, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: retq
+;
+; X64-O0-LABEL: v_udiv_exact_i129_v_pow2k:
+; X64-O0: # %bb.0:
+; X64-O0-NEXT: movq %rsi, %rax
+; X64-O0-NEXT: shldq $31, %rdi, %rax
+; X64-O0-NEXT: movl %edx, %ecx
+; X64-O0-NEXT: andl $1, %ecx
+; X64-O0-NEXT: movl %ecx, %edx
+; X64-O0-NEXT: shldq $31, %rsi, %rdx
+; X64-O0-NEXT: xorl %ecx, %ecx
+; X64-O0-NEXT: # kill: def $rcx killed $ecx
+; X64-O0-NEXT: retq
+;
+; X86-LABEL: v_udiv_exact_i129_v_pow2k:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: shrdl $1, %esi, %edx
+; X86-NEXT: shldl $31, %edi, %ecx
+; X86-NEXT: shldl $31, %esi, %edi
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl %edi, 4(%eax)
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: movl $0, 12(%eax)
+; X86-NEXT: movb $0, 16(%eax)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl $4
+;
+; X86-O0-LABEL: v_udiv_exact_i129_v_pow2k:
+; X86-O0: # %bb.0:
+; X86-O0-NEXT: pushl %ebx
+; X86-O0-NEXT: pushl %edi
+; X86-O0-NEXT: pushl %esi
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-O0-NEXT: movl %ecx, %eax
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-O0-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-O0-NEXT: shldl $31, %esi, %edx
+; X86-O0-NEXT: shldl $31, %edi, %esi
+; X86-O0-NEXT: shldl $31, %ebx, %edi
+; X86-O0-NEXT: movl %edi, (%ecx)
+; X86-O0-NEXT: movl %esi, 4(%ecx)
+; X86-O0-NEXT: movl %edx, 8(%ecx)
+; X86-O0-NEXT: movl $0, 12(%ecx)
+; X86-O0-NEXT: movb $0, 16(%ecx)
+; X86-O0-NEXT: popl %esi
+; X86-O0-NEXT: popl %edi
+; X86-O0-NEXT: popl %ebx
+; X86-O0-NEXT: retl $4
+ %div = udiv exact i129 %lhs, 8589934592
+ ret i129 %div
+}
>From a2fe1d8447c9b6ce9cc06e0be3e465a699da99ca Mon Sep 17 00:00:00 2001
From: Anshul Nigham <nigham at google.com>
Date: Fri, 6 Feb 2026 00:45:46 -0800
Subject: [PATCH 32/33] Fix Bazel build for d005cb2 (#180134)
Co-authored-by: Pranav Kant <prka at google.com>
---
.../llvm-project-overlay/llvm/BUILD.bazel | 30 +++++++++++++++----
1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index e2a6a97f62a2b..97ec5c8bca39e 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -1542,13 +1542,29 @@ cc_library(
],
)
+cc_library(
+ name = "AVRTargetParser",
+ srcs = ["lib/TargetParser/AVRTargetParser.cpp"],
+ hdrs = ["include/llvm/TargetParser/AVRTargetParser.h"],
+ copts = llvm_copts,
+ deps = [
+ ":BinaryFormat",
+ ":Object",
+ ":Support",
+ ":TargetParser",
+ ],
+)
+
cc_library(
name = "TargetParser",
srcs = glob(
[
"lib/TargetParser/*.cpp",
],
- exclude = ["lib/TargetParser/PPCTargetParser.cpp"],
+ exclude = [
+ "lib/TargetParser/AVRTargetParser.cpp",
+ "lib/TargetParser/PPCTargetParser.cpp",
+ ],
) + select({
"@platforms//os:windows": glob([
"lib/TargetParser/Windows/*.inc",
@@ -1557,9 +1573,10 @@ cc_library(
"lib/TargetParser/Unix/*.inc",
]),
}),
- hdrs = glob([
- "include/llvm/TargetParser/*.h",
- ]),
+ hdrs = glob(
+ ["include/llvm/TargetParser/*.h"],
+ exclude = ["include/llvm/TargetParser/AVRTargetParser.h"],
+ ),
copts = llvm_copts,
includes = ["include"],
textual_hdrs = [
@@ -3780,7 +3797,9 @@ gentbl_cc_library(
":config",
":" + target["name"] + "CommonTableGen",
":" + target["name"] + "Info",
- ] + target.get("tbl_deps", []),
+ ] + target.get("tbl_deps", []) + (
+ [":AVRTargetParser"] if target["name"] == "AVR" else []
+ ),
)],
[cc_library(
name = target["name"] + "CodeGen",
@@ -5723,6 +5742,7 @@ cc_library(
]),
copts = llvm_copts,
deps = [
+ ":AVRTargetParser",
":AllTargetsAsmParsers",
":AllTargetsCodeGens",
":AllTargetsDisassemblers",
>From d41746e9d545eb6fdd00a9977110ee703473483d Mon Sep 17 00:00:00 2001
From: YunQiang Su <syq at debian.org>
Date: Sun, 8 Feb 2026 19:45:08 +0800
Subject: [PATCH 33/33] Not swap op0 and op1 if op1 is NaN
---
llvm/lib/Analysis/InstructionSimplify.cpp | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index a3e0994b72986..2701e229df9f0 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -7064,7 +7064,11 @@ Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType,
return Op0;
// Canonicalize constant operand as Op1.
- if (isa<Constant>(Op0))
+ bool Op1IsNaN = false;
+ if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Op1)) {
+ Op1IsNaN = CFP->getValueAPF().isNaN();
+ }
+ if (isa<Constant>(Op0) && !Op1IsNaN)
std::swap(Op0, Op1);
if (Constant *C = dyn_cast<Constant>(Op1)) {
More information about the Mlir-commits
mailing list