[llvm] [AArch64] Guard against non-simple types in udiv sve costs. (PR #148580)

Tue Jul 15 00:46:20 PDT 2025

https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/148580

>From dd26153ec59a769d586b084f7172a410fa226af0 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Tue, 15 Jul 2025 08:45:53 +0100
Subject: [PATCH] [AArch64] Guard against non-simple types in udiv sve costs.

The code here probably needs to change to hand;e types moe uniformly, but this
prevents it from trying to use a simple type where that does not exist.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |   6 +-
 .../Analysis/CostModel/AArch64/sve-div.ll     |   8 +
 .../CodeGen/AArch64/saturating-vec-smull.ll   | 265 ++++++++++++++++++
 3 files changed, 275 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/saturating-vec-smull.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8904402065696..07baf29ce7016 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4126,10 +4126,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
       if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
         // SDIV/UDIV operations are lowered using SVE, then we can have less
         // costs.
-        if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
-                                                ->getPrimitiveSizeInBits()
-                                                .getFixedValue() < 128) {
-          EVT VT = TLI->getValueType(DL, Ty);
+        if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
+            Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
           static const CostTblEntry DivTbl[]{
               {ISD::SDIV, MVT::v2i8, 5},  {ISD::SDIV, MVT::v4i8, 8},
               {ISD::SDIV, MVT::v8i8, 8},  {ISD::SDIV, MVT::v2i16, 5},
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-div.ll b/llvm/test/Analysis/CostModel/AArch64/sve-div.ll
index c055d3218f65b..84a9ca0fab6d8 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-div.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-div.ll
@@ -10,6 +10,7 @@ define void @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V6i32 = sdiv <6 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8i32 = sdiv <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, undef
@@ -19,6 +20,7 @@ define void @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = sdiv <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V6i8 = sdiv <6 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, undef
@@ -48,6 +50,7 @@ define void @sdiv() {
   %V8i64 = sdiv <8 x i64> undef, undef
   %V2i32 = sdiv <2 x i32> undef, undef
   %V4i32 = sdiv <4 x i32> undef, undef
+  %V6i32 = sdiv <6 x i32> undef, undef
   %V8i32 = sdiv <8 x i32> undef, undef
   %V16i32 = sdiv <16 x i32> undef, undef
   %V2i16 = sdiv <2 x i16> undef, undef
@@ -57,6 +60,7 @@ define void @sdiv() {
   %V32i16 = sdiv <32 x i16> undef, undef
   %V2i8 = sdiv <2 x i8> undef, undef
   %V4i8 = sdiv <4 x i8> undef, undef
+  %V6i8 = sdiv <6 x i8> undef, undef
   %V8i8 = sdiv <8 x i8> undef, undef
   %V16i8 = sdiv <16 x i8> undef, undef
   %V32i8 = sdiv <32 x i8> undef, undef
@@ -89,6 +93,7 @@ define void @udiv() {
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = udiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = udiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = udiv <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V6i32 = udiv <6 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8i32 = udiv <8 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = udiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = udiv <2 x i16> undef, undef
@@ -98,6 +103,7 @@ define void @udiv() {
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = udiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = udiv <2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = udiv <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V6i8 = udiv <6 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = udiv <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = udiv <16 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = udiv <32 x i8> undef, undef
@@ -127,6 +133,7 @@ define void @udiv() {
   %V8i64 = udiv <8 x i64> undef, undef
   %V2i32 = udiv <2 x i32> undef, undef
   %V4i32 = udiv <4 x i32> undef, undef
+  %V6i32 = udiv <6 x i32> undef, undef
   %V8i32 = udiv <8 x i32> undef, undef
   %V16i32 = udiv <16 x i32> undef, undef
   %V2i16 = udiv <2 x i16> undef, undef
@@ -136,6 +143,7 @@ define void @udiv() {
   %V32i16 = udiv <32 x i16> undef, undef
   %V2i8 = udiv <2 x i8> undef, undef
   %V4i8 = udiv <4 x i8> undef, undef
+  %V6i8 = udiv <6 x i8> undef, undef
   %V8i8 = udiv <8 x i8> undef, undef
   %V16i8 = udiv <16 x i8> undef, undef
   %V32i8 = udiv <32 x i8> undef, undef
diff --git a/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
new file mode 100644
index 0000000000000..cd82b35a48292
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll
@@ -0,0 +1,265 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-elf < %s | FileCheck %s
+
+
+define <2 x i16> @saturating_2xi16(<2 x i16> %a, <2 x i16> %b) {
+; CHECK-LABEL: saturating_2xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #16
+; CHECK-NEXT:    mul v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    movi v1.2s, #127, msl #8
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #15
+; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %as = sext <2 x i16> %a to <2 x i32>
+  %bs = sext <2 x i16> %b to <2 x i32>
+  %m = mul <2 x i32> %bs, %as
+  %sh = ashr <2 x i32> %m, splat (i32 15)
+  %ma = tail call <2 x i32> @llvm.smin.v4i32(<2 x i32> %sh, <2 x i32> splat (i32 32767))
+  %t = trunc <2 x i32> %ma to <2 x i16>
+  ret <2 x i16> %t
+}
+
+define <4 x i16> @saturating_4xi16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: saturating_4xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #15
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %as = sext <4 x i16> %a to <4 x i32>
+  %bs = sext <4 x i16> %b to <4 x i32>
+  %m = mul <4 x i32> %bs, %as
+  %sh = ashr <4 x i32> %m, splat (i32 15)
+  %ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 32767))
+  %t = trunc <4 x i32> %ma to <4 x i16>
+  ret <4 x i16> %t
+}
+
+define <6 x i16> @saturating_6xi16(<6 x i16> %a, <6 x i16> %b) {
+; CHECK-LABEL: saturating_6xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull2 v3.4s, v1.8h, v0.8h
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #15
+; CHECK-NEXT:    sshr v1.4s, v3.4s, #15
+; CHECK-NEXT:    smin v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    smin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %as = sext <6 x i16> %a to <6 x i32>
+  %bs = sext <6 x i16> %b to <6 x i32>
+  %m = mul <6 x i32> %bs, %as
+  %sh = ashr <6 x i32> %m, splat (i32 15)
+  %ma = tail call <6 x i32> @llvm.smin.v6i32(<6 x i32> %sh, <6 x i32> splat (i32 32767))
+  %t = trunc <6 x i32> %ma to <6 x i16>
+  ret <6 x i16> %t
+}
+
+define <8 x i16> @saturating_8xi16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: saturating_8xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull2 v3.4s, v1.8h, v0.8h
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #15
+; CHECK-NEXT:    sshr v1.4s, v3.4s, #15
+; CHECK-NEXT:    smin v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    smin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %as = sext <8 x i16> %a to <8 x i32>
+  %bs = sext <8 x i16> %b to <8 x i32>
+  %m = mul <8 x i32> %bs, %as
+  %sh = ashr <8 x i32> %m, splat (i32 15)
+  %ma = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 32767))
+  %t = trunc <8 x i32> %ma to <8 x i16>
+  ret <8 x i16> %t
+}
+
+define <2 x i32> @saturating_2xi32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: saturating_2xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull v0.2d, v1.2s, v0.2s
+; CHECK-NEXT:    mov w8, #2147483647 // =0x7fffffff
+; CHECK-NEXT:    dup v1.2d, x8
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #31
+; CHECK-NEXT:    cmgt v2.2d, v1.2d, v0.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %as = sext <2 x i32> %a to <2 x i64>
+  %bs = sext <2 x i32> %b to <2 x i64>
+  %m = mul <2 x i64> %bs, %as
+  %sh = ashr <2 x i64> %m, splat (i64 31)
+  %ma = tail call <2 x i64> @llvm.smin.v8i64(<2 x i64> %sh, <2 x i64> splat (i64 2147483647))
+  %t = trunc <2 x i64> %ma to <2 x i32>
+  ret <2 x i32> %t
+}
+
+define <4 x i32> @saturating_4xi32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: saturating_4xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull2 v2.2d, v1.4s, v0.4s
+; CHECK-NEXT:    smull v0.2d, v1.2s, v0.2s
+; CHECK-NEXT:    mov w8, #2147483647 // =0x7fffffff
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #31
+; CHECK-NEXT:    sshr v1.2d, v2.2d, #31
+; CHECK-NEXT:    dup v2.2d, x8
+; CHECK-NEXT:    cmgt v3.2d, v2.2d, v1.2d
+; CHECK-NEXT:    cmgt v4.2d, v2.2d, v0.2d
+; CHECK-NEXT:    bif v1.16b, v2.16b, v3.16b
+; CHECK-NEXT:    bif v0.16b, v2.16b, v4.16b
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %as = sext <4 x i32> %a to <4 x i64>
+  %bs = sext <4 x i32> %b to <4 x i64>
+  %m = mul <4 x i64> %bs, %as
+  %sh = ashr <4 x i64> %m, splat (i64 31)
+  %ma = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %sh, <4 x i64> splat (i64 2147483647))
+  %t = trunc <4 x i64> %ma to <4 x i32>
+  ret <4 x i32> %t
+}
+
+define <8 x i32> @saturating_8xi32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: saturating_8xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull2 v4.2d, v3.4s, v1.4s
+; CHECK-NEXT:    smull v1.2d, v3.2s, v1.2s
+; CHECK-NEXT:    mov w8, #2147483647 // =0x7fffffff
+; CHECK-NEXT:    smull2 v3.2d, v2.4s, v0.4s
+; CHECK-NEXT:    smull v0.2d, v2.2s, v0.2s
+; CHECK-NEXT:    dup v2.2d, x8
+; CHECK-NEXT:    sshr v4.2d, v4.2d, #31
+; CHECK-NEXT:    sshr v1.2d, v1.2d, #31
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #31
+; CHECK-NEXT:    sshr v3.2d, v3.2d, #31
+; CHECK-NEXT:    cmgt v5.2d, v2.2d, v4.2d
+; CHECK-NEXT:    cmgt v6.2d, v2.2d, v1.2d
+; CHECK-NEXT:    cmgt v7.2d, v2.2d, v3.2d
+; CHECK-NEXT:    cmgt v16.2d, v2.2d, v0.2d
+; CHECK-NEXT:    bif v4.16b, v2.16b, v5.16b
+; CHECK-NEXT:    bif v1.16b, v2.16b, v6.16b
+; CHECK-NEXT:    bif v3.16b, v2.16b, v7.16b
+; CHECK-NEXT:    bif v0.16b, v2.16b, v16.16b
+; CHECK-NEXT:    uzp1 v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    ret
+  %as = sext <8 x i32> %a to <8 x i64>
+  %bs = sext <8 x i32> %b to <8 x i64>
+  %m = mul <8 x i64> %bs, %as
+  %sh = ashr <8 x i64> %m, splat (i64 31)
+  %ma = tail call <8 x i64> @llvm.smin.v8i64(<8 x i64> %sh, <8 x i64> splat (i64 2147483647))
+  %t = trunc <8 x i64> %ma to <8 x i32>
+  ret <8 x i32> %t
+}
+
+define <2 x i64> @saturating_2xi32_2xi64(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: saturating_2xi32_2xi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull v0.2d, v1.2s, v0.2s
+; CHECK-NEXT:    mov w8, #2147483647 // =0x7fffffff
+; CHECK-NEXT:    dup v1.2d, x8
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #31
+; CHECK-NEXT:    cmgt v2.2d, v1.2d, v0.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %as = sext <2 x i32> %a to <2 x i64>
+  %bs = sext <2 x i32> %b to <2 x i64>
+  %m = mul <2 x i64> %bs, %as
+  %sh = ashr <2 x i64> %m, splat (i64 31)
+  %ma = tail call <2 x i64> @llvm.smin.v8i64(<2 x i64> %sh, <2 x i64> splat (i64 2147483647))
+  ret <2 x i64> %ma
+}
+
+define <4 x i16> @unsupported_saturation_value_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: unsupported_saturation_value_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    movi v1.4s, #42
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #15
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %as = sext <4 x i16> %a to <4 x i32>
+  %bs = sext <4 x i16> %b to <4 x i32>
+  %m = mul <4 x i32> %bs, %as
+  %sh = ashr <4 x i32> %m, splat (i32 15)
+  %ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 42))
+  %t = trunc <4 x i32> %ma to <4 x i16>
+  ret <4 x i16> %t
+}
+
+define <4 x i16> @unsupported_shift_value_v4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: unsupported_shift_value_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #3
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %as = sext <4 x i16> %a to <4 x i32>
+  %bs = sext <4 x i16> %b to <4 x i32>
+  %m = mul <4 x i32> %bs, %as
+  %sh = ashr <4 x i32> %m, splat (i32 3)
+  %ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 32767))
+  %t = trunc <4 x i32> %ma to <4 x i16>
+  ret <4 x i16> %t
+}
+
+define <2 x i16> @extend_to_illegal_type(<2 x i16> %a, <2 x i16> %b) {
+; CHECK-LABEL: extend_to_illegal_type:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-NEXT:    shl v1.2d, v1.2d, #48
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-NEXT:    sshr v1.2d, v1.2d, #48
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    smull v0.2d, v1.2s, v0.2s
+; CHECK-NEXT:    dup v1.2d, x8
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #15
+; CHECK-NEXT:    cmgt v2.2d, v1.2d, v0.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+  %as = sext <2 x i16> %a to <2 x i48>
+  %bs = sext <2 x i16> %b to <2 x i48>
+  %m = mul <2 x i48> %bs, %as
+  %sh = ashr <2 x i48> %m, splat (i48 15)
+  %ma = tail call <2 x i48> @llvm.smin.v4i32(<2 x i48> %sh, <2 x i48> splat (i48 32767))
+  %t = trunc <2 x i48> %ma to <2 x i16>
+  ret <2 x i16> %t
+}
+
+define <2 x i11> @illegal_source(<2 x i11> %a, <2 x i11> %b) {
+; CHECK-LABEL: illegal_source:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #21
+; CHECK-NEXT:    shl v1.2s, v1.2s, #21
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #21
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #21
+; CHECK-NEXT:    mul v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    movi v1.2s, #127, msl #8
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #15
+; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+  %as = sext <2 x i11> %a to <2 x i32>
+  %bs = sext <2 x i11> %b to <2 x i32>
+  %m = mul <2 x i32> %bs, %as
+  %sh = ashr <2 x i32> %m, splat (i32 15)
+  %ma = tail call <2 x i32> @llvm.smin.v4i32(<2 x i32> %sh, <2 x i32> splat (i32 32767))
+  %t = trunc <2 x i32> %ma to <2 x i11>
+  ret <2 x i11> %t
+}
+